Tutorial

Basic .txt files spellchecker

Command line interface

"""__main__.py"""

import argparse
import sys

from hunspellcheck import (
    hunspellchecker_argument_parser,
    looks_like_a_word_creator,
    render_hunspell_word_error,
    HunspellChecker,
)


def build_parser():
    parser = argparse.ArgumentParser(description="TXT files spellchecker.")
    hunspellchecker_argument_parser(
        parser,
        version=True,
        version_number="1.0.0",
    )
    return parser


def main():
    opts = build_parser().parse_args()

    # Extracting content from the files is the task you must focused in.
    # By default are passed as globs in positional arguments and stored in
    # the 'files' property of the namespace
    filenames_contents = {}
    for filename in opts.files:
        with open(filename, "r") as f:
            filenames_contents[filename] = f.read()

    looks_like_a_word = looks_like_a_word_creator(
        digits_are_words=opts.digits_are_words,
        words_can_contain_digits=opts.words_can_contain_digits,
        words_can_startswith_dash=opts.words_can_startswith_dash,
        words_can_endswith_dash=opts.words_can_endswith_dash,
        words_can_contain_dash=opts.words_can_contain_dash,
        words_can_contain_two_upper=opts.words_can_contain_two_upper,
    )

    spellchecker = HunspellChecker(
        filenames_contents=filenames_contents,
        languages=opts.languages,
        personal_dicts=opts.personal_dicts,
        encoding=opts.encoding,
        looks_like_a_word=looks_like_a_word,
    )

    for word_error in spellchecker.check(
        include_filename=opts.include_filename,
        include_line_number=opts.include_line_number,
        include_word=opts.include_word,
        include_word_line_index=opts.include_word_line_index,
        include_line=opts.include_line,
        include_text=opts.include_text,
        include_error_number=opts.include_error_number,
        include_near_misses=opts.include_near_misses,
    ):
        print(render_hunspell_word_error(word_error), file=sys.stderr)

    return 0 if not spellchecker.errors else 1


if __name__ == "__main__":
    sys.exit(main())

You can see the usage passing --help option to this script.

To use it, just create a .txt file and pass its filename as positional argument, selecting the language with --language option:

hola hello
$ python3 __main__.py --language es_ES foo.txt
foo.txt:hello:1:5

Public API interface

"""__init__.py"""

import glob

from hunspellcheck import (
   HunspellChecker,
   assert_is_valid_dictionary_language_or_filename,
   looks_like_a_word_creator,
)


def txt_file_to_content(filename, encoding=None):
    with open(filename, "r", encoding=encoding) as f:
        return f.read()


def txt_spell(
     self,
     files,
     languages,
     personal_dicts=None,
     negotiate_languages=False,
     encoding=None,
     digits_are_words=False,
     words_can_contain_digits=True,
     words_can_startswith_dash=True,
     words_can_endswith_dash=True,
     words_can_contain_dash=True,
     words_can_contain_two_upper=True,
     include_filename=True,
     include_line_number=True,
     include_word=True,
     include_word_line_index=True,
     include_line=False,
     include_text=False,
     include_error_number=False,
     include_near_misses=False,
):
    """Text files spellchecker function.

    Parameters
    ----------

    filenames : list
      List of path globs to check.

    languages : list
      Languages to use excluding words from being considered mispelling
      errors.

    personal_dicts : list, optional
      Personal dictionaries used to exclude certain words from being
      considered mispelling errors.

    negotiate_languages : bool, optional
      If ``True``, you can pass territory codes as dictionary names, for
      example ``"es"`` instead of ``"es_ES"``.

    encoding : str, optional
      Input encoding. If not defined, it will be autodetected by hunspell.

    digits_are_words : bool, optional
      If ``False``, values with all characters as digits will not be
      considered words, so they will not be checked for mispelling errors.

    words_can_contain_digits : bool, optional
      If ``False``, values with at least one digit character will not be
      considered words, so they will not be checked for mispelling errors.

    words_can_startswith_dash : bool, optional
      If ``False``, values starting with the ``-`` character will not be
      considered words, so they will not be checked for mispelling errors.

    words_can_endswith_dash : bool, optional
      If ``False``, values ending with the ``-`` character will not be
      considered words, so they will not be checked for mispelling errors.

    words_can_contain_dash : bool, optional
      If ``False``, values containing the ``-`` character will not be
      considered words, so they will not be checked for mispelling errors.

    words_can_contain_two_upper : bool, optional
      If ``False``, values containing two uppercase letters will not be
      considered words, so they will not be checked for mispelling errors.

    include_filename : bool, optional
      Include the filename in which has been found a mispelling error.

    include_line_number : bool, optional
      Include the line number in which has been found a mispelling error.

    include_word : bool, optional
      Include the mispelled word in each mispelling error message.

    include_word_line_index : bool, optional
      Include the index of the caracter in which the mispelled word starts
      in their line (starting at index 0).

    include_line : bool, optional
      Include the entire line where each mispelled word resides.

    include_text : bool, optional
      Include the full text in where the mispelled word resides.

    include_error_number : bool, optional
      Include the number of the error in yielded data. This could be useful
      to avoid the need of define a counter.

    include_near_misses : bool, optional
      Include a list with the near misses for the mispelled word.
    """
    assert_is_valid_dictionary_language_or_filename(
        languages,
        negotiate_languages=negotiate_languages,
    )

    filename_contents = {}
    for glob_files in files:
        for filename in glob.glob(glob_files):
            filename_contents[filename] = txt_file_to_content(
                filename,
                encoding=encoding,
            )

    yield from HunspellChecker(
        filename_contents,
        languages,
        personal_dicts=personal_dicts,
        looks_like_a_word=looks_like_a_word_creator(
            digits_are_words=digits_are_words,
            words_can_contain_digits=words_can_contain_digits,
            words_can_startswith_dash=words_can_startswith_dash,
            words_can_endswith_dash=words_can_endswith_dash,
            words_can_contain_dash=words_can_contain_dash,
            words_can_contain_two_upper=words_can_contain_two_upper,
        ),
        encoding=encoding,
    ).check(
        include_filename=include_filename,
        include_line_number=include_line_number,
        include_word=include_word,
        include_word_line_index=include_word_line_index,
        include_line=include_line,
        include_text=include_text,
        include_error_number=include_error_number,
        include_near_misses=include_near_misses,
    )

The function will yield from a generator:

Input

hello hola
for word_error in txt_spell(["foo.txt"], "es_ES"):
    print(word_error)

Output

{'filename': 'foo.txt', 'line_number': 1, 'word': 'hello', 'word_line_index': 0}

See also

Public API