Source code for acheck.checks.spelling

import re
from typing import List, Dict
import difflib

from collections import Counter
from acheck.config import config
from acheck.checking.check_interface import Check, ToolObjectsMeta, CheckGroup
from acheck.checking.error import Error, ErrorType, Sequence, Fix, FixCode, ErrorLevel
from acheck.utils.annotationhelper import parse_annotation
import logging

logger = logging.getLogger(__name__)



[docs]class SpellCheck(Check):
    """Checks whether the given annotation has spelling mistakes

    :Options:
        - language:
            Supported languages if installed correctly: en_US, en_GB, de_DE
    """

    language = config.load("Annotation","lang")

[docs]    def run(self, annotation_file, domain_file, problem_file, line_limit=-1) -> List[Error]:
        self.logs.clear()
        try:
            import enchant.checker
        except Exception:
            self.logs.append(f"An error occurred during import. Try to install the enchant C-library")
            raise

        pwl = self.tool_meta.pwl
        pel = self.tool_meta.pel
        return self._check_spelling(annotation_file=annotation_file,
                                   pwl=pwl,
                                   pel=pel,
                                   auto_add=False,
                                   check_id=self.id,
                                   ench=enchant,
                                   lang=self.options.get("language", self.language),
                                   line_limit=line_limit,
                                   )

    @staticmethod
    def _check_spelling(annotation_file, check_id, line_limit, ench, lang, pwl=None, pel=None, auto_add=False):
        """

        :param lang:
        :param ench:
        :param line_limit:
        :param check_id:
        :param annotation_file:
        :param pwl:
        :param pel:
        :param auto_add:
        :return:
        """

        error_list = []
        term_divider = config.load("Annotation","term_divider")
        whitespace_divider = config.load("Annotation","whitespace_divider")
        candidates_number = config.load("SpellCheck","candidates")

        times, divs, expressions = parse_annotation(annotation_file, line_limit)
        enchant_dict = ench.DictWithPWL(lang, pwl, pel)
        if (pwl or pel) is None:
            logger.error("Trying to access enchant without initialized dictionary. \n"
                         "Try to initialize it before you run the spellcheck. \n"
                         "Otherwise it won't work properly")
            enchant_dict = ench.DictWithPWL(lang, "deleteme", "deleteme")

        frequency_list = _count_words(" ".join(expressions), whitespace_divider, term_divider)
        candidates_dict = {}

        for line_index, expression in enumerate(expressions, start=1):
            if expression.strip() == "":
                continue

            words = expression.replace(whitespace_divider, " ").replace(term_divider, " ")
            words_split = words.split(" ")

            start_index = len(times[line_index - 1]) + len(divs[line_index - 1])
            for word in words_split:
                if " " in word or word == "":

                    for char in word:
                        if char == " ":
                            start_index += 1
                    continue
                if word in candidates_dict.keys():
                    error_list.append(
                        Error(
                            file_name=annotation_file,
                            error_type=ErrorType.WrongSpelling,
                            fixes=[Fix(x, fix_code=FixCode.ReplaceSequence) for x in candidates_dict[word]] + [
                                Fix("{{custom}}", fix_code=FixCode.ReplaceSequence)] + [Fix(fix_code=FixCode.AddToDict,correct_string=word)] +
                                  [Fix(fix_code=FixCode.RemoveSequence)]
                                  ,
                            line_number=line_index,
                            incorrect_sequence=Sequence(start_index, word),
                            error_level=ErrorLevel.Warning,
                            check_id=check_id
                        )
                    )
                    start_index += len(word) + 1
                    continue
                if not enchant_dict.check(word):

                    suggestions = enchant_dict.suggest(word)

                    if auto_add:
                        suggestions = [x.replace(" ", "").replace("-", "") if " " in x else x for x in suggestions]

                        # eliminate duplicates
                        suggestions_without_duplicates = []
                        [suggestions_without_duplicates.append(x) for x in suggestions if
                         x not in suggestions_without_duplicates]

                        matches = difflib.get_close_matches(word, suggestions, candidates_number)

                    else:
                        #maybe not eliminate uppercase letters
                        suggestions = [x for x in suggestions if
                                       " " not in x and "-" not in x and not bool(re.match(r'\w*[A-Z]\w*', x))]
                        matches = difflib.get_close_matches(word, suggestions, candidates_number)

                    if set(matches).intersection(frequency_list.keys()):
                        matches.sort(key=lambda x: _frequency_in_text(x, frequency_list), reverse=True)

                    error_list.append(
                        Error(
                            file_name=annotation_file,
                            error_type=ErrorType.WrongSpelling,
                            fixes=[Fix(x, fix_code=FixCode.ReplaceSequence) for x in matches] + [
                                Fix("{{custom}}", fix_code=FixCode.ReplaceSequence)]+ [Fix(fix_code=FixCode.AddToDict,correct_string=word)]  + [
                                      Fix(fix_code=FixCode.RemoveSequence)],
                            line_number=line_index,
                            incorrect_sequence=Sequence(start_index, word),
                            error_level=ErrorLevel.Warning,
                            check_id=check_id
                        )
                    )
                    candidates_dict.update({word: matches})
                start_index += len(word) + 1

        return error_list


def _frequency_in_text(word: str, frequency_list: dict):
    frequency = 0
    if word in frequency_list.keys():
        frequency = frequency_list.get(word)
    return frequency


def _count_words(text: str, *divider: str) -> Dict:
    text = text.replace("\n", " ")
    for div in divider:
        text = text.replace(div, " ")
    return {x: y for x, y in Counter(text.split(" ")).items() if y > 1 and len(x) > 1}