Source code for acheck.checks.spelling

import re
from typing import List, Dict
import difflib

from collections import Counter
from acheck.config import config
from acheck.checking.check_interface import Check, ToolObjectsMeta, CheckGroup
from acheck.checking.error import Error, ErrorType, Sequence, Fix, FixCode, ErrorLevel
from acheck.utils.annotationhelper import parse_annotation
import logging

logger = logging.getLogger(__name__)



[docs]class SpellCheck(Check): """Checks whether the given annotation has spelling mistakes :Options: - language: Supported languages if installed correctly: en_US, en_GB, de_DE """ language = config.load("Annotation","lang")
[docs] def run(self, annotation_file, domain_file, problem_file, line_limit=-1) -> List[Error]: self.logs.clear() try: import enchant.checker except Exception: self.logs.append(f"An error occurred during import. Try to install the enchant C-library") raise pwl = self.tool_meta.pwl pel = self.tool_meta.pel return self._check_spelling(annotation_file=annotation_file, pwl=pwl, pel=pel, auto_add=False, check_id=self.id, ench=enchant, lang=self.options.get("language", self.language), line_limit=line_limit, )
@staticmethod def _check_spelling(annotation_file, check_id, line_limit, ench, lang, pwl=None, pel=None, auto_add=False): """ :param lang: :param ench: :param line_limit: :param check_id: :param annotation_file: :param pwl: :param pel: :param auto_add: :return: """ error_list = [] term_divider = config.load("Annotation","term_divider") whitespace_divider = config.load("Annotation","whitespace_divider") candidates_number = config.load("SpellCheck","candidates") times, divs, expressions = parse_annotation(annotation_file, line_limit) enchant_dict = ench.DictWithPWL(lang, pwl, pel) if (pwl or pel) is None: logger.error("Trying to access enchant without initialized dictionary. \n" "Try to initialize it before you run the spellcheck. \n" "Otherwise it won't work properly") enchant_dict = ench.DictWithPWL(lang, "deleteme", "deleteme") frequency_list = _count_words(" ".join(expressions), whitespace_divider, term_divider) candidates_dict = {} for line_index, expression in enumerate(expressions, start=1): if expression.strip() == "": continue words = expression.replace(whitespace_divider, " ").replace(term_divider, " ") words_split = words.split(" ") start_index = len(times[line_index - 1]) + len(divs[line_index - 1]) for word in words_split: if " " in word or word == "": for char in word: if char == " ": start_index += 1 continue if word in candidates_dict.keys(): error_list.append( Error( file_name=annotation_file, error_type=ErrorType.WrongSpelling, fixes=[Fix(x, fix_code=FixCode.ReplaceSequence) for x in candidates_dict[word]] + [ Fix("{{custom}}", fix_code=FixCode.ReplaceSequence)] + [Fix(fix_code=FixCode.AddToDict,correct_string=word)] + [Fix(fix_code=FixCode.RemoveSequence)] , line_number=line_index, incorrect_sequence=Sequence(start_index, word), error_level=ErrorLevel.Warning, check_id=check_id ) ) start_index += len(word) + 1 continue if not enchant_dict.check(word): suggestions = enchant_dict.suggest(word) if auto_add: suggestions = [x.replace(" ", "").replace("-", "") if " " in x else x for x in suggestions] # eliminate duplicates suggestions_without_duplicates = [] [suggestions_without_duplicates.append(x) for x in suggestions if x not in suggestions_without_duplicates] matches = difflib.get_close_matches(word, suggestions, candidates_number) else: #maybe not eliminate uppercase letters suggestions = [x for x in suggestions if " " not in x and "-" not in x and not bool(re.match(r'\w*[A-Z]\w*', x))] matches = difflib.get_close_matches(word, suggestions, candidates_number) if set(matches).intersection(frequency_list.keys()): matches.sort(key=lambda x: _frequency_in_text(x, frequency_list), reverse=True) error_list.append( Error( file_name=annotation_file, error_type=ErrorType.WrongSpelling, fixes=[Fix(x, fix_code=FixCode.ReplaceSequence) for x in matches] + [ Fix("{{custom}}", fix_code=FixCode.ReplaceSequence)]+ [Fix(fix_code=FixCode.AddToDict,correct_string=word)] + [ Fix(fix_code=FixCode.RemoveSequence)], line_number=line_index, incorrect_sequence=Sequence(start_index, word), error_level=ErrorLevel.Warning, check_id=check_id ) ) candidates_dict.update({word: matches}) start_index += len(word) + 1 return error_list
def _frequency_in_text(word: str, frequency_list: dict): frequency = 0 if word in frequency_list.keys(): frequency = frequency_list.get(word) return frequency def _count_words(text: str, *divider: str) -> Dict: text = text.replace("\n", " ") for div in divider: text = text.replace(div, " ") return {x: y for x, y in Counter(text.split(" ")).items() if y > 1 and len(x) > 1}