Source code for acheck.checks.file

import csv
import logging
import re
import string
from pathlib import Path
from typing import List
from acheck.config import config
from acheck.utils.annotationhelper import parse_annotation
from acheck.checking.check_interface import Check
from acheck.checking.error import ErrorType, Error, FixCode, Fix, Sequence

logger = logging.getLogger(__name__)


[docs]class CSVFormatCheck(Check): """Checks whether the given annotation file has a valid csv format :Options: - sniffer_size: Size parameter for reading the file and determine the csv dialect (e.g. "file_size=4096") - columns: Number of columns in the csv file - quotechar: Quotechar of the csv file - delimiter: Delimiter char of the csv file """ sniffer_size = config.load("Annotation","csv_sniffer_size") columns = config.load("Annotation","csv_columns") quotechar = config.load("Annotation","csv_quotechar") delimiter = config.load("Annotation","csv_delimiter")
[docs] def run(self, annotation_file, domain_file, problem_file, line_limit=-1) -> List[Error]: self.logs.clear() return self._check_csv_structure(annotation_file, sniffer_size=self.options.get("sniffer_size", self.sniffer_size), columns=self.options.get("columns", self.columns), delimiter=self.options.get("delimiter", self.quotechar), quotechar=self.options.get("quotechar", self.delimiter), check_id=self.id)
@staticmethod def _check_csv_structure(file_name: Path, sniffer_size: int, check_id: int, columns=2, delimiter=",", quotechar='|'): try: with open(file_name, 'r', newline='') as csvfile: data = csvfile.read(sniffer_size) if not all([c in string.printable or c.isprintable() for c in data]): return [Error( file_name=file_name, error_type=ErrorType.IllegalCSVFile, fixes=[Fix(f"Found not printable elements", fix_code=FixCode.Alert)], check_id=check_id, )] csvfile.seek(0) reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) errors = [] for index, row in enumerate(reader): if len(row) != columns and len(row) > 0 or len(row) == 0: errors.append(Error( file_name=file_name, error_type=ErrorType.IllegalCSVFile, line_number=index + 1, fixes=[Fix(f"Illegal number of columns", fix_code=FixCode.Alert)], check_id=check_id, )) return errors except (IOError, OSError, UnicodeError, csv.Error) as e: logger.exception(e) return [ Error( file_name=file_name, error_type=ErrorType.IllegalCSVFile, fixes=[Fix(str(e), fix_code=FixCode.Alert)], check_id=check_id, ) ]
[docs]class ReadFileCheck(Check): """Checks if there are any issues during reading the file by opening it""" @staticmethod def _check_open_file(*infile, check_id): for file in infile: file_error = Error(file, ErrorType.IllegalFile, check_id, fixes=[Fix("Can't open the file")]) try: with open(file, "r") as f: f.close() return [] except (IOError, OSError) as e: file_error.advice = f"{repr(e)}: {infile}" logger.error(e) return [file_error]
[docs] def run(self, annotation_file, domain_file, problem_file, line_limit=-1) -> List[Error]: self.logs.clear() return self._check_open_file(domain_file, problem_file, annotation_file, check_id=self.id)
[docs]class CharacterCheck(Check): """Checks if there are any characters that are not allowed :Options: - regex_characters: The regular expression that matches all allowed characters for the whole annotation - regex_time: The regular expression that matches all allowed characters plus the punctuation mark period """ regex_characters = config.load("Annotation","regex_characters") regex_time = config.load("Annotation","regex_time")
[docs] def run(self, annotation_file, domain_file, problem_file, line_limit: int = -1) -> List[Error]: self.logs.clear() return self._check_characters(annotation_file=annotation_file, check_id=self.id, regex_characters=self.options.get("regex_characters", self.regex_characters), regex_time=self.options.get("regex_time", self.regex_time), line_limit=line_limit )
@staticmethod def _check_characters(annotation_file, check_id, regex_characters, regex_time, line_limit) -> List[Error]: error_list = [] times, divs, expressions = parse_annotation(annotation_file, line_limit) p_symbols = re.compile(regex_characters) p_time_symbols = re.compile(regex_time) for index, (time, div, exp) in enumerate(zip(times, divs, expressions), start=1): if (time + div + exp).strip() == "": continue for m in p_time_symbols.finditer(time): error_list.append( Error( annotation_file, incorrect_sequence=Sequence(m.start(), m.group()), fixes=[Fix(fix_code=FixCode.RemoveSequence)], error_type=ErrorType.IllegalCharacter, line_number=index, check_id=check_id, ) ) for n in p_symbols.finditer(div + exp): error_list.append( Error( annotation_file, incorrect_sequence=Sequence(len(time) + n.start(), n.group()), fixes=[Fix(fix_code=FixCode.RemoveSequence)], error_type=ErrorType.IllegalCharacter, line_number=index, check_id=check_id ) ) return error_list