from __future__ import annotations import re from abc import ABC, abstractmethod from pathlib import Path from app.parsers.models import ParsedReport # Fuzzy column-name patterns → canonical field name COLUMN_PATTERNS: dict[str, list[re.Pattern[str]]] = { "feature_name": [ re.compile(r"feat|char|dimen|label|id|name|item", re.I), ], "nominal": [ re.compile(r"nom|target|blueprint|print", re.I), ], "tolerance_plus": [ re.compile(r"tol.*\+|upper.*tol|\+.*tol|usl|dev.*\+|pos.*tol", re.I), ], "tolerance_minus": [ re.compile(r"tol.*-|lower.*tol|-.*tol|lsl|dev.*-|neg.*tol", re.I), ], "actual": [ re.compile(r"actual|meas|value|result|reading", re.I), ], "deviation": [ re.compile(r"dev(?!.*tol)|diff|error|delta", re.I), ], } def match_column(header: str) -> str | None: """Return the canonical field name for a header string, or None.""" header = header.strip() for field_name, patterns in COLUMN_PATTERNS.items(): for pat in patterns: if pat.search(header): return field_name return None class CMMParser(ABC): @abstractmethod def parse(self, path: Path) -> ParsedReport: ... def get_parser(filename: str) -> CMMParser: suffix = Path(filename).suffix.lower() if suffix == ".pdf": from app.parsers.pdf_parser import PDFParser return PDFParser() if suffix in (".xlsx", ".xls", ".csv"): from app.parsers.excel_parser import ExcelParser return ExcelParser() raise ValueError(f"Unsupported file type: {suffix}")