from __future__ import annotations from pathlib import Path import pandas as pd from app.parsers.base import CMMParser, match_column from app.parsers.models import MeasurementRecord, ParsedReport class ExcelParser(CMMParser): def parse(self, path: Path) -> ParsedReport: if path.suffix.lower() == ".csv": df = pd.read_csv(path) else: df = pd.read_excel(path, engine="openpyxl") col_map = self._map_columns(df.columns.tolist()) measurements = self._extract(df, col_map) return ParsedReport( filename=path.name, measurements=measurements, metadata={"source": "excel", "rows": str(len(df))}, raw_text=df.to_string(max_rows=200), ) def _map_columns(self, headers: list[str]) -> dict[str, str]: """Map canonical field names to actual DataFrame column names.""" mapping: dict[str, str] = {} for header in headers: canonical = match_column(str(header)) if canonical and canonical not in mapping: mapping[canonical] = str(header) return mapping def _extract( self, df: pd.DataFrame, col_map: dict[str, str] ) -> list[MeasurementRecord]: required = {"feature_name", "nominal", "actual"} if not required.issubset(col_map): return self._fallback_extract(df) records: list[MeasurementRecord] = [] for _, row in df.iterrows(): try: nominal = float(row[col_map["nominal"]]) actual = float(row[col_map["actual"]]) tol_plus = ( float(row[col_map["tolerance_plus"]]) if "tolerance_plus" in col_map else 0.0 ) tol_minus = ( float(row[col_map["tolerance_minus"]]) if "tolerance_minus" in col_map else 0.0 ) deviation = ( float(row[col_map["deviation"]]) if "deviation" in col_map else actual - nominal ) records.append( MeasurementRecord( feature_name=str(row[col_map["feature_name"]]), nominal=nominal, tolerance_plus=abs(tol_plus), tolerance_minus=-abs(tol_minus), actual=actual, deviation=deviation, ) ) except (ValueError, TypeError): continue return records def _fallback_extract(self, df: pd.DataFrame) -> list[MeasurementRecord]: """Best-effort extraction when column mapping is incomplete. Treats the first string column as the feature name and the first three numeric columns as nominal, actual, tolerance_plus (with tolerance_minus mirrored). """ numeric_cols = df.select_dtypes(include="number").columns.tolist() str_cols = df.select_dtypes(include="object").columns.tolist() if len(numeric_cols) < 2 or not str_cols: return [] name_col = str_cols[0] nom_col = numeric_cols[0] act_col = numeric_cols[1] tol_col = numeric_cols[2] if len(numeric_cols) > 2 else None records: list[MeasurementRecord] = [] for _, row in df.iterrows(): try: nominal = float(row[nom_col]) actual = float(row[act_col]) tol = float(row[tol_col]) if tol_col else 0.0 records.append( MeasurementRecord( feature_name=str(row[name_col]), nominal=nominal, tolerance_plus=abs(tol), tolerance_minus=-abs(tol), actual=actual, deviation=actual - nominal, ) ) except (ValueError, TypeError): continue return records