Files
cmm-report-analyzer/app/parsers/excel_parser.py
chrisryn 9abf9b4b58 Initial commit: CMM Report Analyzer
FastAPI app that parses CMM inspection reports (PDF/Excel/CSV),
computes SPC metrics (Cp/Cpk/Pp/Ppk, control limits, Shapiro-Wilk),
generates interactive Plotly charts, and provides AI-powered quality
summaries via Azure OpenAI with graceful fallback.

Includes 21 passing tests covering parsers, SPC calculations, and
API endpoints.
2026-02-19 10:38:51 -06:00

113 lines
4.1 KiB
Python

from __future__ import annotations
from pathlib import Path
import pandas as pd
from app.parsers.base import CMMParser, match_column
from app.parsers.models import MeasurementRecord, ParsedReport
class ExcelParser(CMMParser):
def parse(self, path: Path) -> ParsedReport:
if path.suffix.lower() == ".csv":
df = pd.read_csv(path)
else:
df = pd.read_excel(path, engine="openpyxl")
col_map = self._map_columns(df.columns.tolist())
measurements = self._extract(df, col_map)
return ParsedReport(
filename=path.name,
measurements=measurements,
metadata={"source": "excel", "rows": str(len(df))},
raw_text=df.to_string(max_rows=200),
)
def _map_columns(self, headers: list[str]) -> dict[str, str]:
"""Map canonical field names to actual DataFrame column names."""
mapping: dict[str, str] = {}
for header in headers:
canonical = match_column(str(header))
if canonical and canonical not in mapping:
mapping[canonical] = str(header)
return mapping
def _extract(
self, df: pd.DataFrame, col_map: dict[str, str]
) -> list[MeasurementRecord]:
required = {"feature_name", "nominal", "actual"}
if not required.issubset(col_map):
return self._fallback_extract(df)
records: list[MeasurementRecord] = []
for _, row in df.iterrows():
try:
nominal = float(row[col_map["nominal"]])
actual = float(row[col_map["actual"]])
tol_plus = (
float(row[col_map["tolerance_plus"]])
if "tolerance_plus" in col_map
else 0.0
)
tol_minus = (
float(row[col_map["tolerance_minus"]])
if "tolerance_minus" in col_map
else 0.0
)
deviation = (
float(row[col_map["deviation"]])
if "deviation" in col_map
else actual - nominal
)
records.append(
MeasurementRecord(
feature_name=str(row[col_map["feature_name"]]),
nominal=nominal,
tolerance_plus=abs(tol_plus),
tolerance_minus=-abs(tol_minus),
actual=actual,
deviation=deviation,
)
)
except (ValueError, TypeError):
continue
return records
def _fallback_extract(self, df: pd.DataFrame) -> list[MeasurementRecord]:
"""Best-effort extraction when column mapping is incomplete.
Treats the first string column as the feature name and the first
three numeric columns as nominal, actual, tolerance_plus (with
tolerance_minus mirrored).
"""
numeric_cols = df.select_dtypes(include="number").columns.tolist()
str_cols = df.select_dtypes(include="object").columns.tolist()
if len(numeric_cols) < 2 or not str_cols:
return []
name_col = str_cols[0]
nom_col = numeric_cols[0]
act_col = numeric_cols[1]
tol_col = numeric_cols[2] if len(numeric_cols) > 2 else None
records: list[MeasurementRecord] = []
for _, row in df.iterrows():
try:
nominal = float(row[nom_col])
actual = float(row[act_col])
tol = float(row[tol_col]) if tol_col else 0.0
records.append(
MeasurementRecord(
feature_name=str(row[name_col]),
nominal=nominal,
tolerance_plus=abs(tol),
tolerance_minus=-abs(tol),
actual=actual,
deviation=actual - nominal,
)
)
except (ValueError, TypeError):
continue
return records