Initial commit: CMM Report Analyzer

FastAPI app that parses CMM inspection reports (PDF/Excel/CSV), computes SPC metrics (Cp/Cpk/Pp/Ppk, control limits, Shapiro-Wilk), generates interactive Plotly charts, and provides AI-powered quality summaries via Azure OpenAI with graceful fallback. Includes 21 passing tests covering parsers, SPC calculations, and API endpoints.
2026-02-19 10:38:51 -06:00
commit 9abf9b4b58
28 changed files with 1727 additions and 0 deletions
--- a/app/parsers/init.py
+++ b/app/parsers/init.py
--- a/app/parsers/base.py
+++ b/app/parsers/base.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import re
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+from app.parsers.models import ParsedReport
+
+# Fuzzy column-name patterns → canonical field name
+COLUMN_PATTERNS: dict[str, list[re.Pattern[str]]] = {
+    "feature_name": [
+        re.compile(r"feat|char|dimen|label|id|name|item", re.I),
+    ],
+    "nominal": [
+        re.compile(r"nom|target|blueprint|print", re.I),
+    ],
+    "tolerance_plus": [
+        re.compile(r"tol.*\+|upper.*tol|\+.*tol|usl|dev.*\+|pos.*tol", re.I),
+    ],
+    "tolerance_minus": [
+        re.compile(r"tol.*-|lower.*tol|-.*tol|lsl|dev.*-|neg.*tol", re.I),
+    ],
+    "actual": [
+        re.compile(r"actual|meas|value|result|reading", re.I),
+    ],
+    "deviation": [
+        re.compile(r"dev(?!.*tol)|diff|error|delta", re.I),
+    ],
+}
+
+
+def match_column(header: str) -> str | None:
+    """Return the canonical field name for a header string, or None."""
+    header = header.strip()
+    for field_name, patterns in COLUMN_PATTERNS.items():
+        for pat in patterns:
+            if pat.search(header):
+                return field_name
+    return None
+
+
+class CMMParser(ABC):
+    @abstractmethod
+    def parse(self, path: Path) -> ParsedReport: ...
+
+
+def get_parser(filename: str) -> CMMParser:
+    suffix = Path(filename).suffix.lower()
+    if suffix == ".pdf":
+        from app.parsers.pdf_parser import PDFParser
+        return PDFParser()
+    if suffix in (".xlsx", ".xls", ".csv"):
+        from app.parsers.excel_parser import ExcelParser
+        return ExcelParser()
+    raise ValueError(f"Unsupported file type: {suffix}")
--- a/app/parsers/excel_parser.py
+++ b/app/parsers/excel_parser.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+
+from app.parsers.base import CMMParser, match_column
+from app.parsers.models import MeasurementRecord, ParsedReport
+
+
+class ExcelParser(CMMParser):
+    def parse(self, path: Path) -> ParsedReport:
+        if path.suffix.lower() == ".csv":
+            df = pd.read_csv(path)
+        else:
+            df = pd.read_excel(path, engine="openpyxl")
+
+        col_map = self._map_columns(df.columns.tolist())
+        measurements = self._extract(df, col_map)
+        return ParsedReport(
+            filename=path.name,
+            measurements=measurements,
+            metadata={"source": "excel", "rows": str(len(df))},
+            raw_text=df.to_string(max_rows=200),
+        )
+
+    def _map_columns(self, headers: list[str]) -> dict[str, str]:
+        """Map canonical field names to actual DataFrame column names."""
+        mapping: dict[str, str] = {}
+        for header in headers:
+            canonical = match_column(str(header))
+            if canonical and canonical not in mapping:
+                mapping[canonical] = str(header)
+        return mapping
+
+    def _extract(
+        self, df: pd.DataFrame, col_map: dict[str, str]
+    ) -> list[MeasurementRecord]:
+        required = {"feature_name", "nominal", "actual"}
+        if not required.issubset(col_map):
+            return self._fallback_extract(df)
+
+        records: list[MeasurementRecord] = []
+        for _, row in df.iterrows():
+            try:
+                nominal = float(row[col_map["nominal"]])
+                actual = float(row[col_map["actual"]])
+                tol_plus = (
+                    float(row[col_map["tolerance_plus"]])
+                    if "tolerance_plus" in col_map
+                    else 0.0
+                )
+                tol_minus = (
+                    float(row[col_map["tolerance_minus"]])
+                    if "tolerance_minus" in col_map
+                    else 0.0
+                )
+                deviation = (
+                    float(row[col_map["deviation"]])
+                    if "deviation" in col_map
+                    else actual - nominal
+                )
+                records.append(
+                    MeasurementRecord(
+                        feature_name=str(row[col_map["feature_name"]]),
+                        nominal=nominal,
+                        tolerance_plus=abs(tol_plus),
+                        tolerance_minus=-abs(tol_minus),
+                        actual=actual,
+                        deviation=deviation,
+                    )
+                )
+            except (ValueError, TypeError):
+                continue
+        return records
+
+    def _fallback_extract(self, df: pd.DataFrame) -> list[MeasurementRecord]:
+        """Best-effort extraction when column mapping is incomplete.
+
+        Treats the first string column as the feature name and the first
+        three numeric columns as nominal, actual, tolerance_plus (with
+        tolerance_minus mirrored).
+        """
+        numeric_cols = df.select_dtypes(include="number").columns.tolist()
+        str_cols = df.select_dtypes(include="object").columns.tolist()
+        if len(numeric_cols) < 2 or not str_cols:
+            return []
+
+        name_col = str_cols[0]
+        nom_col = numeric_cols[0]
+        act_col = numeric_cols[1]
+        tol_col = numeric_cols[2] if len(numeric_cols) > 2 else None
+
+        records: list[MeasurementRecord] = []
+        for _, row in df.iterrows():
+            try:
+                nominal = float(row[nom_col])
+                actual = float(row[act_col])
+                tol = float(row[tol_col]) if tol_col else 0.0
+                records.append(
+                    MeasurementRecord(
+                        feature_name=str(row[name_col]),
+                        nominal=nominal,
+                        tolerance_plus=abs(tol),
+                        tolerance_minus=-abs(tol),
+                        actual=actual,
+                        deviation=actual - nominal,
+                    )
+                )
+            except (ValueError, TypeError):
+                continue
+        return records
--- a/app/parsers/models.py
+++ b/app/parsers/models.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class MeasurementRecord:
+    feature_name: str
+    nominal: float
+    tolerance_plus: float
+    tolerance_minus: float
+    actual: float
+    deviation: float = 0.0
+    unit: str = "mm"
+
+    @property
+    def usl(self) -> float:
+        return self.nominal + self.tolerance_plus
+
+    @property
+    def lsl(self) -> float:
+        return self.nominal + self.tolerance_minus  # tolerance_minus is negative
+
+    @property
+    def in_tolerance(self) -> bool:
+        return self.lsl <= self.actual <= self.usl
+
+    def to_dict(self) -> dict:
+        return {
+            "feature_name": self.feature_name,
+            "nominal": self.nominal,
+            "tolerance_plus": self.tolerance_plus,
+            "tolerance_minus": self.tolerance_minus,
+            "actual": self.actual,
+            "deviation": self.deviation,
+            "unit": self.unit,
+            "usl": self.usl,
+            "lsl": self.lsl,
+            "in_tolerance": self.in_tolerance,
+        }
+
+
+@dataclass
+class ParsedReport:
+    filename: str
+    measurements: list[MeasurementRecord] = field(default_factory=list)
+    metadata: dict[str, str] = field(default_factory=dict)
+    raw_text: str = ""
+
+    @property
+    def out_of_tolerance(self) -> list[MeasurementRecord]:
+        return [m for m in self.measurements if not m.in_tolerance]
+
+    def to_dict(self) -> dict:
+        return {
+            "filename": self.filename,
+            "metadata": self.metadata,
+            "measurement_count": len(self.measurements),
+            "out_of_tolerance_count": len(self.out_of_tolerance),
+            "measurements": [m.to_dict() for m in self.measurements],
+        }
--- a/app/parsers/pdf_parser.py
+++ b/app/parsers/pdf_parser.py
@@ -0,0 +1,161 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pdfplumber
+
+from app.parsers.base import CMMParser, match_column
+from app.parsers.models import MeasurementRecord, ParsedReport
+
+
+class PDFParser(CMMParser):
+    def parse(self, path: Path) -> ParsedReport:
+        text_parts: list[str] = []
+        all_rows: list[dict[str, str | None]] = []
+        headers: list[str] = []
+
+        with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text() or ""
+                text_parts.append(page_text)
+
+                for table in page.extract_tables():
+                    if not table or not table[0]:
+                        continue
+                    if not headers:
+                        headers = [str(c or "").strip() for c in table[0]]
+                        data_rows = table[1:]
+                    else:
+                        data_rows = table
+                    for row in data_rows:
+                        if row and any(cell for cell in row):
+                            all_rows.append(
+                                {
+                                    headers[i]: (str(cell).strip() if cell else None)
+                                    for i, cell in enumerate(row)
+                                    if i < len(headers)
+                                }
+                            )
+
+        raw_text = "\n".join(text_parts)
+        col_map = {match_column(h): h for h in headers if match_column(h)}
+        measurements = self._extract(all_rows, col_map)
+        metadata = self._extract_metadata(raw_text)
+        metadata["source"] = "pdf"
+
+        return ParsedReport(
+            filename=path.name,
+            measurements=measurements,
+            metadata=metadata,
+            raw_text=raw_text[:10_000],
+        )
+
+    def _extract(
+        self,
+        rows: list[dict[str, str | None]],
+        col_map: dict[str | None, str],
+    ) -> list[MeasurementRecord]:
+        required = {"feature_name", "nominal", "actual"}
+        if not required.issubset(col_map):
+            return self._fallback_extract(rows)
+
+        records: list[MeasurementRecord] = []
+        for row in rows:
+            try:
+                name = row.get(col_map["feature_name"]) or ""
+                nominal = _to_float(row.get(col_map["nominal"]))
+                actual = _to_float(row.get(col_map["actual"]))
+                if nominal is None or actual is None or not name:
+                    continue
+                tol_plus = (
+                    _to_float(row.get(col_map.get("tolerance_plus", ""), "")) or 0.0
+                )
+                tol_minus = (
+                    _to_float(row.get(col_map.get("tolerance_minus", ""), "")) or 0.0
+                )
+                deviation = (
+                    _to_float(row.get(col_map.get("deviation", ""), ""))
+                    or actual - nominal
+                )
+                records.append(
+                    MeasurementRecord(
+                        feature_name=name,
+                        nominal=nominal,
+                        tolerance_plus=abs(tol_plus),
+                        tolerance_minus=-abs(tol_minus),
+                        actual=actual,
+                        deviation=deviation,
+                    )
+                )
+            except (ValueError, TypeError):
+                continue
+        return records
+
+    def _fallback_extract(
+        self, rows: list[dict[str, str | None]]
+    ) -> list[MeasurementRecord]:
+        """Try to extract from rows even without full column mapping."""
+        if not rows:
+            return []
+        headers = list(rows[0].keys())
+        # Heuristic: first string-looking column = name, then look for numeric columns
+        numeric_cols: list[str] = []
+        name_col: str | None = None
+        for h in headers:
+            sample_vals = [r.get(h) for r in rows[:5] if r.get(h)]
+            if sample_vals and all(_to_float(v) is not None for v in sample_vals):
+                numeric_cols.append(h)
+            elif name_col is None and sample_vals:
+                name_col = h
+        if not name_col or len(numeric_cols) < 2:
+            return []
+
+        records: list[MeasurementRecord] = []
+        for row in rows:
+            try:
+                name = row.get(name_col) or ""
+                nominal = _to_float(row.get(numeric_cols[0]))
+                actual = _to_float(row.get(numeric_cols[1]))
+                if nominal is None or actual is None or not name:
+                    continue
+                tol = _to_float(row.get(numeric_cols[2])) if len(numeric_cols) > 2 else 0.0
+                tol = tol or 0.0
+                records.append(
+                    MeasurementRecord(
+                        feature_name=name,
+                        nominal=nominal,
+                        tolerance_plus=abs(tol),
+                        tolerance_minus=-abs(tol),
+                        actual=actual,
+                        deviation=actual - nominal,
+                    )
+                )
+            except (ValueError, TypeError):
+                continue
+        return records
+
+    def _extract_metadata(self, text: str) -> dict[str, str]:
+        metadata: dict[str, str] = {}
+        import re
+
+        for pattern, key in [
+            (r"(?i)part\s*(?:no|number|#|:)\s*[:\s]*(\S+)", "part_number"),
+            (r"(?i)serial\s*(?:no|number|#|:)\s*[:\s]*(\S+)", "serial_number"),
+            (r"(?i)date\s*[:\s]+(\d[\d/\-\.]+\d)", "inspection_date"),
+            (r"(?i)program\s*[:\s]+(.+?)(?:\n|$)", "program"),
+            (r"(?i)operator\s*[:\s]+(.+?)(?:\n|$)", "operator"),
+        ]:
+            m = re.search(pattern, text)
+            if m:
+                metadata[key] = m.group(1).strip()
+        return metadata
+
+
+def _to_float(val: str | None) -> float | None:
+    if val is None:
+        return None
+    val = val.strip().replace(",", "")
+    try:
+        return float(val)
+    except ValueError:
+        return None