Initial commit: CMM Report Analyzer
FastAPI app that parses CMM inspection reports (PDF/Excel/CSV), computes SPC metrics (Cp/Cpk/Pp/Ppk, control limits, Shapiro-Wilk), generates interactive Plotly charts, and provides AI-powered quality summaries via Azure OpenAI with graceful fallback. Includes 21 passing tests covering parsers, SPC calculations, and API endpoints.
This commit is contained in:
0
app/parsers/__init__.py
Normal file
0
app/parsers/__init__.py
Normal file
55
app/parsers/base.py
Normal file
55
app/parsers/base.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
from app.parsers.models import ParsedReport
|
||||
|
||||
# Fuzzy column-name patterns → canonical field name
|
||||
COLUMN_PATTERNS: dict[str, list[re.Pattern[str]]] = {
|
||||
"feature_name": [
|
||||
re.compile(r"feat|char|dimen|label|id|name|item", re.I),
|
||||
],
|
||||
"nominal": [
|
||||
re.compile(r"nom|target|blueprint|print", re.I),
|
||||
],
|
||||
"tolerance_plus": [
|
||||
re.compile(r"tol.*\+|upper.*tol|\+.*tol|usl|dev.*\+|pos.*tol", re.I),
|
||||
],
|
||||
"tolerance_minus": [
|
||||
re.compile(r"tol.*-|lower.*tol|-.*tol|lsl|dev.*-|neg.*tol", re.I),
|
||||
],
|
||||
"actual": [
|
||||
re.compile(r"actual|meas|value|result|reading", re.I),
|
||||
],
|
||||
"deviation": [
|
||||
re.compile(r"dev(?!.*tol)|diff|error|delta", re.I),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def match_column(header: str) -> str | None:
|
||||
"""Return the canonical field name for a header string, or None."""
|
||||
header = header.strip()
|
||||
for field_name, patterns in COLUMN_PATTERNS.items():
|
||||
for pat in patterns:
|
||||
if pat.search(header):
|
||||
return field_name
|
||||
return None
|
||||
|
||||
|
||||
class CMMParser(ABC):
|
||||
@abstractmethod
|
||||
def parse(self, path: Path) -> ParsedReport: ...
|
||||
|
||||
|
||||
def get_parser(filename: str) -> CMMParser:
|
||||
suffix = Path(filename).suffix.lower()
|
||||
if suffix == ".pdf":
|
||||
from app.parsers.pdf_parser import PDFParser
|
||||
return PDFParser()
|
||||
if suffix in (".xlsx", ".xls", ".csv"):
|
||||
from app.parsers.excel_parser import ExcelParser
|
||||
return ExcelParser()
|
||||
raise ValueError(f"Unsupported file type: {suffix}")
|
||||
112
app/parsers/excel_parser.py
Normal file
112
app/parsers/excel_parser.py
Normal file
@@ -0,0 +1,112 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from app.parsers.base import CMMParser, match_column
|
||||
from app.parsers.models import MeasurementRecord, ParsedReport
|
||||
|
||||
|
||||
class ExcelParser(CMMParser):
|
||||
def parse(self, path: Path) -> ParsedReport:
|
||||
if path.suffix.lower() == ".csv":
|
||||
df = pd.read_csv(path)
|
||||
else:
|
||||
df = pd.read_excel(path, engine="openpyxl")
|
||||
|
||||
col_map = self._map_columns(df.columns.tolist())
|
||||
measurements = self._extract(df, col_map)
|
||||
return ParsedReport(
|
||||
filename=path.name,
|
||||
measurements=measurements,
|
||||
metadata={"source": "excel", "rows": str(len(df))},
|
||||
raw_text=df.to_string(max_rows=200),
|
||||
)
|
||||
|
||||
def _map_columns(self, headers: list[str]) -> dict[str, str]:
|
||||
"""Map canonical field names to actual DataFrame column names."""
|
||||
mapping: dict[str, str] = {}
|
||||
for header in headers:
|
||||
canonical = match_column(str(header))
|
||||
if canonical and canonical not in mapping:
|
||||
mapping[canonical] = str(header)
|
||||
return mapping
|
||||
|
||||
def _extract(
|
||||
self, df: pd.DataFrame, col_map: dict[str, str]
|
||||
) -> list[MeasurementRecord]:
|
||||
required = {"feature_name", "nominal", "actual"}
|
||||
if not required.issubset(col_map):
|
||||
return self._fallback_extract(df)
|
||||
|
||||
records: list[MeasurementRecord] = []
|
||||
for _, row in df.iterrows():
|
||||
try:
|
||||
nominal = float(row[col_map["nominal"]])
|
||||
actual = float(row[col_map["actual"]])
|
||||
tol_plus = (
|
||||
float(row[col_map["tolerance_plus"]])
|
||||
if "tolerance_plus" in col_map
|
||||
else 0.0
|
||||
)
|
||||
tol_minus = (
|
||||
float(row[col_map["tolerance_minus"]])
|
||||
if "tolerance_minus" in col_map
|
||||
else 0.0
|
||||
)
|
||||
deviation = (
|
||||
float(row[col_map["deviation"]])
|
||||
if "deviation" in col_map
|
||||
else actual - nominal
|
||||
)
|
||||
records.append(
|
||||
MeasurementRecord(
|
||||
feature_name=str(row[col_map["feature_name"]]),
|
||||
nominal=nominal,
|
||||
tolerance_plus=abs(tol_plus),
|
||||
tolerance_minus=-abs(tol_minus),
|
||||
actual=actual,
|
||||
deviation=deviation,
|
||||
)
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
return records
|
||||
|
||||
def _fallback_extract(self, df: pd.DataFrame) -> list[MeasurementRecord]:
|
||||
"""Best-effort extraction when column mapping is incomplete.
|
||||
|
||||
Treats the first string column as the feature name and the first
|
||||
three numeric columns as nominal, actual, tolerance_plus (with
|
||||
tolerance_minus mirrored).
|
||||
"""
|
||||
numeric_cols = df.select_dtypes(include="number").columns.tolist()
|
||||
str_cols = df.select_dtypes(include="object").columns.tolist()
|
||||
if len(numeric_cols) < 2 or not str_cols:
|
||||
return []
|
||||
|
||||
name_col = str_cols[0]
|
||||
nom_col = numeric_cols[0]
|
||||
act_col = numeric_cols[1]
|
||||
tol_col = numeric_cols[2] if len(numeric_cols) > 2 else None
|
||||
|
||||
records: list[MeasurementRecord] = []
|
||||
for _, row in df.iterrows():
|
||||
try:
|
||||
nominal = float(row[nom_col])
|
||||
actual = float(row[act_col])
|
||||
tol = float(row[tol_col]) if tol_col else 0.0
|
||||
records.append(
|
||||
MeasurementRecord(
|
||||
feature_name=str(row[name_col]),
|
||||
nominal=nominal,
|
||||
tolerance_plus=abs(tol),
|
||||
tolerance_minus=-abs(tol),
|
||||
actual=actual,
|
||||
deviation=actual - nominal,
|
||||
)
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
return records
|
||||
61
app/parsers/models.py
Normal file
61
app/parsers/models.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class MeasurementRecord:
|
||||
feature_name: str
|
||||
nominal: float
|
||||
tolerance_plus: float
|
||||
tolerance_minus: float
|
||||
actual: float
|
||||
deviation: float = 0.0
|
||||
unit: str = "mm"
|
||||
|
||||
@property
|
||||
def usl(self) -> float:
|
||||
return self.nominal + self.tolerance_plus
|
||||
|
||||
@property
|
||||
def lsl(self) -> float:
|
||||
return self.nominal + self.tolerance_minus # tolerance_minus is negative
|
||||
|
||||
@property
|
||||
def in_tolerance(self) -> bool:
|
||||
return self.lsl <= self.actual <= self.usl
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"feature_name": self.feature_name,
|
||||
"nominal": self.nominal,
|
||||
"tolerance_plus": self.tolerance_plus,
|
||||
"tolerance_minus": self.tolerance_minus,
|
||||
"actual": self.actual,
|
||||
"deviation": self.deviation,
|
||||
"unit": self.unit,
|
||||
"usl": self.usl,
|
||||
"lsl": self.lsl,
|
||||
"in_tolerance": self.in_tolerance,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedReport:
|
||||
filename: str
|
||||
measurements: list[MeasurementRecord] = field(default_factory=list)
|
||||
metadata: dict[str, str] = field(default_factory=dict)
|
||||
raw_text: str = ""
|
||||
|
||||
@property
|
||||
def out_of_tolerance(self) -> list[MeasurementRecord]:
|
||||
return [m for m in self.measurements if not m.in_tolerance]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"filename": self.filename,
|
||||
"metadata": self.metadata,
|
||||
"measurement_count": len(self.measurements),
|
||||
"out_of_tolerance_count": len(self.out_of_tolerance),
|
||||
"measurements": [m.to_dict() for m in self.measurements],
|
||||
}
|
||||
161
app/parsers/pdf_parser.py
Normal file
161
app/parsers/pdf_parser.py
Normal file
@@ -0,0 +1,161 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pdfplumber
|
||||
|
||||
from app.parsers.base import CMMParser, match_column
|
||||
from app.parsers.models import MeasurementRecord, ParsedReport
|
||||
|
||||
|
||||
class PDFParser(CMMParser):
|
||||
def parse(self, path: Path) -> ParsedReport:
|
||||
text_parts: list[str] = []
|
||||
all_rows: list[dict[str, str | None]] = []
|
||||
headers: list[str] = []
|
||||
|
||||
with pdfplumber.open(path) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text() or ""
|
||||
text_parts.append(page_text)
|
||||
|
||||
for table in page.extract_tables():
|
||||
if not table or not table[0]:
|
||||
continue
|
||||
if not headers:
|
||||
headers = [str(c or "").strip() for c in table[0]]
|
||||
data_rows = table[1:]
|
||||
else:
|
||||
data_rows = table
|
||||
for row in data_rows:
|
||||
if row and any(cell for cell in row):
|
||||
all_rows.append(
|
||||
{
|
||||
headers[i]: (str(cell).strip() if cell else None)
|
||||
for i, cell in enumerate(row)
|
||||
if i < len(headers)
|
||||
}
|
||||
)
|
||||
|
||||
raw_text = "\n".join(text_parts)
|
||||
col_map = {match_column(h): h for h in headers if match_column(h)}
|
||||
measurements = self._extract(all_rows, col_map)
|
||||
metadata = self._extract_metadata(raw_text)
|
||||
metadata["source"] = "pdf"
|
||||
|
||||
return ParsedReport(
|
||||
filename=path.name,
|
||||
measurements=measurements,
|
||||
metadata=metadata,
|
||||
raw_text=raw_text[:10_000],
|
||||
)
|
||||
|
||||
def _extract(
|
||||
self,
|
||||
rows: list[dict[str, str | None]],
|
||||
col_map: dict[str | None, str],
|
||||
) -> list[MeasurementRecord]:
|
||||
required = {"feature_name", "nominal", "actual"}
|
||||
if not required.issubset(col_map):
|
||||
return self._fallback_extract(rows)
|
||||
|
||||
records: list[MeasurementRecord] = []
|
||||
for row in rows:
|
||||
try:
|
||||
name = row.get(col_map["feature_name"]) or ""
|
||||
nominal = _to_float(row.get(col_map["nominal"]))
|
||||
actual = _to_float(row.get(col_map["actual"]))
|
||||
if nominal is None or actual is None or not name:
|
||||
continue
|
||||
tol_plus = (
|
||||
_to_float(row.get(col_map.get("tolerance_plus", ""), "")) or 0.0
|
||||
)
|
||||
tol_minus = (
|
||||
_to_float(row.get(col_map.get("tolerance_minus", ""), "")) or 0.0
|
||||
)
|
||||
deviation = (
|
||||
_to_float(row.get(col_map.get("deviation", ""), ""))
|
||||
or actual - nominal
|
||||
)
|
||||
records.append(
|
||||
MeasurementRecord(
|
||||
feature_name=name,
|
||||
nominal=nominal,
|
||||
tolerance_plus=abs(tol_plus),
|
||||
tolerance_minus=-abs(tol_minus),
|
||||
actual=actual,
|
||||
deviation=deviation,
|
||||
)
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
return records
|
||||
|
||||
def _fallback_extract(
|
||||
self, rows: list[dict[str, str | None]]
|
||||
) -> list[MeasurementRecord]:
|
||||
"""Try to extract from rows even without full column mapping."""
|
||||
if not rows:
|
||||
return []
|
||||
headers = list(rows[0].keys())
|
||||
# Heuristic: first string-looking column = name, then look for numeric columns
|
||||
numeric_cols: list[str] = []
|
||||
name_col: str | None = None
|
||||
for h in headers:
|
||||
sample_vals = [r.get(h) for r in rows[:5] if r.get(h)]
|
||||
if sample_vals and all(_to_float(v) is not None for v in sample_vals):
|
||||
numeric_cols.append(h)
|
||||
elif name_col is None and sample_vals:
|
||||
name_col = h
|
||||
if not name_col or len(numeric_cols) < 2:
|
||||
return []
|
||||
|
||||
records: list[MeasurementRecord] = []
|
||||
for row in rows:
|
||||
try:
|
||||
name = row.get(name_col) or ""
|
||||
nominal = _to_float(row.get(numeric_cols[0]))
|
||||
actual = _to_float(row.get(numeric_cols[1]))
|
||||
if nominal is None or actual is None or not name:
|
||||
continue
|
||||
tol = _to_float(row.get(numeric_cols[2])) if len(numeric_cols) > 2 else 0.0
|
||||
tol = tol or 0.0
|
||||
records.append(
|
||||
MeasurementRecord(
|
||||
feature_name=name,
|
||||
nominal=nominal,
|
||||
tolerance_plus=abs(tol),
|
||||
tolerance_minus=-abs(tol),
|
||||
actual=actual,
|
||||
deviation=actual - nominal,
|
||||
)
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
return records
|
||||
|
||||
def _extract_metadata(self, text: str) -> dict[str, str]:
|
||||
metadata: dict[str, str] = {}
|
||||
import re
|
||||
|
||||
for pattern, key in [
|
||||
(r"(?i)part\s*(?:no|number|#|:)\s*[:\s]*(\S+)", "part_number"),
|
||||
(r"(?i)serial\s*(?:no|number|#|:)\s*[:\s]*(\S+)", "serial_number"),
|
||||
(r"(?i)date\s*[:\s]+(\d[\d/\-\.]+\d)", "inspection_date"),
|
||||
(r"(?i)program\s*[:\s]+(.+?)(?:\n|$)", "program"),
|
||||
(r"(?i)operator\s*[:\s]+(.+?)(?:\n|$)", "operator"),
|
||||
]:
|
||||
m = re.search(pattern, text)
|
||||
if m:
|
||||
metadata[key] = m.group(1).strip()
|
||||
return metadata
|
||||
|
||||
|
||||
def _to_float(val: str | None) -> float | None:
|
||||
if val is None:
|
||||
return None
|
||||
val = val.strip().replace(",", "")
|
||||
try:
|
||||
return float(val)
|
||||
except ValueError:
|
||||
return None
|
||||
Reference in New Issue
Block a user