From 90401dc1ba20caf81d2d7b5eca4c3848225b5ea2 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Thu, 12 Feb 2026 21:14:13 -0600 Subject: [PATCH] feat: add maintenance receipt extraction pipeline with Gemini + regex (refs #150) - New MaintenanceReceiptExtractor: Gemini-primary extraction with regex cross-validation for dates, amounts, and odometer readings - New maintenance_receipt_validation.py: cross-validation patterns for structured field confidence adjustment - New POST /extract/maintenance-receipt endpoint reusing ReceiptExtractionResponse model - Per-field confidence scores (0.0-1.0) with Gemini base 0.85, boosted/reduced by regex agreement Co-Authored-By: Claude Opus 4.6 --- ocr/app/extractors/__init__.py | 6 + .../maintenance_receipt_extractor.py | 312 ++++++++++++++++++ ocr/app/patterns/__init__.py | 6 + .../maintenance_receipt_validation.py | 299 +++++++++++++++++ ocr/app/routers/extract.py | 90 +++++ 5 files changed, 713 insertions(+) create mode 100644 ocr/app/extractors/maintenance_receipt_extractor.py create mode 100644 ocr/app/patterns/maintenance_receipt_validation.py diff --git a/ocr/app/extractors/__init__.py b/ocr/app/extractors/__init__.py index c0026a7..97d7480 100644 --- a/ocr/app/extractors/__init__.py +++ b/ocr/app/extractors/__init__.py @@ -8,6 +8,10 @@ from app.extractors.receipt_extractor import ( ExtractedField, ) from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor +from app.extractors.maintenance_receipt_extractor import ( + MaintenanceReceiptExtractor, + maintenance_receipt_extractor, +) from app.extractors.manual_extractor import ( ManualExtractor, manual_extractor, @@ -27,6 +31,8 @@ __all__ = [ "ExtractedField", "FuelReceiptExtractor", "fuel_receipt_extractor", + "MaintenanceReceiptExtractor", + "maintenance_receipt_extractor", "ManualExtractor", "manual_extractor", "ManualExtractionResult", diff --git a/ocr/app/extractors/maintenance_receipt_extractor.py b/ocr/app/extractors/maintenance_receipt_extractor.py new file mode 100644 index 0000000..93285ba --- /dev/null +++ b/ocr/app/extractors/maintenance_receipt_extractor.py @@ -0,0 +1,312 @@ +"""Maintenance receipt extraction with Gemini-primary and regex cross-validation. + +Flow: +1. Preprocess image and OCR via receipt_extractor (PaddleOCR) +2. Send OCR text to Gemini text API for semantic field extraction +3. Cross-validate structured fields (date, cost, odometer) with regex +4. Return ReceiptExtractionResult with per-field confidence scores +""" + +import json +import logging +import os +import time +from typing import Any, Optional + +from app.config import settings +from app.extractors.receipt_extractor import ( + ExtractedField, + ReceiptExtractionResult, + receipt_extractor, +) +from app.patterns.maintenance_receipt_validation import ( + MaintenanceReceiptValidation, + maintenance_receipt_validator, +) + +logger = logging.getLogger(__name__) + +# Default confidence for Gemini-extracted fields before cross-validation +DEFAULT_GEMINI_CONFIDENCE = 0.85 + +# Gemini prompt for maintenance receipt field extraction +_RECEIPT_EXTRACTION_PROMPT = """\ +Extract maintenance service receipt fields from the following OCR text. + +For each field, extract the value if present. Return null for fields not found. + +Fields to extract: +- serviceName: The maintenance service performed (e.g., "Oil Change", "Brake Pad Replacement", "Tire Rotation") +- serviceDate: Date of service in YYYY-MM-DD format +- totalCost: Total cost as a number (e.g., 89.95) +- shopName: Name of the shop or business +- laborCost: Labor cost as a number, or null if not broken out +- partsCost: Parts cost as a number, or null if not broken out +- odometerReading: Odometer/mileage reading as a number, or null if not present +- vehicleInfo: Vehicle description if present (e.g., "2022 Toyota Camry"), or null + +Return a JSON object with these field names and their extracted values. + +OCR Text: +--- +{ocr_text} +---\ +""" + +_RECEIPT_RESPONSE_SCHEMA: dict[str, Any] = { + "type": "object", + "properties": { + "serviceName": {"type": "string", "nullable": True}, + "serviceDate": {"type": "string", "nullable": True}, + "totalCost": {"type": "number", "nullable": True}, + "shopName": {"type": "string", "nullable": True}, + "laborCost": {"type": "number", "nullable": True}, + "partsCost": {"type": "number", "nullable": True}, + "odometerReading": {"type": "number", "nullable": True}, + "vehicleInfo": {"type": "string", "nullable": True}, + }, + "required": [ + "serviceName", + "serviceDate", + "totalCost", + "shopName", + "laborCost", + "partsCost", + "odometerReading", + "vehicleInfo", + ], +} + + +class MaintenanceReceiptExtractor: + """Maintenance receipt extractor using Gemini for semantic extraction. + + Wraps receipt_extractor for OCR preprocessing, then sends raw text to + Gemini for field extraction. Structured fields (dates, amounts, odometer) + are cross-validated against regex patterns for confidence adjustment. + """ + + def __init__(self) -> None: + self._model: Any | None = None + self._generation_config: Any | None = None + + def extract( + self, + image_bytes: bytes, + content_type: Optional[str] = None, + ) -> ReceiptExtractionResult: + """Extract maintenance receipt fields from an image. + + Args: + image_bytes: Raw image bytes (HEIC, JPEG, PNG). + content_type: MIME type (auto-detected if not provided). + + Returns: + ReceiptExtractionResult with maintenance-specific fields. + """ + start_time = time.time() + + # Step 1: OCR the image via receipt_extractor + ocr_result = receipt_extractor.extract( + image_bytes=image_bytes, + content_type=content_type, + ) + + if not ocr_result.success: + return ocr_result + + raw_text = ocr_result.raw_text + + if not raw_text.strip(): + return ReceiptExtractionResult( + success=False, + error="No text found in image", + processing_time_ms=int((time.time() - start_time) * 1000), + ) + + # Step 2: Extract fields with Gemini + try: + gemini_fields = self._extract_with_gemini(raw_text) + except Exception as e: + logger.warning(f"Gemini extraction failed, falling back to OCR-only: {e}") + gemini_fields = {} + + # Step 3: Build extracted fields with base confidence + extracted_fields = self._build_fields(gemini_fields) + + if not extracted_fields: + return ReceiptExtractionResult( + success=False, + receipt_type="maintenance", + error="No maintenance receipt fields could be extracted", + raw_text=raw_text, + processing_time_ms=int((time.time() - start_time) * 1000), + ) + + # Step 4: Cross-validate structured fields with regex + validation = maintenance_receipt_validator.validate(gemini_fields, raw_text) + + if validation.issues: + logger.info(f"Maintenance receipt validation issues: {validation.issues}") + + # Step 5: Adjust confidences based on cross-validation + adjusted_fields = self._adjust_confidences(extracted_fields, validation) + + processing_time_ms = int((time.time() - start_time) * 1000) + + logger.info( + f"Maintenance receipt extraction: " + f"fields={len(adjusted_fields)}, " + f"validated={validation.is_valid}, " + f"time={processing_time_ms}ms" + ) + + return ReceiptExtractionResult( + success=True, + receipt_type="maintenance", + extracted_fields=adjusted_fields, + raw_text=raw_text, + processing_time_ms=processing_time_ms, + ) + + def _get_model(self) -> Any: + """Lazy-initialize Vertex AI Gemini model. + + Uses the same authentication pattern as GeminiEngine. + """ + if self._model is not None: + return self._model + + key_path = settings.google_vision_key_path + if not os.path.isfile(key_path): + raise RuntimeError( + f"Google credential config not found at {key_path}. " + "Set GOOGLE_VISION_KEY_PATH or mount the secret." + ) + + from google.cloud import aiplatform # type: ignore[import-untyped] + from vertexai.generative_models import ( # type: ignore[import-untyped] + GenerationConfig, + GenerativeModel, + ) + + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path + os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1" + + aiplatform.init( + project=settings.vertex_ai_project, + location=settings.vertex_ai_location, + ) + + model_name = settings.gemini_model + self._model = GenerativeModel(model_name) + self._generation_config = GenerationConfig( + response_mime_type="application/json", + response_schema=_RECEIPT_RESPONSE_SCHEMA, + ) + + logger.info( + "Maintenance receipt Gemini model initialized (model=%s)", + model_name, + ) + return self._model + + def _extract_with_gemini(self, ocr_text: str) -> dict: + """Send OCR text to Gemini for semantic field extraction. + + Args: + ocr_text: Raw OCR text from receipt image. + + Returns: + Dictionary of field_name -> extracted_value from Gemini. + """ + model = self._get_model() + + prompt = _RECEIPT_EXTRACTION_PROMPT.format(ocr_text=ocr_text) + + response = model.generate_content( + [prompt], + generation_config=self._generation_config, + ) + + raw = json.loads(response.text) + + logger.info( + "Gemini extracted maintenance fields: %s", + [k for k, v in raw.items() if v is not None], + ) + + return raw + + def _build_fields(self, gemini_fields: dict) -> dict[str, ExtractedField]: + """Convert Gemini response to ExtractedField dict with base confidence. + + Args: + gemini_fields: Raw Gemini response dict. + + Returns: + Dictionary of field_name -> ExtractedField. + """ + fields: dict[str, ExtractedField] = {} + + for field_name, value in gemini_fields.items(): + if value is None: + continue + + # Convert numeric values to appropriate types + if field_name in ("totalCost", "laborCost", "partsCost"): + try: + value = round(float(value), 2) + except (ValueError, TypeError): + continue + elif field_name == "odometerReading": + try: + value = int(float(value)) + except (ValueError, TypeError): + continue + elif isinstance(value, str) and not value.strip(): + continue + + fields[field_name] = ExtractedField( + value=value, + confidence=DEFAULT_GEMINI_CONFIDENCE, + ) + + return fields + + def _adjust_confidences( + self, + fields: dict[str, ExtractedField], + validation: MaintenanceReceiptValidation, + ) -> dict[str, ExtractedField]: + """Adjust field confidences based on cross-validation results. + + Args: + fields: Extracted fields with base confidence. + validation: Cross-validation results. + + Returns: + Fields with adjusted confidences. + """ + adjusted: dict[str, ExtractedField] = {} + + for name, extracted_field in fields.items(): + if name in validation.field_validations: + fv = validation.field_validations[name] + new_confidence = min( + 1.0, extracted_field.confidence * fv.confidence_adjustment + ) + else: + # Semantic fields (no regex validation) keep base confidence + new_confidence = extracted_field.confidence + + adjusted[name] = ExtractedField( + value=extracted_field.value, + confidence=round(new_confidence, 3), + ) + + return adjusted + + +# Singleton instance +maintenance_receipt_extractor = MaintenanceReceiptExtractor() diff --git a/ocr/app/patterns/__init__.py b/ocr/app/patterns/__init__.py index f9a8bc4..969c36a 100644 --- a/ocr/app/patterns/__init__.py +++ b/ocr/app/patterns/__init__.py @@ -4,6 +4,10 @@ from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matc from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher from app.patterns.service_mapping import ServiceMapper, service_mapper +from app.patterns.maintenance_receipt_validation import ( + MaintenanceReceiptValidator, + maintenance_receipt_validator, +) __all__ = [ "DatePatternMatcher", @@ -16,4 +20,6 @@ __all__ = [ "maintenance_matcher", "ServiceMapper", "service_mapper", + "MaintenanceReceiptValidator", + "maintenance_receipt_validator", ] diff --git a/ocr/app/patterns/maintenance_receipt_validation.py b/ocr/app/patterns/maintenance_receipt_validation.py new file mode 100644 index 0000000..ef2abcc --- /dev/null +++ b/ocr/app/patterns/maintenance_receipt_validation.py @@ -0,0 +1,299 @@ +"""Cross-validation patterns for maintenance receipt field extraction. + +Validates structured fields (dates, amounts, odometer) extracted by Gemini +against regex patterns found in the OCR raw text. Boosts or reduces confidence +based on regex agreement. +""" + +import re +from dataclasses import dataclass, field +from typing import Optional + +from app.patterns.currency_patterns import currency_matcher +from app.patterns.date_patterns import date_matcher + + +@dataclass +class FieldValidation: + """Validation result for a single extracted field.""" + + field_name: str + regex_confirmed: bool + confidence_adjustment: float # Multiplier: >1.0 boosts, <1.0 reduces + regex_value: Optional[str] = None # Value found by regex, if any + + +@dataclass +class MaintenanceReceiptValidation: + """Aggregated validation result for a maintenance receipt.""" + + is_valid: bool + issues: list[str] + field_validations: dict[str, FieldValidation] = field(default_factory=dict) + overall_confidence: float = 1.0 + + +# Odometer patterns: 5-7 digit numbers near odometer keywords +ODOMETER_PATTERNS = [ + # "Odometer: 45,231" or "Mileage: 45231" + ( + r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})", + "labeled_odometer", + 0.95, + ), + # "45,231 mi" or "45231 miles" + ( + r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)", + "unit_odometer", + 0.90, + ), + # Standalone 5-6 digit number (lower confidence) + ( + r"(? MaintenanceReceiptValidation: + """Validate Gemini-extracted fields against regex patterns in raw OCR text. + + Args: + gemini_fields: Fields extracted by Gemini (field_name -> value). + raw_text: Raw OCR text for regex cross-validation. + + Returns: + MaintenanceReceiptValidation with per-field results. + """ + issues: list[str] = [] + field_validations: dict[str, FieldValidation] = {} + overall_confidence = 1.0 + + # Validate date field + if "serviceDate" in gemini_fields: + date_validation = self._validate_date( + gemini_fields["serviceDate"], raw_text + ) + field_validations["serviceDate"] = date_validation + if not date_validation.regex_confirmed: + issues.append( + f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex" + ) + overall_confidence *= 0.85 + + # Validate total cost + if "totalCost" in gemini_fields: + cost_validation = self._validate_amount( + "totalCost", gemini_fields["totalCost"], raw_text + ) + field_validations["totalCost"] = cost_validation + if not cost_validation.regex_confirmed: + issues.append( + f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex" + ) + overall_confidence *= 0.85 + + # Validate labor cost + if "laborCost" in gemini_fields: + labor_validation = self._validate_amount( + "laborCost", gemini_fields["laborCost"], raw_text + ) + field_validations["laborCost"] = labor_validation + if not labor_validation.regex_confirmed: + issues.append( + f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex" + ) + overall_confidence *= 0.90 + + # Validate odometer + if "odometerReading" in gemini_fields: + odo_validation = self._validate_odometer( + gemini_fields["odometerReading"], raw_text + ) + field_validations["odometerReading"] = odo_validation + if not odo_validation.regex_confirmed: + issues.append( + f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex" + ) + overall_confidence *= 0.90 + + is_valid = len(issues) == 0 + return MaintenanceReceiptValidation( + is_valid=is_valid, + issues=issues, + field_validations=field_validations, + overall_confidence=overall_confidence, + ) + + def extract_odometer(self, text: str) -> Optional[OdometerMatch]: + """Extract odometer reading from text using regex patterns. + + Args: + text: OCR text to search. + + Returns: + Best OdometerMatch or None. + """ + text_upper = text.upper() + best_match: Optional[OdometerMatch] = None + + for pattern, name, confidence in ODOMETER_PATTERNS: + for match in re.finditer(pattern, text_upper): + raw_value = match.group(1) + parsed = self._parse_odometer(raw_value) + if parsed is not None and self._is_reasonable_odometer(parsed): + candidate = OdometerMatch( + value=parsed, + raw_match=match.group(0).strip(), + confidence=confidence, + pattern_name=name, + ) + if best_match is None or candidate.confidence > best_match.confidence: + best_match = candidate + + return best_match + + def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation: + """Check if Gemini-extracted date matches a regex-found date.""" + regex_dates = date_matcher.extract_dates(raw_text) + + if not regex_dates: + # No dates found by regex -- cannot confirm or deny + return FieldValidation( + field_name="serviceDate", + regex_confirmed=False, + confidence_adjustment=0.95, + ) + + # Normalize Gemini date for comparison + gemini_normalized = gemini_date.strip().replace("/", "-") + + for regex_date in regex_dates: + if regex_date.value == gemini_normalized: + return FieldValidation( + field_name="serviceDate", + regex_confirmed=True, + confidence_adjustment=1.10, + regex_value=regex_date.value, + ) + + # Gemini found a date but it doesn't match regex dates + return FieldValidation( + field_name="serviceDate", + regex_confirmed=False, + confidence_adjustment=0.80, + regex_value=regex_dates[0].value if regex_dates else None, + ) + + def _validate_amount( + self, field_name: str, gemini_amount: str | float, raw_text: str + ) -> FieldValidation: + """Check if Gemini-extracted amount matches a regex-found amount.""" + try: + gemini_value = float(str(gemini_amount).replace("$", "").replace(",", "")) + except (ValueError, TypeError): + return FieldValidation( + field_name=field_name, + regex_confirmed=False, + confidence_adjustment=0.70, + ) + + regex_amounts = currency_matcher.extract_all_amounts(raw_text) + + if not regex_amounts: + return FieldValidation( + field_name=field_name, + regex_confirmed=False, + confidence_adjustment=0.95, + ) + + # Check if any regex amount matches within 5% tolerance + for regex_amount in regex_amounts: + if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05: + return FieldValidation( + field_name=field_name, + regex_confirmed=True, + confidence_adjustment=1.10, + regex_value=str(regex_amount.value), + ) + + return FieldValidation( + field_name=field_name, + regex_confirmed=False, + confidence_adjustment=0.80, + regex_value=str(regex_amounts[0].value) if regex_amounts else None, + ) + + def _validate_odometer( + self, gemini_odometer: str | int, raw_text: str + ) -> FieldValidation: + """Check if Gemini-extracted odometer matches a regex-found reading.""" + try: + gemini_value = int( + str(gemini_odometer).replace(",", "").replace(".", "").strip() + ) + except (ValueError, TypeError): + return FieldValidation( + field_name="odometerReading", + regex_confirmed=False, + confidence_adjustment=0.70, + ) + + regex_match = self.extract_odometer(raw_text) + + if not regex_match: + return FieldValidation( + field_name="odometerReading", + regex_confirmed=False, + confidence_adjustment=0.95, + ) + + # Check if values match within 1% tolerance (OCR might misread a digit) + if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01: + return FieldValidation( + field_name="odometerReading", + regex_confirmed=True, + confidence_adjustment=1.10, + regex_value=str(regex_match.value), + ) + + return FieldValidation( + field_name="odometerReading", + regex_confirmed=False, + confidence_adjustment=0.80, + regex_value=str(regex_match.value), + ) + + def _parse_odometer(self, raw: str) -> Optional[int]: + """Parse odometer string to integer.""" + cleaned = raw.replace(",", "").replace(".", "").strip() + try: + return int(cleaned) + except ValueError: + return None + + def _is_reasonable_odometer(self, value: int) -> bool: + """Check if odometer reading is in a reasonable range.""" + return 100 <= value <= 999_999 + + +# Singleton instance +maintenance_receipt_validator = MaintenanceReceiptValidator() diff --git a/ocr/app/routers/extract.py b/ocr/app/routers/extract.py index e9aa6e3..3c1d02f 100644 --- a/ocr/app/routers/extract.py +++ b/ocr/app/routers/extract.py @@ -6,6 +6,7 @@ from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Query from app.extractors.vin_extractor import vin_extractor from app.extractors.receipt_extractor import receipt_extractor +from app.extractors.maintenance_receipt_extractor import maintenance_receipt_extractor from app.extractors.manual_extractor import manual_extractor from app.models import ( BoundingBox, @@ -267,6 +268,95 @@ async def extract_receipt( ) +@router.post("/maintenance-receipt", response_model=ReceiptExtractionResponse) +async def extract_maintenance_receipt( + file: UploadFile = File(..., description="Maintenance receipt image file"), +) -> ReceiptExtractionResponse: + """ + Extract data from a maintenance receipt image using OCR + Gemini. + + Gemini-primary extraction with regex cross-validation: + - OCR preprocessing (HEIC conversion, contrast, thresholding) + - PaddleOCR text extraction + - Gemini semantic field extraction from OCR text + - Regex cross-validation for dates, amounts, odometer + + Supports HEIC, JPEG, PNG formats. + + - **file**: Maintenance receipt image file (max 10MB) + + Returns: + - **receiptType**: "maintenance" + - **extractedFields**: Dictionary of extracted fields with confidence scores + - serviceName: Service performed (e.g., "Oil Change") + - serviceDate: Date in YYYY-MM-DD format + - totalCost: Total cost + - shopName: Shop or business name + - laborCost: Labor cost (if broken out) + - partsCost: Parts cost (if broken out) + - odometerReading: Odometer reading (if present) + - vehicleInfo: Vehicle description (if present) + - **rawText**: Full OCR text + - **processingTimeMs**: Processing time in milliseconds + """ + # Validate file presence + if not file.filename: + raise HTTPException(status_code=400, detail="No file provided") + + # Read file content + content = await file.read() + file_size = len(content) + + # Validate file size + if file_size > MAX_SYNC_SIZE: + raise HTTPException( + status_code=413, + detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB", + ) + + if file_size == 0: + raise HTTPException(status_code=400, detail="Empty file provided") + + logger.info( + f"Maintenance receipt extraction: {file.filename}, " + f"size: {file_size} bytes, " + f"content_type: {file.content_type}" + ) + + # Perform maintenance receipt extraction + result = maintenance_receipt_extractor.extract( + image_bytes=content, + content_type=file.content_type, + ) + + if not result.success: + logger.warning( + f"Maintenance receipt extraction failed for {file.filename}: {result.error}" + ) + raise HTTPException( + status_code=422, + detail=result.error or "Failed to extract data from maintenance receipt", + ) + + # Convert internal fields to API response format + extracted_fields = { + name: ReceiptExtractedField( + value=field.value, + confidence=field.confidence, + ) + for name, field in result.extracted_fields.items() + } + + return ReceiptExtractionResponse( + success=result.success, + receiptType=result.receipt_type, + extractedFields=extracted_fields, + rawText=result.raw_text, + processingTimeMs=result.processing_time_ms, + error=result.error, + ) + + @router.post("/manual", response_model=ManualJobResponse) async def extract_manual( background_tasks: BackgroundTasks,