feat: add maintenance receipt extraction pipeline with Gemini + regex (refs #150)
- New MaintenanceReceiptExtractor: Gemini-primary extraction with regex cross-validation for dates, amounts, and odometer readings - New maintenance_receipt_validation.py: cross-validation patterns for structured field confidence adjustment - New POST /extract/maintenance-receipt endpoint reusing ReceiptExtractionResponse model - Per-field confidence scores (0.0-1.0) with Gemini base 0.85, boosted/reduced by regex agreement Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,10 @@ from app.extractors.receipt_extractor import (
|
|||||||
ExtractedField,
|
ExtractedField,
|
||||||
)
|
)
|
||||||
from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor
|
from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor
|
||||||
|
from app.extractors.maintenance_receipt_extractor import (
|
||||||
|
MaintenanceReceiptExtractor,
|
||||||
|
maintenance_receipt_extractor,
|
||||||
|
)
|
||||||
from app.extractors.manual_extractor import (
|
from app.extractors.manual_extractor import (
|
||||||
ManualExtractor,
|
ManualExtractor,
|
||||||
manual_extractor,
|
manual_extractor,
|
||||||
@@ -27,6 +31,8 @@ __all__ = [
|
|||||||
"ExtractedField",
|
"ExtractedField",
|
||||||
"FuelReceiptExtractor",
|
"FuelReceiptExtractor",
|
||||||
"fuel_receipt_extractor",
|
"fuel_receipt_extractor",
|
||||||
|
"MaintenanceReceiptExtractor",
|
||||||
|
"maintenance_receipt_extractor",
|
||||||
"ManualExtractor",
|
"ManualExtractor",
|
||||||
"manual_extractor",
|
"manual_extractor",
|
||||||
"ManualExtractionResult",
|
"ManualExtractionResult",
|
||||||
|
|||||||
312
ocr/app/extractors/maintenance_receipt_extractor.py
Normal file
312
ocr/app/extractors/maintenance_receipt_extractor.py
Normal file
@@ -0,0 +1,312 @@
|
|||||||
|
"""Maintenance receipt extraction with Gemini-primary and regex cross-validation.
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. Preprocess image and OCR via receipt_extractor (PaddleOCR)
|
||||||
|
2. Send OCR text to Gemini text API for semantic field extraction
|
||||||
|
3. Cross-validate structured fields (date, cost, odometer) with regex
|
||||||
|
4. Return ReceiptExtractionResult with per-field confidence scores
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
from app.extractors.receipt_extractor import (
|
||||||
|
ExtractedField,
|
||||||
|
ReceiptExtractionResult,
|
||||||
|
receipt_extractor,
|
||||||
|
)
|
||||||
|
from app.patterns.maintenance_receipt_validation import (
|
||||||
|
MaintenanceReceiptValidation,
|
||||||
|
maintenance_receipt_validator,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Default confidence for Gemini-extracted fields before cross-validation
|
||||||
|
DEFAULT_GEMINI_CONFIDENCE = 0.85
|
||||||
|
|
||||||
|
# Gemini prompt for maintenance receipt field extraction
|
||||||
|
_RECEIPT_EXTRACTION_PROMPT = """\
|
||||||
|
Extract maintenance service receipt fields from the following OCR text.
|
||||||
|
|
||||||
|
For each field, extract the value if present. Return null for fields not found.
|
||||||
|
|
||||||
|
Fields to extract:
|
||||||
|
- serviceName: The maintenance service performed (e.g., "Oil Change", "Brake Pad Replacement", "Tire Rotation")
|
||||||
|
- serviceDate: Date of service in YYYY-MM-DD format
|
||||||
|
- totalCost: Total cost as a number (e.g., 89.95)
|
||||||
|
- shopName: Name of the shop or business
|
||||||
|
- laborCost: Labor cost as a number, or null if not broken out
|
||||||
|
- partsCost: Parts cost as a number, or null if not broken out
|
||||||
|
- odometerReading: Odometer/mileage reading as a number, or null if not present
|
||||||
|
- vehicleInfo: Vehicle description if present (e.g., "2022 Toyota Camry"), or null
|
||||||
|
|
||||||
|
Return a JSON object with these field names and their extracted values.
|
||||||
|
|
||||||
|
OCR Text:
|
||||||
|
---
|
||||||
|
{ocr_text}
|
||||||
|
---\
|
||||||
|
"""
|
||||||
|
|
||||||
|
_RECEIPT_RESPONSE_SCHEMA: dict[str, Any] = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"serviceName": {"type": "string", "nullable": True},
|
||||||
|
"serviceDate": {"type": "string", "nullable": True},
|
||||||
|
"totalCost": {"type": "number", "nullable": True},
|
||||||
|
"shopName": {"type": "string", "nullable": True},
|
||||||
|
"laborCost": {"type": "number", "nullable": True},
|
||||||
|
"partsCost": {"type": "number", "nullable": True},
|
||||||
|
"odometerReading": {"type": "number", "nullable": True},
|
||||||
|
"vehicleInfo": {"type": "string", "nullable": True},
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"serviceName",
|
||||||
|
"serviceDate",
|
||||||
|
"totalCost",
|
||||||
|
"shopName",
|
||||||
|
"laborCost",
|
||||||
|
"partsCost",
|
||||||
|
"odometerReading",
|
||||||
|
"vehicleInfo",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class MaintenanceReceiptExtractor:
|
||||||
|
"""Maintenance receipt extractor using Gemini for semantic extraction.
|
||||||
|
|
||||||
|
Wraps receipt_extractor for OCR preprocessing, then sends raw text to
|
||||||
|
Gemini for field extraction. Structured fields (dates, amounts, odometer)
|
||||||
|
are cross-validated against regex patterns for confidence adjustment.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._model: Any | None = None
|
||||||
|
self._generation_config: Any | None = None
|
||||||
|
|
||||||
|
def extract(
|
||||||
|
self,
|
||||||
|
image_bytes: bytes,
|
||||||
|
content_type: Optional[str] = None,
|
||||||
|
) -> ReceiptExtractionResult:
|
||||||
|
"""Extract maintenance receipt fields from an image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_bytes: Raw image bytes (HEIC, JPEG, PNG).
|
||||||
|
content_type: MIME type (auto-detected if not provided).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ReceiptExtractionResult with maintenance-specific fields.
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Step 1: OCR the image via receipt_extractor
|
||||||
|
ocr_result = receipt_extractor.extract(
|
||||||
|
image_bytes=image_bytes,
|
||||||
|
content_type=content_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not ocr_result.success:
|
||||||
|
return ocr_result
|
||||||
|
|
||||||
|
raw_text = ocr_result.raw_text
|
||||||
|
|
||||||
|
if not raw_text.strip():
|
||||||
|
return ReceiptExtractionResult(
|
||||||
|
success=False,
|
||||||
|
error="No text found in image",
|
||||||
|
processing_time_ms=int((time.time() - start_time) * 1000),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 2: Extract fields with Gemini
|
||||||
|
try:
|
||||||
|
gemini_fields = self._extract_with_gemini(raw_text)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Gemini extraction failed, falling back to OCR-only: {e}")
|
||||||
|
gemini_fields = {}
|
||||||
|
|
||||||
|
# Step 3: Build extracted fields with base confidence
|
||||||
|
extracted_fields = self._build_fields(gemini_fields)
|
||||||
|
|
||||||
|
if not extracted_fields:
|
||||||
|
return ReceiptExtractionResult(
|
||||||
|
success=False,
|
||||||
|
receipt_type="maintenance",
|
||||||
|
error="No maintenance receipt fields could be extracted",
|
||||||
|
raw_text=raw_text,
|
||||||
|
processing_time_ms=int((time.time() - start_time) * 1000),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 4: Cross-validate structured fields with regex
|
||||||
|
validation = maintenance_receipt_validator.validate(gemini_fields, raw_text)
|
||||||
|
|
||||||
|
if validation.issues:
|
||||||
|
logger.info(f"Maintenance receipt validation issues: {validation.issues}")
|
||||||
|
|
||||||
|
# Step 5: Adjust confidences based on cross-validation
|
||||||
|
adjusted_fields = self._adjust_confidences(extracted_fields, validation)
|
||||||
|
|
||||||
|
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Maintenance receipt extraction: "
|
||||||
|
f"fields={len(adjusted_fields)}, "
|
||||||
|
f"validated={validation.is_valid}, "
|
||||||
|
f"time={processing_time_ms}ms"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ReceiptExtractionResult(
|
||||||
|
success=True,
|
||||||
|
receipt_type="maintenance",
|
||||||
|
extracted_fields=adjusted_fields,
|
||||||
|
raw_text=raw_text,
|
||||||
|
processing_time_ms=processing_time_ms,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_model(self) -> Any:
|
||||||
|
"""Lazy-initialize Vertex AI Gemini model.
|
||||||
|
|
||||||
|
Uses the same authentication pattern as GeminiEngine.
|
||||||
|
"""
|
||||||
|
if self._model is not None:
|
||||||
|
return self._model
|
||||||
|
|
||||||
|
key_path = settings.google_vision_key_path
|
||||||
|
if not os.path.isfile(key_path):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Google credential config not found at {key_path}. "
|
||||||
|
"Set GOOGLE_VISION_KEY_PATH or mount the secret."
|
||||||
|
)
|
||||||
|
|
||||||
|
from google.cloud import aiplatform # type: ignore[import-untyped]
|
||||||
|
from vertexai.generative_models import ( # type: ignore[import-untyped]
|
||||||
|
GenerationConfig,
|
||||||
|
GenerativeModel,
|
||||||
|
)
|
||||||
|
|
||||||
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
|
||||||
|
os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1"
|
||||||
|
|
||||||
|
aiplatform.init(
|
||||||
|
project=settings.vertex_ai_project,
|
||||||
|
location=settings.vertex_ai_location,
|
||||||
|
)
|
||||||
|
|
||||||
|
model_name = settings.gemini_model
|
||||||
|
self._model = GenerativeModel(model_name)
|
||||||
|
self._generation_config = GenerationConfig(
|
||||||
|
response_mime_type="application/json",
|
||||||
|
response_schema=_RECEIPT_RESPONSE_SCHEMA,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Maintenance receipt Gemini model initialized (model=%s)",
|
||||||
|
model_name,
|
||||||
|
)
|
||||||
|
return self._model
|
||||||
|
|
||||||
|
def _extract_with_gemini(self, ocr_text: str) -> dict:
|
||||||
|
"""Send OCR text to Gemini for semantic field extraction.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ocr_text: Raw OCR text from receipt image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of field_name -> extracted_value from Gemini.
|
||||||
|
"""
|
||||||
|
model = self._get_model()
|
||||||
|
|
||||||
|
prompt = _RECEIPT_EXTRACTION_PROMPT.format(ocr_text=ocr_text)
|
||||||
|
|
||||||
|
response = model.generate_content(
|
||||||
|
[prompt],
|
||||||
|
generation_config=self._generation_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
raw = json.loads(response.text)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Gemini extracted maintenance fields: %s",
|
||||||
|
[k for k, v in raw.items() if v is not None],
|
||||||
|
)
|
||||||
|
|
||||||
|
return raw
|
||||||
|
|
||||||
|
def _build_fields(self, gemini_fields: dict) -> dict[str, ExtractedField]:
|
||||||
|
"""Convert Gemini response to ExtractedField dict with base confidence.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gemini_fields: Raw Gemini response dict.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of field_name -> ExtractedField.
|
||||||
|
"""
|
||||||
|
fields: dict[str, ExtractedField] = {}
|
||||||
|
|
||||||
|
for field_name, value in gemini_fields.items():
|
||||||
|
if value is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Convert numeric values to appropriate types
|
||||||
|
if field_name in ("totalCost", "laborCost", "partsCost"):
|
||||||
|
try:
|
||||||
|
value = round(float(value), 2)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
continue
|
||||||
|
elif field_name == "odometerReading":
|
||||||
|
try:
|
||||||
|
value = int(float(value))
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
continue
|
||||||
|
elif isinstance(value, str) and not value.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
fields[field_name] = ExtractedField(
|
||||||
|
value=value,
|
||||||
|
confidence=DEFAULT_GEMINI_CONFIDENCE,
|
||||||
|
)
|
||||||
|
|
||||||
|
return fields
|
||||||
|
|
||||||
|
def _adjust_confidences(
|
||||||
|
self,
|
||||||
|
fields: dict[str, ExtractedField],
|
||||||
|
validation: MaintenanceReceiptValidation,
|
||||||
|
) -> dict[str, ExtractedField]:
|
||||||
|
"""Adjust field confidences based on cross-validation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fields: Extracted fields with base confidence.
|
||||||
|
validation: Cross-validation results.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Fields with adjusted confidences.
|
||||||
|
"""
|
||||||
|
adjusted: dict[str, ExtractedField] = {}
|
||||||
|
|
||||||
|
for name, extracted_field in fields.items():
|
||||||
|
if name in validation.field_validations:
|
||||||
|
fv = validation.field_validations[name]
|
||||||
|
new_confidence = min(
|
||||||
|
1.0, extracted_field.confidence * fv.confidence_adjustment
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Semantic fields (no regex validation) keep base confidence
|
||||||
|
new_confidence = extracted_field.confidence
|
||||||
|
|
||||||
|
adjusted[name] = ExtractedField(
|
||||||
|
value=extracted_field.value,
|
||||||
|
confidence=round(new_confidence, 3),
|
||||||
|
)
|
||||||
|
|
||||||
|
return adjusted
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton instance
|
||||||
|
maintenance_receipt_extractor = MaintenanceReceiptExtractor()
|
||||||
@@ -4,6 +4,10 @@ from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matc
|
|||||||
from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
|
from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
|
||||||
from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher
|
from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher
|
||||||
from app.patterns.service_mapping import ServiceMapper, service_mapper
|
from app.patterns.service_mapping import ServiceMapper, service_mapper
|
||||||
|
from app.patterns.maintenance_receipt_validation import (
|
||||||
|
MaintenanceReceiptValidator,
|
||||||
|
maintenance_receipt_validator,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"DatePatternMatcher",
|
"DatePatternMatcher",
|
||||||
@@ -16,4 +20,6 @@ __all__ = [
|
|||||||
"maintenance_matcher",
|
"maintenance_matcher",
|
||||||
"ServiceMapper",
|
"ServiceMapper",
|
||||||
"service_mapper",
|
"service_mapper",
|
||||||
|
"MaintenanceReceiptValidator",
|
||||||
|
"maintenance_receipt_validator",
|
||||||
]
|
]
|
||||||
|
|||||||
299
ocr/app/patterns/maintenance_receipt_validation.py
Normal file
299
ocr/app/patterns/maintenance_receipt_validation.py
Normal file
@@ -0,0 +1,299 @@
|
|||||||
|
"""Cross-validation patterns for maintenance receipt field extraction.
|
||||||
|
|
||||||
|
Validates structured fields (dates, amounts, odometer) extracted by Gemini
|
||||||
|
against regex patterns found in the OCR raw text. Boosts or reduces confidence
|
||||||
|
based on regex agreement.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from app.patterns.currency_patterns import currency_matcher
|
||||||
|
from app.patterns.date_patterns import date_matcher
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FieldValidation:
|
||||||
|
"""Validation result for a single extracted field."""
|
||||||
|
|
||||||
|
field_name: str
|
||||||
|
regex_confirmed: bool
|
||||||
|
confidence_adjustment: float # Multiplier: >1.0 boosts, <1.0 reduces
|
||||||
|
regex_value: Optional[str] = None # Value found by regex, if any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MaintenanceReceiptValidation:
|
||||||
|
"""Aggregated validation result for a maintenance receipt."""
|
||||||
|
|
||||||
|
is_valid: bool
|
||||||
|
issues: list[str]
|
||||||
|
field_validations: dict[str, FieldValidation] = field(default_factory=dict)
|
||||||
|
overall_confidence: float = 1.0
|
||||||
|
|
||||||
|
|
||||||
|
# Odometer patterns: 5-7 digit numbers near odometer keywords
|
||||||
|
ODOMETER_PATTERNS = [
|
||||||
|
# "Odometer: 45,231" or "Mileage: 45231"
|
||||||
|
(
|
||||||
|
r"(?:ODOMETER|MILEAGE|MILES|ODO|MI)\s*[:\s]\s*(\d{1,3}[,.]?\d{3,4})",
|
||||||
|
"labeled_odometer",
|
||||||
|
0.95,
|
||||||
|
),
|
||||||
|
# "45,231 mi" or "45231 miles"
|
||||||
|
(
|
||||||
|
r"(\d{1,3}[,.]?\d{3,4})\s*(?:MI|MILES|KM)",
|
||||||
|
"unit_odometer",
|
||||||
|
0.90,
|
||||||
|
),
|
||||||
|
# Standalone 5-6 digit number (lower confidence)
|
||||||
|
(
|
||||||
|
r"(?<!\d)(\d{5,6})(?!\d)",
|
||||||
|
"standalone_odometer",
|
||||||
|
0.60,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OdometerMatch:
|
||||||
|
"""Result of odometer pattern matching."""
|
||||||
|
|
||||||
|
value: int
|
||||||
|
raw_match: str
|
||||||
|
confidence: float
|
||||||
|
pattern_name: str
|
||||||
|
|
||||||
|
|
||||||
|
class MaintenanceReceiptValidator:
|
||||||
|
"""Cross-validates Gemini-extracted maintenance receipt fields against regex patterns."""
|
||||||
|
|
||||||
|
def validate(
|
||||||
|
self,
|
||||||
|
gemini_fields: dict,
|
||||||
|
raw_text: str,
|
||||||
|
) -> MaintenanceReceiptValidation:
|
||||||
|
"""Validate Gemini-extracted fields against regex patterns in raw OCR text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gemini_fields: Fields extracted by Gemini (field_name -> value).
|
||||||
|
raw_text: Raw OCR text for regex cross-validation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
MaintenanceReceiptValidation with per-field results.
|
||||||
|
"""
|
||||||
|
issues: list[str] = []
|
||||||
|
field_validations: dict[str, FieldValidation] = {}
|
||||||
|
overall_confidence = 1.0
|
||||||
|
|
||||||
|
# Validate date field
|
||||||
|
if "serviceDate" in gemini_fields:
|
||||||
|
date_validation = self._validate_date(
|
||||||
|
gemini_fields["serviceDate"], raw_text
|
||||||
|
)
|
||||||
|
field_validations["serviceDate"] = date_validation
|
||||||
|
if not date_validation.regex_confirmed:
|
||||||
|
issues.append(
|
||||||
|
f"Service date '{gemini_fields['serviceDate']}' not confirmed by regex"
|
||||||
|
)
|
||||||
|
overall_confidence *= 0.85
|
||||||
|
|
||||||
|
# Validate total cost
|
||||||
|
if "totalCost" in gemini_fields:
|
||||||
|
cost_validation = self._validate_amount(
|
||||||
|
"totalCost", gemini_fields["totalCost"], raw_text
|
||||||
|
)
|
||||||
|
field_validations["totalCost"] = cost_validation
|
||||||
|
if not cost_validation.regex_confirmed:
|
||||||
|
issues.append(
|
||||||
|
f"Total cost '{gemini_fields['totalCost']}' not confirmed by regex"
|
||||||
|
)
|
||||||
|
overall_confidence *= 0.85
|
||||||
|
|
||||||
|
# Validate labor cost
|
||||||
|
if "laborCost" in gemini_fields:
|
||||||
|
labor_validation = self._validate_amount(
|
||||||
|
"laborCost", gemini_fields["laborCost"], raw_text
|
||||||
|
)
|
||||||
|
field_validations["laborCost"] = labor_validation
|
||||||
|
if not labor_validation.regex_confirmed:
|
||||||
|
issues.append(
|
||||||
|
f"Labor cost '{gemini_fields['laborCost']}' not confirmed by regex"
|
||||||
|
)
|
||||||
|
overall_confidence *= 0.90
|
||||||
|
|
||||||
|
# Validate odometer
|
||||||
|
if "odometerReading" in gemini_fields:
|
||||||
|
odo_validation = self._validate_odometer(
|
||||||
|
gemini_fields["odometerReading"], raw_text
|
||||||
|
)
|
||||||
|
field_validations["odometerReading"] = odo_validation
|
||||||
|
if not odo_validation.regex_confirmed:
|
||||||
|
issues.append(
|
||||||
|
f"Odometer '{gemini_fields['odometerReading']}' not confirmed by regex"
|
||||||
|
)
|
||||||
|
overall_confidence *= 0.90
|
||||||
|
|
||||||
|
is_valid = len(issues) == 0
|
||||||
|
return MaintenanceReceiptValidation(
|
||||||
|
is_valid=is_valid,
|
||||||
|
issues=issues,
|
||||||
|
field_validations=field_validations,
|
||||||
|
overall_confidence=overall_confidence,
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_odometer(self, text: str) -> Optional[OdometerMatch]:
|
||||||
|
"""Extract odometer reading from text using regex patterns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: OCR text to search.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Best OdometerMatch or None.
|
||||||
|
"""
|
||||||
|
text_upper = text.upper()
|
||||||
|
best_match: Optional[OdometerMatch] = None
|
||||||
|
|
||||||
|
for pattern, name, confidence in ODOMETER_PATTERNS:
|
||||||
|
for match in re.finditer(pattern, text_upper):
|
||||||
|
raw_value = match.group(1)
|
||||||
|
parsed = self._parse_odometer(raw_value)
|
||||||
|
if parsed is not None and self._is_reasonable_odometer(parsed):
|
||||||
|
candidate = OdometerMatch(
|
||||||
|
value=parsed,
|
||||||
|
raw_match=match.group(0).strip(),
|
||||||
|
confidence=confidence,
|
||||||
|
pattern_name=name,
|
||||||
|
)
|
||||||
|
if best_match is None or candidate.confidence > best_match.confidence:
|
||||||
|
best_match = candidate
|
||||||
|
|
||||||
|
return best_match
|
||||||
|
|
||||||
|
def _validate_date(self, gemini_date: str, raw_text: str) -> FieldValidation:
|
||||||
|
"""Check if Gemini-extracted date matches a regex-found date."""
|
||||||
|
regex_dates = date_matcher.extract_dates(raw_text)
|
||||||
|
|
||||||
|
if not regex_dates:
|
||||||
|
# No dates found by regex -- cannot confirm or deny
|
||||||
|
return FieldValidation(
|
||||||
|
field_name="serviceDate",
|
||||||
|
regex_confirmed=False,
|
||||||
|
confidence_adjustment=0.95,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Normalize Gemini date for comparison
|
||||||
|
gemini_normalized = gemini_date.strip().replace("/", "-")
|
||||||
|
|
||||||
|
for regex_date in regex_dates:
|
||||||
|
if regex_date.value == gemini_normalized:
|
||||||
|
return FieldValidation(
|
||||||
|
field_name="serviceDate",
|
||||||
|
regex_confirmed=True,
|
||||||
|
confidence_adjustment=1.10,
|
||||||
|
regex_value=regex_date.value,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Gemini found a date but it doesn't match regex dates
|
||||||
|
return FieldValidation(
|
||||||
|
field_name="serviceDate",
|
||||||
|
regex_confirmed=False,
|
||||||
|
confidence_adjustment=0.80,
|
||||||
|
regex_value=regex_dates[0].value if regex_dates else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _validate_amount(
|
||||||
|
self, field_name: str, gemini_amount: str | float, raw_text: str
|
||||||
|
) -> FieldValidation:
|
||||||
|
"""Check if Gemini-extracted amount matches a regex-found amount."""
|
||||||
|
try:
|
||||||
|
gemini_value = float(str(gemini_amount).replace("$", "").replace(",", ""))
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return FieldValidation(
|
||||||
|
field_name=field_name,
|
||||||
|
regex_confirmed=False,
|
||||||
|
confidence_adjustment=0.70,
|
||||||
|
)
|
||||||
|
|
||||||
|
regex_amounts = currency_matcher.extract_all_amounts(raw_text)
|
||||||
|
|
||||||
|
if not regex_amounts:
|
||||||
|
return FieldValidation(
|
||||||
|
field_name=field_name,
|
||||||
|
regex_confirmed=False,
|
||||||
|
confidence_adjustment=0.95,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if any regex amount matches within 5% tolerance
|
||||||
|
for regex_amount in regex_amounts:
|
||||||
|
if gemini_value > 0 and abs(regex_amount.value - gemini_value) / gemini_value < 0.05:
|
||||||
|
return FieldValidation(
|
||||||
|
field_name=field_name,
|
||||||
|
regex_confirmed=True,
|
||||||
|
confidence_adjustment=1.10,
|
||||||
|
regex_value=str(regex_amount.value),
|
||||||
|
)
|
||||||
|
|
||||||
|
return FieldValidation(
|
||||||
|
field_name=field_name,
|
||||||
|
regex_confirmed=False,
|
||||||
|
confidence_adjustment=0.80,
|
||||||
|
regex_value=str(regex_amounts[0].value) if regex_amounts else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _validate_odometer(
|
||||||
|
self, gemini_odometer: str | int, raw_text: str
|
||||||
|
) -> FieldValidation:
|
||||||
|
"""Check if Gemini-extracted odometer matches a regex-found reading."""
|
||||||
|
try:
|
||||||
|
gemini_value = int(
|
||||||
|
str(gemini_odometer).replace(",", "").replace(".", "").strip()
|
||||||
|
)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return FieldValidation(
|
||||||
|
field_name="odometerReading",
|
||||||
|
regex_confirmed=False,
|
||||||
|
confidence_adjustment=0.70,
|
||||||
|
)
|
||||||
|
|
||||||
|
regex_match = self.extract_odometer(raw_text)
|
||||||
|
|
||||||
|
if not regex_match:
|
||||||
|
return FieldValidation(
|
||||||
|
field_name="odometerReading",
|
||||||
|
regex_confirmed=False,
|
||||||
|
confidence_adjustment=0.95,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if values match within 1% tolerance (OCR might misread a digit)
|
||||||
|
if gemini_value > 0 and abs(regex_match.value - gemini_value) / gemini_value < 0.01:
|
||||||
|
return FieldValidation(
|
||||||
|
field_name="odometerReading",
|
||||||
|
regex_confirmed=True,
|
||||||
|
confidence_adjustment=1.10,
|
||||||
|
regex_value=str(regex_match.value),
|
||||||
|
)
|
||||||
|
|
||||||
|
return FieldValidation(
|
||||||
|
field_name="odometerReading",
|
||||||
|
regex_confirmed=False,
|
||||||
|
confidence_adjustment=0.80,
|
||||||
|
regex_value=str(regex_match.value),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_odometer(self, raw: str) -> Optional[int]:
|
||||||
|
"""Parse odometer string to integer."""
|
||||||
|
cleaned = raw.replace(",", "").replace(".", "").strip()
|
||||||
|
try:
|
||||||
|
return int(cleaned)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _is_reasonable_odometer(self, value: int) -> bool:
|
||||||
|
"""Check if odometer reading is in a reasonable range."""
|
||||||
|
return 100 <= value <= 999_999
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton instance
|
||||||
|
maintenance_receipt_validator = MaintenanceReceiptValidator()
|
||||||
@@ -6,6 +6,7 @@ from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Query
|
|||||||
|
|
||||||
from app.extractors.vin_extractor import vin_extractor
|
from app.extractors.vin_extractor import vin_extractor
|
||||||
from app.extractors.receipt_extractor import receipt_extractor
|
from app.extractors.receipt_extractor import receipt_extractor
|
||||||
|
from app.extractors.maintenance_receipt_extractor import maintenance_receipt_extractor
|
||||||
from app.extractors.manual_extractor import manual_extractor
|
from app.extractors.manual_extractor import manual_extractor
|
||||||
from app.models import (
|
from app.models import (
|
||||||
BoundingBox,
|
BoundingBox,
|
||||||
@@ -267,6 +268,95 @@ async def extract_receipt(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/maintenance-receipt", response_model=ReceiptExtractionResponse)
|
||||||
|
async def extract_maintenance_receipt(
|
||||||
|
file: UploadFile = File(..., description="Maintenance receipt image file"),
|
||||||
|
) -> ReceiptExtractionResponse:
|
||||||
|
"""
|
||||||
|
Extract data from a maintenance receipt image using OCR + Gemini.
|
||||||
|
|
||||||
|
Gemini-primary extraction with regex cross-validation:
|
||||||
|
- OCR preprocessing (HEIC conversion, contrast, thresholding)
|
||||||
|
- PaddleOCR text extraction
|
||||||
|
- Gemini semantic field extraction from OCR text
|
||||||
|
- Regex cross-validation for dates, amounts, odometer
|
||||||
|
|
||||||
|
Supports HEIC, JPEG, PNG formats.
|
||||||
|
|
||||||
|
- **file**: Maintenance receipt image file (max 10MB)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- **receiptType**: "maintenance"
|
||||||
|
- **extractedFields**: Dictionary of extracted fields with confidence scores
|
||||||
|
- serviceName: Service performed (e.g., "Oil Change")
|
||||||
|
- serviceDate: Date in YYYY-MM-DD format
|
||||||
|
- totalCost: Total cost
|
||||||
|
- shopName: Shop or business name
|
||||||
|
- laborCost: Labor cost (if broken out)
|
||||||
|
- partsCost: Parts cost (if broken out)
|
||||||
|
- odometerReading: Odometer reading (if present)
|
||||||
|
- vehicleInfo: Vehicle description (if present)
|
||||||
|
- **rawText**: Full OCR text
|
||||||
|
- **processingTimeMs**: Processing time in milliseconds
|
||||||
|
"""
|
||||||
|
# Validate file presence
|
||||||
|
if not file.filename:
|
||||||
|
raise HTTPException(status_code=400, detail="No file provided")
|
||||||
|
|
||||||
|
# Read file content
|
||||||
|
content = await file.read()
|
||||||
|
file_size = len(content)
|
||||||
|
|
||||||
|
# Validate file size
|
||||||
|
if file_size > MAX_SYNC_SIZE:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=413,
|
||||||
|
detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB",
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_size == 0:
|
||||||
|
raise HTTPException(status_code=400, detail="Empty file provided")
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Maintenance receipt extraction: {file.filename}, "
|
||||||
|
f"size: {file_size} bytes, "
|
||||||
|
f"content_type: {file.content_type}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform maintenance receipt extraction
|
||||||
|
result = maintenance_receipt_extractor.extract(
|
||||||
|
image_bytes=content,
|
||||||
|
content_type=file.content_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
logger.warning(
|
||||||
|
f"Maintenance receipt extraction failed for {file.filename}: {result.error}"
|
||||||
|
)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=422,
|
||||||
|
detail=result.error or "Failed to extract data from maintenance receipt",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert internal fields to API response format
|
||||||
|
extracted_fields = {
|
||||||
|
name: ReceiptExtractedField(
|
||||||
|
value=field.value,
|
||||||
|
confidence=field.confidence,
|
||||||
|
)
|
||||||
|
for name, field in result.extracted_fields.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
return ReceiptExtractionResponse(
|
||||||
|
success=result.success,
|
||||||
|
receiptType=result.receipt_type,
|
||||||
|
extractedFields=extracted_fields,
|
||||||
|
rawText=result.raw_text,
|
||||||
|
processingTimeMs=result.processing_time_ms,
|
||||||
|
error=result.error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/manual", response_model=ManualJobResponse)
|
@router.post("/manual", response_model=ManualJobResponse)
|
||||||
async def extract_manual(
|
async def extract_manual(
|
||||||
background_tasks: BackgroundTasks,
|
background_tasks: BackgroundTasks,
|
||||||
|
|||||||
Reference in New Issue
Block a user