Files
motovaultpro/ocr/app/extractors/manual_extractor.py
2026-02-11 14:40:11 -06:00

167 lines
5.1 KiB
Python

"""Owner's manual extractor for maintenance schedule extraction via Gemini."""
import logging
import time
from dataclasses import dataclass, field
from typing import Callable, Optional
from app.engines.gemini_engine import GeminiEngine, GeminiEngineError
from app.patterns.service_mapping import service_mapper
logger = logging.getLogger(__name__)
@dataclass
class ExtractedSchedule:
"""A single extracted maintenance schedule."""
service: str
interval_miles: Optional[int]
interval_months: Optional[int]
details: Optional[str]
confidence: float
subtypes: list[str] = field(default_factory=list)
@dataclass
class VehicleInfo:
"""Vehicle information extracted from manual."""
make: Optional[str]
model: Optional[str]
year: Optional[int]
@dataclass
class ManualExtractionResult:
"""Complete result of manual extraction."""
success: bool
vehicle_info: Optional[VehicleInfo]
maintenance_schedules: list[ExtractedSchedule]
raw_tables: list[dict]
processing_time_ms: int
total_pages: int
pages_processed: int
error: Optional[str] = None
class ManualExtractor:
"""Extract maintenance schedules from owner's manuals using Gemini.
Processing pipeline:
1. Send entire PDF to Gemini for semantic extraction
2. Map extracted service names to system maintenance subtypes via fuzzy matching
3. Return structured results
"""
# Default confidence for Gemini-extracted items without a subtype match
DEFAULT_CONFIDENCE = 0.85
def __init__(self) -> None:
self._engine = GeminiEngine()
def extract(
self,
pdf_bytes: bytes,
progress_callback: Optional[Callable[[int, str], None]] = None,
) -> ManualExtractionResult:
"""Extract maintenance schedules from an owner's manual PDF.
Args:
pdf_bytes: Raw PDF bytes
progress_callback: Optional callback for progress updates (percent, message)
Returns:
ManualExtractionResult with extracted data
"""
start_time = time.time()
def update_progress(percent: int, message: str) -> None:
if progress_callback:
progress_callback(percent, message)
logger.info(f"Progress {percent}%: {message}")
try:
update_progress(10, "Preparing extraction")
update_progress(50, "Processing with Gemini")
gemini_result = self._engine.extract_maintenance(pdf_bytes)
update_progress(95, "Mapping results")
schedules: list[ExtractedSchedule] = []
for item in gemini_result.items:
mapping = service_mapper.map_service_fuzzy(item.service_name)
if mapping:
subtypes = mapping.subtypes
confidence = mapping.confidence
service_name = mapping.normalized_name
else:
subtypes = []
confidence = self.DEFAULT_CONFIDENCE
service_name = item.service_name
schedules.append(
ExtractedSchedule(
service=service_name,
interval_miles=item.interval_miles,
interval_months=item.interval_months,
details=item.details,
confidence=confidence,
subtypes=subtypes,
)
)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"Extraction complete: {len(schedules)} schedules in {processing_time_ms}ms"
)
update_progress(100, "Complete")
return ManualExtractionResult(
success=True,
vehicle_info=None,
maintenance_schedules=schedules,
raw_tables=[],
processing_time_ms=processing_time_ms,
total_pages=0,
pages_processed=0,
)
except GeminiEngineError as e:
logger.error(f"Gemini extraction failed: {e}", exc_info=True)
processing_time_ms = int((time.time() - start_time) * 1000)
return ManualExtractionResult(
success=False,
vehicle_info=None,
maintenance_schedules=[],
raw_tables=[],
processing_time_ms=processing_time_ms,
total_pages=0,
pages_processed=0,
error=str(e),
)
except Exception as e:
logger.error(f"Manual extraction failed: {e}", exc_info=True)
processing_time_ms = int((time.time() - start_time) * 1000)
return ManualExtractionResult(
success=False,
vehicle_info=None,
maintenance_schedules=[],
raw_tables=[],
processing_time_ms=processing_time_ms,
total_pages=0,
pages_processed=0,
error=str(e),
)
# Singleton instance
manual_extractor = ManualExtractor()