diff --git a/ocr/app/extractors/__init__.py b/ocr/app/extractors/__init__.py index d0468f1..c0026a7 100644 --- a/ocr/app/extractors/__init__.py +++ b/ocr/app/extractors/__init__.py @@ -8,6 +8,13 @@ from app.extractors.receipt_extractor import ( ExtractedField, ) from app.extractors.fuel_receipt import FuelReceiptExtractor, fuel_receipt_extractor +from app.extractors.manual_extractor import ( + ManualExtractor, + manual_extractor, + ManualExtractionResult, + ExtractedSchedule, + VehicleInfo, +) __all__ = [ "BaseExtractor", @@ -20,4 +27,9 @@ __all__ = [ "ExtractedField", "FuelReceiptExtractor", "fuel_receipt_extractor", + "ManualExtractor", + "manual_extractor", + "ManualExtractionResult", + "ExtractedSchedule", + "VehicleInfo", ] diff --git a/ocr/app/extractors/manual_extractor.py b/ocr/app/extractors/manual_extractor.py new file mode 100644 index 0000000..e447882 --- /dev/null +++ b/ocr/app/extractors/manual_extractor.py @@ -0,0 +1,417 @@ +"""Owner's manual extractor for maintenance schedule extraction.""" +import io +import logging +import time +from dataclasses import dataclass, field +from typing import Callable, Optional + +import pytesseract +from PIL import Image + +from app.preprocessors.pdf_preprocessor import pdf_preprocessor, PdfInfo +from app.table_extraction.detector import table_detector, DetectedTable +from app.table_extraction.parser import table_parser, ParsedScheduleRow +from app.patterns.maintenance_patterns import maintenance_matcher + +logger = logging.getLogger(__name__) + + +@dataclass +class ExtractedSchedule: + """A single extracted maintenance schedule.""" + + service: str + interval_miles: Optional[int] + interval_months: Optional[int] + details: Optional[str] + confidence: float + subtypes: list[str] = field(default_factory=list) + + +@dataclass +class VehicleInfo: + """Vehicle information extracted from manual.""" + + make: Optional[str] + model: Optional[str] + year: Optional[int] + + +@dataclass +class ManualExtractionResult: + """Complete result of manual extraction.""" + + success: bool + vehicle_info: Optional[VehicleInfo] + maintenance_schedules: list[ExtractedSchedule] + raw_tables: list[dict] + processing_time_ms: int + total_pages: int + pages_processed: int + error: Optional[str] = None + + +class ManualExtractor: + """Extract maintenance schedules from owner's manuals. + + Processing pipeline: + 1. Analyze PDF structure + 2. Find maintenance section pages + 3. Extract text (native) or OCR (scanned) + 4. Detect tables + 5. Parse schedules + 6. Normalize and deduplicate + """ + + # Maximum pages to process for performance + MAX_PAGES_TO_PROCESS = 50 + + # Minimum confidence to include schedule + MIN_CONFIDENCE = 0.5 + + def extract( + self, + pdf_bytes: bytes, + progress_callback: Optional[Callable[[int, str], None]] = None, + ) -> ManualExtractionResult: + """ + Extract maintenance schedules from an owner's manual PDF. + + Args: + pdf_bytes: Raw PDF bytes + progress_callback: Optional callback for progress updates (percent, message) + + Returns: + ManualExtractionResult with extracted data + """ + start_time = time.time() + + def update_progress(percent: int, message: str) -> None: + if progress_callback: + progress_callback(percent, message) + logger.info(f"Progress {percent}%: {message}") + + try: + update_progress(5, "Analyzing PDF structure") + + # Get PDF info + pdf_info = pdf_preprocessor.get_pdf_info(pdf_bytes) + logger.info( + f"PDF: {pdf_info.total_pages} pages, " + f"has_text={pdf_info.has_text_layer}, " + f"is_scanned={pdf_info.is_scanned}" + ) + + update_progress(10, "Finding maintenance sections") + + # Find pages likely to contain maintenance schedules + maintenance_pages = pdf_preprocessor.find_maintenance_section(pdf_bytes) + + if not maintenance_pages: + # If no specific pages found, process first N pages + maintenance_pages = list(range(min(self.MAX_PAGES_TO_PROCESS, pdf_info.total_pages))) + logger.info("No specific maintenance section found, processing all pages") + else: + # Include pages before and after detected maintenance pages + expanded_pages: set[int] = set() + for page in maintenance_pages: + for offset in range(-2, 5): # Include 2 before, 4 after + new_page = page + offset + if 0 <= new_page < pdf_info.total_pages: + expanded_pages.add(new_page) + maintenance_pages = sorted(expanded_pages)[:self.MAX_PAGES_TO_PROCESS] + logger.info(f"Processing {len(maintenance_pages)} pages around maintenance section") + + update_progress(15, "Extracting page content") + + # Extract content from pages + all_schedules: list[ParsedScheduleRow] = [] + all_tables: list[dict] = [] + pages_processed = 0 + + for i, page_num in enumerate(maintenance_pages): + page_progress = 15 + int((i / len(maintenance_pages)) * 60) + update_progress(page_progress, f"Processing page {page_num + 1}") + + # Extract page content + page_content = pdf_preprocessor.extract_text_from_page(pdf_bytes, page_num) + pages_processed += 1 + + # Process based on content type + if page_content.has_text: + # Native PDF - use text directly + schedules, tables = self._process_text_page( + page_content.text_content, page_num + ) + elif page_content.image_bytes: + # Scanned PDF - OCR required + schedules, tables = self._process_scanned_page( + page_content.image_bytes, page_num + ) + else: + continue + + all_schedules.extend(schedules) + all_tables.extend(tables) + + update_progress(75, "Normalizing results") + + # Deduplicate and normalize schedules + normalized_schedules = self._normalize_schedules(all_schedules) + + update_progress(85, "Extracting vehicle information") + + # Try to extract vehicle info from first few pages + vehicle_info = self._extract_vehicle_info(pdf_bytes, pdf_info) + + update_progress(95, "Finalizing results") + + processing_time_ms = int((time.time() - start_time) * 1000) + + logger.info( + f"Extraction complete: {len(normalized_schedules)} schedules from " + f"{pages_processed} pages in {processing_time_ms}ms" + ) + + update_progress(100, "Complete") + + return ManualExtractionResult( + success=True, + vehicle_info=vehicle_info, + maintenance_schedules=normalized_schedules, + raw_tables=[{"page": t.get("page", 0), "rows": t.get("rows", 0)} for t in all_tables], + processing_time_ms=processing_time_ms, + total_pages=pdf_info.total_pages, + pages_processed=pages_processed, + ) + + except Exception as e: + logger.error(f"Manual extraction failed: {e}", exc_info=True) + processing_time_ms = int((time.time() - start_time) * 1000) + + return ManualExtractionResult( + success=False, + vehicle_info=None, + maintenance_schedules=[], + raw_tables=[], + processing_time_ms=processing_time_ms, + total_pages=0, + pages_processed=0, + error=str(e), + ) + + def _process_text_page( + self, text: str, page_number: int + ) -> tuple[list[ParsedScheduleRow], list[dict]]: + """Process a native PDF page with text.""" + schedules: list[ParsedScheduleRow] = [] + tables: list[dict] = [] + + # Detect tables in text + detected_tables = table_detector.detect_tables_in_text(text, page_number) + + for table in detected_tables: + if table.is_maintenance_table and table.header_row: + # Parse table + parsed = table_parser.parse_table( + table.header_row, + table.raw_content, + ) + schedules.extend(parsed) + + tables.append({ + "page": page_number, + "rows": len(table.raw_content), + "is_maintenance": True, + }) + + # Also try to extract from unstructured text + text_schedules = table_parser.parse_text_block(text) + schedules.extend(text_schedules) + + return schedules, tables + + def _process_scanned_page( + self, image_bytes: bytes, page_number: int + ) -> tuple[list[ParsedScheduleRow], list[dict]]: + """Process a scanned PDF page with OCR.""" + schedules: list[ParsedScheduleRow] = [] + tables: list[dict] = [] + + # Detect tables in image + detected_tables = table_detector.detect_tables_in_image(image_bytes, page_number) + + # OCR the full page + try: + image = Image.open(io.BytesIO(image_bytes)) + ocr_text = pytesseract.image_to_string(image) + + # Mark tables as maintenance if page contains maintenance keywords + for table in detected_tables: + table.is_maintenance_table = table_detector.is_maintenance_table( + table, ocr_text + ) + + # Try to extract from OCR text + text_tables = table_detector.detect_tables_in_text(ocr_text, page_number) + + for table in text_tables: + if table.is_maintenance_table and table.header_row: + parsed = table_parser.parse_table( + table.header_row, + table.raw_content, + ) + schedules.extend(parsed) + + tables.append({ + "page": page_number, + "rows": len(table.raw_content), + "is_maintenance": True, + }) + + # Also try unstructured text + text_schedules = table_parser.parse_text_block(ocr_text) + schedules.extend(text_schedules) + + except Exception as e: + logger.warning(f"OCR failed for page {page_number}: {e}") + + return schedules, tables + + def _normalize_schedules( + self, schedules: list[ParsedScheduleRow] + ) -> list[ExtractedSchedule]: + """Normalize and deduplicate extracted schedules.""" + # Group by normalized service name + by_service: dict[str, list[ParsedScheduleRow]] = {} + + for schedule in schedules: + if schedule.confidence < self.MIN_CONFIDENCE: + continue + + key = schedule.normalized_service or schedule.service.lower() + if key not in by_service: + by_service[key] = [] + by_service[key].append(schedule) + + # Merge duplicates, keeping highest confidence + results: list[ExtractedSchedule] = [] + + for service_key, items in by_service.items(): + # Sort by confidence + items.sort(key=lambda x: x.confidence, reverse=True) + best = items[0] + + # Merge interval info from other items if missing + miles = best.interval_miles + months = best.interval_months + details = best.details + fluid_spec = best.fluid_spec + + for item in items[1:]: + if not miles and item.interval_miles: + miles = item.interval_miles + if not months and item.interval_months: + months = item.interval_months + if not details and item.details: + details = item.details + if not fluid_spec and item.fluid_spec: + fluid_spec = item.fluid_spec + + # Build details string + detail_parts = [] + if details: + detail_parts.append(details) + if fluid_spec: + detail_parts.append(f"Use {fluid_spec}") + + results.append( + ExtractedSchedule( + service=best.normalized_service or best.service, + interval_miles=miles, + interval_months=months, + details=" - ".join(detail_parts) if detail_parts else None, + confidence=best.confidence, + subtypes=best.subtypes, + ) + ) + + # Sort by confidence + results.sort(key=lambda x: x.confidence, reverse=True) + + return results + + def _extract_vehicle_info( + self, pdf_bytes: bytes, pdf_info: PdfInfo + ) -> Optional[VehicleInfo]: + """Extract vehicle make/model/year from manual.""" + # Check metadata first + if pdf_info.title: + info = self._parse_vehicle_from_title(pdf_info.title) + if info: + return info + + # Try first page + try: + first_page = pdf_preprocessor.extract_text_from_page(pdf_bytes, 0) + text = first_page.text_content + + if not text and first_page.image_bytes: + # OCR first page + image = Image.open(io.BytesIO(first_page.image_bytes)) + text = pytesseract.image_to_string(image) + + if text: + return self._parse_vehicle_from_text(text) + + except Exception as e: + logger.warning(f"Failed to extract vehicle info: {e}") + + return None + + def _parse_vehicle_from_title(self, title: str) -> Optional[VehicleInfo]: + """Parse vehicle info from document title.""" + import re + + # Common patterns: "2024 Honda Civic Owner's Manual" + year_match = re.search(r"(20\d{2}|19\d{2})", title) + year = int(year_match.group(1)) if year_match else None + + # Common makes + makes = [ + "Acura", "Alfa Romeo", "Audi", "BMW", "Buick", "Cadillac", + "Chevrolet", "Chrysler", "Dodge", "Ferrari", "Fiat", "Ford", + "Genesis", "GMC", "Honda", "Hyundai", "Infiniti", "Jaguar", + "Jeep", "Kia", "Lamborghini", "Land Rover", "Lexus", "Lincoln", + "Maserati", "Mazda", "McLaren", "Mercedes", "Mini", "Mitsubishi", + "Nissan", "Porsche", "Ram", "Rolls-Royce", "Subaru", "Tesla", + "Toyota", "Volkswagen", "Volvo", + ] + + make = None + model = None + + for m in makes: + if m.lower() in title.lower(): + make = m + # Try to find model after make + idx = title.lower().find(m.lower()) + after = title[idx + len(m):].strip() + # First word after make is likely model + model_match = re.match(r"^(\w+)", after) + if model_match: + model = model_match.group(1) + break + + if year or make: + return VehicleInfo(make=make, model=model, year=year) + + return None + + def _parse_vehicle_from_text(self, text: str) -> Optional[VehicleInfo]: + """Parse vehicle info from page text.""" + return self._parse_vehicle_from_title(text[:500]) # Use first 500 chars + + +# Singleton instance +manual_extractor = ManualExtractor() diff --git a/ocr/app/main.py b/ocr/app/main.py index dca38c1..d5c87ab 100644 --- a/ocr/app/main.py +++ b/ocr/app/main.py @@ -56,6 +56,8 @@ async def root() -> dict: "endpoints": [ "POST /extract - Synchronous OCR extraction", "POST /extract/vin - VIN-specific extraction with validation", + "POST /extract/receipt - Receipt extraction (fuel, general)", + "POST /extract/manual - Owner's manual extraction (async)", "POST /jobs - Submit async OCR job", "GET /jobs/{job_id} - Get async job status", ], diff --git a/ocr/app/models/__init__.py b/ocr/app/models/__init__.py index eecbf23..6c5a8aa 100644 --- a/ocr/app/models/__init__.py +++ b/ocr/app/models/__init__.py @@ -6,6 +6,10 @@ from .schemas import ( JobResponse, JobStatus, JobSubmitRequest, + ManualExtractionResponse, + ManualJobResponse, + ManualMaintenanceSchedule, + ManualVehicleInfo, OcrResponse, ReceiptExtractedField, ReceiptExtractionResponse, @@ -20,6 +24,10 @@ __all__ = [ "JobResponse", "JobStatus", "JobSubmitRequest", + "ManualExtractionResponse", + "ManualJobResponse", + "ManualMaintenanceSchedule", + "ManualVehicleInfo", "OcrResponse", "ReceiptExtractedField", "ReceiptExtractionResponse", diff --git a/ocr/app/models/schemas.py b/ocr/app/models/schemas.py index d1c9536..d6a8737 100644 --- a/ocr/app/models/schemas.py +++ b/ocr/app/models/schemas.py @@ -115,3 +115,57 @@ class ReceiptExtractionResponse(BaseModel): error: Optional[str] = None model_config = {"populate_by_name": True} + + +# Manual extraction models + + +class ManualVehicleInfo(BaseModel): + """Vehicle information extracted from manual.""" + + make: Optional[str] = None + model: Optional[str] = None + year: Optional[int] = None + + +class ManualMaintenanceSchedule(BaseModel): + """A single maintenance schedule entry.""" + + service: str + interval_miles: Optional[int] = Field(default=None, alias="intervalMiles") + interval_months: Optional[int] = Field(default=None, alias="intervalMonths") + details: Optional[str] = None + confidence: float = Field(ge=0.0, le=1.0) + subtypes: list[str] = Field(default_factory=list) + + model_config = {"populate_by_name": True} + + +class ManualExtractionResponse(BaseModel): + """Response from manual extraction endpoint.""" + + success: bool + vehicle_info: Optional[ManualVehicleInfo] = Field(default=None, alias="vehicleInfo") + maintenance_schedules: list[ManualMaintenanceSchedule] = Field( + default_factory=list, alias="maintenanceSchedules" + ) + raw_tables: list[dict] = Field(default_factory=list, alias="rawTables") + processing_time_ms: int = Field(alias="processingTimeMs") + total_pages: int = Field(alias="totalPages") + pages_processed: int = Field(alias="pagesProcessed") + error: Optional[str] = None + + model_config = {"populate_by_name": True} + + +class ManualJobResponse(BaseModel): + """Response for async manual extraction job.""" + + job_id: str = Field(alias="jobId") + status: JobStatus + progress: Optional[int] = Field(default=None, ge=0, le=100) + estimated_seconds: Optional[int] = Field(default=None, alias="estimatedSeconds") + result: Optional[ManualExtractionResponse] = None + error: Optional[str] = None + + model_config = {"populate_by_name": True} diff --git a/ocr/app/patterns/__init__.py b/ocr/app/patterns/__init__.py index e4d94c3..f9a8bc4 100644 --- a/ocr/app/patterns/__init__.py +++ b/ocr/app/patterns/__init__.py @@ -1,7 +1,9 @@ -"""Pattern matching modules for receipt field extraction.""" +"""Pattern matching modules for receipt and manual field extraction.""" from app.patterns.date_patterns import DatePatternMatcher, date_matcher from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher +from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher +from app.patterns.service_mapping import ServiceMapper, service_mapper __all__ = [ "DatePatternMatcher", @@ -10,4 +12,8 @@ __all__ = [ "currency_matcher", "FuelPatternMatcher", "fuel_matcher", + "MaintenancePatternMatcher", + "maintenance_matcher", + "ServiceMapper", + "service_mapper", ] diff --git a/ocr/app/patterns/maintenance_patterns.py b/ocr/app/patterns/maintenance_patterns.py new file mode 100644 index 0000000..81afb52 --- /dev/null +++ b/ocr/app/patterns/maintenance_patterns.py @@ -0,0 +1,335 @@ +"""Maintenance schedule pattern matching for owner's manual extraction.""" +import re +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class MileageIntervalMatch: + """Result of mileage interval pattern matching.""" + + value: int # Miles + raw_match: str + confidence: float + pattern_name: str + + +@dataclass +class TimeIntervalMatch: + """Result of time interval pattern matching.""" + + value: int # Months + raw_match: str + confidence: float + pattern_name: str + + +@dataclass +class FluidSpecMatch: + """Result of fluid specification pattern matching.""" + + value: str # e.g., "0W-20", "ATF-Z1", "DOT 4" + fluid_type: str # e.g., "oil", "transmission", "brake" + raw_match: str + confidence: float + + +class MaintenancePatternMatcher: + """Extract maintenance-specific data from owner's manual text.""" + + # Mileage interval patterns + MILEAGE_PATTERNS = [ + # "every 5,000 miles" or "every 5000 miles" + ( + r"every\s+([\d,]+)\s*(?:miles?|mi\.?)", + "every_miles", + 0.95, + ), + # "at 30,000 mi" or "at 30000 miles" + ( + r"at\s+([\d,]+)\s*(?:miles?|mi\.?)", + "at_miles", + 0.93, + ), + # "5,000 miles or" (interval before "or") + ( + r"([\d,]+)\s*(?:miles?|mi\.?)\s*(?:or|/)", + "miles_or", + 0.90, + ), + # "every 5,000-7,500 miles" (range - take lower) + ( + r"every\s+([\d,]+)\s*[-–]\s*[\d,]+\s*(?:miles?|mi\.?)", + "miles_range", + 0.88, + ), + # "7,500 mi/12 months" (interval with slash) + ( + r"([\d,]+)\s*(?:miles?|mi\.?)\s*/", + "miles_slash", + 0.87, + ), + # Standalone "X,XXX miles" in table context + ( + r"(? 12 months + ( + r"\bannually\b", + "annually", + 0.95, + ), + # "semi-annually" or "semi-annual" -> 6 months + ( + r"\bsemi-?annual(?:ly)?\b", + "semi_annual", + 0.95, + ), + # "every year" -> 12 months + ( + r"every\s+year", + "every_year", + 0.93, + ), + # "every 2 years" -> 24 months + ( + r"every\s+(\d+)\s*years?", + "every_years", + 0.93, + ), + # "12 mo/7,500 mi" or "12 months/" + ( + r"(\d+)\s*(?:mo(?:nths?)?\.?)\s*/", + "months_slash", + 0.87, + ), + # Standalone "X months" in table context + ( + r"(? Optional[MileageIntervalMatch]: + """ + Extract mileage interval from text. + + Args: + text: Text to search for mileage intervals + + Returns: + MileageIntervalMatch or None if no interval found + """ + text_lower = text.lower() + + for pattern, name, confidence in self.MILEAGE_PATTERNS: + match = re.search(pattern, text_lower, re.IGNORECASE) + if match: + # Extract the number and remove commas + mileage_str = match.group(1).replace(",", "") + mileage = int(mileage_str) + + if self._is_reasonable_mileage(mileage): + return MileageIntervalMatch( + value=mileage, + raw_match=match.group(0), + confidence=confidence, + pattern_name=name, + ) + + return None + + def extract_time_interval(self, text: str) -> Optional[TimeIntervalMatch]: + """ + Extract time interval from text. + + Args: + text: Text to search for time intervals + + Returns: + TimeIntervalMatch or None if no interval found + """ + text_lower = text.lower() + + for pattern, name, confidence in self.TIME_PATTERNS: + match = re.search(pattern, text_lower, re.IGNORECASE) + if match: + # Handle special cases + if name == "annually": + months = 12 + elif name == "semi_annual": + months = 6 + elif name == "every_year": + months = 12 + elif name == "every_years": + years = int(match.group(1)) + months = years * 12 + else: + months = int(match.group(1)) + + if self._is_reasonable_months(months): + return TimeIntervalMatch( + value=months, + raw_match=match.group(0), + confidence=confidence, + pattern_name=name, + ) + + return None + + def extract_fluid_spec(self, text: str) -> Optional[FluidSpecMatch]: + """ + Extract fluid specification from text. + + Args: + text: Text to search for fluid specs + + Returns: + FluidSpecMatch or None if no spec found + """ + for pattern, fluid_type, confidence in self.FLUID_PATTERNS: + match = re.search(pattern, text, re.IGNORECASE) + if match: + return FluidSpecMatch( + value=match.group(1).upper() if fluid_type != "coolant" else match.group(1), + fluid_type=fluid_type, + raw_match=match.group(0), + confidence=confidence, + ) + + return None + + def extract_all_fluid_specs(self, text: str) -> list[FluidSpecMatch]: + """ + Extract all fluid specifications from text. + + Args: + text: Text to search for fluid specs + + Returns: + List of FluidSpecMatch objects + """ + results = [] + seen_values: set[str] = set() + + for pattern, fluid_type, confidence in self.FLUID_PATTERNS: + for match in re.finditer(pattern, text, re.IGNORECASE): + value = match.group(1).upper() if fluid_type != "coolant" else match.group(1) + if value not in seen_values: + seen_values.add(value) + results.append( + FluidSpecMatch( + value=value, + fluid_type=fluid_type, + raw_match=match.group(0), + confidence=confidence, + ) + ) + + return results + + def extract_combined_interval( + self, text: str + ) -> tuple[Optional[MileageIntervalMatch], Optional[TimeIntervalMatch]]: + """ + Extract both mileage and time intervals from a combined pattern. + + Many schedules use patterns like "every 5,000 miles or 6 months". + + Args: + text: Text to search + + Returns: + Tuple of (mileage_match, time_match) + """ + mileage = self.extract_mileage_interval(text) + time = self.extract_time_interval(text) + return mileage, time + + def _is_reasonable_mileage(self, mileage: int) -> bool: + """Check if mileage interval is reasonable for maintenance.""" + # Typical ranges: 1,000 to 100,000 miles + return 500 <= mileage <= 150000 + + def _is_reasonable_months(self, months: int) -> bool: + """Check if month interval is reasonable for maintenance.""" + # Typical ranges: 1 to 120 months (10 years) + return 1 <= months <= 120 + + +# Singleton instance +maintenance_matcher = MaintenancePatternMatcher() diff --git a/ocr/app/patterns/service_mapping.py b/ocr/app/patterns/service_mapping.py new file mode 100644 index 0000000..7f7e398 --- /dev/null +++ b/ocr/app/patterns/service_mapping.py @@ -0,0 +1,259 @@ +"""Service name normalization and mapping to maintenance subtypes.""" +import re +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class ServiceMapping: + """Mapping result from extracted text to maintenance subtypes.""" + + normalized_name: str # Standardized service name + subtypes: list[str] # Maintenance subtypes from the system + category: str # routine_maintenance, repair, performance_upgrade + confidence: float + + +# Maintenance subtypes from the system (must match exactly) +ROUTINE_MAINTENANCE_SUBTYPES = [ + "Accelerator Pedal", + "Air Filter Element", + "Brakes and Traction Control", + "Cabin Air Filter / Purifier", + "Coolant", + "Doors", + "Drive Belt", + "Engine Oil", + "Evaporative Emissions System", + "Exhaust System", + "Fluid - A/T", + "Fluid - Differential", + "Fluid - M/T", + "Fluid Filter - A/T", + "Fluids", + "Fuel Delivery and Air Induction", + "Hood Shock / Support", + "Neutral Safety Switch", + "Parking Brake System", + "Restraints and Safety Systems", + "Shift Interlock A/T", + "Spark Plug", + "Steering and Suspension", + "Tires", + "Trunk / Liftgate Shock / Support", + "Washer Fluid", + "Wiper Blade", +] + + +class ServiceMapper: + """Map extracted service names to maintenance subtypes.""" + + # Mapping from common service terms to system subtypes + # Keys are lowercase patterns, values are (normalized_name, subtypes, category, confidence) + SERVICE_MAPPINGS: dict[str, tuple[str, list[str], str, float]] = { + # Oil related + "engine oil": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95), + "oil change": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95), + "motor oil": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.93), + "oil and filter": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95), + "oil & filter": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95), + "change engine oil": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95), + "replace engine oil": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95), + # Air filter + "air filter": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.90), + "engine air filter": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.95), + "air cleaner": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.88), + "air cleaner element": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.93), + "replace air filter": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.95), + # Cabin filter + "cabin air filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.95), + "cabin filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.93), + "a/c filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.88), + "hvac filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.88), + "interior air filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.90), + "dust and pollen filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.90), + # Tires + "tire rotation": ("Tire Rotation", ["Tires"], "routine_maintenance", 0.98), + "rotate tires": ("Tire Rotation", ["Tires"], "routine_maintenance", 0.95), + "tire inspection": ("Tire Inspection", ["Tires"], "routine_maintenance", 0.93), + "inspect tires": ("Tire Inspection", ["Tires"], "routine_maintenance", 0.93), + "check tire pressure": ("Tire Pressure Check", ["Tires"], "routine_maintenance", 0.90), + "tire pressure": ("Tire Pressure Check", ["Tires"], "routine_maintenance", 0.85), + # Brakes + "brake inspection": ("Brake Inspection", ["Brakes and Traction Control"], "routine_maintenance", 0.95), + "inspect brakes": ("Brake Inspection", ["Brakes and Traction Control"], "routine_maintenance", 0.93), + "brake fluid": ("Brake Fluid Service", ["Brakes and Traction Control"], "routine_maintenance", 0.93), + "brake pads": ("Brake Pad Inspection", ["Brakes and Traction Control"], "routine_maintenance", 0.90), + "parking brake": ("Parking Brake Inspection", ["Parking Brake System"], "routine_maintenance", 0.93), + # Coolant + "coolant": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.90), + "engine coolant": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.93), + "antifreeze": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.90), + "cooling system": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.88), + "radiator fluid": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.88), + "replace coolant": ("Coolant Replacement", ["Coolant"], "routine_maintenance", 0.95), + # Transmission + "transmission fluid": ("Transmission Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.93), + "automatic transmission fluid": ("Transmission Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.95), + "atf": ("Transmission Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.90), + "manual transmission fluid": ("Manual Transmission Fluid", ["Fluid - M/T"], "routine_maintenance", 0.95), + "cvt fluid": ("CVT Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.93), + "transmission filter": ("Transmission Filter", ["Fluid Filter - A/T"], "routine_maintenance", 0.93), + # Differential + "differential fluid": ("Differential Fluid Service", ["Fluid - Differential"], "routine_maintenance", 0.95), + "rear differential": ("Differential Fluid Service", ["Fluid - Differential"], "routine_maintenance", 0.93), + "front differential": ("Differential Fluid Service", ["Fluid - Differential"], "routine_maintenance", 0.93), + "transfer case": ("Transfer Case Fluid", ["Fluid - Differential"], "routine_maintenance", 0.90), + # Spark plugs + "spark plug": ("Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.95), + "spark plugs": ("Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.95), + "replace spark plugs": ("Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.95), + "ignition plugs": ("Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.88), + # Drive belt + "drive belt": ("Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.93), + "serpentine belt": ("Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.93), + "accessory belt": ("Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.90), + "timing belt": ("Timing Belt Service", ["Drive Belt"], "routine_maintenance", 0.90), + "v-belt": ("Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.88), + # Wipers + "wiper blade": ("Wiper Blade Replacement", ["Wiper Blade"], "routine_maintenance", 0.95), + "wiper blades": ("Wiper Blade Replacement", ["Wiper Blade"], "routine_maintenance", 0.95), + "windshield wiper": ("Wiper Blade Replacement", ["Wiper Blade"], "routine_maintenance", 0.93), + "replace wipers": ("Wiper Blade Replacement", ["Wiper Blade"], "routine_maintenance", 0.93), + # Washer fluid + "washer fluid": ("Washer Fluid", ["Washer Fluid"], "routine_maintenance", 0.95), + "windshield washer": ("Washer Fluid", ["Washer Fluid"], "routine_maintenance", 0.90), + # Steering/Suspension + "steering": ("Steering Inspection", ["Steering and Suspension"], "routine_maintenance", 0.85), + "suspension": ("Suspension Inspection", ["Steering and Suspension"], "routine_maintenance", 0.85), + "power steering": ("Power Steering Fluid", ["Steering and Suspension"], "routine_maintenance", 0.90), + "power steering fluid": ("Power Steering Fluid", ["Steering and Suspension"], "routine_maintenance", 0.93), + # Exhaust + "exhaust": ("Exhaust System Inspection", ["Exhaust System"], "routine_maintenance", 0.88), + "exhaust system": ("Exhaust System Inspection", ["Exhaust System"], "routine_maintenance", 0.93), + # Fuel system + "fuel filter": ("Fuel Filter Replacement", ["Fuel Delivery and Air Induction"], "routine_maintenance", 0.93), + "fuel system": ("Fuel System Inspection", ["Fuel Delivery and Air Induction"], "routine_maintenance", 0.88), + "fuel injection": ("Fuel Injection Service", ["Fuel Delivery and Air Induction"], "routine_maintenance", 0.88), + # Emissions + "evaporative emissions": ("Evaporative Emissions Inspection", ["Evaporative Emissions System"], "routine_maintenance", 0.93), + "evap system": ("Evaporative Emissions Inspection", ["Evaporative Emissions System"], "routine_maintenance", 0.90), + "emissions": ("Evaporative Emissions Inspection", ["Evaporative Emissions System"], "routine_maintenance", 0.80), + # Safety systems + "seat belt": ("Safety Systems Inspection", ["Restraints and Safety Systems"], "routine_maintenance", 0.90), + "airbag": ("Safety Systems Inspection", ["Restraints and Safety Systems"], "routine_maintenance", 0.85), + "restraint": ("Safety Systems Inspection", ["Restraints and Safety Systems"], "routine_maintenance", 0.85), + # Miscellaneous + "battery": ("Battery Inspection", ["Fluids"], "routine_maintenance", 0.80), + "inspect battery": ("Battery Inspection", ["Fluids"], "routine_maintenance", 0.85), + "door hinges": ("Door Lubrication", ["Doors"], "routine_maintenance", 0.85), + "hood shock": ("Hood Shock Inspection", ["Hood Shock / Support"], "routine_maintenance", 0.90), + "trunk shock": ("Trunk Shock Inspection", ["Trunk / Liftgate Shock / Support"], "routine_maintenance", 0.90), + "liftgate": ("Liftgate Inspection", ["Trunk / Liftgate Shock / Support"], "routine_maintenance", 0.88), + } + + # Pattern-based mappings for fuzzy matching + SERVICE_PATTERNS: list[tuple[str, str, list[str], str, float]] = [ + # (regex_pattern, normalized_name, subtypes, category, confidence) + (r"oil\s+(?:and|&)\s+filter", "Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95), + (r"(?:change|replace)\s+(?:the\s+)?oil", "Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.93), + (r"(?:inspect|check)\s+(?:the\s+)?brakes?", "Brake Inspection", ["Brakes and Traction Control"], "routine_maintenance", 0.90), + (r"(?:inspect|check)\s+(?:the\s+)?tires?", "Tire Inspection", ["Tires"], "routine_maintenance", 0.90), + (r"(?:rotate|rotation)\s+(?:the\s+)?tires?", "Tire Rotation", ["Tires"], "routine_maintenance", 0.95), + (r"(?:replace|change)\s+(?:the\s+)?(?:engine\s+)?air\s+filter", "Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.95), + (r"(?:replace|change)\s+(?:the\s+)?cabin\s+(?:air\s+)?filter", "Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.95), + (r"(?:replace|change)\s+(?:the\s+)?spark\s+plugs?", "Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.95), + (r"(?:replace|change)\s+(?:the\s+)?coolant", "Coolant Replacement", ["Coolant"], "routine_maintenance", 0.93), + (r"(?:flush|drain)\s+(?:the\s+)?coolant", "Coolant Flush", ["Coolant"], "routine_maintenance", 0.93), + (r"(?:replace|change)\s+(?:the\s+)?(?:a/?t|automatic\s+transmission)\s+fluid", "Transmission Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.93), + (r"(?:inspect|check)\s+(?:the\s+)?(?:drive|serpentine|accessory)\s+belt", "Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.90), + ] + + def map_service(self, service_text: str) -> Optional[ServiceMapping]: + """ + Map extracted service text to maintenance subtypes. + + Args: + service_text: Service name or description from the manual + + Returns: + ServiceMapping or None if no mapping found + """ + normalized_text = service_text.lower().strip() + + # Try exact mapping first + for key, (name, subtypes, category, conf) in self.SERVICE_MAPPINGS.items(): + if key in normalized_text: + return ServiceMapping( + normalized_name=name, + subtypes=subtypes, + category=category, + confidence=conf, + ) + + # Try pattern matching + for pattern, name, subtypes, category, conf in self.SERVICE_PATTERNS: + if re.search(pattern, normalized_text, re.IGNORECASE): + return ServiceMapping( + normalized_name=name, + subtypes=subtypes, + category=category, + confidence=conf, + ) + + return None + + def map_service_fuzzy(self, service_text: str, threshold: float = 0.6) -> Optional[ServiceMapping]: + """ + Map service text with fuzzy matching for typos and variations. + + Args: + service_text: Service name or description + threshold: Minimum similarity threshold (0.0-1.0) + + Returns: + ServiceMapping or None + """ + # First try exact matching + result = self.map_service(service_text) + if result: + return result + + # Fall back to word overlap matching + words = set(service_text.lower().split()) + + best_match: Optional[ServiceMapping] = None + best_score = 0.0 + + for key, (name, subtypes, category, conf) in self.SERVICE_MAPPINGS.items(): + key_words = set(key.split()) + overlap = len(words & key_words) + total = len(words | key_words) + + if total > 0: + score = overlap / total + if score > best_score and score >= threshold: + best_score = score + best_match = ServiceMapping( + normalized_name=name, + subtypes=subtypes, + category=category, + confidence=conf * score, # Reduce confidence by match quality + ) + + return best_match + + def get_all_service_keywords(self) -> list[str]: + """Get all service keywords for table header detection.""" + keywords = list(self.SERVICE_MAPPINGS.keys()) + # Add common header terms + keywords.extend([ + "service", "maintenance", "item", "operation", + "inspection", "replacement", "interval", "schedule", + ]) + return keywords + + +# Singleton instance +service_mapper = ServiceMapper() diff --git a/ocr/app/preprocessors/__init__.py b/ocr/app/preprocessors/__init__.py index 50e04ed..fb5cab5 100644 --- a/ocr/app/preprocessors/__init__.py +++ b/ocr/app/preprocessors/__init__.py @@ -5,6 +5,12 @@ from app.preprocessors.receipt_preprocessor import ( ReceiptPreprocessor, receipt_preprocessor, ) +from app.preprocessors.pdf_preprocessor import ( + PdfPreprocessor, + pdf_preprocessor, + PdfPageContent, + PdfInfo, +) __all__ = [ "ImagePreprocessor", @@ -13,4 +19,8 @@ __all__ = [ "vin_preprocessor", "ReceiptPreprocessor", "receipt_preprocessor", + "PdfPreprocessor", + "pdf_preprocessor", + "PdfPageContent", + "PdfInfo", ] diff --git a/ocr/app/preprocessors/pdf_preprocessor.py b/ocr/app/preprocessors/pdf_preprocessor.py new file mode 100644 index 0000000..b744aae --- /dev/null +++ b/ocr/app/preprocessors/pdf_preprocessor.py @@ -0,0 +1,353 @@ +"""PDF preprocessing for owner's manual extraction.""" +import io +import logging +from dataclasses import dataclass, field +from typing import Iterator, Optional + +import fitz # PyMuPDF +from PIL import Image + +logger = logging.getLogger(__name__) + + +@dataclass +class PdfPageContent: + """Content extracted from a single PDF page.""" + + page_number: int + has_text: bool + text_content: str + image_bytes: Optional[bytes] # Rendered image for scanned pages + width: int + height: int + + +@dataclass +class PdfInfo: + """Information about a PDF document.""" + + total_pages: int + has_text_layer: bool + is_scanned: bool # True if most pages lack text layer + file_size_bytes: int + title: Optional[str] + author: Optional[str] + metadata: dict = field(default_factory=dict) + + +class PdfPreprocessor: + """Preprocess PDFs for OCR extraction. + + Handles two scenarios: + 1. Native PDFs with text layer - extract text directly + 2. Scanned PDFs - render pages to images for OCR + + Uses PyMuPDF (fitz) for both text extraction and image rendering. + """ + + # DPI for rendering scanned pages + DEFAULT_DPI = 300 + + # Minimum text length to consider a page has text + MIN_TEXT_LENGTH = 50 + + # Maximum pages to sample for scan detection + SAMPLE_PAGES = 10 + + def get_pdf_info(self, pdf_bytes: bytes) -> PdfInfo: + """ + Analyze PDF and return metadata. + + Args: + pdf_bytes: Raw PDF bytes + + Returns: + PdfInfo with document metadata + """ + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + try: + total_pages = len(doc) + metadata = doc.metadata or {} + + # Sample pages to determine if scanned + text_pages = 0 + sample_count = min(total_pages, self.SAMPLE_PAGES) + + # Sample from beginning, middle, and end + if total_pages <= self.SAMPLE_PAGES: + sample_indices = list(range(total_pages)) + else: + sample_indices = [ + 0, 1, 2, # Beginning + total_pages // 2 - 1, total_pages // 2, total_pages // 2 + 1, # Middle + total_pages - 3, total_pages - 2, total_pages - 1, # End + ] + sample_indices = [i for i in sample_indices if 0 <= i < total_pages] + + for page_idx in sample_indices: + page = doc[page_idx] + text = page.get_text().strip() + if len(text) >= self.MIN_TEXT_LENGTH: + text_pages += 1 + + # Consider it a scanned PDF if less than half of sampled pages have text + has_text_layer = text_pages > 0 + is_scanned = text_pages < len(sample_indices) / 2 + + return PdfInfo( + total_pages=total_pages, + has_text_layer=has_text_layer, + is_scanned=is_scanned, + file_size_bytes=len(pdf_bytes), + title=metadata.get("title"), + author=metadata.get("author"), + metadata=metadata, + ) + + finally: + doc.close() + + def extract_text_from_page( + self, pdf_bytes: bytes, page_number: int + ) -> PdfPageContent: + """ + Extract content from a single PDF page. + + Args: + pdf_bytes: Raw PDF bytes + page_number: Zero-indexed page number + + Returns: + PdfPageContent with text and/or image + """ + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + try: + if page_number >= len(doc): + raise ValueError(f"Page {page_number} does not exist (max: {len(doc) - 1})") + + page = doc[page_number] + text = page.get_text().strip() + has_text = len(text) >= self.MIN_TEXT_LENGTH + + rect = page.rect + width = int(rect.width) + height = int(rect.height) + + # If page has text, we don't need to render + image_bytes = None + if not has_text: + image_bytes = self._render_page_to_image(page, self.DEFAULT_DPI) + + return PdfPageContent( + page_number=page_number, + has_text=has_text, + text_content=text, + image_bytes=image_bytes, + width=width, + height=height, + ) + + finally: + doc.close() + + def extract_all_pages( + self, + pdf_bytes: bytes, + dpi: int = DEFAULT_DPI, + force_ocr: bool = False, + ) -> Iterator[PdfPageContent]: + """ + Extract content from all pages as a generator. + + Args: + pdf_bytes: Raw PDF bytes + dpi: DPI for rendering scanned pages + force_ocr: If True, render all pages regardless of text layer + + Yields: + PdfPageContent for each page + """ + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + try: + for page_number in range(len(doc)): + page = doc[page_number] + text = page.get_text().strip() + has_text = len(text) >= self.MIN_TEXT_LENGTH + + rect = page.rect + width = int(rect.width) + height = int(rect.height) + + # Render to image if no text or force_ocr + image_bytes = None + if not has_text or force_ocr: + image_bytes = self._render_page_to_image(page, dpi) + + yield PdfPageContent( + page_number=page_number, + has_text=has_text, + text_content=text if has_text else "", + image_bytes=image_bytes, + width=width, + height=height, + ) + + finally: + doc.close() + + def extract_page_range( + self, + pdf_bytes: bytes, + start_page: int, + end_page: int, + dpi: int = DEFAULT_DPI, + ) -> list[PdfPageContent]: + """ + Extract content from a range of pages. + + Args: + pdf_bytes: Raw PDF bytes + start_page: First page (zero-indexed) + end_page: Last page (exclusive) + dpi: DPI for rendering + + Returns: + List of PdfPageContent + """ + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + try: + results = [] + end_page = min(end_page, len(doc)) + + for page_number in range(start_page, end_page): + page = doc[page_number] + text = page.get_text().strip() + has_text = len(text) >= self.MIN_TEXT_LENGTH + + rect = page.rect + width = int(rect.width) + height = int(rect.height) + + image_bytes = None + if not has_text: + image_bytes = self._render_page_to_image(page, dpi) + + results.append( + PdfPageContent( + page_number=page_number, + has_text=has_text, + text_content=text if has_text else "", + image_bytes=image_bytes, + width=width, + height=height, + ) + ) + + return results + + finally: + doc.close() + + def find_maintenance_section( + self, pdf_bytes: bytes, keywords: Optional[list[str]] = None + ) -> list[int]: + """ + Find pages likely containing maintenance schedules. + + Args: + pdf_bytes: Raw PDF bytes + keywords: Keywords to search for (defaults to common terms) + + Returns: + List of page numbers likely containing maintenance info + """ + if keywords is None: + keywords = [ + "maintenance schedule", + "maintenance interval", + "service schedule", + "service interval", + "recommended maintenance", + "scheduled maintenance", + "routine maintenance", + "periodic maintenance", + "owner's maintenance", + "maintenance requirements", + ] + + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + try: + maintenance_pages = [] + + for page_number in range(len(doc)): + page = doc[page_number] + text = page.get_text().lower() + + for keyword in keywords: + if keyword.lower() in text: + maintenance_pages.append(page_number) + break + + return maintenance_pages + + finally: + doc.close() + + def _render_page_to_image(self, page: fitz.Page, dpi: int) -> bytes: + """ + Render a PDF page to PNG image bytes. + + Args: + page: PyMuPDF page object + dpi: Target DPI for rendering + + Returns: + PNG image bytes + """ + # Calculate scale factor from DPI + # Default PDF resolution is 72 DPI + scale = dpi / 72.0 + matrix = fitz.Matrix(scale, scale) + + # Render page to pixmap + pixmap = page.get_pixmap(matrix=matrix) + + # Convert to PNG bytes + png_bytes = pixmap.tobytes("png") + + return png_bytes + + def render_page_for_table_detection( + self, pdf_bytes: bytes, page_number: int, dpi: int = 150 + ) -> bytes: + """ + Render a page at lower DPI for table detection (faster). + + Args: + pdf_bytes: Raw PDF bytes + page_number: Page to render + dpi: DPI for rendering (lower for faster processing) + + Returns: + PNG image bytes + """ + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + try: + if page_number >= len(doc): + raise ValueError(f"Page {page_number} does not exist") + + page = doc[page_number] + return self._render_page_to_image(page, dpi) + + finally: + doc.close() + + +# Singleton instance +pdf_preprocessor = PdfPreprocessor() diff --git a/ocr/app/routers/extract.py b/ocr/app/routers/extract.py index 32bbdb3..edec582 100644 --- a/ocr/app/routers/extract.py +++ b/ocr/app/routers/extract.py @@ -2,19 +2,24 @@ import logging from typing import Optional -from fastapi import APIRouter, File, Form, HTTPException, Query, UploadFile +from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Query, UploadFile from app.extractors.vin_extractor import vin_extractor from app.extractors.receipt_extractor import receipt_extractor +from app.extractors.manual_extractor import manual_extractor from app.models import ( BoundingBox, + ManualExtractionResponse, + ManualJobResponse, + ManualMaintenanceSchedule, + ManualVehicleInfo, OcrResponse, ReceiptExtractedField, ReceiptExtractionResponse, VinAlternative, VinExtractionResponse, ) -from app.services import ocr_service +from app.services import ocr_service, job_queue logger = logging.getLogger(__name__) @@ -23,6 +28,9 @@ router = APIRouter(prefix="/extract", tags=["extract"]) # Maximum file size for synchronous processing (10MB) MAX_SYNC_SIZE = 10 * 1024 * 1024 +# Maximum file size for manual/PDF processing (200MB) +MAX_MANUAL_SIZE = 200 * 1024 * 1024 + @router.post("", response_model=OcrResponse) async def extract_text( @@ -257,3 +265,166 @@ async def extract_receipt( processingTimeMs=result.processing_time_ms, error=result.error, ) + + +@router.post("/manual", response_model=ManualJobResponse) +async def extract_manual( + background_tasks: BackgroundTasks, + file: UploadFile = File(..., description="Owner's manual PDF file"), + vehicle_id: Optional[str] = Form(None, description="Vehicle ID for context"), +) -> ManualJobResponse: + """ + Submit an async job to extract maintenance schedules from an owner's manual. + + Supports PDF files up to 200MB. Processing is done asynchronously due to + the time required for large documents. + + Pipeline: + 1. Analyze PDF structure (text layer vs scanned) + 2. Find maintenance schedule sections + 3. Extract text or perform OCR on scanned pages + 4. Detect and parse maintenance tables + 5. Extract service intervals and fluid specifications + + - **file**: Owner's manual PDF (max 200MB) + - **vehicle_id**: Optional vehicle ID for context + + Returns immediately with job_id. Poll GET /jobs/{job_id} for status and results. + + Response when completed: + - **vehicleInfo**: Detected make/model/year + - **maintenanceSchedules**: List of extracted maintenance items with intervals + - **rawTables**: Metadata about detected tables + - **processingTimeMs**: Total processing time + """ + # Validate file presence + if not file.filename: + raise HTTPException(status_code=400, detail="No file provided") + + # Validate file type + content_type = file.content_type or "" + if not content_type.startswith("application/pdf") and not file.filename.lower().endswith(".pdf"): + raise HTTPException( + status_code=400, + detail="File must be a PDF document", + ) + + # Read file content + content = await file.read() + file_size = len(content) + + # Validate file size + if file_size > MAX_MANUAL_SIZE: + raise HTTPException( + status_code=413, + detail=f"File too large. Max: {MAX_MANUAL_SIZE // (1024*1024)}MB.", + ) + + if file_size == 0: + raise HTTPException(status_code=400, detail="Empty file provided") + + logger.info( + f"Manual extraction: {file.filename}, " + f"size: {file_size} bytes, " + f"vehicle_id: {vehicle_id}" + ) + + # Estimate processing time based on file size + # Rough estimate: 1 second per MB for native PDFs, 3 seconds for scanned + estimated_seconds = max(30, (file_size // (1024 * 1024)) * 2) + + # Submit job to queue + job_id = await job_queue.submit_manual_job( + file_bytes=content, + vehicle_id=vehicle_id, + ) + + # Schedule background processing + background_tasks.add_task(process_manual_job, job_id) + + # Return initial status + return ManualJobResponse( + jobId=job_id, + status="pending", + progress=0, + estimatedSeconds=estimated_seconds, + ) + + +async def process_manual_job(job_id: str) -> None: + """Background task to process a manual extraction job.""" + import asyncio + + logger.info(f"Starting manual extraction job {job_id}") + + try: + # Update status to processing + await job_queue.update_manual_job_progress(job_id, 5, "Starting extraction") + + # Get job data + file_bytes = await job_queue.get_job_data(job_id) + if not file_bytes: + await job_queue.fail_manual_job(job_id, "Job data not found") + return + + # Define progress callback + async def progress_callback(percent: int, message: str) -> None: + await job_queue.update_manual_job_progress(job_id, percent, message) + + # Run extraction in thread pool (CPU-bound) + loop = asyncio.get_event_loop() + + def sync_progress_callback(percent: int, message: str) -> None: + # Schedule the async update + asyncio.run_coroutine_threadsafe( + job_queue.update_manual_job_progress(job_id, percent, message), + loop, + ) + + result = await loop.run_in_executor( + None, + lambda: manual_extractor.extract( + pdf_bytes=file_bytes, + progress_callback=sync_progress_callback, + ), + ) + + if result.success: + # Convert to response model + vehicle_info = None + if result.vehicle_info: + vehicle_info = ManualVehicleInfo( + make=result.vehicle_info.make, + model=result.vehicle_info.model, + year=result.vehicle_info.year, + ) + + schedules = [ + ManualMaintenanceSchedule( + service=s.service, + intervalMiles=s.interval_miles, + intervalMonths=s.interval_months, + details=s.details, + confidence=s.confidence, + subtypes=s.subtypes, + ) + for s in result.maintenance_schedules + ] + + response = ManualExtractionResponse( + success=True, + vehicleInfo=vehicle_info, + maintenanceSchedules=schedules, + rawTables=result.raw_tables, + processingTimeMs=result.processing_time_ms, + totalPages=result.total_pages, + pagesProcessed=result.pages_processed, + ) + + await job_queue.complete_manual_job(job_id, response) + else: + await job_queue.fail_manual_job(job_id, result.error or "Extraction failed") + + except Exception as e: + logger.error(f"Manual job {job_id} failed: {e}", exc_info=True) + await job_queue.fail_manual_job(job_id, str(e)) diff --git a/ocr/app/routers/jobs.py b/ocr/app/routers/jobs.py index c467c8a..86e4b9a 100644 --- a/ocr/app/routers/jobs.py +++ b/ocr/app/routers/jobs.py @@ -1,11 +1,11 @@ """Async OCR job endpoints.""" import asyncio import logging -from typing import Optional +from typing import Optional, Union from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, UploadFile -from app.models import JobResponse, JobSubmitRequest +from app.models import JobResponse, JobSubmitRequest, ManualJobResponse from app.services import job_queue, ocr_service logger = logging.getLogger(__name__) @@ -73,12 +73,13 @@ async def submit_job( ) -@router.get("/{job_id}", response_model=JobResponse) -async def get_job_status(job_id: str) -> JobResponse: +@router.get("/{job_id}", response_model=Union[JobResponse, ManualJobResponse]) +async def get_job_status(job_id: str) -> Union[JobResponse, ManualJobResponse]: """ Get the status of an async OCR job. Poll this endpoint to check job progress and retrieve results. + Works for both regular OCR jobs and manual extraction jobs. Returns: - **pending**: Job is queued @@ -86,15 +87,20 @@ async def get_job_status(job_id: str) -> JobResponse: - **completed**: Job finished successfully (includes result) - **failed**: Job failed (includes error message) """ + # Try regular job first result = await job_queue.get_job_status(job_id) + if result is not None: + return result - if result is None: - raise HTTPException( - status_code=404, - detail=f"Job {job_id} not found. Jobs expire after 1 hour.", - ) + # Try manual job + manual_result = await job_queue.get_manual_job_status(job_id) + if manual_result is not None: + return manual_result - return result + raise HTTPException( + status_code=404, + detail=f"Job {job_id} not found. Jobs expire after 1-2 hours.", + ) async def process_job(job_id: str) -> None: diff --git a/ocr/app/services/job_queue.py b/ocr/app/services/job_queue.py index 7b4e04c..5afce25 100644 --- a/ocr/app/services/job_queue.py +++ b/ocr/app/services/job_queue.py @@ -3,23 +3,34 @@ import asyncio import json import logging import uuid -from typing import Optional +from typing import Optional, TYPE_CHECKING import redis.asyncio as redis from app.config import settings from app.models import JobResponse, JobStatus, OcrResponse +if TYPE_CHECKING: + from app.models import ManualExtractionResponse, ManualJobResponse + logger = logging.getLogger(__name__) # Job TTL in seconds (1 hour) JOB_TTL = 3600 +# Manual job TTL (2 hours for larger files) +MANUAL_JOB_TTL = 7200 + # Key prefixes JOB_PREFIX = "ocr:job:" JOB_DATA_PREFIX = "ocr:job:data:" JOB_RESULT_PREFIX = "ocr:job:result:" +# Manual job prefixes +MANUAL_JOB_PREFIX = "ocr:manual:job:" +MANUAL_JOB_DATA_PREFIX = "ocr:manual:job:data:" +MANUAL_JOB_RESULT_PREFIX = "ocr:manual:job:result:" + class JobQueue: """Manages async OCR jobs using Redis.""" @@ -228,6 +239,156 @@ class JobQueue: except Exception as e: logger.error(f"Callback failed for job {job_id}: {e}") + # Manual extraction job methods + + async def submit_manual_job( + self, + file_bytes: bytes, + vehicle_id: Optional[str] = None, + ) -> str: + """ + Submit a new manual extraction job. + + Args: + file_bytes: Raw PDF bytes + vehicle_id: Optional vehicle ID for context + + Returns: + Job ID + """ + r = await self.get_redis() + job_id = str(uuid.uuid4()) + + # Store job metadata + job_meta = { + "status": JobStatus.PENDING.value, + "progress": 0, + "progress_message": "", + "vehicle_id": vehicle_id or "", + "job_type": "manual", + } + + # Store file data separately (binary) + data_key = f"{MANUAL_JOB_DATA_PREFIX}{job_id}" + meta_key = f"{MANUAL_JOB_PREFIX}{job_id}" + + # Use pipeline for atomic operation + async with r.pipeline() as pipe: + # Store metadata as hash + await pipe.hset(meta_key, mapping=job_meta) # type: ignore + await pipe.expire(meta_key, MANUAL_JOB_TTL) + + # Store binary data + await pipe.set(data_key, file_bytes) + await pipe.expire(data_key, MANUAL_JOB_TTL) + + await pipe.execute() + + logger.info(f"Manual job {job_id} submitted") + return job_id + + async def get_manual_job_status(self, job_id: str) -> Optional["ManualJobResponse"]: + """ + Get the status of a manual extraction job. + + Args: + job_id: Job ID to check + + Returns: + ManualJobResponse or None if job doesn't exist + """ + from app.models import ManualJobResponse, ManualExtractionResponse + + r = await self.get_redis() + meta_key = f"{MANUAL_JOB_PREFIX}{job_id}" + result_key = f"{MANUAL_JOB_RESULT_PREFIX}{job_id}" + + # Get job metadata + meta = await r.hgetall(meta_key) # type: ignore + if not meta: + return None + + status = JobStatus(meta.get("status", JobStatus.PENDING.value)) + progress = int(meta.get("progress", 0)) + error = meta.get("error") + + # Get result if completed + result = None + if status == JobStatus.COMPLETED: + result_json = await r.get(result_key) + if result_json: + result_dict = json.loads(result_json) + result = ManualExtractionResponse(**result_dict) + + return ManualJobResponse( + jobId=job_id, + status=status, + progress=progress if status == JobStatus.PROCESSING else None, + result=result, + error=error if status == JobStatus.FAILED else None, + ) + + async def update_manual_job_progress( + self, job_id: str, progress: int, message: str = "" + ) -> None: + """Update manual job progress percentage and message.""" + r = await self.get_redis() + meta_key = f"{MANUAL_JOB_PREFIX}{job_id}" + + await r.hset(meta_key, mapping={ # type: ignore + "status": JobStatus.PROCESSING.value, + "progress": progress, + "progress_message": message, + }) + + async def complete_manual_job( + self, job_id: str, result: "ManualExtractionResponse" + ) -> None: + """Mark manual job as completed with result.""" + r = await self.get_redis() + meta_key = f"{MANUAL_JOB_PREFIX}{job_id}" + result_key = f"{MANUAL_JOB_RESULT_PREFIX}{job_id}" + data_key = f"{MANUAL_JOB_DATA_PREFIX}{job_id}" + + # Store result + result_dict = result.model_dump(by_alias=True) + result_json = json.dumps(result_dict) + + async with r.pipeline() as pipe: + # Update status + await pipe.hset(meta_key, mapping={ # type: ignore + "status": JobStatus.COMPLETED.value, + "progress": 100, + }) + + # Store result + await pipe.set(result_key, result_json) + await pipe.expire(result_key, MANUAL_JOB_TTL) + + # Delete file data (no longer needed) + await pipe.delete(data_key) + + await pipe.execute() + + logger.info(f"Manual job {job_id} completed") + + async def fail_manual_job(self, job_id: str, error: str) -> None: + """Mark manual job as failed with error message.""" + r = await self.get_redis() + meta_key = f"{MANUAL_JOB_PREFIX}{job_id}" + data_key = f"{MANUAL_JOB_DATA_PREFIX}{job_id}" + + async with r.pipeline() as pipe: + await pipe.hset(meta_key, mapping={ # type: ignore + "status": JobStatus.FAILED.value, + "error": error, + }) + # Delete file data + await pipe.delete(data_key) + await pipe.execute() + + logger.error(f"Manual job {job_id} failed: {error}") + # Singleton instance job_queue = JobQueue() diff --git a/ocr/app/table_extraction/__init__.py b/ocr/app/table_extraction/__init__.py new file mode 100644 index 0000000..bc663a1 --- /dev/null +++ b/ocr/app/table_extraction/__init__.py @@ -0,0 +1,12 @@ +"""Table extraction components for maintenance schedule parsing.""" +from app.table_extraction.detector import TableDetector, table_detector, DetectedTable +from app.table_extraction.parser import TableParser, table_parser, ParsedScheduleRow + +__all__ = [ + "TableDetector", + "table_detector", + "DetectedTable", + "TableParser", + "table_parser", + "ParsedScheduleRow", +] diff --git a/ocr/app/table_extraction/detector.py b/ocr/app/table_extraction/detector.py new file mode 100644 index 0000000..362990f --- /dev/null +++ b/ocr/app/table_extraction/detector.py @@ -0,0 +1,322 @@ +"""Table detection for maintenance schedule extraction.""" +import io +import logging +import re +from dataclasses import dataclass, field +from typing import Optional + +import cv2 +import numpy as np +from PIL import Image + +logger = logging.getLogger(__name__) + + +@dataclass +class DetectedTable: + """A detected table in a document.""" + + page_number: int + x: int + y: int + width: int + height: int + confidence: float + is_maintenance_table: bool + header_row: Optional[list[str]] = None + raw_content: list[list[str]] = field(default_factory=list) + + +class TableDetector: + """Detect tables in document pages. + + Uses computer vision techniques to identify table regions: + 1. Line detection for bordered tables + 2. Text alignment analysis for borderless tables + 3. Header keyword matching for maintenance schedule identification + """ + + # Keywords indicating maintenance schedule table headers + MAINTENANCE_HEADERS = [ + "service", "maintenance", "item", "operation", + "miles", "mi", "km", "kilometers", + "months", "mo", "interval", + "check", "replace", "inspect", "change", + "schedule", "frequency", + ] + + # Keywords in content that indicate maintenance + MAINTENANCE_CONTENT_KEYWORDS = [ + "oil", "filter", "brake", "tire", "coolant", + "fluid", "spark plug", "belt", "hose", + "inspect", "replace", "change", "check", + ] + + def detect_tables_in_image( + self, image_bytes: bytes, page_number: int = 0 + ) -> list[DetectedTable]: + """ + Detect tables in an image using line detection. + + Args: + image_bytes: PNG/JPEG image bytes + page_number: Page number for the result + + Returns: + List of DetectedTable objects + """ + # Load image + nparr = np.frombuffer(image_bytes, np.uint8) + img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE) + + if img is None: + logger.warning("Failed to decode image for table detection") + return [] + + # Apply threshold + _, binary = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY_INV) + + # Detect horizontal lines + horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)) + horizontal_lines = cv2.morphologyEx( + binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2 + ) + + # Detect vertical lines + vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40)) + vertical_lines = cv2.morphologyEx( + binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2 + ) + + # Combine lines + table_mask = cv2.add(horizontal_lines, vertical_lines) + + # Find contours + contours, _ = cv2.findContours( + table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + + tables = [] + height, width = img.shape[:2] + + for contour in contours: + x, y, w, h = cv2.boundingRect(contour) + + # Filter by size (tables should be reasonably large) + if w < width * 0.3 or h < height * 0.05: + continue + if w > width * 0.95 and h > height * 0.95: + continue # Skip full-page rectangles + + # Calculate confidence based on aspect ratio and size + aspect_ratio = w / h if h > 0 else 0 + size_ratio = (w * h) / (width * height) + + # Tables typically have reasonable aspect ratios + if 0.5 <= aspect_ratio <= 10 and 0.01 <= size_ratio <= 0.8: + confidence = min(0.9, 0.5 + size_ratio + (1 - abs(aspect_ratio - 2) / 10)) + + tables.append( + DetectedTable( + page_number=page_number, + x=x, + y=y, + width=w, + height=h, + confidence=confidence, + is_maintenance_table=False, # Will be determined later + ) + ) + + logger.debug(f"Detected {len(tables)} potential tables on page {page_number}") + return tables + + def detect_tables_in_text( + self, text: str, page_number: int = 0 + ) -> list[DetectedTable]: + """ + Detect table-like structures in text using pattern analysis. + + Useful for native PDFs where text is available. + + Args: + text: Extracted text content + page_number: Page number + + Returns: + List of DetectedTable with content populated + """ + tables = [] + lines = text.split("\n") + + # Look for patterns that suggest tabular data + # - Multiple columns separated by whitespace or tabs + # - Consistent column alignment across rows + + current_table_lines: list[str] = [] + in_table = False + table_start_idx = 0 + + for i, line in enumerate(lines): + # Check if line looks like table row + is_table_row = self._is_table_row(line) + + if is_table_row: + if not in_table: + in_table = True + table_start_idx = i + current_table_lines = [] + current_table_lines.append(line) + else: + if in_table and len(current_table_lines) >= 3: + # End of table, process it + table = self._process_text_table( + current_table_lines, page_number, table_start_idx + ) + if table: + tables.append(table) + in_table = False + current_table_lines = [] + + # Handle table at end of text + if in_table and len(current_table_lines) >= 3: + table = self._process_text_table( + current_table_lines, page_number, table_start_idx + ) + if table: + tables.append(table) + + return tables + + def is_maintenance_table( + self, table: DetectedTable, full_text: Optional[str] = None + ) -> bool: + """ + Determine if a detected table is a maintenance schedule. + + Args: + table: Detected table to analyze + full_text: Optional surrounding text for context + + Returns: + True if likely a maintenance schedule table + """ + # Check header row for maintenance keywords + if table.header_row: + header_text = " ".join(table.header_row).lower() + header_matches = sum( + 1 for kw in self.MAINTENANCE_HEADERS if kw in header_text + ) + if header_matches >= 2: + return True + + # Check content for maintenance keywords + if table.raw_content: + content_text = " ".join( + " ".join(row) for row in table.raw_content + ).lower() + content_matches = sum( + 1 for kw in self.MAINTENANCE_CONTENT_KEYWORDS if kw in content_text + ) + if content_matches >= 3: + return True + + # Check surrounding text + if full_text: + text_lower = full_text.lower() + context_keywords = [ + "maintenance schedule", + "service schedule", + "maintenance interval", + "recommended maintenance", + ] + if any(kw in text_lower for kw in context_keywords): + return True + + return False + + def _is_table_row(self, line: str) -> bool: + """Check if a line looks like a table row.""" + # Skip empty lines + stripped = line.strip() + if not stripped: + return False + + # Check for multiple whitespace-separated columns + parts = re.split(r"\s{2,}|\t", stripped) + if len(parts) >= 2: + # At least 2 columns with content + non_empty = [p for p in parts if p.strip()] + return len(non_empty) >= 2 + + # Check for common table patterns + # e.g., "Service Item 5,000 miles 6 months" + if re.search(r"\d+[,.]?\d*\s*(miles?|mi\.?|km|months?|mo\.?)", stripped, re.I): + return True + + return False + + def _process_text_table( + self, lines: list[str], page_number: int, start_line: int + ) -> Optional[DetectedTable]: + """Process extracted text lines into a table structure.""" + if not lines: + return None + + # Parse rows + rows = [] + for line in lines: + # Split on multiple whitespace or tabs + parts = re.split(r"\s{2,}|\t", line.strip()) + cells = [p.strip() for p in parts if p.strip()] + if cells: + rows.append(cells) + + if len(rows) < 2: + return None + + # First row is likely header + header_row = rows[0] + + # Check if this looks like a maintenance table + table = DetectedTable( + page_number=page_number, + x=0, # Text tables don't have coordinates + y=start_line, + width=0, + height=len(rows), + confidence=0.7, + is_maintenance_table=False, + header_row=header_row, + raw_content=rows[1:], + ) + + # Determine if it's a maintenance table + table.is_maintenance_table = self.is_maintenance_table(table) + + if table.is_maintenance_table: + table.confidence = 0.85 + + return table + + def extract_table_text_from_region( + self, image_bytes: bytes, table: DetectedTable + ) -> list[list[str]]: + """ + Extract text from a table region using OCR. + + Args: + image_bytes: Full page image + table: Detected table with coordinates + + Returns: + 2D list of cell contents + """ + # This would use Tesseract on the cropped region + # For now, return empty - actual OCR will be done in manual_extractor + logger.debug(f"Table region: ({table.x}, {table.y}) {table.width}x{table.height}") + return [] + + +# Singleton instance +table_detector = TableDetector() diff --git a/ocr/app/table_extraction/parser.py b/ocr/app/table_extraction/parser.py new file mode 100644 index 0000000..905d947 --- /dev/null +++ b/ocr/app/table_extraction/parser.py @@ -0,0 +1,357 @@ +"""Parse maintenance schedule tables into structured data.""" +import logging +import re +from dataclasses import dataclass, field +from typing import Optional + +from app.patterns.maintenance_patterns import maintenance_matcher +from app.patterns.service_mapping import service_mapper + +logger = logging.getLogger(__name__) + + +@dataclass +class ParsedScheduleRow: + """A parsed maintenance schedule row.""" + + service: str + normalized_service: Optional[str] + subtypes: list[str] + interval_miles: Optional[int] + interval_months: Optional[int] + details: Optional[str] + fluid_spec: Optional[str] + confidence: float + raw_row: list[str] = field(default_factory=list) + + +class TableParser: + """Parse detected tables into maintenance schedules. + + Handles various table formats: + - Service | Miles | Months | Notes + - Service | Interval | Description + - Miles/Months header with service rows + """ + + # Common column header patterns + COLUMN_PATTERNS = { + "service": [ + r"service", r"item", r"maintenance", r"operation", + r"component", r"part", r"system", r"description", + ], + "miles": [ + r"miles?", r"mi\.?", r"mileage", r"odometer", + r"km", r"kilometers?", + ], + "months": [ + r"months?", r"mo\.?", r"time", r"interval", + r"years?", r"yr\.?", + ], + "details": [ + r"notes?", r"details?", r"remarks?", r"comments?", + r"specification", r"specs?", r"procedure", + ], + } + + def parse_table( + self, + header_row: list[str], + data_rows: list[list[str]], + ) -> list[ParsedScheduleRow]: + """ + Parse a maintenance table into structured schedule rows. + + Args: + header_row: Table header cells + data_rows: Table data rows + + Returns: + List of ParsedScheduleRow objects + """ + # Identify column types + column_types = self._identify_columns(header_row) + + if not column_types: + logger.warning("Could not identify table columns") + return self._parse_without_headers(data_rows) + + results = [] + + for row in data_rows: + parsed = self._parse_row(row, column_types) + if parsed: + results.append(parsed) + + return results + + def parse_text_block(self, text: str) -> list[ParsedScheduleRow]: + """ + Parse maintenance schedules from unstructured text. + + Useful when table detection fails but text contains schedule info. + + Args: + text: Text block that may contain maintenance schedules + + Returns: + List of ParsedScheduleRow objects + """ + results = [] + lines = text.split("\n") + + for line in lines: + # Look for lines with service + interval pattern + service_match = service_mapper.map_service(line) + mileage_match = maintenance_matcher.extract_mileage_interval(line) + time_match = maintenance_matcher.extract_time_interval(line) + + if service_match and (mileage_match or time_match): + # Extract fluid spec if present + fluid_match = maintenance_matcher.extract_fluid_spec(line) + + results.append( + ParsedScheduleRow( + service=line.strip(), + normalized_service=service_match.normalized_name, + subtypes=service_match.subtypes, + interval_miles=mileage_match.value if mileage_match else None, + interval_months=time_match.value if time_match else None, + details=None, + fluid_spec=fluid_match.value if fluid_match else None, + confidence=min( + service_match.confidence, + mileage_match.confidence if mileage_match else 1.0, + time_match.confidence if time_match else 1.0, + ), + raw_row=[line], + ) + ) + + return results + + def _identify_columns( + self, header_row: list[str] + ) -> dict[int, str]: + """ + Identify column types from header row. + + Args: + header_row: Table header cells + + Returns: + Dict mapping column index to type + """ + column_types: dict[int, str] = {} + + for i, header in enumerate(header_row): + header_lower = header.lower().strip() + + for col_type, patterns in self.COLUMN_PATTERNS.items(): + for pattern in patterns: + if re.search(pattern, header_lower, re.IGNORECASE): + column_types[i] = col_type + break + if i in column_types: + break + + # If no service column found, assume first column + if "service" not in column_types.values() and header_row: + for i, header in enumerate(header_row): + if i not in column_types: + column_types[i] = "service" + break + + return column_types + + def _parse_row( + self, + row: list[str], + column_types: dict[int, str], + ) -> Optional[ParsedScheduleRow]: + """ + Parse a single data row using identified column types. + + Args: + row: Table row cells + column_types: Column index to type mapping + + Returns: + ParsedScheduleRow or None + """ + service = "" + interval_miles: Optional[int] = None + interval_months: Optional[int] = None + details: Optional[str] = None + fluid_spec: Optional[str] = None + + # Extract values based on column types + for i, cell in enumerate(row): + cell_value = cell.strip() + if not cell_value: + continue + + col_type = column_types.get(i) + + if col_type == "service": + service = cell_value + elif col_type == "miles": + miles = self._extract_miles(cell_value) + if miles: + interval_miles = miles + elif col_type == "months": + months = self._extract_months(cell_value) + if months: + interval_months = months + elif col_type == "details": + details = cell_value + # Also check for fluid specs in details + fluid_match = maintenance_matcher.extract_fluid_spec(cell_value) + if fluid_match: + fluid_spec = fluid_match.value + + # If no explicit miles/months columns, try to extract from service text + if not interval_miles and not interval_months: + mileage_match = maintenance_matcher.extract_mileage_interval(service) + time_match = maintenance_matcher.extract_time_interval(service) + if mileage_match: + interval_miles = mileage_match.value + if time_match: + interval_months = time_match.value + + # Check for intervals in any cell + if not interval_miles: + for cell in row: + mileage_match = maintenance_matcher.extract_mileage_interval(cell) + if mileage_match: + interval_miles = mileage_match.value + break + + if not interval_months: + for cell in row: + time_match = maintenance_matcher.extract_time_interval(cell) + if time_match: + interval_months = time_match.value + break + + # Skip if no service identified + if not service: + return None + + # Map service to normalized name and subtypes + service_match = service_mapper.map_service(service) + + normalized_service = service_match.normalized_name if service_match else None + subtypes = service_match.subtypes if service_match else [] + service_confidence = service_match.confidence if service_match else 0.5 + + # Calculate overall confidence + interval_confidence = 0.0 + if interval_miles: + interval_confidence = max(interval_confidence, 0.8) + if interval_months: + interval_confidence = max(interval_confidence, 0.8) + + confidence = (service_confidence + interval_confidence) / 2 if interval_confidence else service_confidence * 0.7 + + return ParsedScheduleRow( + service=service, + normalized_service=normalized_service, + subtypes=subtypes, + interval_miles=interval_miles, + interval_months=interval_months, + details=details, + fluid_spec=fluid_spec, + confidence=confidence, + raw_row=row, + ) + + def _parse_without_headers( + self, data_rows: list[list[str]] + ) -> list[ParsedScheduleRow]: + """ + Parse table without clear headers by analyzing content. + + Args: + data_rows: Table rows + + Returns: + List of ParsedScheduleRow + """ + results = [] + + for row in data_rows: + if not row: + continue + + # Join all cells and try to extract info + row_text = " ".join(row) + + service_match = service_mapper.map_service(row_text) + mileage_match = maintenance_matcher.extract_mileage_interval(row_text) + time_match = maintenance_matcher.extract_time_interval(row_text) + fluid_match = maintenance_matcher.extract_fluid_spec(row_text) + + if service_match: + results.append( + ParsedScheduleRow( + service=row[0] if row else row_text, + normalized_service=service_match.normalized_name, + subtypes=service_match.subtypes, + interval_miles=mileage_match.value if mileage_match else None, + interval_months=time_match.value if time_match else None, + details=None, + fluid_spec=fluid_match.value if fluid_match else None, + confidence=service_match.confidence * 0.8, # Reduce for no-header parsing + raw_row=row, + ) + ) + + return results + + def _extract_miles(self, text: str) -> Optional[int]: + """Extract mileage value from cell text.""" + # First try pattern matcher + match = maintenance_matcher.extract_mileage_interval(text) + if match: + return match.value + + # Try simple number extraction + # Look for patterns like "5,000", "5000", "5K" + number_match = re.search(r"([\d,]+)(?:K)?", text.replace(" ", ""), re.IGNORECASE) + if number_match: + num_str = number_match.group(1).replace(",", "") + try: + value = int(num_str) + # Handle "5K" notation + if "K" in text.upper() and value < 1000: + value *= 1000 + if 500 <= value <= 150000: + return value + except ValueError: + pass + + return None + + def _extract_months(self, text: str) -> Optional[int]: + """Extract month interval from cell text.""" + # First try pattern matcher + match = maintenance_matcher.extract_time_interval(text) + if match: + return match.value + + # Try simple number extraction + number_match = re.search(r"(\d+)", text) + if number_match: + try: + value = int(number_match.group(1)) + if 1 <= value <= 120: + return value + except ValueError: + pass + + return None + + +# Singleton instance +table_parser = TableParser() diff --git a/ocr/requirements.txt b/ocr/requirements.txt index 5e374c1..8138d85 100644 --- a/ocr/requirements.txt +++ b/ocr/requirements.txt @@ -16,6 +16,9 @@ numpy>=1.24.0 # OCR Engines pytesseract>=0.3.10 +# PDF Processing +PyMuPDF>=1.23.0 + # Redis for job queue redis>=5.0.0 diff --git a/ocr/tests/test_maintenance_patterns.py b/ocr/tests/test_maintenance_patterns.py new file mode 100644 index 0000000..d9b43d1 --- /dev/null +++ b/ocr/tests/test_maintenance_patterns.py @@ -0,0 +1,164 @@ +"""Tests for maintenance pattern matching.""" +import pytest + +from app.patterns.maintenance_patterns import maintenance_matcher + + +class TestMileageIntervalExtraction: + """Tests for mileage interval extraction.""" + + def test_every_miles_pattern(self) -> None: + """Test 'every X miles' pattern.""" + result = maintenance_matcher.extract_mileage_interval("every 5,000 miles") + assert result is not None + assert result.value == 5000 + assert result.confidence >= 0.9 + + def test_every_miles_no_comma(self) -> None: + """Test 'every X miles' without comma.""" + result = maintenance_matcher.extract_mileage_interval("every 5000 miles") + assert result is not None + assert result.value == 5000 + + def test_at_miles_pattern(self) -> None: + """Test 'at X miles' pattern.""" + result = maintenance_matcher.extract_mileage_interval("at 30,000 mi") + assert result is not None + assert result.value == 30000 + + def test_miles_or_pattern(self) -> None: + """Test 'X miles or' pattern.""" + result = maintenance_matcher.extract_mileage_interval("7,500 miles or 12 months") + assert result is not None + assert result.value == 7500 + + def test_miles_slash_pattern(self) -> None: + """Test 'X mi/Y months' pattern.""" + result = maintenance_matcher.extract_mileage_interval("5000 mi/6 months") + assert result is not None + assert result.value == 5000 + + def test_no_mileage(self) -> None: + """Test text without mileage.""" + result = maintenance_matcher.extract_mileage_interval("check brake fluid") + assert result is None + + def test_unreasonable_mileage(self) -> None: + """Test unreasonably low/high mileage is rejected.""" + result = maintenance_matcher.extract_mileage_interval("every 10 miles") + assert result is None + + result = maintenance_matcher.extract_mileage_interval("every 1,000,000 miles") + assert result is None + + +class TestTimeIntervalExtraction: + """Tests for time interval extraction.""" + + def test_every_months_pattern(self) -> None: + """Test 'every X months' pattern.""" + result = maintenance_matcher.extract_time_interval("every 6 months") + assert result is not None + assert result.value == 6 + assert result.confidence >= 0.9 + + def test_months_or_pattern(self) -> None: + """Test 'X months or' pattern.""" + result = maintenance_matcher.extract_time_interval("12 months or 10,000 miles") + assert result is not None + assert result.value == 12 + + def test_annually_pattern(self) -> None: + """Test 'annually' keyword.""" + result = maintenance_matcher.extract_time_interval("check annually") + assert result is not None + assert result.value == 12 + + def test_semi_annual_pattern(self) -> None: + """Test 'semi-annually' keyword.""" + result = maintenance_matcher.extract_time_interval("inspect semi-annually") + assert result is not None + assert result.value == 6 + + def test_every_years_pattern(self) -> None: + """Test 'every X years' pattern.""" + result = maintenance_matcher.extract_time_interval("replace every 2 years") + assert result is not None + assert result.value == 24 + + def test_no_time_interval(self) -> None: + """Test text without time interval.""" + result = maintenance_matcher.extract_time_interval("change oil filter") + assert result is None + + +class TestFluidSpecExtraction: + """Tests for fluid specification extraction.""" + + def test_oil_viscosity(self) -> None: + """Test oil viscosity patterns.""" + result = maintenance_matcher.extract_fluid_spec("Use 0W-20 oil") + assert result is not None + assert result.value == "0W-20" + assert result.fluid_type == "oil" + + result = maintenance_matcher.extract_fluid_spec("5W-30 synthetic") + assert result is not None + assert result.value == "5W-30" + + def test_transmission_fluid(self) -> None: + """Test transmission fluid patterns.""" + result = maintenance_matcher.extract_fluid_spec("ATF-Z1 transmission fluid") + assert result is not None + assert "ATF" in result.value + assert result.fluid_type == "transmission" + + result = maintenance_matcher.extract_fluid_spec("Dexron VI") + assert result is not None + assert result.fluid_type == "transmission" + + def test_brake_fluid(self) -> None: + """Test brake fluid patterns.""" + result = maintenance_matcher.extract_fluid_spec("DOT 4 brake fluid") + assert result is not None + assert "DOT" in result.value + assert result.fluid_type == "brake" + + def test_extract_all_fluid_specs(self) -> None: + """Test extracting multiple fluid specs.""" + text = "Use 0W-20 oil and DOT 4 brake fluid" + results = maintenance_matcher.extract_all_fluid_specs(text) + assert len(results) >= 2 + + +class TestCombinedInterval: + """Tests for combined interval extraction.""" + + def test_mileage_and_time(self) -> None: + """Test extracting both intervals.""" + text = "every 5,000 miles or 6 months, whichever comes first" + mileage, time = maintenance_matcher.extract_combined_interval(text) + + assert mileage is not None + assert mileage.value == 5000 + + assert time is not None + assert time.value == 6 + + def test_only_mileage(self) -> None: + """Test with only mileage.""" + text = "replace every 30,000 miles" + mileage, time = maintenance_matcher.extract_combined_interval(text) + + assert mileage is not None + assert mileage.value == 30000 + assert time is None + + def test_only_time(self) -> None: + """Test with only time.""" + text = "inspect annually" + mileage, time = maintenance_matcher.extract_combined_interval(text) + + assert mileage is None + assert time is not None + assert time.value == 12 diff --git a/ocr/tests/test_service_mapping.py b/ocr/tests/test_service_mapping.py new file mode 100644 index 0000000..6355925 --- /dev/null +++ b/ocr/tests/test_service_mapping.py @@ -0,0 +1,116 @@ +"""Tests for service name mapping.""" +import pytest + +from app.patterns.service_mapping import service_mapper + + +class TestServiceMapping: + """Tests for service to subtype mapping.""" + + def test_engine_oil_mapping(self) -> None: + """Test engine oil service mapping.""" + result = service_mapper.map_service("engine oil") + assert result is not None + assert result.normalized_name == "Engine Oil Change" + assert "Engine Oil" in result.subtypes + assert result.category == "routine_maintenance" + + def test_oil_change_mapping(self) -> None: + """Test oil change service mapping.""" + result = service_mapper.map_service("oil change") + assert result is not None + assert "Engine Oil" in result.subtypes + + def test_air_filter_mapping(self) -> None: + """Test air filter service mapping.""" + result = service_mapper.map_service("engine air filter") + assert result is not None + assert result.normalized_name == "Air Filter Replacement" + assert "Air Filter Element" in result.subtypes + + def test_cabin_filter_mapping(self) -> None: + """Test cabin air filter mapping.""" + result = service_mapper.map_service("cabin air filter") + assert result is not None + assert "Cabin Air Filter / Purifier" in result.subtypes + + def test_tire_rotation_mapping(self) -> None: + """Test tire rotation mapping.""" + result = service_mapper.map_service("tire rotation") + assert result is not None + assert "Tires" in result.subtypes + assert result.confidence >= 0.95 + + def test_brake_inspection_mapping(self) -> None: + """Test brake inspection mapping.""" + result = service_mapper.map_service("brake inspection") + assert result is not None + assert "Brakes and Traction Control" in result.subtypes + + def test_coolant_mapping(self) -> None: + """Test coolant service mapping.""" + result = service_mapper.map_service("engine coolant") + assert result is not None + assert "Coolant" in result.subtypes + + def test_transmission_fluid_mapping(self) -> None: + """Test transmission fluid mapping.""" + result = service_mapper.map_service("automatic transmission fluid") + assert result is not None + assert "Fluid - A/T" in result.subtypes + + def test_spark_plug_mapping(self) -> None: + """Test spark plug mapping.""" + result = service_mapper.map_service("spark plugs") + assert result is not None + assert "Spark Plug" in result.subtypes + + def test_wiper_blade_mapping(self) -> None: + """Test wiper blade mapping.""" + result = service_mapper.map_service("wiper blades") + assert result is not None + assert "Wiper Blade" in result.subtypes + + def test_unknown_service(self) -> None: + """Test unknown service returns None.""" + result = service_mapper.map_service("quantum flux capacitor") + assert result is None + + def test_case_insensitive(self) -> None: + """Test mapping is case insensitive.""" + result = service_mapper.map_service("ENGINE OIL") + assert result is not None + assert "Engine Oil" in result.subtypes + + def test_partial_match(self) -> None: + """Test partial matching in longer text.""" + result = service_mapper.map_service("Replace engine oil and filter") + assert result is not None + assert "Engine Oil" in result.subtypes + + +class TestFuzzyMapping: + """Tests for fuzzy service mapping.""" + + def test_fuzzy_oil_change(self) -> None: + """Test fuzzy matching for oil change.""" + result = service_mapper.map_service_fuzzy("change the engine oil") + assert result is not None + assert "Engine Oil" in result.subtypes + + def test_fuzzy_low_threshold(self) -> None: + """Test fuzzy matching with low similarity.""" + result = service_mapper.map_service_fuzzy("oil", threshold=0.3) + assert result is not None # Should match "engine oil" partially + + +class TestKeywords: + """Tests for keyword extraction.""" + + def test_get_keywords(self) -> None: + """Test getting service keywords.""" + keywords = service_mapper.get_all_service_keywords() + assert len(keywords) > 0 + assert "engine oil" in keywords + assert "service" in keywords + assert "maintenance" in keywords diff --git a/ocr/tests/test_table_parser.py b/ocr/tests/test_table_parser.py new file mode 100644 index 0000000..a804f45 --- /dev/null +++ b/ocr/tests/test_table_parser.py @@ -0,0 +1,122 @@ +"""Tests for table parsing.""" +import pytest + +from app.table_extraction.parser import table_parser + + +class TestTableParsing: + """Tests for maintenance table parsing.""" + + def test_parse_simple_table(self) -> None: + """Test parsing a simple maintenance table.""" + header = ["Service", "Miles", "Months"] + data = [ + ["Engine Oil", "5,000", "6"], + ["Air Filter", "30,000", "24"], + ["Cabin Filter", "15,000", "12"], + ] + + results = table_parser.parse_table(header, data) + + assert len(results) == 3 + + # Check oil change + oil = next(r for r in results if "oil" in r.service.lower()) + assert oil.interval_miles == 5000 + assert oil.interval_months == 6 + + def test_parse_table_with_notes(self) -> None: + """Test parsing table with notes column.""" + header = ["Item", "Interval", "Notes"] + data = [ + ["Engine Oil", "5,000 miles or 6 months", "Use 0W-20"], + ["Brake Fluid", "30,000 miles", "DOT 4"], + ] + + results = table_parser.parse_table(header, data) + + assert len(results) == 2 + + def test_parse_without_headers(self) -> None: + """Test parsing table without clear headers.""" + data = [ + ["Engine oil change", "5,000 miles", "6 months"], + ["Tire rotation", "7,500 miles", ""], + ] + + results = table_parser._parse_without_headers(data) + + assert len(results) >= 1 + + def test_parse_text_block(self) -> None: + """Test parsing unstructured text.""" + text = """ + Engine oil: replace every 5,000 miles or 6 months + Air filter: replace every 30,000 miles + Tire rotation: every 7,500 miles + """ + + results = table_parser.parse_text_block(text) + + assert len(results) >= 2 + + +class TestColumnIdentification: + """Tests for column type identification.""" + + def test_identify_service_column(self) -> None: + """Test identifying service column.""" + header = ["Service Item", "Miles", "Months"] + columns = table_parser._identify_columns(header) + + assert columns.get(0) == "service" + assert columns.get(1) == "miles" + assert columns.get(2) == "months" + + def test_identify_maintenance_column(self) -> None: + """Test identifying 'maintenance' as service column.""" + header = ["Maintenance", "Interval", "Notes"] + columns = table_parser._identify_columns(header) + + assert columns.get(0) == "service" + + def test_identify_details_column(self) -> None: + """Test identifying details/notes column.""" + header = ["Item", "Miles", "Notes"] + columns = table_parser._identify_columns(header) + + assert columns.get(2) == "details" + + +class TestIntervalExtraction: + """Tests for interval extraction from cells.""" + + def test_extract_miles_with_comma(self) -> None: + """Test extracting miles with comma separator.""" + result = table_parser._extract_miles("5,000") + assert result == 5000 + + def test_extract_miles_without_comma(self) -> None: + """Test extracting miles without comma.""" + result = table_parser._extract_miles("5000") + assert result == 5000 + + def test_extract_miles_with_unit(self) -> None: + """Test extracting miles with unit.""" + result = table_parser._extract_miles("5,000 miles") + assert result == 5000 + + def test_extract_miles_k_notation(self) -> None: + """Test extracting miles with K notation.""" + result = table_parser._extract_miles("5K") + assert result == 5000 + + def test_extract_months(self) -> None: + """Test extracting months.""" + result = table_parser._extract_months("6") + assert result == 6 + + def test_extract_months_with_unit(self) -> None: + """Test extracting months with unit.""" + result = table_parser._extract_months("12 months") + assert result == 12