feat: add owner's manual OCR pipeline (refs #71)

Implement async PDF processing for owner's manuals with maintenance schedule extraction: - Add PDF preprocessor with PyMuPDF for text/scanned PDF handling - Add maintenance pattern matching (mileage, time, fluid specs) - Add service name mapping to maintenance subtypes - Add table detection and parsing for schedule tables - Add manual extractor orchestrating the complete pipeline - Add POST /extract/manual endpoint for async job submission - Add Redis job queue support for manual extraction jobs - Add progress tracking during processing Processing pipeline: 1. Analyze PDF structure (text layer vs scanned) 2. Find maintenance schedule sections 3. Extract text or OCR scanned pages at 300 DPI 4. Detect and parse maintenance tables 5. Normalize service names and extract intervals 6. Return structured maintenance schedules with confidence scores Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 21:30:20 -06:00
parent b226ca59de
commit 3eb54211cb
20 changed files with 2904 additions and 14 deletions
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -2,19 +2,24 @@
 import logging
 from typing import Optional

-from fastapi import APIRouter, File, Form, HTTPException, Query, UploadFile
+from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Query, UploadFile

 from app.extractors.vin_extractor import vin_extractor
 from app.extractors.receipt_extractor import receipt_extractor
+from app.extractors.manual_extractor import manual_extractor
 from app.models import (
    BoundingBox,
+    ManualExtractionResponse,
+    ManualJobResponse,
+    ManualMaintenanceSchedule,
+    ManualVehicleInfo,
    OcrResponse,
    ReceiptExtractedField,
    ReceiptExtractionResponse,
    VinAlternative,
    VinExtractionResponse,
 )
-from app.services import ocr_service
+from app.services import ocr_service, job_queue

 logger = logging.getLogger(__name__)

@@ -23,6 +28,9 @@ router = APIRouter(prefix="/extract", tags=["extract"])
 # Maximum file size for synchronous processing (10MB)
 MAX_SYNC_SIZE = 10 * 1024 * 1024

+# Maximum file size for manual/PDF processing (200MB)
+MAX_MANUAL_SIZE = 200 * 1024 * 1024
+

@router.post("", response_model=OcrResponse)
 async def extract_text(
@@ -257,3 +265,166 @@ async def extract_receipt(
        processingTimeMs=result.processing_time_ms,
        error=result.error,
    )
+
+
+@router.post("/manual", response_model=ManualJobResponse)
+async def extract_manual(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(..., description="Owner's manual PDF file"),
+    vehicle_id: Optional[str] = Form(None, description="Vehicle ID for context"),
+) -> ManualJobResponse:
+    """
+    Submit an async job to extract maintenance schedules from an owner's manual.
+
+    Supports PDF files up to 200MB. Processing is done asynchronously due to
+    the time required for large documents.
+
+    Pipeline:
+    1. Analyze PDF structure (text layer vs scanned)
+    2. Find maintenance schedule sections
+    3. Extract text or perform OCR on scanned pages
+    4. Detect and parse maintenance tables
+    5. Extract service intervals and fluid specifications
+
+    - **file**: Owner's manual PDF (max 200MB)
+    - **vehicle_id**: Optional vehicle ID for context
+
+    Returns immediately with job_id. Poll GET /jobs/{job_id} for status and results.
+
+    Response when completed:
+    - **vehicleInfo**: Detected make/model/year
+    - **maintenanceSchedules**: List of extracted maintenance items with intervals
+    - **rawTables**: Metadata about detected tables
+    - **processingTimeMs**: Total processing time
+    """
+    # Validate file presence
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+
+    # Validate file type
+    content_type = file.content_type or ""
+    if not content_type.startswith("application/pdf") and not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(
+            status_code=400,
+            detail="File must be a PDF document",
+        )
+
+    # Read file content
+    content = await file.read()
+    file_size = len(content)
+
+    # Validate file size
+    if file_size > MAX_MANUAL_SIZE:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Max: {MAX_MANUAL_SIZE // (1024*1024)}MB.",
+        )
+
+    if file_size == 0:
+        raise HTTPException(status_code=400, detail="Empty file provided")
+
+    logger.info(
+        f"Manual extraction: {file.filename}, "
+        f"size: {file_size} bytes, "
+        f"vehicle_id: {vehicle_id}"
+    )
+
+    # Estimate processing time based on file size
+    # Rough estimate: 1 second per MB for native PDFs, 3 seconds for scanned
+    estimated_seconds = max(30, (file_size // (1024 * 1024)) * 2)
+
+    # Submit job to queue
+    job_id = await job_queue.submit_manual_job(
+        file_bytes=content,
+        vehicle_id=vehicle_id,
+    )
+
+    # Schedule background processing
+    background_tasks.add_task(process_manual_job, job_id)
+
+    # Return initial status
+    return ManualJobResponse(
+        jobId=job_id,
+        status="pending",
+        progress=0,
+        estimatedSeconds=estimated_seconds,
+    )
+
+
+async def process_manual_job(job_id: str) -> None:
+    """Background task to process a manual extraction job."""
+    import asyncio
+
+    logger.info(f"Starting manual extraction job {job_id}")
+
+    try:
+        # Update status to processing
+        await job_queue.update_manual_job_progress(job_id, 5, "Starting extraction")
+
+        # Get job data
+        file_bytes = await job_queue.get_job_data(job_id)
+        if not file_bytes:
+            await job_queue.fail_manual_job(job_id, "Job data not found")
+            return
+
+        # Define progress callback
+        async def progress_callback(percent: int, message: str) -> None:
+            await job_queue.update_manual_job_progress(job_id, percent, message)
+
+        # Run extraction in thread pool (CPU-bound)
+        loop = asyncio.get_event_loop()
+
+        def sync_progress_callback(percent: int, message: str) -> None:
+            # Schedule the async update
+            asyncio.run_coroutine_threadsafe(
+                job_queue.update_manual_job_progress(job_id, percent, message),
+                loop,
+            )
+
+        result = await loop.run_in_executor(
+            None,
+            lambda: manual_extractor.extract(
+                pdf_bytes=file_bytes,
+                progress_callback=sync_progress_callback,
+            ),
+        )
+
+        if result.success:
+            # Convert to response model
+            vehicle_info = None
+            if result.vehicle_info:
+                vehicle_info = ManualVehicleInfo(
+                    make=result.vehicle_info.make,
+                    model=result.vehicle_info.model,
+                    year=result.vehicle_info.year,
+                )
+
+            schedules = [
+                ManualMaintenanceSchedule(
+                    service=s.service,
+                    intervalMiles=s.interval_miles,
+                    intervalMonths=s.interval_months,
+                    details=s.details,
+                    confidence=s.confidence,
+                    subtypes=s.subtypes,
+                )
+                for s in result.maintenance_schedules
+            ]
+
+            response = ManualExtractionResponse(
+                success=True,
+                vehicleInfo=vehicle_info,
+                maintenanceSchedules=schedules,
+                rawTables=result.raw_tables,
+                processingTimeMs=result.processing_time_ms,
+                totalPages=result.total_pages,
+                pagesProcessed=result.pages_processed,
+            )
+
+            await job_queue.complete_manual_job(job_id, response)
+        else:
+            await job_queue.fail_manual_job(job_id, result.error or "Extraction failed")
+
+    except Exception as e:
+        logger.error(f"Manual job {job_id} failed: {e}", exc_info=True)
+        await job_queue.fail_manual_job(job_id, str(e))