feat: add owner's manual OCR pipeline (refs #71)

Implement async PDF processing for owner's manuals with maintenance schedule extraction: - Add PDF preprocessor with PyMuPDF for text/scanned PDF handling - Add maintenance pattern matching (mileage, time, fluid specs) - Add service name mapping to maintenance subtypes - Add table detection and parsing for schedule tables - Add manual extractor orchestrating the complete pipeline - Add POST /extract/manual endpoint for async job submission - Add Redis job queue support for manual extraction jobs - Add progress tracking during processing Processing pipeline: 1. Analyze PDF structure (text layer vs scanned) 2. Find maintenance schedule sections 3. Extract text or OCR scanned pages at 300 DPI 4. Detect and parse maintenance tables 5. Normalize service names and extract intervals 6. Return structured maintenance schedules with confidence scores Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 21:30:20 -06:00
parent b226ca59de
commit 3eb54211cb
20 changed files with 2904 additions and 14 deletions
--- a/ocr/app/routers/jobs.py
+++ b/ocr/app/routers/jobs.py
@@ -1,11 +1,11 @@
 """Async OCR job endpoints."""
 import asyncio
 import logging
-from typing import Optional
+from typing import Optional, Union

 from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, UploadFile

-from app.models import JobResponse, JobSubmitRequest
+from app.models import JobResponse, JobSubmitRequest, ManualJobResponse
 from app.services import job_queue, ocr_service

 logger = logging.getLogger(__name__)
@@ -73,12 +73,13 @@ async def submit_job(
    )


-@router.get("/{job_id}", response_model=JobResponse)
-async def get_job_status(job_id: str) -> JobResponse:
+@router.get("/{job_id}", response_model=Union[JobResponse, ManualJobResponse])
+async def get_job_status(job_id: str) -> Union[JobResponse, ManualJobResponse]:
    """
    Get the status of an async OCR job.

    Poll this endpoint to check job progress and retrieve results.
+    Works for both regular OCR jobs and manual extraction jobs.

    Returns:
    - **pending**: Job is queued
@@ -86,15 +87,20 @@ async def get_job_status(job_id: str) -> JobResponse:
    - **completed**: Job finished successfully (includes result)
    - **failed**: Job failed (includes error message)
    """
+    # Try regular job first
    result = await job_queue.get_job_status(job_id)
+    if result is not None:
+        return result

-    if result is None:
-        raise HTTPException(
-            status_code=404,
-            detail=f"Job {job_id} not found. Jobs expire after 1 hour.",
-        )
+    # Try manual job
+    manual_result = await job_queue.get_manual_job_status(job_id)
+    if manual_result is not None:
+        return manual_result

-    return result
+    raise HTTPException(
+        status_code=404,
+        detail=f"Job {job_id} not found. Jobs expire after 1-2 hours.",
+    )


 async def process_job(job_id: str) -> None: