feat: rewrite ManualExtractor progress to spec-aligned 10/50/95/100 pattern (refs #143)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 14:40:11 -06:00
parent f9a650a4d7
commit 209425a908
3 changed files with 9 additions and 12 deletions
--- a/ocr/app/extractors/manual_extractor.py
+++ b/ocr/app/extractors/manual_extractor.py
@@ -82,11 +82,12 @@ class ManualExtractor:
            logger.info(f"Progress {percent}%: {message}")

        try:
-            update_progress(5, "Sending PDF to Gemini for analysis")
+            update_progress(10, "Preparing extraction")

+            update_progress(50, "Processing with Gemini")
            gemini_result = self._engine.extract_maintenance(pdf_bytes)

-            update_progress(50, "Mapping service names to maintenance subtypes")
+            update_progress(95, "Mapping results")

            schedules: list[ExtractedSchedule] = []
            for item in gemini_result.items:
@@ -112,8 +113,6 @@ class ManualExtractor:
                    )
                )

-            update_progress(90, "Finalizing results")
-
            processing_time_ms = int((time.time() - start_time) * 1000)

            logger.info(
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -280,11 +280,9 @@ async def extract_manual(
    the time required for large documents.

    Pipeline:
-    1. Analyze PDF structure (text layer vs scanned)
-    2. Find maintenance schedule sections
-    3. Extract text or perform OCR on scanned pages
-    4. Detect and parse maintenance tables
-    5. Extract service intervals and fluid specifications
+    1. Send entire PDF to Gemini for semantic extraction
+    2. Map extracted service names to system maintenance subtypes
+    3. Return structured results with confidence scores

    - **file**: Owner's manual PDF (max 200MB)
    - **vehicle_id**: Optional vehicle ID for context