From 209425a908ddd9437975bd0d0ecac76b88f02e54 Mon Sep 17 00:00:00 2001
From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com>
Date: Wed, 11 Feb 2026 14:40:11 -0600
Subject: [PATCH] feat: rewrite ManualExtractor progress to spec-aligned
 10/50/95/100 pattern (refs #143)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ocr/app/extractors/manual_extractor.py | 7 +++----
 ocr/app/routers/extract.py             | 8 +++-----
 ocr/tests/test_manual_extractor.py     | 6 +++---
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/ocr/app/extractors/manual_extractor.py b/ocr/app/extractors/manual_extractor.py
index 174828e..c2ed271 100644
--- a/ocr/app/extractors/manual_extractor.py
+++ b/ocr/app/extractors/manual_extractor.py
@@ -82,11 +82,12 @@ class ManualExtractor:
             logger.info(f"Progress {percent}%: {message}")
 
         try:
-            update_progress(5, "Sending PDF to Gemini for analysis")
+            update_progress(10, "Preparing extraction")
 
+            update_progress(50, "Processing with Gemini")
             gemini_result = self._engine.extract_maintenance(pdf_bytes)
 
-            update_progress(50, "Mapping service names to maintenance subtypes")
+            update_progress(95, "Mapping results")
 
             schedules: list[ExtractedSchedule] = []
             for item in gemini_result.items:
@@ -112,8 +113,6 @@ class ManualExtractor:
                     )
                 )
 
-            update_progress(90, "Finalizing results")
-
             processing_time_ms = int((time.time() - start_time) * 1000)
 
             logger.info(
diff --git a/ocr/app/routers/extract.py b/ocr/app/routers/extract.py
index edec582..45f657e 100644
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -280,11 +280,9 @@ async def extract_manual(
     the time required for large documents.
 
     Pipeline:
-    1. Analyze PDF structure (text layer vs scanned)
-    2. Find maintenance schedule sections
-    3. Extract text or perform OCR on scanned pages
-    4. Detect and parse maintenance tables
-    5. Extract service intervals and fluid specifications
+    1. Send entire PDF to Gemini for semantic extraction
+    2. Map extracted service names to system maintenance subtypes
+    3. Return structured results with confidence scores
 
     - **file**: Owner's manual PDF (max 200MB)
     - **vehicle_id**: Optional vehicle ID for context
diff --git a/ocr/tests/test_manual_extractor.py b/ocr/tests/test_manual_extractor.py
index 38481b2..adf39d0 100644
--- a/ocr/tests/test_manual_extractor.py
+++ b/ocr/tests/test_manual_extractor.py
@@ -108,11 +108,11 @@ class TestNormalExtraction:
 
         extractor.extract(_make_pdf_bytes(), progress_callback=track_progress)
 
-        # Should have progress calls at 5, 50, 90, 100
+        # Should have progress calls at 10, 50, 95, 100
         percents = [p for p, _ in progress_calls]
-        assert 5 in percents
+        assert 10 in percents
         assert 50 in percents
-        assert 90 in percents
+        assert 95 in percents
         assert 100 in percents
         # Percents should be non-decreasing
         assert percents == sorted(percents)