From 209425a908ddd9437975bd0d0ecac76b88f02e54 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Wed, 11 Feb 2026 14:40:11 -0600 Subject: [PATCH] feat: rewrite ManualExtractor progress to spec-aligned 10/50/95/100 pattern (refs #143) Co-Authored-By: Claude Opus 4.6 --- ocr/app/extractors/manual_extractor.py | 7 +++---- ocr/app/routers/extract.py | 8 +++----- ocr/tests/test_manual_extractor.py | 6 +++--- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/ocr/app/extractors/manual_extractor.py b/ocr/app/extractors/manual_extractor.py index 174828e..c2ed271 100644 --- a/ocr/app/extractors/manual_extractor.py +++ b/ocr/app/extractors/manual_extractor.py @@ -82,11 +82,12 @@ class ManualExtractor: logger.info(f"Progress {percent}%: {message}") try: - update_progress(5, "Sending PDF to Gemini for analysis") + update_progress(10, "Preparing extraction") + update_progress(50, "Processing with Gemini") gemini_result = self._engine.extract_maintenance(pdf_bytes) - update_progress(50, "Mapping service names to maintenance subtypes") + update_progress(95, "Mapping results") schedules: list[ExtractedSchedule] = [] for item in gemini_result.items: @@ -112,8 +113,6 @@ class ManualExtractor: ) ) - update_progress(90, "Finalizing results") - processing_time_ms = int((time.time() - start_time) * 1000) logger.info( diff --git a/ocr/app/routers/extract.py b/ocr/app/routers/extract.py index edec582..45f657e 100644 --- a/ocr/app/routers/extract.py +++ b/ocr/app/routers/extract.py @@ -280,11 +280,9 @@ async def extract_manual( the time required for large documents. Pipeline: - 1. Analyze PDF structure (text layer vs scanned) - 2. Find maintenance schedule sections - 3. Extract text or perform OCR on scanned pages - 4. Detect and parse maintenance tables - 5. Extract service intervals and fluid specifications + 1. Send entire PDF to Gemini for semantic extraction + 2. Map extracted service names to system maintenance subtypes + 3. Return structured results with confidence scores - **file**: Owner's manual PDF (max 200MB) - **vehicle_id**: Optional vehicle ID for context diff --git a/ocr/tests/test_manual_extractor.py b/ocr/tests/test_manual_extractor.py index 38481b2..adf39d0 100644 --- a/ocr/tests/test_manual_extractor.py +++ b/ocr/tests/test_manual_extractor.py @@ -108,11 +108,11 @@ class TestNormalExtraction: extractor.extract(_make_pdf_bytes(), progress_callback=track_progress) - # Should have progress calls at 5, 50, 90, 100 + # Should have progress calls at 10, 50, 95, 100 percents = [p for p, _ in progress_calls] - assert 5 in percents + assert 10 in percents assert 50 in percents - assert 90 in percents + assert 95 in percents assert 100 in percents # Percents should be non-decreasing assert percents == sorted(percents)