feat: Expand OCR with fuel receipt scanning and maintenance extraction (#129) #147

Merged
egullickson merged 26 commits from issue-129-expand-ocr-fuel-receipt-maintenance into main 2026-02-13 02:25:55 +00:00
3 changed files with 9 additions and 12 deletions
Showing only changes of commit 209425a908 - Show all commits

View File

@@ -82,11 +82,12 @@ class ManualExtractor:
logger.info(f"Progress {percent}%: {message}")
try:
update_progress(5, "Sending PDF to Gemini for analysis")
update_progress(10, "Preparing extraction")
update_progress(50, "Processing with Gemini")
gemini_result = self._engine.extract_maintenance(pdf_bytes)
update_progress(50, "Mapping service names to maintenance subtypes")
update_progress(95, "Mapping results")
schedules: list[ExtractedSchedule] = []
for item in gemini_result.items:
@@ -112,8 +113,6 @@ class ManualExtractor:
)
)
update_progress(90, "Finalizing results")
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(

View File

@@ -280,11 +280,9 @@ async def extract_manual(
the time required for large documents.
Pipeline:
1. Analyze PDF structure (text layer vs scanned)
2. Find maintenance schedule sections
3. Extract text or perform OCR on scanned pages
4. Detect and parse maintenance tables
5. Extract service intervals and fluid specifications
1. Send entire PDF to Gemini for semantic extraction
2. Map extracted service names to system maintenance subtypes
3. Return structured results with confidence scores
- **file**: Owner's manual PDF (max 200MB)
- **vehicle_id**: Optional vehicle ID for context

View File

@@ -108,11 +108,11 @@ class TestNormalExtraction:
extractor.extract(_make_pdf_bytes(), progress_callback=track_progress)
# Should have progress calls at 5, 50, 90, 100
# Should have progress calls at 10, 50, 95, 100
percents = [p for p, _ in progress_calls]
assert 5 in percents
assert 10 in percents
assert 50 in percents
assert 90 in percents
assert 95 in percents
assert 100 in percents
# Percents should be non-decreasing
assert percents == sorted(percents)