Files
motovaultpro/ocr/tests/test_manual_extractor.py
Eric Gullickson 57ed04d955 feat: rewrite ManualExtractor to use Gemini engine (refs #134)
Replace traditional OCR pipeline (table_detector, table_parser,
maintenance_patterns) with GeminiEngine for semantic PDF extraction.
Map Gemini serviceName values to 27 maintenance subtypes via
ServiceMapper fuzzy matching. Add 8 unit tests covering normal
extraction, unusual names, empty response, and error handling.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 10:24:11 -06:00

273 lines
9.2 KiB
Python

"""Tests for ManualExtractor Gemini-based maintenance schedule extraction.
Covers: normal extraction with subtype mapping, unusual service names,
empty Gemini response, and Gemini call failure.
All GeminiEngine calls are mocked.
"""
from unittest.mock import MagicMock, patch
import pytest
from app.engines.gemini_engine import (
GeminiProcessingError,
MaintenanceExtractionResult,
MaintenanceItem,
)
from app.extractors.manual_extractor import (
ExtractedSchedule,
ManualExtractionResult,
ManualExtractor,
)
# --- Helpers ---
def _make_pdf_bytes(size: int = 1024) -> bytes:
"""Create fake PDF bytes of a given size."""
header = b"%PDF-1.4 fake"
return header + b"\x00" * max(0, size - len(header))
def _make_gemini_result(items: list[MaintenanceItem]) -> MaintenanceExtractionResult:
"""Create a mock Gemini extraction result."""
return MaintenanceExtractionResult(items=items, model="gemini-2.5-flash")
# --- Successful extraction ---
class TestNormalExtraction:
"""Verify normal PDF extraction returns mapped schedules with subtypes."""
def test_pdf_with_maintenance_schedule_returns_mapped_items(self):
"""Normal: PDF with maintenance schedule returns extracted items with subtypes."""
items = [
MaintenanceItem(
service_name="Engine Oil Change",
interval_miles=5000,
interval_months=6,
details="Use 0W-20 full synthetic oil",
),
MaintenanceItem(
service_name="Tire Rotation",
interval_miles=5000,
interval_months=6,
details=None,
),
MaintenanceItem(
service_name="Cabin Filter",
interval_miles=15000,
interval_months=12,
details=None,
),
]
extractor = ManualExtractor()
extractor._engine = MagicMock()
extractor._engine.extract_maintenance.return_value = _make_gemini_result(items)
result = extractor.extract(_make_pdf_bytes())
assert result.success is True
assert result.error is None
assert len(result.maintenance_schedules) == 3
# Oil change should map to Engine Oil subtype
oil = result.maintenance_schedules[0]
assert oil.service == "Engine Oil Change"
assert oil.interval_miles == 5000
assert oil.interval_months == 6
assert oil.details == "Use 0W-20 full synthetic oil"
assert "Engine Oil" in oil.subtypes
assert oil.confidence > 0.0
# Tire rotation should map to Tires subtype
tire = result.maintenance_schedules[1]
assert tire.service == "Tire Rotation"
assert "Tires" in tire.subtypes
# Cabin filter should map to Cabin Air Filter / Purifier
cabin = result.maintenance_schedules[2]
assert "Cabin Air Filter / Purifier" in cabin.subtypes
def test_progress_callbacks_fire_at_intervals(self):
"""Progress callbacks fire at appropriate intervals during processing."""
items = [
MaintenanceItem(service_name="Oil Change", interval_miles=5000),
]
extractor = ManualExtractor()
extractor._engine = MagicMock()
extractor._engine.extract_maintenance.return_value = _make_gemini_result(items)
progress_calls: list[tuple[int, str]] = []
def track_progress(percent: int, message: str) -> None:
progress_calls.append((percent, message))
extractor.extract(_make_pdf_bytes(), progress_callback=track_progress)
# Should have progress calls at 5, 50, 90, 100
percents = [p for p, _ in progress_calls]
assert 5 in percents
assert 50 in percents
assert 90 in percents
assert 100 in percents
# Percents should be non-decreasing
assert percents == sorted(percents)
# --- Unusual service names ---
class TestUnusualServiceNames:
"""Verify that unusual service names still map to closest subtype."""
def test_unusual_names_fuzzy_match_to_subtypes(self):
"""Edge: PDF with unusual service names still maps to closest subtype."""
items = [
MaintenanceItem(
service_name="Replace engine air cleaner element",
interval_miles=30000,
),
MaintenanceItem(
service_name="Inspect drive belt for cracks",
interval_miles=60000,
),
]
extractor = ManualExtractor()
extractor._engine = MagicMock()
extractor._engine.extract_maintenance.return_value = _make_gemini_result(items)
result = extractor.extract(_make_pdf_bytes())
assert result.success is True
assert len(result.maintenance_schedules) == 2
# "air cleaner element" should fuzzy match to Air Filter Element
air_filter = result.maintenance_schedules[0]
assert "Air Filter Element" in air_filter.subtypes
# "drive belt" should match to Drive Belt
belt = result.maintenance_schedules[1]
assert "Drive Belt" in belt.subtypes
def test_unmapped_service_uses_gemini_name_directly(self):
"""Edge: Service name with no match uses Gemini name and default confidence."""
items = [
MaintenanceItem(
service_name="Recalibrate Quantum Flux Capacitor",
interval_miles=100000,
),
]
extractor = ManualExtractor()
extractor._engine = MagicMock()
extractor._engine.extract_maintenance.return_value = _make_gemini_result(items)
result = extractor.extract(_make_pdf_bytes())
assert result.success is True
assert len(result.maintenance_schedules) == 1
item = result.maintenance_schedules[0]
assert item.service == "Recalibrate Quantum Flux Capacitor"
assert item.subtypes == []
assert item.confidence == ManualExtractor.DEFAULT_CONFIDENCE
# --- Empty response ---
class TestEmptyResponse:
"""Verify handling of empty Gemini responses."""
def test_empty_gemini_response_returns_empty_schedules(self):
"""Edge: Empty Gemini response returns empty schedules list."""
extractor = ManualExtractor()
extractor._engine = MagicMock()
extractor._engine.extract_maintenance.return_value = _make_gemini_result([])
result = extractor.extract(_make_pdf_bytes())
assert result.success is True
assert result.maintenance_schedules == []
assert result.error is None
assert result.processing_time_ms >= 0
# --- Error handling ---
class TestErrorHandling:
"""Verify error handling when Gemini calls fail."""
def test_gemini_failure_returns_error_result(self):
"""Error: Gemini call failure returns ManualExtractionResult with error."""
extractor = ManualExtractor()
extractor._engine = MagicMock()
extractor._engine.extract_maintenance.side_effect = GeminiProcessingError(
"Gemini maintenance extraction failed: API quota exceeded"
)
result = extractor.extract(_make_pdf_bytes())
assert result.success is False
assert result.maintenance_schedules == []
assert result.error is not None
assert "quota exceeded" in result.error.lower()
def test_unexpected_exception_returns_error_result(self):
"""Error: Unexpected exception is caught and returned as error."""
extractor = ManualExtractor()
extractor._engine = MagicMock()
extractor._engine.extract_maintenance.side_effect = RuntimeError(
"Unexpected failure"
)
result = extractor.extract(_make_pdf_bytes())
assert result.success is False
assert result.error is not None
assert "Unexpected failure" in result.error
# --- Job queue integration ---
class TestJobQueueIntegration:
"""Verify the extractor works within the existing job queue flow."""
def test_extract_returns_all_required_fields(self):
"""The result contains all fields needed by process_manual_job in extract.py."""
items = [
MaintenanceItem(service_name="Oil Change", interval_miles=5000),
]
extractor = ManualExtractor()
extractor._engine = MagicMock()
extractor._engine.extract_maintenance.return_value = _make_gemini_result(items)
result = extractor.extract(_make_pdf_bytes())
# All fields used by process_manual_job must be present
assert hasattr(result, "success")
assert hasattr(result, "vehicle_info")
assert hasattr(result, "maintenance_schedules")
assert hasattr(result, "raw_tables")
assert hasattr(result, "processing_time_ms")
assert hasattr(result, "total_pages")
assert hasattr(result, "pages_processed")
assert hasattr(result, "error")
# Schedules have required fields
schedule = result.maintenance_schedules[0]
assert hasattr(schedule, "service")
assert hasattr(schedule, "interval_miles")
assert hasattr(schedule, "interval_months")
assert hasattr(schedule, "details")
assert hasattr(schedule, "confidence")
assert hasattr(schedule, "subtypes")