feat: add owner's manual OCR pipeline (refs #71)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

Implement async PDF processing for owner's manuals with maintenance
schedule extraction:

- Add PDF preprocessor with PyMuPDF for text/scanned PDF handling
- Add maintenance pattern matching (mileage, time, fluid specs)
- Add service name mapping to maintenance subtypes
- Add table detection and parsing for schedule tables
- Add manual extractor orchestrating the complete pipeline
- Add POST /extract/manual endpoint for async job submission
- Add Redis job queue support for manual extraction jobs
- Add progress tracking during processing

Processing pipeline:
1. Analyze PDF structure (text layer vs scanned)
2. Find maintenance schedule sections
3. Extract text or OCR scanned pages at 300 DPI
4. Detect and parse maintenance tables
5. Normalize service names and extract intervals
6. Return structured maintenance schedules with confidence scores

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 21:30:20 -06:00
parent b226ca59de
commit 3eb54211cb
20 changed files with 2904 additions and 14 deletions

View File

@@ -0,0 +1,164 @@
"""Tests for maintenance pattern matching."""
import pytest
from app.patterns.maintenance_patterns import maintenance_matcher
class TestMileageIntervalExtraction:
"""Tests for mileage interval extraction."""
def test_every_miles_pattern(self) -> None:
"""Test 'every X miles' pattern."""
result = maintenance_matcher.extract_mileage_interval("every 5,000 miles")
assert result is not None
assert result.value == 5000
assert result.confidence >= 0.9
def test_every_miles_no_comma(self) -> None:
"""Test 'every X miles' without comma."""
result = maintenance_matcher.extract_mileage_interval("every 5000 miles")
assert result is not None
assert result.value == 5000
def test_at_miles_pattern(self) -> None:
"""Test 'at X miles' pattern."""
result = maintenance_matcher.extract_mileage_interval("at 30,000 mi")
assert result is not None
assert result.value == 30000
def test_miles_or_pattern(self) -> None:
"""Test 'X miles or' pattern."""
result = maintenance_matcher.extract_mileage_interval("7,500 miles or 12 months")
assert result is not None
assert result.value == 7500
def test_miles_slash_pattern(self) -> None:
"""Test 'X mi/Y months' pattern."""
result = maintenance_matcher.extract_mileage_interval("5000 mi/6 months")
assert result is not None
assert result.value == 5000
def test_no_mileage(self) -> None:
"""Test text without mileage."""
result = maintenance_matcher.extract_mileage_interval("check brake fluid")
assert result is None
def test_unreasonable_mileage(self) -> None:
"""Test unreasonably low/high mileage is rejected."""
result = maintenance_matcher.extract_mileage_interval("every 10 miles")
assert result is None
result = maintenance_matcher.extract_mileage_interval("every 1,000,000 miles")
assert result is None
class TestTimeIntervalExtraction:
"""Tests for time interval extraction."""
def test_every_months_pattern(self) -> None:
"""Test 'every X months' pattern."""
result = maintenance_matcher.extract_time_interval("every 6 months")
assert result is not None
assert result.value == 6
assert result.confidence >= 0.9
def test_months_or_pattern(self) -> None:
"""Test 'X months or' pattern."""
result = maintenance_matcher.extract_time_interval("12 months or 10,000 miles")
assert result is not None
assert result.value == 12
def test_annually_pattern(self) -> None:
"""Test 'annually' keyword."""
result = maintenance_matcher.extract_time_interval("check annually")
assert result is not None
assert result.value == 12
def test_semi_annual_pattern(self) -> None:
"""Test 'semi-annually' keyword."""
result = maintenance_matcher.extract_time_interval("inspect semi-annually")
assert result is not None
assert result.value == 6
def test_every_years_pattern(self) -> None:
"""Test 'every X years' pattern."""
result = maintenance_matcher.extract_time_interval("replace every 2 years")
assert result is not None
assert result.value == 24
def test_no_time_interval(self) -> None:
"""Test text without time interval."""
result = maintenance_matcher.extract_time_interval("change oil filter")
assert result is None
class TestFluidSpecExtraction:
"""Tests for fluid specification extraction."""
def test_oil_viscosity(self) -> None:
"""Test oil viscosity patterns."""
result = maintenance_matcher.extract_fluid_spec("Use 0W-20 oil")
assert result is not None
assert result.value == "0W-20"
assert result.fluid_type == "oil"
result = maintenance_matcher.extract_fluid_spec("5W-30 synthetic")
assert result is not None
assert result.value == "5W-30"
def test_transmission_fluid(self) -> None:
"""Test transmission fluid patterns."""
result = maintenance_matcher.extract_fluid_spec("ATF-Z1 transmission fluid")
assert result is not None
assert "ATF" in result.value
assert result.fluid_type == "transmission"
result = maintenance_matcher.extract_fluid_spec("Dexron VI")
assert result is not None
assert result.fluid_type == "transmission"
def test_brake_fluid(self) -> None:
"""Test brake fluid patterns."""
result = maintenance_matcher.extract_fluid_spec("DOT 4 brake fluid")
assert result is not None
assert "DOT" in result.value
assert result.fluid_type == "brake"
def test_extract_all_fluid_specs(self) -> None:
"""Test extracting multiple fluid specs."""
text = "Use 0W-20 oil and DOT 4 brake fluid"
results = maintenance_matcher.extract_all_fluid_specs(text)
assert len(results) >= 2
class TestCombinedInterval:
"""Tests for combined interval extraction."""
def test_mileage_and_time(self) -> None:
"""Test extracting both intervals."""
text = "every 5,000 miles or 6 months, whichever comes first"
mileage, time = maintenance_matcher.extract_combined_interval(text)
assert mileage is not None
assert mileage.value == 5000
assert time is not None
assert time.value == 6
def test_only_mileage(self) -> None:
"""Test with only mileage."""
text = "replace every 30,000 miles"
mileage, time = maintenance_matcher.extract_combined_interval(text)
assert mileage is not None
assert mileage.value == 30000
assert time is None
def test_only_time(self) -> None:
"""Test with only time."""
text = "inspect annually"
mileage, time = maintenance_matcher.extract_combined_interval(text)
assert mileage is None
assert time is not None
assert time.value == 12