feat: add owner's manual OCR pipeline (refs #71)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement async PDF processing for owner's manuals with maintenance schedule extraction: - Add PDF preprocessor with PyMuPDF for text/scanned PDF handling - Add maintenance pattern matching (mileage, time, fluid specs) - Add service name mapping to maintenance subtypes - Add table detection and parsing for schedule tables - Add manual extractor orchestrating the complete pipeline - Add POST /extract/manual endpoint for async job submission - Add Redis job queue support for manual extraction jobs - Add progress tracking during processing Processing pipeline: 1. Analyze PDF structure (text layer vs scanned) 2. Find maintenance schedule sections 3. Extract text or OCR scanned pages at 300 DPI 4. Detect and parse maintenance tables 5. Normalize service names and extract intervals 6. Return structured maintenance schedules with confidence scores Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
164
ocr/tests/test_maintenance_patterns.py
Normal file
164
ocr/tests/test_maintenance_patterns.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""Tests for maintenance pattern matching."""
|
||||
import pytest
|
||||
|
||||
from app.patterns.maintenance_patterns import maintenance_matcher
|
||||
|
||||
|
||||
class TestMileageIntervalExtraction:
|
||||
"""Tests for mileage interval extraction."""
|
||||
|
||||
def test_every_miles_pattern(self) -> None:
|
||||
"""Test 'every X miles' pattern."""
|
||||
result = maintenance_matcher.extract_mileage_interval("every 5,000 miles")
|
||||
assert result is not None
|
||||
assert result.value == 5000
|
||||
assert result.confidence >= 0.9
|
||||
|
||||
def test_every_miles_no_comma(self) -> None:
|
||||
"""Test 'every X miles' without comma."""
|
||||
result = maintenance_matcher.extract_mileage_interval("every 5000 miles")
|
||||
assert result is not None
|
||||
assert result.value == 5000
|
||||
|
||||
def test_at_miles_pattern(self) -> None:
|
||||
"""Test 'at X miles' pattern."""
|
||||
result = maintenance_matcher.extract_mileage_interval("at 30,000 mi")
|
||||
assert result is not None
|
||||
assert result.value == 30000
|
||||
|
||||
def test_miles_or_pattern(self) -> None:
|
||||
"""Test 'X miles or' pattern."""
|
||||
result = maintenance_matcher.extract_mileage_interval("7,500 miles or 12 months")
|
||||
assert result is not None
|
||||
assert result.value == 7500
|
||||
|
||||
def test_miles_slash_pattern(self) -> None:
|
||||
"""Test 'X mi/Y months' pattern."""
|
||||
result = maintenance_matcher.extract_mileage_interval("5000 mi/6 months")
|
||||
assert result is not None
|
||||
assert result.value == 5000
|
||||
|
||||
def test_no_mileage(self) -> None:
|
||||
"""Test text without mileage."""
|
||||
result = maintenance_matcher.extract_mileage_interval("check brake fluid")
|
||||
assert result is None
|
||||
|
||||
def test_unreasonable_mileage(self) -> None:
|
||||
"""Test unreasonably low/high mileage is rejected."""
|
||||
result = maintenance_matcher.extract_mileage_interval("every 10 miles")
|
||||
assert result is None
|
||||
|
||||
result = maintenance_matcher.extract_mileage_interval("every 1,000,000 miles")
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestTimeIntervalExtraction:
|
||||
"""Tests for time interval extraction."""
|
||||
|
||||
def test_every_months_pattern(self) -> None:
|
||||
"""Test 'every X months' pattern."""
|
||||
result = maintenance_matcher.extract_time_interval("every 6 months")
|
||||
assert result is not None
|
||||
assert result.value == 6
|
||||
assert result.confidence >= 0.9
|
||||
|
||||
def test_months_or_pattern(self) -> None:
|
||||
"""Test 'X months or' pattern."""
|
||||
result = maintenance_matcher.extract_time_interval("12 months or 10,000 miles")
|
||||
assert result is not None
|
||||
assert result.value == 12
|
||||
|
||||
def test_annually_pattern(self) -> None:
|
||||
"""Test 'annually' keyword."""
|
||||
result = maintenance_matcher.extract_time_interval("check annually")
|
||||
assert result is not None
|
||||
assert result.value == 12
|
||||
|
||||
def test_semi_annual_pattern(self) -> None:
|
||||
"""Test 'semi-annually' keyword."""
|
||||
result = maintenance_matcher.extract_time_interval("inspect semi-annually")
|
||||
assert result is not None
|
||||
assert result.value == 6
|
||||
|
||||
def test_every_years_pattern(self) -> None:
|
||||
"""Test 'every X years' pattern."""
|
||||
result = maintenance_matcher.extract_time_interval("replace every 2 years")
|
||||
assert result is not None
|
||||
assert result.value == 24
|
||||
|
||||
def test_no_time_interval(self) -> None:
|
||||
"""Test text without time interval."""
|
||||
result = maintenance_matcher.extract_time_interval("change oil filter")
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestFluidSpecExtraction:
|
||||
"""Tests for fluid specification extraction."""
|
||||
|
||||
def test_oil_viscosity(self) -> None:
|
||||
"""Test oil viscosity patterns."""
|
||||
result = maintenance_matcher.extract_fluid_spec("Use 0W-20 oil")
|
||||
assert result is not None
|
||||
assert result.value == "0W-20"
|
||||
assert result.fluid_type == "oil"
|
||||
|
||||
result = maintenance_matcher.extract_fluid_spec("5W-30 synthetic")
|
||||
assert result is not None
|
||||
assert result.value == "5W-30"
|
||||
|
||||
def test_transmission_fluid(self) -> None:
|
||||
"""Test transmission fluid patterns."""
|
||||
result = maintenance_matcher.extract_fluid_spec("ATF-Z1 transmission fluid")
|
||||
assert result is not None
|
||||
assert "ATF" in result.value
|
||||
assert result.fluid_type == "transmission"
|
||||
|
||||
result = maintenance_matcher.extract_fluid_spec("Dexron VI")
|
||||
assert result is not None
|
||||
assert result.fluid_type == "transmission"
|
||||
|
||||
def test_brake_fluid(self) -> None:
|
||||
"""Test brake fluid patterns."""
|
||||
result = maintenance_matcher.extract_fluid_spec("DOT 4 brake fluid")
|
||||
assert result is not None
|
||||
assert "DOT" in result.value
|
||||
assert result.fluid_type == "brake"
|
||||
|
||||
def test_extract_all_fluid_specs(self) -> None:
|
||||
"""Test extracting multiple fluid specs."""
|
||||
text = "Use 0W-20 oil and DOT 4 brake fluid"
|
||||
results = maintenance_matcher.extract_all_fluid_specs(text)
|
||||
assert len(results) >= 2
|
||||
|
||||
|
||||
class TestCombinedInterval:
|
||||
"""Tests for combined interval extraction."""
|
||||
|
||||
def test_mileage_and_time(self) -> None:
|
||||
"""Test extracting both intervals."""
|
||||
text = "every 5,000 miles or 6 months, whichever comes first"
|
||||
mileage, time = maintenance_matcher.extract_combined_interval(text)
|
||||
|
||||
assert mileage is not None
|
||||
assert mileage.value == 5000
|
||||
|
||||
assert time is not None
|
||||
assert time.value == 6
|
||||
|
||||
def test_only_mileage(self) -> None:
|
||||
"""Test with only mileage."""
|
||||
text = "replace every 30,000 miles"
|
||||
mileage, time = maintenance_matcher.extract_combined_interval(text)
|
||||
|
||||
assert mileage is not None
|
||||
assert mileage.value == 30000
|
||||
assert time is None
|
||||
|
||||
def test_only_time(self) -> None:
|
||||
"""Test with only time."""
|
||||
text = "inspect annually"
|
||||
mileage, time = maintenance_matcher.extract_combined_interval(text)
|
||||
|
||||
assert mileage is None
|
||||
assert time is not None
|
||||
assert time.value == 12
|
||||
116
ocr/tests/test_service_mapping.py
Normal file
116
ocr/tests/test_service_mapping.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""Tests for service name mapping."""
|
||||
import pytest
|
||||
|
||||
from app.patterns.service_mapping import service_mapper
|
||||
|
||||
|
||||
class TestServiceMapping:
|
||||
"""Tests for service to subtype mapping."""
|
||||
|
||||
def test_engine_oil_mapping(self) -> None:
|
||||
"""Test engine oil service mapping."""
|
||||
result = service_mapper.map_service("engine oil")
|
||||
assert result is not None
|
||||
assert result.normalized_name == "Engine Oil Change"
|
||||
assert "Engine Oil" in result.subtypes
|
||||
assert result.category == "routine_maintenance"
|
||||
|
||||
def test_oil_change_mapping(self) -> None:
|
||||
"""Test oil change service mapping."""
|
||||
result = service_mapper.map_service("oil change")
|
||||
assert result is not None
|
||||
assert "Engine Oil" in result.subtypes
|
||||
|
||||
def test_air_filter_mapping(self) -> None:
|
||||
"""Test air filter service mapping."""
|
||||
result = service_mapper.map_service("engine air filter")
|
||||
assert result is not None
|
||||
assert result.normalized_name == "Air Filter Replacement"
|
||||
assert "Air Filter Element" in result.subtypes
|
||||
|
||||
def test_cabin_filter_mapping(self) -> None:
|
||||
"""Test cabin air filter mapping."""
|
||||
result = service_mapper.map_service("cabin air filter")
|
||||
assert result is not None
|
||||
assert "Cabin Air Filter / Purifier" in result.subtypes
|
||||
|
||||
def test_tire_rotation_mapping(self) -> None:
|
||||
"""Test tire rotation mapping."""
|
||||
result = service_mapper.map_service("tire rotation")
|
||||
assert result is not None
|
||||
assert "Tires" in result.subtypes
|
||||
assert result.confidence >= 0.95
|
||||
|
||||
def test_brake_inspection_mapping(self) -> None:
|
||||
"""Test brake inspection mapping."""
|
||||
result = service_mapper.map_service("brake inspection")
|
||||
assert result is not None
|
||||
assert "Brakes and Traction Control" in result.subtypes
|
||||
|
||||
def test_coolant_mapping(self) -> None:
|
||||
"""Test coolant service mapping."""
|
||||
result = service_mapper.map_service("engine coolant")
|
||||
assert result is not None
|
||||
assert "Coolant" in result.subtypes
|
||||
|
||||
def test_transmission_fluid_mapping(self) -> None:
|
||||
"""Test transmission fluid mapping."""
|
||||
result = service_mapper.map_service("automatic transmission fluid")
|
||||
assert result is not None
|
||||
assert "Fluid - A/T" in result.subtypes
|
||||
|
||||
def test_spark_plug_mapping(self) -> None:
|
||||
"""Test spark plug mapping."""
|
||||
result = service_mapper.map_service("spark plugs")
|
||||
assert result is not None
|
||||
assert "Spark Plug" in result.subtypes
|
||||
|
||||
def test_wiper_blade_mapping(self) -> None:
|
||||
"""Test wiper blade mapping."""
|
||||
result = service_mapper.map_service("wiper blades")
|
||||
assert result is not None
|
||||
assert "Wiper Blade" in result.subtypes
|
||||
|
||||
def test_unknown_service(self) -> None:
|
||||
"""Test unknown service returns None."""
|
||||
result = service_mapper.map_service("quantum flux capacitor")
|
||||
assert result is None
|
||||
|
||||
def test_case_insensitive(self) -> None:
|
||||
"""Test mapping is case insensitive."""
|
||||
result = service_mapper.map_service("ENGINE OIL")
|
||||
assert result is not None
|
||||
assert "Engine Oil" in result.subtypes
|
||||
|
||||
def test_partial_match(self) -> None:
|
||||
"""Test partial matching in longer text."""
|
||||
result = service_mapper.map_service("Replace engine oil and filter")
|
||||
assert result is not None
|
||||
assert "Engine Oil" in result.subtypes
|
||||
|
||||
|
||||
class TestFuzzyMapping:
|
||||
"""Tests for fuzzy service mapping."""
|
||||
|
||||
def test_fuzzy_oil_change(self) -> None:
|
||||
"""Test fuzzy matching for oil change."""
|
||||
result = service_mapper.map_service_fuzzy("change the engine oil")
|
||||
assert result is not None
|
||||
assert "Engine Oil" in result.subtypes
|
||||
|
||||
def test_fuzzy_low_threshold(self) -> None:
|
||||
"""Test fuzzy matching with low similarity."""
|
||||
result = service_mapper.map_service_fuzzy("oil", threshold=0.3)
|
||||
assert result is not None # Should match "engine oil" partially
|
||||
|
||||
|
||||
class TestKeywords:
|
||||
"""Tests for keyword extraction."""
|
||||
|
||||
def test_get_keywords(self) -> None:
|
||||
"""Test getting service keywords."""
|
||||
keywords = service_mapper.get_all_service_keywords()
|
||||
assert len(keywords) > 0
|
||||
assert "engine oil" in keywords
|
||||
assert "service" in keywords
|
||||
assert "maintenance" in keywords
|
||||
122
ocr/tests/test_table_parser.py
Normal file
122
ocr/tests/test_table_parser.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""Tests for table parsing."""
|
||||
import pytest
|
||||
|
||||
from app.table_extraction.parser import table_parser
|
||||
|
||||
|
||||
class TestTableParsing:
|
||||
"""Tests for maintenance table parsing."""
|
||||
|
||||
def test_parse_simple_table(self) -> None:
|
||||
"""Test parsing a simple maintenance table."""
|
||||
header = ["Service", "Miles", "Months"]
|
||||
data = [
|
||||
["Engine Oil", "5,000", "6"],
|
||||
["Air Filter", "30,000", "24"],
|
||||
["Cabin Filter", "15,000", "12"],
|
||||
]
|
||||
|
||||
results = table_parser.parse_table(header, data)
|
||||
|
||||
assert len(results) == 3
|
||||
|
||||
# Check oil change
|
||||
oil = next(r for r in results if "oil" in r.service.lower())
|
||||
assert oil.interval_miles == 5000
|
||||
assert oil.interval_months == 6
|
||||
|
||||
def test_parse_table_with_notes(self) -> None:
|
||||
"""Test parsing table with notes column."""
|
||||
header = ["Item", "Interval", "Notes"]
|
||||
data = [
|
||||
["Engine Oil", "5,000 miles or 6 months", "Use 0W-20"],
|
||||
["Brake Fluid", "30,000 miles", "DOT 4"],
|
||||
]
|
||||
|
||||
results = table_parser.parse_table(header, data)
|
||||
|
||||
assert len(results) == 2
|
||||
|
||||
def test_parse_without_headers(self) -> None:
|
||||
"""Test parsing table without clear headers."""
|
||||
data = [
|
||||
["Engine oil change", "5,000 miles", "6 months"],
|
||||
["Tire rotation", "7,500 miles", ""],
|
||||
]
|
||||
|
||||
results = table_parser._parse_without_headers(data)
|
||||
|
||||
assert len(results) >= 1
|
||||
|
||||
def test_parse_text_block(self) -> None:
|
||||
"""Test parsing unstructured text."""
|
||||
text = """
|
||||
Engine oil: replace every 5,000 miles or 6 months
|
||||
Air filter: replace every 30,000 miles
|
||||
Tire rotation: every 7,500 miles
|
||||
"""
|
||||
|
||||
results = table_parser.parse_text_block(text)
|
||||
|
||||
assert len(results) >= 2
|
||||
|
||||
|
||||
class TestColumnIdentification:
|
||||
"""Tests for column type identification."""
|
||||
|
||||
def test_identify_service_column(self) -> None:
|
||||
"""Test identifying service column."""
|
||||
header = ["Service Item", "Miles", "Months"]
|
||||
columns = table_parser._identify_columns(header)
|
||||
|
||||
assert columns.get(0) == "service"
|
||||
assert columns.get(1) == "miles"
|
||||
assert columns.get(2) == "months"
|
||||
|
||||
def test_identify_maintenance_column(self) -> None:
|
||||
"""Test identifying 'maintenance' as service column."""
|
||||
header = ["Maintenance", "Interval", "Notes"]
|
||||
columns = table_parser._identify_columns(header)
|
||||
|
||||
assert columns.get(0) == "service"
|
||||
|
||||
def test_identify_details_column(self) -> None:
|
||||
"""Test identifying details/notes column."""
|
||||
header = ["Item", "Miles", "Notes"]
|
||||
columns = table_parser._identify_columns(header)
|
||||
|
||||
assert columns.get(2) == "details"
|
||||
|
||||
|
||||
class TestIntervalExtraction:
|
||||
"""Tests for interval extraction from cells."""
|
||||
|
||||
def test_extract_miles_with_comma(self) -> None:
|
||||
"""Test extracting miles with comma separator."""
|
||||
result = table_parser._extract_miles("5,000")
|
||||
assert result == 5000
|
||||
|
||||
def test_extract_miles_without_comma(self) -> None:
|
||||
"""Test extracting miles without comma."""
|
||||
result = table_parser._extract_miles("5000")
|
||||
assert result == 5000
|
||||
|
||||
def test_extract_miles_with_unit(self) -> None:
|
||||
"""Test extracting miles with unit."""
|
||||
result = table_parser._extract_miles("5,000 miles")
|
||||
assert result == 5000
|
||||
|
||||
def test_extract_miles_k_notation(self) -> None:
|
||||
"""Test extracting miles with K notation."""
|
||||
result = table_parser._extract_miles("5K")
|
||||
assert result == 5000
|
||||
|
||||
def test_extract_months(self) -> None:
|
||||
"""Test extracting months."""
|
||||
result = table_parser._extract_months("6")
|
||||
assert result == 6
|
||||
|
||||
def test_extract_months_with_unit(self) -> None:
|
||||
"""Test extracting months with unit."""
|
||||
result = table_parser._extract_months("12 months")
|
||||
assert result == 12
|
||||
Reference in New Issue
Block a user