Files
motovaultpro/ocr/app/patterns/maintenance_patterns.py
Eric Gullickson 3eb54211cb
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
feat: add owner's manual OCR pipeline (refs #71)
Implement async PDF processing for owner's manuals with maintenance
schedule extraction:

- Add PDF preprocessor with PyMuPDF for text/scanned PDF handling
- Add maintenance pattern matching (mileage, time, fluid specs)
- Add service name mapping to maintenance subtypes
- Add table detection and parsing for schedule tables
- Add manual extractor orchestrating the complete pipeline
- Add POST /extract/manual endpoint for async job submission
- Add Redis job queue support for manual extraction jobs
- Add progress tracking during processing

Processing pipeline:
1. Analyze PDF structure (text layer vs scanned)
2. Find maintenance schedule sections
3. Extract text or OCR scanned pages at 300 DPI
4. Detect and parse maintenance tables
5. Normalize service names and extract intervals
6. Return structured maintenance schedules with confidence scores

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 21:30:20 -06:00

336 lines
9.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Maintenance schedule pattern matching for owner's manual extraction."""
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class MileageIntervalMatch:
"""Result of mileage interval pattern matching."""
value: int # Miles
raw_match: str
confidence: float
pattern_name: str
@dataclass
class TimeIntervalMatch:
"""Result of time interval pattern matching."""
value: int # Months
raw_match: str
confidence: float
pattern_name: str
@dataclass
class FluidSpecMatch:
"""Result of fluid specification pattern matching."""
value: str # e.g., "0W-20", "ATF-Z1", "DOT 4"
fluid_type: str # e.g., "oil", "transmission", "brake"
raw_match: str
confidence: float
class MaintenancePatternMatcher:
"""Extract maintenance-specific data from owner's manual text."""
# Mileage interval patterns
MILEAGE_PATTERNS = [
# "every 5,000 miles" or "every 5000 miles"
(
r"every\s+([\d,]+)\s*(?:miles?|mi\.?)",
"every_miles",
0.95,
),
# "at 30,000 mi" or "at 30000 miles"
(
r"at\s+([\d,]+)\s*(?:miles?|mi\.?)",
"at_miles",
0.93,
),
# "5,000 miles or" (interval before "or")
(
r"([\d,]+)\s*(?:miles?|mi\.?)\s*(?:or|/)",
"miles_or",
0.90,
),
# "every 5,000-7,500 miles" (range - take lower)
(
r"every\s+([\d,]+)\s*[-]\s*[\d,]+\s*(?:miles?|mi\.?)",
"miles_range",
0.88,
),
# "7,500 mi/12 months" (interval with slash)
(
r"([\d,]+)\s*(?:miles?|mi\.?)\s*/",
"miles_slash",
0.87,
),
# Standalone "X,XXX miles" in table context
(
r"(?<![0-9])([\d,]+)\s*(?:miles?|mi\.?)(?![a-z])",
"standalone_miles",
0.75,
),
]
# Time interval patterns
TIME_PATTERNS = [
# "every 6 months"
(
r"every\s+(\d+)\s*months?",
"every_months",
0.95,
),
# "6 months or" (interval before "or")
(
r"(\d+)\s*months?\s*(?:or|/)",
"months_or",
0.90,
),
# "annually" -> 12 months
(
r"\bannually\b",
"annually",
0.95,
),
# "semi-annually" or "semi-annual" -> 6 months
(
r"\bsemi-?annual(?:ly)?\b",
"semi_annual",
0.95,
),
# "every year" -> 12 months
(
r"every\s+year",
"every_year",
0.93,
),
# "every 2 years" -> 24 months
(
r"every\s+(\d+)\s*years?",
"every_years",
0.93,
),
# "12 mo/7,500 mi" or "12 months/"
(
r"(\d+)\s*(?:mo(?:nths?)?\.?)\s*/",
"months_slash",
0.87,
),
# Standalone "X months" in table context
(
r"(?<![0-9])(\d+)\s*months?(?![a-z])",
"standalone_months",
0.75,
),
]
# Fluid specification patterns
FLUID_PATTERNS = [
# Oil viscosity: 0W-20, 5W-30, 10W-40
(
r"\b(\d+W-\d+)\b",
"oil",
0.95,
),
# Full synthetic variants
(
r"(full\s+synthetic\s+\d+W-\d+)",
"oil",
0.93,
),
# Transmission fluid: ATF-Z1, ATF+4, Dexron VI
(
r"\b(ATF[- ]?\w+)\b",
"transmission",
0.90,
),
(
r"\b(Dexron\s*(?:VI|IV|III)?)\b",
"transmission",
0.90,
),
(
r"\b(Mercon\s*(?:V|LV|SP)?)\b",
"transmission",
0.90,
),
# Brake fluid: DOT 3, DOT 4, DOT 5.1
(
r"\b(DOT\s*\d(?:\.\d)?)\b",
"brake",
0.95,
),
# Coolant types
(
r"\b((?:Type\s+)?(?:2|II)\s+(?:coolant|antifreeze))\b",
"coolant",
0.88,
),
(
r"\b((?:50/50|pre-mixed)\s+(?:coolant|antifreeze))\b",
"coolant",
0.85,
),
# Power steering fluid
(
r"\b(power\s+steering\s+fluid)\b",
"power_steering",
0.90,
),
]
def extract_mileage_interval(self, text: str) -> Optional[MileageIntervalMatch]:
"""
Extract mileage interval from text.
Args:
text: Text to search for mileage intervals
Returns:
MileageIntervalMatch or None if no interval found
"""
text_lower = text.lower()
for pattern, name, confidence in self.MILEAGE_PATTERNS:
match = re.search(pattern, text_lower, re.IGNORECASE)
if match:
# Extract the number and remove commas
mileage_str = match.group(1).replace(",", "")
mileage = int(mileage_str)
if self._is_reasonable_mileage(mileage):
return MileageIntervalMatch(
value=mileage,
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
return None
def extract_time_interval(self, text: str) -> Optional[TimeIntervalMatch]:
"""
Extract time interval from text.
Args:
text: Text to search for time intervals
Returns:
TimeIntervalMatch or None if no interval found
"""
text_lower = text.lower()
for pattern, name, confidence in self.TIME_PATTERNS:
match = re.search(pattern, text_lower, re.IGNORECASE)
if match:
# Handle special cases
if name == "annually":
months = 12
elif name == "semi_annual":
months = 6
elif name == "every_year":
months = 12
elif name == "every_years":
years = int(match.group(1))
months = years * 12
else:
months = int(match.group(1))
if self._is_reasonable_months(months):
return TimeIntervalMatch(
value=months,
raw_match=match.group(0),
confidence=confidence,
pattern_name=name,
)
return None
def extract_fluid_spec(self, text: str) -> Optional[FluidSpecMatch]:
"""
Extract fluid specification from text.
Args:
text: Text to search for fluid specs
Returns:
FluidSpecMatch or None if no spec found
"""
for pattern, fluid_type, confidence in self.FLUID_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return FluidSpecMatch(
value=match.group(1).upper() if fluid_type != "coolant" else match.group(1),
fluid_type=fluid_type,
raw_match=match.group(0),
confidence=confidence,
)
return None
def extract_all_fluid_specs(self, text: str) -> list[FluidSpecMatch]:
"""
Extract all fluid specifications from text.
Args:
text: Text to search for fluid specs
Returns:
List of FluidSpecMatch objects
"""
results = []
seen_values: set[str] = set()
for pattern, fluid_type, confidence in self.FLUID_PATTERNS:
for match in re.finditer(pattern, text, re.IGNORECASE):
value = match.group(1).upper() if fluid_type != "coolant" else match.group(1)
if value not in seen_values:
seen_values.add(value)
results.append(
FluidSpecMatch(
value=value,
fluid_type=fluid_type,
raw_match=match.group(0),
confidence=confidence,
)
)
return results
def extract_combined_interval(
self, text: str
) -> tuple[Optional[MileageIntervalMatch], Optional[TimeIntervalMatch]]:
"""
Extract both mileage and time intervals from a combined pattern.
Many schedules use patterns like "every 5,000 miles or 6 months".
Args:
text: Text to search
Returns:
Tuple of (mileage_match, time_match)
"""
mileage = self.extract_mileage_interval(text)
time = self.extract_time_interval(text)
return mileage, time
def _is_reasonable_mileage(self, mileage: int) -> bool:
"""Check if mileage interval is reasonable for maintenance."""
# Typical ranges: 1,000 to 100,000 miles
return 500 <= mileage <= 150000
def _is_reasonable_months(self, months: int) -> bool:
"""Check if month interval is reasonable for maintenance."""
# Typical ranges: 1 to 120 months (10 years)
return 1 <= months <= 120
# Singleton instance
maintenance_matcher = MaintenancePatternMatcher()