All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement async PDF processing for owner's manuals with maintenance schedule extraction: - Add PDF preprocessor with PyMuPDF for text/scanned PDF handling - Add maintenance pattern matching (mileage, time, fluid specs) - Add service name mapping to maintenance subtypes - Add table detection and parsing for schedule tables - Add manual extractor orchestrating the complete pipeline - Add POST /extract/manual endpoint for async job submission - Add Redis job queue support for manual extraction jobs - Add progress tracking during processing Processing pipeline: 1. Analyze PDF structure (text layer vs scanned) 2. Find maintenance schedule sections 3. Extract text or OCR scanned pages at 300 DPI 4. Detect and parse maintenance tables 5. Normalize service names and extract intervals 6. Return structured maintenance schedules with confidence scores Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
336 lines
9.3 KiB
Python
336 lines
9.3 KiB
Python
"""Maintenance schedule pattern matching for owner's manual extraction."""
|
||
import re
|
||
from dataclasses import dataclass
|
||
from typing import Optional
|
||
|
||
|
||
@dataclass
|
||
class MileageIntervalMatch:
|
||
"""Result of mileage interval pattern matching."""
|
||
|
||
value: int # Miles
|
||
raw_match: str
|
||
confidence: float
|
||
pattern_name: str
|
||
|
||
|
||
@dataclass
|
||
class TimeIntervalMatch:
|
||
"""Result of time interval pattern matching."""
|
||
|
||
value: int # Months
|
||
raw_match: str
|
||
confidence: float
|
||
pattern_name: str
|
||
|
||
|
||
@dataclass
|
||
class FluidSpecMatch:
|
||
"""Result of fluid specification pattern matching."""
|
||
|
||
value: str # e.g., "0W-20", "ATF-Z1", "DOT 4"
|
||
fluid_type: str # e.g., "oil", "transmission", "brake"
|
||
raw_match: str
|
||
confidence: float
|
||
|
||
|
||
class MaintenancePatternMatcher:
|
||
"""Extract maintenance-specific data from owner's manual text."""
|
||
|
||
# Mileage interval patterns
|
||
MILEAGE_PATTERNS = [
|
||
# "every 5,000 miles" or "every 5000 miles"
|
||
(
|
||
r"every\s+([\d,]+)\s*(?:miles?|mi\.?)",
|
||
"every_miles",
|
||
0.95,
|
||
),
|
||
# "at 30,000 mi" or "at 30000 miles"
|
||
(
|
||
r"at\s+([\d,]+)\s*(?:miles?|mi\.?)",
|
||
"at_miles",
|
||
0.93,
|
||
),
|
||
# "5,000 miles or" (interval before "or")
|
||
(
|
||
r"([\d,]+)\s*(?:miles?|mi\.?)\s*(?:or|/)",
|
||
"miles_or",
|
||
0.90,
|
||
),
|
||
# "every 5,000-7,500 miles" (range - take lower)
|
||
(
|
||
r"every\s+([\d,]+)\s*[-–]\s*[\d,]+\s*(?:miles?|mi\.?)",
|
||
"miles_range",
|
||
0.88,
|
||
),
|
||
# "7,500 mi/12 months" (interval with slash)
|
||
(
|
||
r"([\d,]+)\s*(?:miles?|mi\.?)\s*/",
|
||
"miles_slash",
|
||
0.87,
|
||
),
|
||
# Standalone "X,XXX miles" in table context
|
||
(
|
||
r"(?<![0-9])([\d,]+)\s*(?:miles?|mi\.?)(?![a-z])",
|
||
"standalone_miles",
|
||
0.75,
|
||
),
|
||
]
|
||
|
||
# Time interval patterns
|
||
TIME_PATTERNS = [
|
||
# "every 6 months"
|
||
(
|
||
r"every\s+(\d+)\s*months?",
|
||
"every_months",
|
||
0.95,
|
||
),
|
||
# "6 months or" (interval before "or")
|
||
(
|
||
r"(\d+)\s*months?\s*(?:or|/)",
|
||
"months_or",
|
||
0.90,
|
||
),
|
||
# "annually" -> 12 months
|
||
(
|
||
r"\bannually\b",
|
||
"annually",
|
||
0.95,
|
||
),
|
||
# "semi-annually" or "semi-annual" -> 6 months
|
||
(
|
||
r"\bsemi-?annual(?:ly)?\b",
|
||
"semi_annual",
|
||
0.95,
|
||
),
|
||
# "every year" -> 12 months
|
||
(
|
||
r"every\s+year",
|
||
"every_year",
|
||
0.93,
|
||
),
|
||
# "every 2 years" -> 24 months
|
||
(
|
||
r"every\s+(\d+)\s*years?",
|
||
"every_years",
|
||
0.93,
|
||
),
|
||
# "12 mo/7,500 mi" or "12 months/"
|
||
(
|
||
r"(\d+)\s*(?:mo(?:nths?)?\.?)\s*/",
|
||
"months_slash",
|
||
0.87,
|
||
),
|
||
# Standalone "X months" in table context
|
||
(
|
||
r"(?<![0-9])(\d+)\s*months?(?![a-z])",
|
||
"standalone_months",
|
||
0.75,
|
||
),
|
||
]
|
||
|
||
# Fluid specification patterns
|
||
FLUID_PATTERNS = [
|
||
# Oil viscosity: 0W-20, 5W-30, 10W-40
|
||
(
|
||
r"\b(\d+W-\d+)\b",
|
||
"oil",
|
||
0.95,
|
||
),
|
||
# Full synthetic variants
|
||
(
|
||
r"(full\s+synthetic\s+\d+W-\d+)",
|
||
"oil",
|
||
0.93,
|
||
),
|
||
# Transmission fluid: ATF-Z1, ATF+4, Dexron VI
|
||
(
|
||
r"\b(ATF[- ]?\w+)\b",
|
||
"transmission",
|
||
0.90,
|
||
),
|
||
(
|
||
r"\b(Dexron\s*(?:VI|IV|III)?)\b",
|
||
"transmission",
|
||
0.90,
|
||
),
|
||
(
|
||
r"\b(Mercon\s*(?:V|LV|SP)?)\b",
|
||
"transmission",
|
||
0.90,
|
||
),
|
||
# Brake fluid: DOT 3, DOT 4, DOT 5.1
|
||
(
|
||
r"\b(DOT\s*\d(?:\.\d)?)\b",
|
||
"brake",
|
||
0.95,
|
||
),
|
||
# Coolant types
|
||
(
|
||
r"\b((?:Type\s+)?(?:2|II)\s+(?:coolant|antifreeze))\b",
|
||
"coolant",
|
||
0.88,
|
||
),
|
||
(
|
||
r"\b((?:50/50|pre-mixed)\s+(?:coolant|antifreeze))\b",
|
||
"coolant",
|
||
0.85,
|
||
),
|
||
# Power steering fluid
|
||
(
|
||
r"\b(power\s+steering\s+fluid)\b",
|
||
"power_steering",
|
||
0.90,
|
||
),
|
||
]
|
||
|
||
def extract_mileage_interval(self, text: str) -> Optional[MileageIntervalMatch]:
|
||
"""
|
||
Extract mileage interval from text.
|
||
|
||
Args:
|
||
text: Text to search for mileage intervals
|
||
|
||
Returns:
|
||
MileageIntervalMatch or None if no interval found
|
||
"""
|
||
text_lower = text.lower()
|
||
|
||
for pattern, name, confidence in self.MILEAGE_PATTERNS:
|
||
match = re.search(pattern, text_lower, re.IGNORECASE)
|
||
if match:
|
||
# Extract the number and remove commas
|
||
mileage_str = match.group(1).replace(",", "")
|
||
mileage = int(mileage_str)
|
||
|
||
if self._is_reasonable_mileage(mileage):
|
||
return MileageIntervalMatch(
|
||
value=mileage,
|
||
raw_match=match.group(0),
|
||
confidence=confidence,
|
||
pattern_name=name,
|
||
)
|
||
|
||
return None
|
||
|
||
def extract_time_interval(self, text: str) -> Optional[TimeIntervalMatch]:
|
||
"""
|
||
Extract time interval from text.
|
||
|
||
Args:
|
||
text: Text to search for time intervals
|
||
|
||
Returns:
|
||
TimeIntervalMatch or None if no interval found
|
||
"""
|
||
text_lower = text.lower()
|
||
|
||
for pattern, name, confidence in self.TIME_PATTERNS:
|
||
match = re.search(pattern, text_lower, re.IGNORECASE)
|
||
if match:
|
||
# Handle special cases
|
||
if name == "annually":
|
||
months = 12
|
||
elif name == "semi_annual":
|
||
months = 6
|
||
elif name == "every_year":
|
||
months = 12
|
||
elif name == "every_years":
|
||
years = int(match.group(1))
|
||
months = years * 12
|
||
else:
|
||
months = int(match.group(1))
|
||
|
||
if self._is_reasonable_months(months):
|
||
return TimeIntervalMatch(
|
||
value=months,
|
||
raw_match=match.group(0),
|
||
confidence=confidence,
|
||
pattern_name=name,
|
||
)
|
||
|
||
return None
|
||
|
||
def extract_fluid_spec(self, text: str) -> Optional[FluidSpecMatch]:
|
||
"""
|
||
Extract fluid specification from text.
|
||
|
||
Args:
|
||
text: Text to search for fluid specs
|
||
|
||
Returns:
|
||
FluidSpecMatch or None if no spec found
|
||
"""
|
||
for pattern, fluid_type, confidence in self.FLUID_PATTERNS:
|
||
match = re.search(pattern, text, re.IGNORECASE)
|
||
if match:
|
||
return FluidSpecMatch(
|
||
value=match.group(1).upper() if fluid_type != "coolant" else match.group(1),
|
||
fluid_type=fluid_type,
|
||
raw_match=match.group(0),
|
||
confidence=confidence,
|
||
)
|
||
|
||
return None
|
||
|
||
def extract_all_fluid_specs(self, text: str) -> list[FluidSpecMatch]:
|
||
"""
|
||
Extract all fluid specifications from text.
|
||
|
||
Args:
|
||
text: Text to search for fluid specs
|
||
|
||
Returns:
|
||
List of FluidSpecMatch objects
|
||
"""
|
||
results = []
|
||
seen_values: set[str] = set()
|
||
|
||
for pattern, fluid_type, confidence in self.FLUID_PATTERNS:
|
||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||
value = match.group(1).upper() if fluid_type != "coolant" else match.group(1)
|
||
if value not in seen_values:
|
||
seen_values.add(value)
|
||
results.append(
|
||
FluidSpecMatch(
|
||
value=value,
|
||
fluid_type=fluid_type,
|
||
raw_match=match.group(0),
|
||
confidence=confidence,
|
||
)
|
||
)
|
||
|
||
return results
|
||
|
||
def extract_combined_interval(
|
||
self, text: str
|
||
) -> tuple[Optional[MileageIntervalMatch], Optional[TimeIntervalMatch]]:
|
||
"""
|
||
Extract both mileage and time intervals from a combined pattern.
|
||
|
||
Many schedules use patterns like "every 5,000 miles or 6 months".
|
||
|
||
Args:
|
||
text: Text to search
|
||
|
||
Returns:
|
||
Tuple of (mileage_match, time_match)
|
||
"""
|
||
mileage = self.extract_mileage_interval(text)
|
||
time = self.extract_time_interval(text)
|
||
return mileage, time
|
||
|
||
def _is_reasonable_mileage(self, mileage: int) -> bool:
|
||
"""Check if mileage interval is reasonable for maintenance."""
|
||
# Typical ranges: 1,000 to 100,000 miles
|
||
return 500 <= mileage <= 150000
|
||
|
||
def _is_reasonable_months(self, months: int) -> bool:
|
||
"""Check if month interval is reasonable for maintenance."""
|
||
# Typical ranges: 1 to 120 months (10 years)
|
||
return 1 <= months <= 120
|
||
|
||
|
||
# Singleton instance
|
||
maintenance_matcher = MaintenancePatternMatcher()
|