feat: add owner's manual OCR pipeline (refs #71)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement async PDF processing for owner's manuals with maintenance schedule extraction: - Add PDF preprocessor with PyMuPDF for text/scanned PDF handling - Add maintenance pattern matching (mileage, time, fluid specs) - Add service name mapping to maintenance subtypes - Add table detection and parsing for schedule tables - Add manual extractor orchestrating the complete pipeline - Add POST /extract/manual endpoint for async job submission - Add Redis job queue support for manual extraction jobs - Add progress tracking during processing Processing pipeline: 1. Analyze PDF structure (text layer vs scanned) 2. Find maintenance schedule sections 3. Extract text or OCR scanned pages at 300 DPI 4. Detect and parse maintenance tables 5. Normalize service names and extract intervals 6. Return structured maintenance schedules with confidence scores Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
"""Pattern matching modules for receipt field extraction."""
|
||||
"""Pattern matching modules for receipt and manual field extraction."""
|
||||
from app.patterns.date_patterns import DatePatternMatcher, date_matcher
|
||||
from app.patterns.currency_patterns import CurrencyPatternMatcher, currency_matcher
|
||||
from app.patterns.fuel_patterns import FuelPatternMatcher, fuel_matcher
|
||||
from app.patterns.maintenance_patterns import MaintenancePatternMatcher, maintenance_matcher
|
||||
from app.patterns.service_mapping import ServiceMapper, service_mapper
|
||||
|
||||
__all__ = [
|
||||
"DatePatternMatcher",
|
||||
@@ -10,4 +12,8 @@ __all__ = [
|
||||
"currency_matcher",
|
||||
"FuelPatternMatcher",
|
||||
"fuel_matcher",
|
||||
"MaintenancePatternMatcher",
|
||||
"maintenance_matcher",
|
||||
"ServiceMapper",
|
||||
"service_mapper",
|
||||
]
|
||||
|
||||
335
ocr/app/patterns/maintenance_patterns.py
Normal file
335
ocr/app/patterns/maintenance_patterns.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Maintenance schedule pattern matching for owner's manual extraction."""
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class MileageIntervalMatch:
|
||||
"""Result of mileage interval pattern matching."""
|
||||
|
||||
value: int # Miles
|
||||
raw_match: str
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimeIntervalMatch:
|
||||
"""Result of time interval pattern matching."""
|
||||
|
||||
value: int # Months
|
||||
raw_match: str
|
||||
confidence: float
|
||||
pattern_name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FluidSpecMatch:
|
||||
"""Result of fluid specification pattern matching."""
|
||||
|
||||
value: str # e.g., "0W-20", "ATF-Z1", "DOT 4"
|
||||
fluid_type: str # e.g., "oil", "transmission", "brake"
|
||||
raw_match: str
|
||||
confidence: float
|
||||
|
||||
|
||||
class MaintenancePatternMatcher:
|
||||
"""Extract maintenance-specific data from owner's manual text."""
|
||||
|
||||
# Mileage interval patterns
|
||||
MILEAGE_PATTERNS = [
|
||||
# "every 5,000 miles" or "every 5000 miles"
|
||||
(
|
||||
r"every\s+([\d,]+)\s*(?:miles?|mi\.?)",
|
||||
"every_miles",
|
||||
0.95,
|
||||
),
|
||||
# "at 30,000 mi" or "at 30000 miles"
|
||||
(
|
||||
r"at\s+([\d,]+)\s*(?:miles?|mi\.?)",
|
||||
"at_miles",
|
||||
0.93,
|
||||
),
|
||||
# "5,000 miles or" (interval before "or")
|
||||
(
|
||||
r"([\d,]+)\s*(?:miles?|mi\.?)\s*(?:or|/)",
|
||||
"miles_or",
|
||||
0.90,
|
||||
),
|
||||
# "every 5,000-7,500 miles" (range - take lower)
|
||||
(
|
||||
r"every\s+([\d,]+)\s*[-–]\s*[\d,]+\s*(?:miles?|mi\.?)",
|
||||
"miles_range",
|
||||
0.88,
|
||||
),
|
||||
# "7,500 mi/12 months" (interval with slash)
|
||||
(
|
||||
r"([\d,]+)\s*(?:miles?|mi\.?)\s*/",
|
||||
"miles_slash",
|
||||
0.87,
|
||||
),
|
||||
# Standalone "X,XXX miles" in table context
|
||||
(
|
||||
r"(?<![0-9])([\d,]+)\s*(?:miles?|mi\.?)(?![a-z])",
|
||||
"standalone_miles",
|
||||
0.75,
|
||||
),
|
||||
]
|
||||
|
||||
# Time interval patterns
|
||||
TIME_PATTERNS = [
|
||||
# "every 6 months"
|
||||
(
|
||||
r"every\s+(\d+)\s*months?",
|
||||
"every_months",
|
||||
0.95,
|
||||
),
|
||||
# "6 months or" (interval before "or")
|
||||
(
|
||||
r"(\d+)\s*months?\s*(?:or|/)",
|
||||
"months_or",
|
||||
0.90,
|
||||
),
|
||||
# "annually" -> 12 months
|
||||
(
|
||||
r"\bannually\b",
|
||||
"annually",
|
||||
0.95,
|
||||
),
|
||||
# "semi-annually" or "semi-annual" -> 6 months
|
||||
(
|
||||
r"\bsemi-?annual(?:ly)?\b",
|
||||
"semi_annual",
|
||||
0.95,
|
||||
),
|
||||
# "every year" -> 12 months
|
||||
(
|
||||
r"every\s+year",
|
||||
"every_year",
|
||||
0.93,
|
||||
),
|
||||
# "every 2 years" -> 24 months
|
||||
(
|
||||
r"every\s+(\d+)\s*years?",
|
||||
"every_years",
|
||||
0.93,
|
||||
),
|
||||
# "12 mo/7,500 mi" or "12 months/"
|
||||
(
|
||||
r"(\d+)\s*(?:mo(?:nths?)?\.?)\s*/",
|
||||
"months_slash",
|
||||
0.87,
|
||||
),
|
||||
# Standalone "X months" in table context
|
||||
(
|
||||
r"(?<![0-9])(\d+)\s*months?(?![a-z])",
|
||||
"standalone_months",
|
||||
0.75,
|
||||
),
|
||||
]
|
||||
|
||||
# Fluid specification patterns
|
||||
FLUID_PATTERNS = [
|
||||
# Oil viscosity: 0W-20, 5W-30, 10W-40
|
||||
(
|
||||
r"\b(\d+W-\d+)\b",
|
||||
"oil",
|
||||
0.95,
|
||||
),
|
||||
# Full synthetic variants
|
||||
(
|
||||
r"(full\s+synthetic\s+\d+W-\d+)",
|
||||
"oil",
|
||||
0.93,
|
||||
),
|
||||
# Transmission fluid: ATF-Z1, ATF+4, Dexron VI
|
||||
(
|
||||
r"\b(ATF[- ]?\w+)\b",
|
||||
"transmission",
|
||||
0.90,
|
||||
),
|
||||
(
|
||||
r"\b(Dexron\s*(?:VI|IV|III)?)\b",
|
||||
"transmission",
|
||||
0.90,
|
||||
),
|
||||
(
|
||||
r"\b(Mercon\s*(?:V|LV|SP)?)\b",
|
||||
"transmission",
|
||||
0.90,
|
||||
),
|
||||
# Brake fluid: DOT 3, DOT 4, DOT 5.1
|
||||
(
|
||||
r"\b(DOT\s*\d(?:\.\d)?)\b",
|
||||
"brake",
|
||||
0.95,
|
||||
),
|
||||
# Coolant types
|
||||
(
|
||||
r"\b((?:Type\s+)?(?:2|II)\s+(?:coolant|antifreeze))\b",
|
||||
"coolant",
|
||||
0.88,
|
||||
),
|
||||
(
|
||||
r"\b((?:50/50|pre-mixed)\s+(?:coolant|antifreeze))\b",
|
||||
"coolant",
|
||||
0.85,
|
||||
),
|
||||
# Power steering fluid
|
||||
(
|
||||
r"\b(power\s+steering\s+fluid)\b",
|
||||
"power_steering",
|
||||
0.90,
|
||||
),
|
||||
]
|
||||
|
||||
def extract_mileage_interval(self, text: str) -> Optional[MileageIntervalMatch]:
|
||||
"""
|
||||
Extract mileage interval from text.
|
||||
|
||||
Args:
|
||||
text: Text to search for mileage intervals
|
||||
|
||||
Returns:
|
||||
MileageIntervalMatch or None if no interval found
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
|
||||
for pattern, name, confidence in self.MILEAGE_PATTERNS:
|
||||
match = re.search(pattern, text_lower, re.IGNORECASE)
|
||||
if match:
|
||||
# Extract the number and remove commas
|
||||
mileage_str = match.group(1).replace(",", "")
|
||||
mileage = int(mileage_str)
|
||||
|
||||
if self._is_reasonable_mileage(mileage):
|
||||
return MileageIntervalMatch(
|
||||
value=mileage,
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_time_interval(self, text: str) -> Optional[TimeIntervalMatch]:
|
||||
"""
|
||||
Extract time interval from text.
|
||||
|
||||
Args:
|
||||
text: Text to search for time intervals
|
||||
|
||||
Returns:
|
||||
TimeIntervalMatch or None if no interval found
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
|
||||
for pattern, name, confidence in self.TIME_PATTERNS:
|
||||
match = re.search(pattern, text_lower, re.IGNORECASE)
|
||||
if match:
|
||||
# Handle special cases
|
||||
if name == "annually":
|
||||
months = 12
|
||||
elif name == "semi_annual":
|
||||
months = 6
|
||||
elif name == "every_year":
|
||||
months = 12
|
||||
elif name == "every_years":
|
||||
years = int(match.group(1))
|
||||
months = years * 12
|
||||
else:
|
||||
months = int(match.group(1))
|
||||
|
||||
if self._is_reasonable_months(months):
|
||||
return TimeIntervalMatch(
|
||||
value=months,
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
pattern_name=name,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_fluid_spec(self, text: str) -> Optional[FluidSpecMatch]:
|
||||
"""
|
||||
Extract fluid specification from text.
|
||||
|
||||
Args:
|
||||
text: Text to search for fluid specs
|
||||
|
||||
Returns:
|
||||
FluidSpecMatch or None if no spec found
|
||||
"""
|
||||
for pattern, fluid_type, confidence in self.FLUID_PATTERNS:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
return FluidSpecMatch(
|
||||
value=match.group(1).upper() if fluid_type != "coolant" else match.group(1),
|
||||
fluid_type=fluid_type,
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def extract_all_fluid_specs(self, text: str) -> list[FluidSpecMatch]:
|
||||
"""
|
||||
Extract all fluid specifications from text.
|
||||
|
||||
Args:
|
||||
text: Text to search for fluid specs
|
||||
|
||||
Returns:
|
||||
List of FluidSpecMatch objects
|
||||
"""
|
||||
results = []
|
||||
seen_values: set[str] = set()
|
||||
|
||||
for pattern, fluid_type, confidence in self.FLUID_PATTERNS:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
value = match.group(1).upper() if fluid_type != "coolant" else match.group(1)
|
||||
if value not in seen_values:
|
||||
seen_values.add(value)
|
||||
results.append(
|
||||
FluidSpecMatch(
|
||||
value=value,
|
||||
fluid_type=fluid_type,
|
||||
raw_match=match.group(0),
|
||||
confidence=confidence,
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def extract_combined_interval(
|
||||
self, text: str
|
||||
) -> tuple[Optional[MileageIntervalMatch], Optional[TimeIntervalMatch]]:
|
||||
"""
|
||||
Extract both mileage and time intervals from a combined pattern.
|
||||
|
||||
Many schedules use patterns like "every 5,000 miles or 6 months".
|
||||
|
||||
Args:
|
||||
text: Text to search
|
||||
|
||||
Returns:
|
||||
Tuple of (mileage_match, time_match)
|
||||
"""
|
||||
mileage = self.extract_mileage_interval(text)
|
||||
time = self.extract_time_interval(text)
|
||||
return mileage, time
|
||||
|
||||
def _is_reasonable_mileage(self, mileage: int) -> bool:
|
||||
"""Check if mileage interval is reasonable for maintenance."""
|
||||
# Typical ranges: 1,000 to 100,000 miles
|
||||
return 500 <= mileage <= 150000
|
||||
|
||||
def _is_reasonable_months(self, months: int) -> bool:
|
||||
"""Check if month interval is reasonable for maintenance."""
|
||||
# Typical ranges: 1 to 120 months (10 years)
|
||||
return 1 <= months <= 120
|
||||
|
||||
|
||||
# Singleton instance
|
||||
maintenance_matcher = MaintenancePatternMatcher()
|
||||
259
ocr/app/patterns/service_mapping.py
Normal file
259
ocr/app/patterns/service_mapping.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""Service name normalization and mapping to maintenance subtypes."""
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceMapping:
|
||||
"""Mapping result from extracted text to maintenance subtypes."""
|
||||
|
||||
normalized_name: str # Standardized service name
|
||||
subtypes: list[str] # Maintenance subtypes from the system
|
||||
category: str # routine_maintenance, repair, performance_upgrade
|
||||
confidence: float
|
||||
|
||||
|
||||
# Maintenance subtypes from the system (must match exactly)
|
||||
ROUTINE_MAINTENANCE_SUBTYPES = [
|
||||
"Accelerator Pedal",
|
||||
"Air Filter Element",
|
||||
"Brakes and Traction Control",
|
||||
"Cabin Air Filter / Purifier",
|
||||
"Coolant",
|
||||
"Doors",
|
||||
"Drive Belt",
|
||||
"Engine Oil",
|
||||
"Evaporative Emissions System",
|
||||
"Exhaust System",
|
||||
"Fluid - A/T",
|
||||
"Fluid - Differential",
|
||||
"Fluid - M/T",
|
||||
"Fluid Filter - A/T",
|
||||
"Fluids",
|
||||
"Fuel Delivery and Air Induction",
|
||||
"Hood Shock / Support",
|
||||
"Neutral Safety Switch",
|
||||
"Parking Brake System",
|
||||
"Restraints and Safety Systems",
|
||||
"Shift Interlock A/T",
|
||||
"Spark Plug",
|
||||
"Steering and Suspension",
|
||||
"Tires",
|
||||
"Trunk / Liftgate Shock / Support",
|
||||
"Washer Fluid",
|
||||
"Wiper Blade",
|
||||
]
|
||||
|
||||
|
||||
class ServiceMapper:
|
||||
"""Map extracted service names to maintenance subtypes."""
|
||||
|
||||
# Mapping from common service terms to system subtypes
|
||||
# Keys are lowercase patterns, values are (normalized_name, subtypes, category, confidence)
|
||||
SERVICE_MAPPINGS: dict[str, tuple[str, list[str], str, float]] = {
|
||||
# Oil related
|
||||
"engine oil": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95),
|
||||
"oil change": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95),
|
||||
"motor oil": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.93),
|
||||
"oil and filter": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95),
|
||||
"oil & filter": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95),
|
||||
"change engine oil": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95),
|
||||
"replace engine oil": ("Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95),
|
||||
# Air filter
|
||||
"air filter": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.90),
|
||||
"engine air filter": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.95),
|
||||
"air cleaner": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.88),
|
||||
"air cleaner element": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.93),
|
||||
"replace air filter": ("Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.95),
|
||||
# Cabin filter
|
||||
"cabin air filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.95),
|
||||
"cabin filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.93),
|
||||
"a/c filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.88),
|
||||
"hvac filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.88),
|
||||
"interior air filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.90),
|
||||
"dust and pollen filter": ("Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.90),
|
||||
# Tires
|
||||
"tire rotation": ("Tire Rotation", ["Tires"], "routine_maintenance", 0.98),
|
||||
"rotate tires": ("Tire Rotation", ["Tires"], "routine_maintenance", 0.95),
|
||||
"tire inspection": ("Tire Inspection", ["Tires"], "routine_maintenance", 0.93),
|
||||
"inspect tires": ("Tire Inspection", ["Tires"], "routine_maintenance", 0.93),
|
||||
"check tire pressure": ("Tire Pressure Check", ["Tires"], "routine_maintenance", 0.90),
|
||||
"tire pressure": ("Tire Pressure Check", ["Tires"], "routine_maintenance", 0.85),
|
||||
# Brakes
|
||||
"brake inspection": ("Brake Inspection", ["Brakes and Traction Control"], "routine_maintenance", 0.95),
|
||||
"inspect brakes": ("Brake Inspection", ["Brakes and Traction Control"], "routine_maintenance", 0.93),
|
||||
"brake fluid": ("Brake Fluid Service", ["Brakes and Traction Control"], "routine_maintenance", 0.93),
|
||||
"brake pads": ("Brake Pad Inspection", ["Brakes and Traction Control"], "routine_maintenance", 0.90),
|
||||
"parking brake": ("Parking Brake Inspection", ["Parking Brake System"], "routine_maintenance", 0.93),
|
||||
# Coolant
|
||||
"coolant": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.90),
|
||||
"engine coolant": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.93),
|
||||
"antifreeze": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.90),
|
||||
"cooling system": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.88),
|
||||
"radiator fluid": ("Coolant Service", ["Coolant"], "routine_maintenance", 0.88),
|
||||
"replace coolant": ("Coolant Replacement", ["Coolant"], "routine_maintenance", 0.95),
|
||||
# Transmission
|
||||
"transmission fluid": ("Transmission Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.93),
|
||||
"automatic transmission fluid": ("Transmission Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.95),
|
||||
"atf": ("Transmission Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.90),
|
||||
"manual transmission fluid": ("Manual Transmission Fluid", ["Fluid - M/T"], "routine_maintenance", 0.95),
|
||||
"cvt fluid": ("CVT Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.93),
|
||||
"transmission filter": ("Transmission Filter", ["Fluid Filter - A/T"], "routine_maintenance", 0.93),
|
||||
# Differential
|
||||
"differential fluid": ("Differential Fluid Service", ["Fluid - Differential"], "routine_maintenance", 0.95),
|
||||
"rear differential": ("Differential Fluid Service", ["Fluid - Differential"], "routine_maintenance", 0.93),
|
||||
"front differential": ("Differential Fluid Service", ["Fluid - Differential"], "routine_maintenance", 0.93),
|
||||
"transfer case": ("Transfer Case Fluid", ["Fluid - Differential"], "routine_maintenance", 0.90),
|
||||
# Spark plugs
|
||||
"spark plug": ("Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.95),
|
||||
"spark plugs": ("Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.95),
|
||||
"replace spark plugs": ("Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.95),
|
||||
"ignition plugs": ("Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.88),
|
||||
# Drive belt
|
||||
"drive belt": ("Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.93),
|
||||
"serpentine belt": ("Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.93),
|
||||
"accessory belt": ("Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.90),
|
||||
"timing belt": ("Timing Belt Service", ["Drive Belt"], "routine_maintenance", 0.90),
|
||||
"v-belt": ("Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.88),
|
||||
# Wipers
|
||||
"wiper blade": ("Wiper Blade Replacement", ["Wiper Blade"], "routine_maintenance", 0.95),
|
||||
"wiper blades": ("Wiper Blade Replacement", ["Wiper Blade"], "routine_maintenance", 0.95),
|
||||
"windshield wiper": ("Wiper Blade Replacement", ["Wiper Blade"], "routine_maintenance", 0.93),
|
||||
"replace wipers": ("Wiper Blade Replacement", ["Wiper Blade"], "routine_maintenance", 0.93),
|
||||
# Washer fluid
|
||||
"washer fluid": ("Washer Fluid", ["Washer Fluid"], "routine_maintenance", 0.95),
|
||||
"windshield washer": ("Washer Fluid", ["Washer Fluid"], "routine_maintenance", 0.90),
|
||||
# Steering/Suspension
|
||||
"steering": ("Steering Inspection", ["Steering and Suspension"], "routine_maintenance", 0.85),
|
||||
"suspension": ("Suspension Inspection", ["Steering and Suspension"], "routine_maintenance", 0.85),
|
||||
"power steering": ("Power Steering Fluid", ["Steering and Suspension"], "routine_maintenance", 0.90),
|
||||
"power steering fluid": ("Power Steering Fluid", ["Steering and Suspension"], "routine_maintenance", 0.93),
|
||||
# Exhaust
|
||||
"exhaust": ("Exhaust System Inspection", ["Exhaust System"], "routine_maintenance", 0.88),
|
||||
"exhaust system": ("Exhaust System Inspection", ["Exhaust System"], "routine_maintenance", 0.93),
|
||||
# Fuel system
|
||||
"fuel filter": ("Fuel Filter Replacement", ["Fuel Delivery and Air Induction"], "routine_maintenance", 0.93),
|
||||
"fuel system": ("Fuel System Inspection", ["Fuel Delivery and Air Induction"], "routine_maintenance", 0.88),
|
||||
"fuel injection": ("Fuel Injection Service", ["Fuel Delivery and Air Induction"], "routine_maintenance", 0.88),
|
||||
# Emissions
|
||||
"evaporative emissions": ("Evaporative Emissions Inspection", ["Evaporative Emissions System"], "routine_maintenance", 0.93),
|
||||
"evap system": ("Evaporative Emissions Inspection", ["Evaporative Emissions System"], "routine_maintenance", 0.90),
|
||||
"emissions": ("Evaporative Emissions Inspection", ["Evaporative Emissions System"], "routine_maintenance", 0.80),
|
||||
# Safety systems
|
||||
"seat belt": ("Safety Systems Inspection", ["Restraints and Safety Systems"], "routine_maintenance", 0.90),
|
||||
"airbag": ("Safety Systems Inspection", ["Restraints and Safety Systems"], "routine_maintenance", 0.85),
|
||||
"restraint": ("Safety Systems Inspection", ["Restraints and Safety Systems"], "routine_maintenance", 0.85),
|
||||
# Miscellaneous
|
||||
"battery": ("Battery Inspection", ["Fluids"], "routine_maintenance", 0.80),
|
||||
"inspect battery": ("Battery Inspection", ["Fluids"], "routine_maintenance", 0.85),
|
||||
"door hinges": ("Door Lubrication", ["Doors"], "routine_maintenance", 0.85),
|
||||
"hood shock": ("Hood Shock Inspection", ["Hood Shock / Support"], "routine_maintenance", 0.90),
|
||||
"trunk shock": ("Trunk Shock Inspection", ["Trunk / Liftgate Shock / Support"], "routine_maintenance", 0.90),
|
||||
"liftgate": ("Liftgate Inspection", ["Trunk / Liftgate Shock / Support"], "routine_maintenance", 0.88),
|
||||
}
|
||||
|
||||
# Pattern-based mappings for fuzzy matching
|
||||
SERVICE_PATTERNS: list[tuple[str, str, list[str], str, float]] = [
|
||||
# (regex_pattern, normalized_name, subtypes, category, confidence)
|
||||
(r"oil\s+(?:and|&)\s+filter", "Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.95),
|
||||
(r"(?:change|replace)\s+(?:the\s+)?oil", "Engine Oil Change", ["Engine Oil"], "routine_maintenance", 0.93),
|
||||
(r"(?:inspect|check)\s+(?:the\s+)?brakes?", "Brake Inspection", ["Brakes and Traction Control"], "routine_maintenance", 0.90),
|
||||
(r"(?:inspect|check)\s+(?:the\s+)?tires?", "Tire Inspection", ["Tires"], "routine_maintenance", 0.90),
|
||||
(r"(?:rotate|rotation)\s+(?:the\s+)?tires?", "Tire Rotation", ["Tires"], "routine_maintenance", 0.95),
|
||||
(r"(?:replace|change)\s+(?:the\s+)?(?:engine\s+)?air\s+filter", "Air Filter Replacement", ["Air Filter Element"], "routine_maintenance", 0.95),
|
||||
(r"(?:replace|change)\s+(?:the\s+)?cabin\s+(?:air\s+)?filter", "Cabin Air Filter Replacement", ["Cabin Air Filter / Purifier"], "routine_maintenance", 0.95),
|
||||
(r"(?:replace|change)\s+(?:the\s+)?spark\s+plugs?", "Spark Plug Replacement", ["Spark Plug"], "routine_maintenance", 0.95),
|
||||
(r"(?:replace|change)\s+(?:the\s+)?coolant", "Coolant Replacement", ["Coolant"], "routine_maintenance", 0.93),
|
||||
(r"(?:flush|drain)\s+(?:the\s+)?coolant", "Coolant Flush", ["Coolant"], "routine_maintenance", 0.93),
|
||||
(r"(?:replace|change)\s+(?:the\s+)?(?:a/?t|automatic\s+transmission)\s+fluid", "Transmission Fluid Service", ["Fluid - A/T"], "routine_maintenance", 0.93),
|
||||
(r"(?:inspect|check)\s+(?:the\s+)?(?:drive|serpentine|accessory)\s+belt", "Drive Belt Inspection", ["Drive Belt"], "routine_maintenance", 0.90),
|
||||
]
|
||||
|
||||
def map_service(self, service_text: str) -> Optional[ServiceMapping]:
|
||||
"""
|
||||
Map extracted service text to maintenance subtypes.
|
||||
|
||||
Args:
|
||||
service_text: Service name or description from the manual
|
||||
|
||||
Returns:
|
||||
ServiceMapping or None if no mapping found
|
||||
"""
|
||||
normalized_text = service_text.lower().strip()
|
||||
|
||||
# Try exact mapping first
|
||||
for key, (name, subtypes, category, conf) in self.SERVICE_MAPPINGS.items():
|
||||
if key in normalized_text:
|
||||
return ServiceMapping(
|
||||
normalized_name=name,
|
||||
subtypes=subtypes,
|
||||
category=category,
|
||||
confidence=conf,
|
||||
)
|
||||
|
||||
# Try pattern matching
|
||||
for pattern, name, subtypes, category, conf in self.SERVICE_PATTERNS:
|
||||
if re.search(pattern, normalized_text, re.IGNORECASE):
|
||||
return ServiceMapping(
|
||||
normalized_name=name,
|
||||
subtypes=subtypes,
|
||||
category=category,
|
||||
confidence=conf,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def map_service_fuzzy(self, service_text: str, threshold: float = 0.6) -> Optional[ServiceMapping]:
|
||||
"""
|
||||
Map service text with fuzzy matching for typos and variations.
|
||||
|
||||
Args:
|
||||
service_text: Service name or description
|
||||
threshold: Minimum similarity threshold (0.0-1.0)
|
||||
|
||||
Returns:
|
||||
ServiceMapping or None
|
||||
"""
|
||||
# First try exact matching
|
||||
result = self.map_service(service_text)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Fall back to word overlap matching
|
||||
words = set(service_text.lower().split())
|
||||
|
||||
best_match: Optional[ServiceMapping] = None
|
||||
best_score = 0.0
|
||||
|
||||
for key, (name, subtypes, category, conf) in self.SERVICE_MAPPINGS.items():
|
||||
key_words = set(key.split())
|
||||
overlap = len(words & key_words)
|
||||
total = len(words | key_words)
|
||||
|
||||
if total > 0:
|
||||
score = overlap / total
|
||||
if score > best_score and score >= threshold:
|
||||
best_score = score
|
||||
best_match = ServiceMapping(
|
||||
normalized_name=name,
|
||||
subtypes=subtypes,
|
||||
category=category,
|
||||
confidence=conf * score, # Reduce confidence by match quality
|
||||
)
|
||||
|
||||
return best_match
|
||||
|
||||
def get_all_service_keywords(self) -> list[str]:
|
||||
"""Get all service keywords for table header detection."""
|
||||
keywords = list(self.SERVICE_MAPPINGS.keys())
|
||||
# Add common header terms
|
||||
keywords.extend([
|
||||
"service", "maintenance", "item", "operation",
|
||||
"inspection", "replacement", "interval", "schedule",
|
||||
])
|
||||
return keywords
|
||||
|
||||
|
||||
# Singleton instance
|
||||
service_mapper = ServiceMapper()
|
||||
Reference in New Issue
Block a user