Files
motovaultpro/ocr/app/models/schemas.py
Eric Gullickson 3eb54211cb
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 3m1s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
feat: add owner's manual OCR pipeline (refs #71)
Implement async PDF processing for owner's manuals with maintenance
schedule extraction:

- Add PDF preprocessor with PyMuPDF for text/scanned PDF handling
- Add maintenance pattern matching (mileage, time, fluid specs)
- Add service name mapping to maintenance subtypes
- Add table detection and parsing for schedule tables
- Add manual extractor orchestrating the complete pipeline
- Add POST /extract/manual endpoint for async job submission
- Add Redis job queue support for manual extraction jobs
- Add progress tracking during processing

Processing pipeline:
1. Analyze PDF structure (text layer vs scanned)
2. Find maintenance schedule sections
3. Extract text or OCR scanned pages at 300 DPI
4. Detect and parse maintenance tables
5. Normalize service names and extract intervals
6. Return structured maintenance schedules with confidence scores

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 21:30:20 -06:00

172 lines
4.7 KiB
Python

"""Pydantic models for OCR API request/response validation."""
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
class DocumentType(str, Enum):
"""Types of documents that can be processed."""
VIN = "vin"
RECEIPT = "receipt"
MANUAL = "manual"
UNKNOWN = "unknown"
class ExtractedField(BaseModel):
"""A single extracted field with confidence score."""
value: str
confidence: float = Field(ge=0.0, le=1.0)
class BoundingBox(BaseModel):
"""Bounding box for detected region."""
x: int
y: int
width: int
height: int
class VinAlternative(BaseModel):
"""Alternative VIN candidate."""
vin: str
confidence: float = Field(ge=0.0, le=1.0)
class VinExtractionResponse(BaseModel):
"""Response from VIN extraction endpoint."""
success: bool
vin: Optional[str] = None
confidence: float = Field(ge=0.0, le=1.0)
bounding_box: Optional[BoundingBox] = Field(default=None, alias="boundingBox")
alternatives: list[VinAlternative] = Field(default_factory=list)
processing_time_ms: int = Field(alias="processingTimeMs")
error: Optional[str] = None
model_config = {"populate_by_name": True}
class OcrResponse(BaseModel):
"""Response from OCR extraction."""
success: bool
document_type: DocumentType = Field(alias="documentType")
raw_text: str = Field(alias="rawText")
confidence: float = Field(ge=0.0, le=1.0)
extracted_fields: dict[str, ExtractedField] = Field(
default_factory=dict, alias="extractedFields"
)
processing_time_ms: int = Field(alias="processingTimeMs")
model_config = {"populate_by_name": True}
class JobStatus(str, Enum):
"""Status of an async OCR job."""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class JobResponse(BaseModel):
"""Response for async job status."""
job_id: str = Field(alias="jobId")
status: JobStatus
progress: Optional[int] = Field(default=None, ge=0, le=100)
result: Optional[OcrResponse] = None
error: Optional[str] = None
model_config = {"populate_by_name": True}
class JobSubmitRequest(BaseModel):
"""Request to submit an async OCR job."""
callback_url: Optional[str] = Field(default=None, alias="callbackUrl")
model_config = {"populate_by_name": True}
class ReceiptExtractedField(BaseModel):
"""A single extracted field from a receipt with confidence."""
value: str | float
confidence: float = Field(ge=0.0, le=1.0)
class ReceiptExtractionResponse(BaseModel):
"""Response from receipt extraction endpoint."""
success: bool
receipt_type: str = Field(alias="receiptType")
extracted_fields: dict[str, ReceiptExtractedField] = Field(
default_factory=dict, alias="extractedFields"
)
raw_text: str = Field(alias="rawText")
processing_time_ms: int = Field(alias="processingTimeMs")
error: Optional[str] = None
model_config = {"populate_by_name": True}
# Manual extraction models
class ManualVehicleInfo(BaseModel):
"""Vehicle information extracted from manual."""
make: Optional[str] = None
model: Optional[str] = None
year: Optional[int] = None
class ManualMaintenanceSchedule(BaseModel):
"""A single maintenance schedule entry."""
service: str
interval_miles: Optional[int] = Field(default=None, alias="intervalMiles")
interval_months: Optional[int] = Field(default=None, alias="intervalMonths")
details: Optional[str] = None
confidence: float = Field(ge=0.0, le=1.0)
subtypes: list[str] = Field(default_factory=list)
model_config = {"populate_by_name": True}
class ManualExtractionResponse(BaseModel):
"""Response from manual extraction endpoint."""
success: bool
vehicle_info: Optional[ManualVehicleInfo] = Field(default=None, alias="vehicleInfo")
maintenance_schedules: list[ManualMaintenanceSchedule] = Field(
default_factory=list, alias="maintenanceSchedules"
)
raw_tables: list[dict] = Field(default_factory=list, alias="rawTables")
processing_time_ms: int = Field(alias="processingTimeMs")
total_pages: int = Field(alias="totalPages")
pages_processed: int = Field(alias="pagesProcessed")
error: Optional[str] = None
model_config = {"populate_by_name": True}
class ManualJobResponse(BaseModel):
"""Response for async manual extraction job."""
job_id: str = Field(alias="jobId")
status: JobStatus
progress: Optional[int] = Field(default=None, ge=0, le=100)
estimated_seconds: Optional[int] = Field(default=None, alias="estimatedSeconds")
result: Optional[ManualExtractionResponse] = None
error: Optional[str] = None
model_config = {"populate_by_name": True}