Files
motovaultpro/ocr/app/extractors/vin_extractor.py
Eric Gullickson b9fe222f12
Some checks failed
Deploy to Staging / Build Images (pull_request) Failing after 4m14s
Deploy to Staging / Deploy to Staging (pull_request) Has been skipped
Deploy to Staging / Verify Staging (pull_request) Has been skipped
Deploy to Staging / Notify Staging Ready (pull_request) Has been skipped
Deploy to Staging / Notify Staging Failure (pull_request) Successful in 8s
fix: Build errors and tesseract removal
2026-02-07 12:12:04 -06:00

367 lines
13 KiB
Python

"""VIN-specific OCR extractor with preprocessing and validation."""
import logging
import os
import time
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import magic
from pillow_heif import register_heif_opener
from app.config import settings
from app.engines import OcrConfig, create_engine
from app.extractors.base import BaseExtractor
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
from app.validators.vin_validator import vin_validator
# Register HEIF/HEIC opener
register_heif_opener()
logger = logging.getLogger(__name__)
@dataclass
class VinAlternative:
"""Alternative VIN candidate with confidence."""
vin: str
confidence: float
@dataclass
class VinExtractionResult:
"""Result of VIN extraction."""
success: bool
vin: Optional[str] = None
confidence: float = 0.0
bounding_box: Optional[BoundingBox] = None
alternatives: list[VinAlternative] = field(default_factory=list)
processing_time_ms: int = 0
error: Optional[str] = None
raw_text: Optional[str] = None
class VinExtractor(BaseExtractor):
"""VIN-specific OCR extractor optimized for VIN plates and stickers."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
}
# VIN character whitelist (passed to engine for post-OCR filtering)
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
# Fixed debug output directory (inside container)
DEBUG_DIR = "/tmp/vin-debug"
def __init__(self) -> None:
"""Initialize VIN extractor with engine from factory."""
self._engine = create_engine()
self._debug = settings.log_level.upper() == "DEBUG"
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
"""Save image bytes to the debug session directory when LOG_LEVEL=debug."""
if not self._debug:
return
path = os.path.join(session_dir, name)
with open(path, "wb") as f:
f.write(data)
logger.debug("Saved debug image: %s (%d bytes)", name, len(data))
def _create_debug_session(self) -> Optional[str]:
"""Create a timestamped debug directory. Returns path or None."""
if not self._debug:
return None
ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
session_dir = os.path.join(self.DEBUG_DIR, ts)
os.makedirs(session_dir, exist_ok=True)
return session_dir
def extract(
self, image_bytes: bytes, content_type: Optional[str] = None
) -> VinExtractionResult:
"""
Extract VIN from an image using optimized preprocessing and OCR.
Args:
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
content_type: MIME type (auto-detected if not provided)
Returns:
VinExtractionResult with extracted VIN and metadata
"""
start_time = time.time()
# Detect content type if not provided
if not content_type:
content_type = self._detect_mime_type(image_bytes)
# Validate content type
if content_type not in self.SUPPORTED_TYPES:
return VinExtractionResult(
success=False,
error=f"Unsupported file type: {content_type}",
processing_time_ms=int((time.time() - start_time) * 1000),
)
try:
debug_session = self._create_debug_session()
logger.debug(
"VIN extraction input: %d bytes, content_type=%s",
len(image_bytes), content_type,
)
if debug_session:
self._save_debug_image(debug_session, "01_original.jpg", image_bytes)
# Apply VIN-optimized preprocessing
preprocessing_result = vin_preprocessor.preprocess(image_bytes)
preprocessed_bytes = preprocessing_result.image_bytes
logger.debug(
"Preprocessing steps: %s", preprocessing_result.preprocessing_applied
)
if debug_session:
self._save_debug_image(
debug_session, "02_preprocessed_adaptive.png", preprocessed_bytes
)
# Perform OCR with VIN-optimized settings
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
logger.debug("Primary OCR raw text: '%s'", raw_text)
logger.debug("Primary OCR word confidences: %s", word_confidences)
# Extract VIN candidates from raw text
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Primary OCR candidates: %s", candidates)
if not candidates:
# No VIN candidates found - try alternate OCR configurations
candidates = self._try_alternate_ocr(preprocessed_bytes)
if not candidates:
# Try grayscale-only (no thresholding) — OCR engines often
# perform better on non-binarized input because they do
# their own internal preprocessing.
gray_result = vin_preprocessor.preprocess(
image_bytes, apply_threshold=False
)
logger.debug(
"Grayscale preprocessing steps: %s",
gray_result.preprocessing_applied,
)
if debug_session:
self._save_debug_image(
debug_session, "04_preprocessed_gray.png",
gray_result.image_bytes,
)
raw_text, word_confidences = self._perform_ocr(
gray_result.image_bytes
)
logger.debug("Gray primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Gray primary candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
gray_result.image_bytes, prefix="Gray"
)
if not candidates:
# Try alternative preprocessing (Otsu's thresholding)
otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
logger.debug(
"Otsu preprocessing steps: %s",
otsu_result.preprocessing_applied,
)
if debug_session:
self._save_debug_image(
debug_session, "03_preprocessed_otsu.png",
otsu_result.image_bytes,
)
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
logger.debug("Otsu primary raw text: '%s'", raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("Otsu primary candidates: %s", candidates)
if not candidates:
candidates = self._try_alternate_ocr(
otsu_result.image_bytes, prefix="Otsu"
)
if not candidates:
logger.debug("No VIN pattern found in any OCR attempt")
return VinExtractionResult(
success=False,
error="No VIN pattern found in image",
raw_text=raw_text,
processing_time_ms=int((time.time() - start_time) * 1000),
)
# Validate and score candidates
scored_candidates = []
for vin, start_pos, end_pos in candidates:
validation = vin_validator.validate(vin)
# Calculate confidence
base_confidence = self._calculate_base_confidence(word_confidences)
adjusted_confidence = min(
1.0, max(0.0, base_confidence + validation.confidence_adjustment)
)
scored_candidates.append(
(validation.vin, adjusted_confidence, validation.is_valid)
)
# Sort by confidence
scored_candidates.sort(key=lambda x: x[1], reverse=True)
# Primary result is the highest confidence valid candidate
primary_vin = None
primary_confidence = 0.0
for vin, confidence, is_valid in scored_candidates:
if is_valid:
primary_vin = vin
primary_confidence = confidence
break
# If no valid candidate, use the highest confidence one
if primary_vin is None and scored_candidates:
primary_vin = scored_candidates[0][0]
primary_confidence = scored_candidates[0][1]
# Build alternatives list (excluding primary)
alternatives = [
VinAlternative(vin=vin, confidence=conf)
for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives
]
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
"VIN extraction: %s, confidence=%.2f%%, time=%dms",
primary_vin, primary_confidence * 100, processing_time_ms,
)
logger.debug(
"VIN alternatives: %s",
[(a.vin, a.confidence) for a in alternatives],
)
return VinExtractionResult(
success=True,
vin=primary_vin,
confidence=primary_confidence,
bounding_box=preprocessing_result.bounding_box,
alternatives=alternatives,
processing_time_ms=processing_time_ms,
raw_text=raw_text,
)
except Exception as e:
logger.error("VIN extraction failed: %s", e, exc_info=True)
return VinExtractionResult(
success=False,
error=str(e),
processing_time_ms=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _perform_ocr(
self,
image_bytes: bytes,
single_line: bool = False,
single_word: bool = False,
) -> tuple[str, list[float]]:
"""
Perform OCR with VIN-optimized settings via engine abstraction.
Args:
image_bytes: Preprocessed image bytes
single_line: Treat image as a single text line
single_word: Treat image as a single word
Returns:
Tuple of (raw_text, word_confidences)
"""
config = OcrConfig(
char_whitelist=self.VIN_WHITELIST,
single_line=single_line,
single_word=single_word,
use_angle_cls=True,
)
result = self._engine.recognize(image_bytes, config)
word_confidences = [wb.confidence for wb in result.word_boxes]
return result.text, word_confidences
def _try_alternate_ocr(
self,
image_bytes: bytes,
prefix: str = "",
) -> list[tuple[str, int, int]]:
"""
Try alternate OCR configurations when initial extraction fails.
Modes tried:
single-line - Treat as a single text line
single-word - Treat as a single word
PaddleOCR angle classification handles rotated/angled text
inherently, so no PSM mode fallbacks are needed.
Returns:
List of VIN candidates
"""
tag = f"{prefix} " if prefix else ""
for mode_name, kwargs in [
("single-line", {"single_line": True}),
("single-word", {"single_word": True}),
]:
raw_text, _ = self._perform_ocr(image_bytes, **kwargs)
logger.debug("%s%s raw text: '%s'", tag, mode_name, raw_text)
candidates = vin_validator.extract_candidates(raw_text)
logger.debug("%s%s candidates: %s", tag, mode_name, candidates)
if candidates:
return candidates
return []
def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
"""Calculate base confidence from word confidences."""
if not word_confidences:
return 0.5
# Use average confidence, weighted slightly toward minimum
avg_conf = sum(word_confidences) / len(word_confidences)
min_conf = min(word_confidences)
# Blend: 70% average, 30% minimum
return 0.7 * avg_conf + 0.3 * min_conf
def validate(self, data: str) -> bool:
"""
Validate a VIN string.
Args:
data: VIN string to validate
Returns:
True if VIN is valid
"""
result = vin_validator.validate(data)
return result.is_valid
# Singleton instance
vin_extractor = VinExtractor()