All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 36s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 21s
Deploy to Staging / Verify Staging (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Save original, adaptive, and Otsu preprocessed images to
/tmp/vin-debug/{timestamp}/ when LOG_LEVEL is set to debug.
No images saved at info level. Volume mount added for access.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
357 lines
12 KiB
Python
357 lines
12 KiB
Python
"""VIN-specific OCR extractor with preprocessing and validation."""
|
|
import io
|
|
import logging
|
|
import os
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
import magic
|
|
import pytesseract
|
|
from PIL import Image
|
|
from pillow_heif import register_heif_opener
|
|
|
|
from app.config import settings
|
|
from app.extractors.base import BaseExtractor
|
|
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
|
|
from app.validators.vin_validator import vin_validator
|
|
|
|
# Register HEIF/HEIC opener
|
|
register_heif_opener()
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class VinAlternative:
|
|
"""Alternative VIN candidate with confidence."""
|
|
|
|
vin: str
|
|
confidence: float
|
|
|
|
|
|
@dataclass
|
|
class VinExtractionResult:
|
|
"""Result of VIN extraction."""
|
|
|
|
success: bool
|
|
vin: Optional[str] = None
|
|
confidence: float = 0.0
|
|
bounding_box: Optional[BoundingBox] = None
|
|
alternatives: list[VinAlternative] = field(default_factory=list)
|
|
processing_time_ms: int = 0
|
|
error: Optional[str] = None
|
|
raw_text: Optional[str] = None
|
|
|
|
|
|
class VinExtractor(BaseExtractor):
|
|
"""VIN-specific OCR extractor optimized for VIN plates and stickers."""
|
|
|
|
# Supported MIME types
|
|
SUPPORTED_TYPES = {
|
|
"image/jpeg",
|
|
"image/png",
|
|
"image/heic",
|
|
"image/heif",
|
|
}
|
|
|
|
# VIN character whitelist for Tesseract
|
|
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
|
|
|
# Fixed debug output directory (inside container)
|
|
DEBUG_DIR = "/tmp/vin-debug"
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialize VIN extractor."""
|
|
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
|
self._debug = settings.log_level.upper() == "DEBUG"
|
|
|
|
def _save_debug_image(self, session_dir: str, name: str, data: bytes) -> None:
|
|
"""Save image bytes to the debug session directory when LOG_LEVEL=debug."""
|
|
if not self._debug:
|
|
return
|
|
path = os.path.join(session_dir, name)
|
|
with open(path, "wb") as f:
|
|
f.write(data)
|
|
logger.debug("Saved debug image: %s (%d bytes)", name, len(data))
|
|
|
|
def _create_debug_session(self) -> Optional[str]:
|
|
"""Create a timestamped debug directory. Returns path or None."""
|
|
if not self._debug:
|
|
return None
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
|
session_dir = os.path.join(self.DEBUG_DIR, ts)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
return session_dir
|
|
|
|
def extract(
|
|
self, image_bytes: bytes, content_type: Optional[str] = None
|
|
) -> VinExtractionResult:
|
|
"""
|
|
Extract VIN from an image using optimized preprocessing and OCR.
|
|
|
|
Args:
|
|
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
|
|
content_type: MIME type (auto-detected if not provided)
|
|
|
|
Returns:
|
|
VinExtractionResult with extracted VIN and metadata
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Detect content type if not provided
|
|
if not content_type:
|
|
content_type = self._detect_mime_type(image_bytes)
|
|
|
|
# Validate content type
|
|
if content_type not in self.SUPPORTED_TYPES:
|
|
return VinExtractionResult(
|
|
success=False,
|
|
error=f"Unsupported file type: {content_type}",
|
|
processing_time_ms=int((time.time() - start_time) * 1000),
|
|
)
|
|
|
|
try:
|
|
debug_session = self._create_debug_session()
|
|
|
|
logger.debug(
|
|
"VIN extraction input: %d bytes, content_type=%s",
|
|
len(image_bytes), content_type,
|
|
)
|
|
if debug_session:
|
|
self._save_debug_image(debug_session, "01_original.jpg", image_bytes)
|
|
|
|
# Apply VIN-optimized preprocessing
|
|
preprocessing_result = vin_preprocessor.preprocess(image_bytes)
|
|
preprocessed_bytes = preprocessing_result.image_bytes
|
|
logger.debug(
|
|
"Preprocessing steps: %s", preprocessing_result.preprocessing_applied
|
|
)
|
|
if debug_session:
|
|
self._save_debug_image(
|
|
debug_session, "02_preprocessed_adaptive.png", preprocessed_bytes
|
|
)
|
|
|
|
# Perform OCR with VIN-optimized settings
|
|
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
|
|
logger.debug("PSM 6 raw text: '%s'", raw_text)
|
|
logger.debug("PSM 6 word confidences: %s", word_confidences)
|
|
|
|
# Extract VIN candidates from raw text
|
|
candidates = vin_validator.extract_candidates(raw_text)
|
|
logger.debug("PSM 6 candidates: %s", candidates)
|
|
|
|
if not candidates:
|
|
# No VIN candidates found - try with different PSM modes
|
|
candidates = self._try_alternate_ocr(preprocessed_bytes)
|
|
|
|
if not candidates:
|
|
# Try alternative preprocessing (Otsu's thresholding)
|
|
otsu_result = vin_preprocessor.preprocess_otsu(image_bytes)
|
|
logger.debug(
|
|
"Otsu preprocessing steps: %s",
|
|
otsu_result.preprocessing_applied,
|
|
)
|
|
if debug_session:
|
|
self._save_debug_image(
|
|
debug_session, "03_preprocessed_otsu.png",
|
|
otsu_result.image_bytes,
|
|
)
|
|
|
|
raw_text, word_confidences = self._perform_ocr(otsu_result.image_bytes)
|
|
logger.debug("Otsu PSM 6 raw text: '%s'", raw_text)
|
|
candidates = vin_validator.extract_candidates(raw_text)
|
|
logger.debug("Otsu PSM 6 candidates: %s", candidates)
|
|
if not candidates:
|
|
candidates = self._try_alternate_ocr(
|
|
otsu_result.image_bytes, prefix="Otsu"
|
|
)
|
|
|
|
if not candidates:
|
|
logger.debug("No VIN pattern found in any OCR attempt")
|
|
return VinExtractionResult(
|
|
success=False,
|
|
error="No VIN pattern found in image",
|
|
raw_text=raw_text,
|
|
processing_time_ms=int((time.time() - start_time) * 1000),
|
|
)
|
|
|
|
# Validate and score candidates
|
|
scored_candidates = []
|
|
for vin, start_pos, end_pos in candidates:
|
|
validation = vin_validator.validate(vin)
|
|
|
|
# Calculate confidence
|
|
base_confidence = self._calculate_base_confidence(word_confidences)
|
|
adjusted_confidence = min(
|
|
1.0, max(0.0, base_confidence + validation.confidence_adjustment)
|
|
)
|
|
|
|
scored_candidates.append(
|
|
(validation.vin, adjusted_confidence, validation.is_valid)
|
|
)
|
|
|
|
# Sort by confidence
|
|
scored_candidates.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Primary result is the highest confidence valid candidate
|
|
primary_vin = None
|
|
primary_confidence = 0.0
|
|
|
|
for vin, confidence, is_valid in scored_candidates:
|
|
if is_valid:
|
|
primary_vin = vin
|
|
primary_confidence = confidence
|
|
break
|
|
|
|
# If no valid candidate, use the highest confidence one
|
|
if primary_vin is None and scored_candidates:
|
|
primary_vin = scored_candidates[0][0]
|
|
primary_confidence = scored_candidates[0][1]
|
|
|
|
# Build alternatives list (excluding primary)
|
|
alternatives = [
|
|
VinAlternative(vin=vin, confidence=conf)
|
|
for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives
|
|
]
|
|
|
|
processing_time_ms = int((time.time() - start_time) * 1000)
|
|
|
|
logger.info(
|
|
"VIN extraction: %s, confidence=%.2f%%, time=%dms",
|
|
primary_vin, primary_confidence * 100, processing_time_ms,
|
|
)
|
|
logger.debug(
|
|
"VIN alternatives: %s",
|
|
[(a.vin, a.confidence) for a in alternatives],
|
|
)
|
|
|
|
return VinExtractionResult(
|
|
success=True,
|
|
vin=primary_vin,
|
|
confidence=primary_confidence,
|
|
bounding_box=preprocessing_result.bounding_box,
|
|
alternatives=alternatives,
|
|
processing_time_ms=processing_time_ms,
|
|
raw_text=raw_text,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("VIN extraction failed: %s", e, exc_info=True)
|
|
return VinExtractionResult(
|
|
success=False,
|
|
error=str(e),
|
|
processing_time_ms=int((time.time() - start_time) * 1000),
|
|
)
|
|
|
|
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
|
"""Detect MIME type using python-magic."""
|
|
mime = magic.Magic(mime=True)
|
|
detected = mime.from_buffer(file_bytes)
|
|
return detected or "application/octet-stream"
|
|
|
|
def _perform_ocr(
|
|
self, image_bytes: bytes, psm: int = 6
|
|
) -> tuple[str, list[float]]:
|
|
"""
|
|
Perform OCR with VIN-optimized settings.
|
|
|
|
Args:
|
|
image_bytes: Preprocessed image bytes
|
|
psm: Tesseract page segmentation mode
|
|
6 = Uniform block of text
|
|
7 = Single text line
|
|
8 = Single word
|
|
|
|
Returns:
|
|
Tuple of (raw_text, word_confidences)
|
|
"""
|
|
image = Image.open(io.BytesIO(image_bytes))
|
|
|
|
# Configure Tesseract for VIN extraction
|
|
# OEM 1 = LSTM neural network engine (best accuracy)
|
|
# Disable dictionaries since VINs are not dictionary words
|
|
config = (
|
|
f"--psm {psm} "
|
|
f"--oem 1 "
|
|
f"-c tessedit_char_whitelist={self.VIN_WHITELIST} "
|
|
f"-c load_system_dawg=false "
|
|
f"-c load_freq_dawg=false"
|
|
)
|
|
|
|
# Get detailed OCR data
|
|
ocr_data = pytesseract.image_to_data(
|
|
image, config=config, output_type=pytesseract.Output.DICT
|
|
)
|
|
|
|
# Extract words and confidences
|
|
words = []
|
|
confidences = []
|
|
|
|
for i, text in enumerate(ocr_data["text"]):
|
|
conf = int(ocr_data["conf"][i])
|
|
if text.strip() and conf > 0:
|
|
words.append(text.strip())
|
|
confidences.append(conf / 100.0)
|
|
|
|
raw_text = " ".join(words)
|
|
return raw_text, confidences
|
|
|
|
def _try_alternate_ocr(
|
|
self,
|
|
image_bytes: bytes,
|
|
prefix: str = "",
|
|
) -> list[tuple[str, int, int]]:
|
|
"""
|
|
Try alternate OCR configurations when initial extraction fails.
|
|
|
|
PSM modes tried in order:
|
|
7 - Single text line
|
|
8 - Single word
|
|
11 - Sparse text (finds text in any order, good for angled photos)
|
|
13 - Raw line (no Tesseract heuristics, good for clean VIN plates)
|
|
|
|
Returns:
|
|
List of VIN candidates
|
|
"""
|
|
tag = f"{prefix} " if prefix else ""
|
|
for psm in (7, 8, 11, 13):
|
|
raw_text, _ = self._perform_ocr(image_bytes, psm=psm)
|
|
logger.debug("%sPSM %d raw text: '%s'", tag, psm, raw_text)
|
|
candidates = vin_validator.extract_candidates(raw_text)
|
|
logger.debug("%sPSM %d candidates: %s", tag, psm, candidates)
|
|
if candidates:
|
|
return candidates
|
|
|
|
return []
|
|
|
|
def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
|
|
"""Calculate base confidence from word confidences."""
|
|
if not word_confidences:
|
|
return 0.5
|
|
|
|
# Use average confidence, weighted slightly toward minimum
|
|
avg_conf = sum(word_confidences) / len(word_confidences)
|
|
min_conf = min(word_confidences)
|
|
|
|
# Blend: 70% average, 30% minimum
|
|
return 0.7 * avg_conf + 0.3 * min_conf
|
|
|
|
def validate(self, data: str) -> bool:
|
|
"""
|
|
Validate a VIN string.
|
|
|
|
Args:
|
|
data: VIN string to validate
|
|
|
|
Returns:
|
|
True if VIN is valid
|
|
"""
|
|
result = vin_validator.validate(data)
|
|
return result.is_valid
|
|
|
|
|
|
# Singleton instance
|
|
vin_extractor = VinExtractor()
|