feat: add VIN photo OCR pipeline (refs #67)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 31s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 31s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Implement VIN-specific OCR extraction with optimized preprocessing: - Add POST /extract/vin endpoint for VIN extraction - VIN preprocessor: CLAHE, deskew, denoise, adaptive threshold - VIN validator: check digit validation, OCR error correction (I->1, O->0) - VIN extractor: PSM modes 6/7/8, character whitelist, alternatives - Response includes confidence, bounding box, and alternatives - Unit tests for validator and preprocessor - Integration tests for VIN extraction endpoint Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
10
ocr/app/extractors/__init__.py
Normal file
10
ocr/app/extractors/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""Extractors package for domain-specific OCR extraction."""
|
||||
from app.extractors.base import BaseExtractor, ExtractionResult
|
||||
from app.extractors.vin_extractor import VinExtractor, vin_extractor
|
||||
|
||||
__all__ = [
|
||||
"BaseExtractor",
|
||||
"ExtractionResult",
|
||||
"VinExtractor",
|
||||
"vin_extractor",
|
||||
]
|
||||
47
ocr/app/extractors/base.py
Normal file
47
ocr/app/extractors/base.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Base extractor class for domain-specific OCR extraction."""
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
"""Base result for extraction operations."""
|
||||
|
||||
success: bool
|
||||
confidence: float
|
||||
raw_text: str
|
||||
processing_time_ms: int
|
||||
extracted_data: dict[str, Any] = field(default_factory=dict)
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class BaseExtractor(ABC):
|
||||
"""Abstract base class for domain-specific extractors."""
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, image_bytes: bytes, content_type: Optional[str] = None) -> ExtractionResult:
|
||||
"""
|
||||
Extract domain-specific data from an image.
|
||||
|
||||
Args:
|
||||
image_bytes: Raw image bytes
|
||||
content_type: MIME type of the image
|
||||
|
||||
Returns:
|
||||
ExtractionResult with extracted data
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def validate(self, data: Any) -> bool:
|
||||
"""
|
||||
Validate extracted data.
|
||||
|
||||
Args:
|
||||
data: Extracted data to validate
|
||||
|
||||
Returns:
|
||||
True if data is valid
|
||||
"""
|
||||
pass
|
||||
275
ocr/app/extractors/vin_extractor.py
Normal file
275
ocr/app/extractors/vin_extractor.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""VIN-specific OCR extractor with preprocessing and validation."""
|
||||
import io
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import magic
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
from pillow_heif import register_heif_opener
|
||||
|
||||
from app.config import settings
|
||||
from app.extractors.base import BaseExtractor
|
||||
from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
|
||||
from app.validators.vin_validator import vin_validator
|
||||
|
||||
# Register HEIF/HEIC opener
|
||||
register_heif_opener()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VinAlternative:
|
||||
"""Alternative VIN candidate with confidence."""
|
||||
|
||||
vin: str
|
||||
confidence: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class VinExtractionResult:
|
||||
"""Result of VIN extraction."""
|
||||
|
||||
success: bool
|
||||
vin: Optional[str] = None
|
||||
confidence: float = 0.0
|
||||
bounding_box: Optional[BoundingBox] = None
|
||||
alternatives: list[VinAlternative] = field(default_factory=list)
|
||||
processing_time_ms: int = 0
|
||||
error: Optional[str] = None
|
||||
raw_text: Optional[str] = None
|
||||
|
||||
|
||||
class VinExtractor(BaseExtractor):
|
||||
"""VIN-specific OCR extractor optimized for VIN plates and stickers."""
|
||||
|
||||
# Supported MIME types
|
||||
SUPPORTED_TYPES = {
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
"image/heic",
|
||||
"image/heif",
|
||||
}
|
||||
|
||||
# VIN character whitelist for Tesseract
|
||||
VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize VIN extractor."""
|
||||
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
||||
|
||||
def extract(
|
||||
self, image_bytes: bytes, content_type: Optional[str] = None
|
||||
) -> VinExtractionResult:
|
||||
"""
|
||||
Extract VIN from an image using optimized preprocessing and OCR.
|
||||
|
||||
Args:
|
||||
image_bytes: Raw image bytes (HEIC, JPEG, PNG)
|
||||
content_type: MIME type (auto-detected if not provided)
|
||||
|
||||
Returns:
|
||||
VinExtractionResult with extracted VIN and metadata
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Detect content type if not provided
|
||||
if not content_type:
|
||||
content_type = self._detect_mime_type(image_bytes)
|
||||
|
||||
# Validate content type
|
||||
if content_type not in self.SUPPORTED_TYPES:
|
||||
return VinExtractionResult(
|
||||
success=False,
|
||||
error=f"Unsupported file type: {content_type}",
|
||||
processing_time_ms=int((time.time() - start_time) * 1000),
|
||||
)
|
||||
|
||||
try:
|
||||
# Apply VIN-optimized preprocessing
|
||||
preprocessing_result = vin_preprocessor.preprocess(image_bytes)
|
||||
preprocessed_bytes = preprocessing_result.image_bytes
|
||||
|
||||
# Perform OCR with VIN-optimized settings
|
||||
raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
|
||||
|
||||
# Extract VIN candidates from raw text
|
||||
candidates = vin_validator.extract_candidates(raw_text)
|
||||
|
||||
if not candidates:
|
||||
# No VIN candidates found - try with different PSM modes
|
||||
candidates = self._try_alternate_ocr(preprocessed_bytes)
|
||||
|
||||
if not candidates:
|
||||
return VinExtractionResult(
|
||||
success=False,
|
||||
error="No VIN pattern found in image",
|
||||
raw_text=raw_text,
|
||||
processing_time_ms=int((time.time() - start_time) * 1000),
|
||||
)
|
||||
|
||||
# Validate and score candidates
|
||||
scored_candidates = []
|
||||
for vin, start_pos, end_pos in candidates:
|
||||
validation = vin_validator.validate(vin)
|
||||
|
||||
# Calculate confidence
|
||||
base_confidence = self._calculate_base_confidence(word_confidences)
|
||||
adjusted_confidence = min(
|
||||
1.0, max(0.0, base_confidence + validation.confidence_adjustment)
|
||||
)
|
||||
|
||||
scored_candidates.append(
|
||||
(validation.vin, adjusted_confidence, validation.is_valid)
|
||||
)
|
||||
|
||||
# Sort by confidence
|
||||
scored_candidates.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Primary result is the highest confidence valid candidate
|
||||
primary_vin = None
|
||||
primary_confidence = 0.0
|
||||
|
||||
for vin, confidence, is_valid in scored_candidates:
|
||||
if is_valid:
|
||||
primary_vin = vin
|
||||
primary_confidence = confidence
|
||||
break
|
||||
|
||||
# If no valid candidate, use the highest confidence one
|
||||
if primary_vin is None and scored_candidates:
|
||||
primary_vin = scored_candidates[0][0]
|
||||
primary_confidence = scored_candidates[0][1]
|
||||
|
||||
# Build alternatives list (excluding primary)
|
||||
alternatives = [
|
||||
VinAlternative(vin=vin, confidence=conf)
|
||||
for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives
|
||||
]
|
||||
|
||||
processing_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
logger.info(
|
||||
f"VIN extraction: {primary_vin}, confidence={primary_confidence:.2%}, "
|
||||
f"time={processing_time_ms}ms"
|
||||
)
|
||||
|
||||
return VinExtractionResult(
|
||||
success=True,
|
||||
vin=primary_vin,
|
||||
confidence=primary_confidence,
|
||||
bounding_box=preprocessing_result.bounding_box,
|
||||
alternatives=alternatives,
|
||||
processing_time_ms=processing_time_ms,
|
||||
raw_text=raw_text,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"VIN extraction failed: {e}", exc_info=True)
|
||||
return VinExtractionResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
processing_time_ms=int((time.time() - start_time) * 1000),
|
||||
)
|
||||
|
||||
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
||||
"""Detect MIME type using python-magic."""
|
||||
mime = magic.Magic(mime=True)
|
||||
detected = mime.from_buffer(file_bytes)
|
||||
return detected or "application/octet-stream"
|
||||
|
||||
def _perform_ocr(
|
||||
self, image_bytes: bytes, psm: int = 6
|
||||
) -> tuple[str, list[float]]:
|
||||
"""
|
||||
Perform OCR with VIN-optimized settings.
|
||||
|
||||
Args:
|
||||
image_bytes: Preprocessed image bytes
|
||||
psm: Tesseract page segmentation mode
|
||||
6 = Uniform block of text
|
||||
7 = Single text line
|
||||
8 = Single word
|
||||
|
||||
Returns:
|
||||
Tuple of (raw_text, word_confidences)
|
||||
"""
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
# Configure Tesseract for VIN extraction
|
||||
# Use character whitelist to exclude I, O, Q
|
||||
config = (
|
||||
f"--psm {psm} "
|
||||
f"-c tessedit_char_whitelist={self.VIN_WHITELIST}"
|
||||
)
|
||||
|
||||
# Get detailed OCR data
|
||||
ocr_data = pytesseract.image_to_data(
|
||||
image, config=config, output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Extract words and confidences
|
||||
words = []
|
||||
confidences = []
|
||||
|
||||
for i, text in enumerate(ocr_data["text"]):
|
||||
conf = int(ocr_data["conf"][i])
|
||||
if text.strip() and conf > 0:
|
||||
words.append(text.strip())
|
||||
confidences.append(conf / 100.0)
|
||||
|
||||
raw_text = " ".join(words)
|
||||
return raw_text, confidences
|
||||
|
||||
def _try_alternate_ocr(self, image_bytes: bytes) -> list[tuple[str, int, int]]:
|
||||
"""
|
||||
Try alternate OCR configurations when initial extraction fails.
|
||||
|
||||
Returns:
|
||||
List of VIN candidates
|
||||
"""
|
||||
# Try PSM 7 (single text line)
|
||||
raw_text, _ = self._perform_ocr(image_bytes, psm=7)
|
||||
candidates = vin_validator.extract_candidates(raw_text)
|
||||
if candidates:
|
||||
return candidates
|
||||
|
||||
# Try PSM 8 (single word)
|
||||
raw_text, _ = self._perform_ocr(image_bytes, psm=8)
|
||||
candidates = vin_validator.extract_candidates(raw_text)
|
||||
if candidates:
|
||||
return candidates
|
||||
|
||||
return []
|
||||
|
||||
def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
|
||||
"""Calculate base confidence from word confidences."""
|
||||
if not word_confidences:
|
||||
return 0.5
|
||||
|
||||
# Use average confidence, weighted slightly toward minimum
|
||||
avg_conf = sum(word_confidences) / len(word_confidences)
|
||||
min_conf = min(word_confidences)
|
||||
|
||||
# Blend: 70% average, 30% minimum
|
||||
return 0.7 * avg_conf + 0.3 * min_conf
|
||||
|
||||
def validate(self, data: str) -> bool:
|
||||
"""
|
||||
Validate a VIN string.
|
||||
|
||||
Args:
|
||||
data: VIN string to validate
|
||||
|
||||
Returns:
|
||||
True if VIN is valid
|
||||
"""
|
||||
result = vin_validator.validate(data)
|
||||
return result.is_valid
|
||||
|
||||
|
||||
# Singleton instance
|
||||
vin_extractor = VinExtractor()
|
||||
Reference in New Issue
Block a user