feat: add VIN photo OCR pipeline (refs #67)

Implement VIN-specific OCR extraction with optimized preprocessing: - Add POST /extract/vin endpoint for VIN extraction - VIN preprocessor: CLAHE, deskew, denoise, adaptive threshold - VIN validator: check digit validation, OCR error correction (I->1, O->0) - VIN extractor: PSM modes 6/7/8, character whitelist, alternatives - Response includes confidence, bounding box, and alternatives - Unit tests for validator and preprocessor - Integration tests for VIN extraction endpoint Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 19:31:36 -06:00
parent 004940b013
commit 54cbd49171
14 changed files with 1694 additions and 1 deletions
--- a/ocr/app/extractors/init.py
+++ b/ocr/app/extractors/init.py
@@ -0,0 +1,10 @@
+"""Extractors package for domain-specific OCR extraction."""
+from app.extractors.base import BaseExtractor, ExtractionResult
+from app.extractors.vin_extractor import VinExtractor, vin_extractor
+
+__all__ = [
+    "BaseExtractor",
+    "ExtractionResult",
+    "VinExtractor",
+    "vin_extractor",
+]
--- a/ocr/app/extractors/base.py
+++ b/ocr/app/extractors/base.py
@@ -0,0 +1,47 @@
+"""Base extractor class for domain-specific OCR extraction."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+
+@dataclass
+class ExtractionResult:
+    """Base result for extraction operations."""
+
+    success: bool
+    confidence: float
+    raw_text: str
+    processing_time_ms: int
+    extracted_data: dict[str, Any] = field(default_factory=dict)
+    error: Optional[str] = None
+
+
+class BaseExtractor(ABC):
+    """Abstract base class for domain-specific extractors."""
+
+    @abstractmethod
+    def extract(self, image_bytes: bytes, content_type: Optional[str] = None) -> ExtractionResult:
+        """
+        Extract domain-specific data from an image.
+
+        Args:
+            image_bytes: Raw image bytes
+            content_type: MIME type of the image
+
+        Returns:
+            ExtractionResult with extracted data
+        """
+        pass
+
+    @abstractmethod
+    def validate(self, data: Any) -> bool:
+        """
+        Validate extracted data.
+
+        Args:
+            data: Extracted data to validate
+
+        Returns:
+            True if data is valid
+        """
+        pass
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -0,0 +1,275 @@
+"""VIN-specific OCR extractor with preprocessing and validation."""
+import io
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+
+import magic
+import pytesseract
+from PIL import Image
+from pillow_heif import register_heif_opener
+
+from app.config import settings
+from app.extractors.base import BaseExtractor
+from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
+from app.validators.vin_validator import vin_validator
+
+# Register HEIF/HEIC opener
+register_heif_opener()
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class VinAlternative:
+    """Alternative VIN candidate with confidence."""
+
+    vin: str
+    confidence: float
+
+
+@dataclass
+class VinExtractionResult:
+    """Result of VIN extraction."""
+
+    success: bool
+    vin: Optional[str] = None
+    confidence: float = 0.0
+    bounding_box: Optional[BoundingBox] = None
+    alternatives: list[VinAlternative] = field(default_factory=list)
+    processing_time_ms: int = 0
+    error: Optional[str] = None
+    raw_text: Optional[str] = None
+
+
+class VinExtractor(BaseExtractor):
+    """VIN-specific OCR extractor optimized for VIN plates and stickers."""
+
+    # Supported MIME types
+    SUPPORTED_TYPES = {
+        "image/jpeg",
+        "image/png",
+        "image/heic",
+        "image/heif",
+    }
+
+    # VIN character whitelist for Tesseract
+    VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
+
+    def __init__(self) -> None:
+        """Initialize VIN extractor."""
+        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+
+    def extract(
+        self, image_bytes: bytes, content_type: Optional[str] = None
+    ) -> VinExtractionResult:
+        """
+        Extract VIN from an image using optimized preprocessing and OCR.
+
+        Args:
+            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
+            content_type: MIME type (auto-detected if not provided)
+
+        Returns:
+            VinExtractionResult with extracted VIN and metadata
+        """
+        start_time = time.time()
+
+        # Detect content type if not provided
+        if not content_type:
+            content_type = self._detect_mime_type(image_bytes)
+
+        # Validate content type
+        if content_type not in self.SUPPORTED_TYPES:
+            return VinExtractionResult(
+                success=False,
+                error=f"Unsupported file type: {content_type}",
+                processing_time_ms=int((time.time() - start_time) * 1000),
+            )
+
+        try:
+            # Apply VIN-optimized preprocessing
+            preprocessing_result = vin_preprocessor.preprocess(image_bytes)
+            preprocessed_bytes = preprocessing_result.image_bytes
+
+            # Perform OCR with VIN-optimized settings
+            raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
+
+            # Extract VIN candidates from raw text
+            candidates = vin_validator.extract_candidates(raw_text)
+
+            if not candidates:
+                # No VIN candidates found - try with different PSM modes
+                candidates = self._try_alternate_ocr(preprocessed_bytes)
+
+            if not candidates:
+                return VinExtractionResult(
+                    success=False,
+                    error="No VIN pattern found in image",
+                    raw_text=raw_text,
+                    processing_time_ms=int((time.time() - start_time) * 1000),
+                )
+
+            # Validate and score candidates
+            scored_candidates = []
+            for vin, start_pos, end_pos in candidates:
+                validation = vin_validator.validate(vin)
+
+                # Calculate confidence
+                base_confidence = self._calculate_base_confidence(word_confidences)
+                adjusted_confidence = min(
+                    1.0, max(0.0, base_confidence + validation.confidence_adjustment)
+                )
+
+                scored_candidates.append(
+                    (validation.vin, adjusted_confidence, validation.is_valid)
+                )
+
+            # Sort by confidence
+            scored_candidates.sort(key=lambda x: x[1], reverse=True)
+
+            # Primary result is the highest confidence valid candidate
+            primary_vin = None
+            primary_confidence = 0.0
+
+            for vin, confidence, is_valid in scored_candidates:
+                if is_valid:
+                    primary_vin = vin
+                    primary_confidence = confidence
+                    break
+
+            # If no valid candidate, use the highest confidence one
+            if primary_vin is None and scored_candidates:
+                primary_vin = scored_candidates[0][0]
+                primary_confidence = scored_candidates[0][1]
+
+            # Build alternatives list (excluding primary)
+            alternatives = [
+                VinAlternative(vin=vin, confidence=conf)
+                for vin, conf, _ in scored_candidates[1:5]  # Max 4 alternatives
+            ]
+
+            processing_time_ms = int((time.time() - start_time) * 1000)
+
+            logger.info(
+                f"VIN extraction: {primary_vin}, confidence={primary_confidence:.2%}, "
+                f"time={processing_time_ms}ms"
+            )
+
+            return VinExtractionResult(
+                success=True,
+                vin=primary_vin,
+                confidence=primary_confidence,
+                bounding_box=preprocessing_result.bounding_box,
+                alternatives=alternatives,
+                processing_time_ms=processing_time_ms,
+                raw_text=raw_text,
+            )
+
+        except Exception as e:
+            logger.error(f"VIN extraction failed: {e}", exc_info=True)
+            return VinExtractionResult(
+                success=False,
+                error=str(e),
+                processing_time_ms=int((time.time() - start_time) * 1000),
+            )
+
+    def _detect_mime_type(self, file_bytes: bytes) -> str:
+        """Detect MIME type using python-magic."""
+        mime = magic.Magic(mime=True)
+        detected = mime.from_buffer(file_bytes)
+        return detected or "application/octet-stream"
+
+    def _perform_ocr(
+        self, image_bytes: bytes, psm: int = 6
+    ) -> tuple[str, list[float]]:
+        """
+        Perform OCR with VIN-optimized settings.
+
+        Args:
+            image_bytes: Preprocessed image bytes
+            psm: Tesseract page segmentation mode
+                 6 = Uniform block of text
+                 7 = Single text line
+                 8 = Single word
+
+        Returns:
+            Tuple of (raw_text, word_confidences)
+        """
+        image = Image.open(io.BytesIO(image_bytes))
+
+        # Configure Tesseract for VIN extraction
+        # Use character whitelist to exclude I, O, Q
+        config = (
+            f"--psm {psm} "
+            f"-c tessedit_char_whitelist={self.VIN_WHITELIST}"
+        )
+
+        # Get detailed OCR data
+        ocr_data = pytesseract.image_to_data(
+            image, config=config, output_type=pytesseract.Output.DICT
+        )
+
+        # Extract words and confidences
+        words = []
+        confidences = []
+
+        for i, text in enumerate(ocr_data["text"]):
+            conf = int(ocr_data["conf"][i])
+            if text.strip() and conf > 0:
+                words.append(text.strip())
+                confidences.append(conf / 100.0)
+
+        raw_text = " ".join(words)
+        return raw_text, confidences
+
+    def _try_alternate_ocr(self, image_bytes: bytes) -> list[tuple[str, int, int]]:
+        """
+        Try alternate OCR configurations when initial extraction fails.
+
+        Returns:
+            List of VIN candidates
+        """
+        # Try PSM 7 (single text line)
+        raw_text, _ = self._perform_ocr(image_bytes, psm=7)
+        candidates = vin_validator.extract_candidates(raw_text)
+        if candidates:
+            return candidates
+
+        # Try PSM 8 (single word)
+        raw_text, _ = self._perform_ocr(image_bytes, psm=8)
+        candidates = vin_validator.extract_candidates(raw_text)
+        if candidates:
+            return candidates
+
+        return []
+
+    def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
+        """Calculate base confidence from word confidences."""
+        if not word_confidences:
+            return 0.5
+
+        # Use average confidence, weighted slightly toward minimum
+        avg_conf = sum(word_confidences) / len(word_confidences)
+        min_conf = min(word_confidences)
+
+        # Blend: 70% average, 30% minimum
+        return 0.7 * avg_conf + 0.3 * min_conf
+
+    def validate(self, data: str) -> bool:
+        """
+        Validate a VIN string.
+
+        Args:
+            data: VIN string to validate
+
+        Returns:
+            True if VIN is valid
+        """
+        result = vin_validator.validate(data)
+        return result.is_valid
+
+
+# Singleton instance
+vin_extractor = VinExtractor()