2026-02-02 01:36:27 +00:00
14 changed files with 1694 additions and 1 deletions
--- a/ocr/app/extractors/init.py
+++ b/ocr/app/extractors/init.py
@@ -0,0 +1,10 @@
 """Extractors package for domain-specific OCR extraction."""
 from app.extractors.base import BaseExtractor, ExtractionResult
 from app.extractors.vin_extractor import VinExtractor, vin_extractor
 __all__ = [
    "BaseExtractor",
    "ExtractionResult",
    "VinExtractor",
    "vin_extractor",
 ]
--- a/ocr/app/extractors/base.py
+++ b/ocr/app/extractors/base.py
@@ -0,0 +1,47 @@
 """Base extractor class for domain-specific OCR extraction."""
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import Any, Optional
@dataclass
 class ExtractionResult:
    """Base result for extraction operations."""
    success: bool
    confidence: float
    raw_text: str
    processing_time_ms: int
    extracted_data: dict[str, Any] = field(default_factory=dict)
    error: Optional[str] = None
 class BaseExtractor(ABC):
    """Abstract base class for domain-specific extractors."""
    @abstractmethod
    def extract(self, image_bytes: bytes, content_type: Optional[str] = None) -> ExtractionResult:
        """
        Extract domain-specific data from an image.
        Args:
            image_bytes: Raw image bytes
            content_type: MIME type of the image
        Returns:
            ExtractionResult with extracted data
        """
        pass
    @abstractmethod
    def validate(self, data: Any) -> bool:
        """
        Validate extracted data.
        Args:
            data: Extracted data to validate
        Returns:
            True if data is valid
        """
        pass
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -0,0 +1,275 @@
 """VIN-specific OCR extractor with preprocessing and validation."""
 import io
 import logging
 import time
 from dataclasses import dataclass, field
 from typing import Optional
 import magic
 import pytesseract
 from PIL import Image
 from pillow_heif import register_heif_opener
 from app.config import settings
 from app.extractors.base import BaseExtractor
 from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
 from app.validators.vin_validator import vin_validator
 # Register HEIF/HEIC opener
 register_heif_opener()
 logger = logging.getLogger(__name__)
@dataclass
 class VinAlternative:
    """Alternative VIN candidate with confidence."""
    vin: str
    confidence: float
@dataclass
 class VinExtractionResult:
    """Result of VIN extraction."""
    success: bool
    vin: Optional[str] = None
    confidence: float = 0.0
    bounding_box: Optional[BoundingBox] = None
    alternatives: list[VinAlternative] = field(default_factory=list)
    processing_time_ms: int = 0
    error: Optional[str] = None
    raw_text: Optional[str] = None
 class VinExtractor(BaseExtractor):
    """VIN-specific OCR extractor optimized for VIN plates and stickers."""
    # Supported MIME types
    SUPPORTED_TYPES = {
        "image/jpeg",
        "image/png",
        "image/heic",
        "image/heif",
    }
    # VIN character whitelist for Tesseract
    VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
    def __init__(self) -> None:
        """Initialize VIN extractor."""
        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
    def extract(
        self, image_bytes: bytes, content_type: Optional[str] = None
    ) -> VinExtractionResult:
        """
        Extract VIN from an image using optimized preprocessing and OCR.
        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            content_type: MIME type (auto-detected if not provided)
        Returns:
            VinExtractionResult with extracted VIN and metadata
        """
        start_time = time.time()
        # Detect content type if not provided
        if not content_type:
            content_type = self._detect_mime_type(image_bytes)
        # Validate content type
        if content_type not in self.SUPPORTED_TYPES:
            return VinExtractionResult(
                success=False,
                error=f"Unsupported file type: {content_type}",
                processing_time_ms=int((time.time() - start_time) * 1000),
            )
        try:
            # Apply VIN-optimized preprocessing
            preprocessing_result = vin_preprocessor.preprocess(image_bytes)
            preprocessed_bytes = preprocessing_result.image_bytes
            # Perform OCR with VIN-optimized settings
            raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
            # Extract VIN candidates from raw text
            candidates = vin_validator.extract_candidates(raw_text)
            if not candidates:
                # No VIN candidates found - try with different PSM modes
                candidates = self._try_alternate_ocr(preprocessed_bytes)
            if not candidates:
                return VinExtractionResult(
                    success=False,
                    error="No VIN pattern found in image",
                    raw_text=raw_text,
                    processing_time_ms=int((time.time() - start_time) * 1000),
                )
            # Validate and score candidates
            scored_candidates = []
            for vin, start_pos, end_pos in candidates:
                validation = vin_validator.validate(vin)
                # Calculate confidence
                base_confidence = self._calculate_base_confidence(word_confidences)
                adjusted_confidence = min(
                    1.0, max(0.0, base_confidence + validation.confidence_adjustment)
                )
                scored_candidates.append(
                    (validation.vin, adjusted_confidence, validation.is_valid)
                )
            # Sort by confidence
            scored_candidates.sort(key=lambda x: x[1], reverse=True)
            # Primary result is the highest confidence valid candidate
            primary_vin = None
            primary_confidence = 0.0
            for vin, confidence, is_valid in scored_candidates:
                if is_valid:
                    primary_vin = vin
                    primary_confidence = confidence
                    break
            # If no valid candidate, use the highest confidence one
            if primary_vin is None and scored_candidates:
                primary_vin = scored_candidates[0][0]
                primary_confidence = scored_candidates[0][1]
            # Build alternatives list (excluding primary)
            alternatives = [
                VinAlternative(vin=vin, confidence=conf)
                for vin, conf, _ in scored_candidates[1:5]  # Max 4 alternatives
            ]
            processing_time_ms = int((time.time() - start_time) * 1000)
            logger.info(
                f"VIN extraction: {primary_vin}, confidence={primary_confidence:.2%}, "
                f"time={processing_time_ms}ms"
            )
            return VinExtractionResult(
                success=True,
                vin=primary_vin,
                confidence=primary_confidence,
                bounding_box=preprocessing_result.bounding_box,
                alternatives=alternatives,
                processing_time_ms=processing_time_ms,
                raw_text=raw_text,
            )
        except Exception as e:
            logger.error(f"VIN extraction failed: {e}", exc_info=True)
            return VinExtractionResult(
                success=False,
                error=str(e),
                processing_time_ms=int((time.time() - start_time) * 1000),
            )
    def _detect_mime_type(self, file_bytes: bytes) -> str:
        """Detect MIME type using python-magic."""
        mime = magic.Magic(mime=True)
        detected = mime.from_buffer(file_bytes)
        return detected or "application/octet-stream"
    def _perform_ocr(
        self, image_bytes: bytes, psm: int = 6
    ) -> tuple[str, list[float]]:
        """
        Perform OCR with VIN-optimized settings.
        Args:
            image_bytes: Preprocessed image bytes
            psm: Tesseract page segmentation mode
                 6 = Uniform block of text
                 7 = Single text line
                 8 = Single word
        Returns:
            Tuple of (raw_text, word_confidences)
        """
        image = Image.open(io.BytesIO(image_bytes))
        # Configure Tesseract for VIN extraction
        # Use character whitelist to exclude I, O, Q
        config = (
            f"--psm {psm} "
            f"-c tessedit_char_whitelist={self.VIN_WHITELIST}"
        )
        # Get detailed OCR data
        ocr_data = pytesseract.image_to_data(
            image, config=config, output_type=pytesseract.Output.DICT
        )
        # Extract words and confidences
        words = []
        confidences = []
        for i, text in enumerate(ocr_data["text"]):
            conf = int(ocr_data["conf"][i])
            if text.strip() and conf > 0:
                words.append(text.strip())
                confidences.append(conf / 100.0)
        raw_text = " ".join(words)
        return raw_text, confidences
    def _try_alternate_ocr(self, image_bytes: bytes) -> list[tuple[str, int, int]]:
        """
        Try alternate OCR configurations when initial extraction fails.
        Returns:
            List of VIN candidates
        """
        # Try PSM 7 (single text line)
        raw_text, _ = self._perform_ocr(image_bytes, psm=7)
        candidates = vin_validator.extract_candidates(raw_text)
        if candidates:
            return candidates
        # Try PSM 8 (single word)
        raw_text, _ = self._perform_ocr(image_bytes, psm=8)
        candidates = vin_validator.extract_candidates(raw_text)
        if candidates:
            return candidates
        return []
    def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
        """Calculate base confidence from word confidences."""
        if not word_confidences:
            return 0.5
        # Use average confidence, weighted slightly toward minimum
        avg_conf = sum(word_confidences) / len(word_confidences)
        min_conf = min(word_confidences)
        # Blend: 70% average, 30% minimum
        return 0.7 * avg_conf + 0.3 * min_conf
    def validate(self, data: str) -> bool:
        """
        Validate a VIN string.
        Args:
            data: VIN string to validate
        Returns:
            True if VIN is valid
        """
        result = vin_validator.validate(data)
        return result.is_valid
 # Singleton instance
 vin_extractor = VinExtractor()
--- a/ocr/app/main.py
+++ b/ocr/app/main.py
@@ -55,6 +55,7 @@ async def root() -> dict:
        "log_level": settings.log_level,
        "endpoints": [
            "POST /extract - Synchronous OCR extraction",
            "POST /extract/vin - VIN-specific extraction with validation",
            "POST /jobs - Submit async OCR job",
            "GET /jobs/{job_id} - Get async job status",
        ],
--- a/ocr/app/models/init.py
+++ b/ocr/app/models/init.py
@@ -1,18 +1,24 @@
 """Pydantic models for OCR service."""
 from .schemas import (
    BoundingBox,
    DocumentType,
    ExtractedField,
    JobResponse,
    JobStatus,
    JobSubmitRequest,
    OcrResponse,
    VinAlternative,
    VinExtractionResponse,
 )
 __all__ = [
    "BoundingBox",
    "DocumentType",
    "ExtractedField",
    "JobResponse",
    "JobStatus",
    "JobSubmitRequest",
    "OcrResponse",
    "VinAlternative",
    "VinExtractionResponse",
 ]
--- a/ocr/app/models/schemas.py
+++ b/ocr/app/models/schemas.py
@@ -21,6 +21,36 @@ class ExtractedField(BaseModel):
    confidence: float = Field(ge=0.0, le=1.0)
 class BoundingBox(BaseModel):
    """Bounding box for detected region."""
    x: int
    y: int
    width: int
    height: int
 class VinAlternative(BaseModel):
    """Alternative VIN candidate."""
    vin: str
    confidence: float = Field(ge=0.0, le=1.0)
 class VinExtractionResponse(BaseModel):
    """Response from VIN extraction endpoint."""
    success: bool
    vin: Optional[str] = None
    confidence: float = Field(ge=0.0, le=1.0)
    bounding_box: Optional[BoundingBox] = Field(default=None, alias="boundingBox")
    alternatives: list[VinAlternative] = Field(default_factory=list)
    processing_time_ms: int = Field(alias="processingTimeMs")
    error: Optional[str] = None
    model_config = {"populate_by_name": True}
 class OcrResponse(BaseModel):
    """Response from OCR extraction."""
--- a/ocr/app/preprocessors/init.py
+++ b/ocr/app/preprocessors/init.py
@@ -0,0 +1,10 @@
 """Image preprocessors for OCR optimization."""
 from app.services.preprocessor import ImagePreprocessor, preprocessor
 from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor
 __all__ = [
    "ImagePreprocessor",
    "preprocessor",
    "VinPreprocessor",
    "vin_preprocessor",
 ]
--- a/ocr/app/preprocessors/vin_preprocessor.py
+++ b/ocr/app/preprocessors/vin_preprocessor.py
@@ -0,0 +1,309 @@
 """VIN-optimized image preprocessing pipeline."""
 import io
 import logging
 from dataclasses import dataclass
 from typing import Optional
 import cv2
 import numpy as np
 from PIL import Image
 from pillow_heif import register_heif_opener
 # Register HEIF/HEIC opener
 register_heif_opener()
 logger = logging.getLogger(__name__)
@dataclass
 class BoundingBox:
    """Represents a region in an image."""
    x: int
    y: int
    width: int
    height: int
@dataclass
 class PreprocessingResult:
    """Result of VIN preprocessing."""
    image_bytes: bytes
    bounding_box: Optional[BoundingBox] = None
    preprocessing_applied: list[str] = None
    def __post_init__(self) -> None:
        if self.preprocessing_applied is None:
            self.preprocessing_applied = []
 class VinPreprocessor:
    """VIN-optimized image preprocessing for improved OCR accuracy."""
    def preprocess(
        self,
        image_bytes: bytes,
        apply_clahe: bool = True,
        apply_deskew: bool = True,
        apply_denoise: bool = True,
        apply_threshold: bool = True,
    ) -> PreprocessingResult:
        """
        Apply VIN-optimized preprocessing pipeline.
        Pipeline:
        1. HEIC conversion (if needed)
        2. Grayscale conversion
        3. Deskew (correct rotation/tilt)
        4. Contrast enhancement (CLAHE)
        5. Noise reduction (fastNlMeansDenoising)
        6. Adaptive thresholding
        Args:
            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
            apply_clahe: Apply CLAHE contrast enhancement
            apply_deskew: Apply deskew correction
            apply_denoise: Apply noise reduction
            apply_threshold: Apply adaptive thresholding
        Returns:
            PreprocessingResult with processed image bytes
        """
        steps_applied = []
        # Load image with PIL (handles HEIC via pillow-heif)
        pil_image = Image.open(io.BytesIO(image_bytes))
        steps_applied.append("loaded")
        # Convert to RGB if needed
        if pil_image.mode not in ("RGB", "L"):
            pil_image = pil_image.convert("RGB")
            steps_applied.append("convert_rgb")
        # Convert to OpenCV format
        cv_image = np.array(pil_image)
        if len(cv_image.shape) == 3:
            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
        # Convert to grayscale
        if len(cv_image.shape) == 3:
            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
        else:
            gray = cv_image
        steps_applied.append("grayscale")
        # Apply deskew
        if apply_deskew:
            gray = self._deskew(gray)
            steps_applied.append("deskew")
        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
        if apply_clahe:
            gray = self._apply_clahe(gray)
            steps_applied.append("clahe")
        # Apply denoising
        if apply_denoise:
            gray = self._denoise(gray)
            steps_applied.append("denoise")
        # Apply adaptive thresholding
        if apply_threshold:
            gray = self._adaptive_threshold(gray)
            steps_applied.append("threshold")
        # Convert back to PNG bytes
        result_image = Image.fromarray(gray)
        buffer = io.BytesIO()
        result_image.save(buffer, format="PNG")
        return PreprocessingResult(
            image_bytes=buffer.getvalue(),
            preprocessing_applied=steps_applied,
        )
    def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
        """
        Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
        CLAHE improves contrast in images with varying illumination,
        which is common in VIN photos taken in different lighting conditions.
        """
        try:
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
            return clahe.apply(image)
        except cv2.error as e:
            logger.warning(f"CLAHE failed: {e}")
            return image
    def _deskew(self, image: np.ndarray) -> np.ndarray:
        """
        Correct image rotation using Hough transform line detection.
        VIN plates/stickers are often photographed at slight angles.
        """
        try:
            # Detect edges
            edges = cv2.Canny(image, 50, 150, apertureSize=3)
            # Detect lines
            lines = cv2.HoughLinesP(
                edges,
                rho=1,
                theta=np.pi / 180,
                threshold=100,
                minLineLength=100,
                maxLineGap=10,
            )
            if lines is None:
                return image
            # Calculate angles of detected lines
            angles = []
            for line in lines:
                x1, y1, x2, y2 = line[0]
                if x2 - x1 != 0:
                    angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
                    # Only consider nearly horizontal lines
                    if -45 < angle < 45:
                        angles.append(angle)
            if not angles:
                return image
            # Use median angle to avoid outliers
            median_angle = np.median(angles)
            # Only correct if skew is significant but not extreme
            if abs(median_angle) < 0.5 or abs(median_angle) > 20:
                return image
            # Rotate to correct skew
            height, width = image.shape[:2]
            center = (width // 2, height // 2)
            rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
            # Calculate new bounds
            cos_val = abs(rotation_matrix[0, 0])
            sin_val = abs(rotation_matrix[0, 1])
            new_width = int(height * sin_val + width * cos_val)
            new_height = int(height * cos_val + width * sin_val)
            rotation_matrix[0, 2] += (new_width - width) / 2
            rotation_matrix[1, 2] += (new_height - height) / 2
            rotated = cv2.warpAffine(
                image,
                rotation_matrix,
                (new_width, new_height),
                borderMode=cv2.BORDER_REPLICATE,
            )
            logger.debug(f"Deskewed by {median_angle:.2f} degrees")
            return rotated
        except Exception as e:
            logger.warning(f"Deskew failed: {e}")
            return image
    def _denoise(self, image: np.ndarray) -> np.ndarray:
        """
        Apply non-local means denoising.
        This helps remove noise while preserving VIN character edges.
        """
        try:
            return cv2.fastNlMeansDenoising(
                image, h=10, templateWindowSize=7, searchWindowSize=21
            )
        except cv2.error as e:
            logger.warning(f"Denoising failed: {e}")
            return image
    def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
        """
        Apply adaptive thresholding for binarization.
        Adaptive thresholding handles varying illumination across the image,
        which is common in VIN photos.
        """
        try:
            return cv2.adaptiveThreshold(
                image,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY,
                blockSize=11,
                C=2,
            )
        except cv2.error as e:
            logger.warning(f"Adaptive threshold failed: {e}")
            return image
    def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
        """
        Attempt to detect the VIN region in an image.
        Uses contour detection to find rectangular regions that might contain VINs.
        Args:
            image_bytes: Raw image bytes
        Returns:
            BoundingBox of detected VIN region, or None if not found
        """
        try:
            pil_image = Image.open(io.BytesIO(image_bytes))
            if pil_image.mode != "L":
                pil_image = pil_image.convert("L")
            cv_image = np.array(pil_image)
            # Apply preprocessing for better contour detection
            blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
            edges = cv2.Canny(blurred, 50, 150)
            # Find contours
            contours, _ = cv2.findContours(
                edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
            )
            if not contours:
                return None
            # Find rectangular contours with appropriate aspect ratio for VIN
            # VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
            vin_candidates = []
            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                if h == 0:
                    continue
                aspect_ratio = w / h
                area = w * h
                # VIN regions typically have:
                # - Aspect ratio between 4:1 and 12:1
                # - Minimum area (to filter out noise)
                if 4 <= aspect_ratio <= 12 and area > 1000:
                    vin_candidates.append((x, y, w, h, area))
            if not vin_candidates:
                return None
            # Return the largest candidate
            vin_candidates.sort(key=lambda c: c[4], reverse=True)
            x, y, w, h, _ = vin_candidates[0]
            return BoundingBox(x=x, y=y, width=w, height=h)
        except Exception as e:
            logger.warning(f"VIN region detection failed: {e}")
            return None
 # Singleton instance
 vin_preprocessor = VinPreprocessor()
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -3,7 +3,8 @@ import logging
 from fastapi import APIRouter, File, HTTPException, Query, UploadFile
-from app.models import OcrResponse
+from app.extractors.vin_extractor import vin_extractor
 from app.models import BoundingBox, OcrResponse, VinAlternative, VinExtractionResponse
 from app.services import ocr_service
 logger = logging.getLogger(__name__)
@@ -67,3 +68,89 @@ async def extract_text(
        )
    return result
@router.post("/vin", response_model=VinExtractionResponse)
 async def extract_vin(
    file: UploadFile = File(..., description="Image file containing VIN"),
 ) -> VinExtractionResponse:
    """
    Extract VIN (Vehicle Identification Number) from an uploaded image.
    Uses VIN-optimized preprocessing and pattern matching:
    - HEIC conversion (if needed)
    - Grayscale conversion
    - Deskew correction
    - CLAHE contrast enhancement
    - Noise reduction
    - Adaptive thresholding
    - VIN pattern matching (17 chars, excludes I/O/Q)
    - Check digit validation
    - Common OCR error correction (I->1, O->0, Q->0)
    Supports HEIC, JPEG, PNG formats.
    Processing time target: <3 seconds.
    - **file**: Image file (max 10MB)
    Returns:
    - **vin**: Extracted VIN (17 alphanumeric characters)
    - **confidence**: Confidence score (0.0-1.0)
    - **boundingBox**: Location of VIN in image (if detected)
    - **alternatives**: Other VIN candidates with confidence scores
    - **processingTimeMs**: Processing time in milliseconds
    """
    # Validate file presence
    if not file.filename:
        raise HTTPException(status_code=400, detail="No file provided")
    # Read file content
    content = await file.read()
    file_size = len(content)
    # Validate file size
    if file_size > MAX_SYNC_SIZE:
        raise HTTPException(
            status_code=413,
            detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB",
        )
    if file_size == 0:
        raise HTTPException(status_code=400, detail="Empty file provided")
    logger.info(
        f"VIN extraction: {file.filename}, "
        f"size: {file_size} bytes, "
        f"content_type: {file.content_type}"
    )
    # Perform VIN extraction
    result = vin_extractor.extract(
        image_bytes=content,
        content_type=file.content_type,
    )
    # Convert internal result to API response
    bounding_box = None
    if result.bounding_box:
        bounding_box = BoundingBox(
            x=result.bounding_box.x,
            y=result.bounding_box.y,
            width=result.bounding_box.width,
            height=result.bounding_box.height,
        )
    alternatives = [
        VinAlternative(vin=alt.vin, confidence=alt.confidence)
        for alt in result.alternatives
    ]
    return VinExtractionResponse(
        success=result.success,
        vin=result.vin,
        confidence=result.confidence,
        boundingBox=bounding_box,
        alternatives=alternatives,
        processingTimeMs=result.processing_time_ms,
        error=result.error,
    )
--- a/ocr/app/validators/init.py
+++ b/ocr/app/validators/init.py
@@ -0,0 +1,4 @@
 """Validators package for OCR data validation."""
 from app.validators.vin_validator import VinValidator, vin_validator
 __all__ = ["VinValidator", "vin_validator"]
--- a/ocr/app/validators/vin_validator.py
+++ b/ocr/app/validators/vin_validator.py
@@ -0,0 +1,259 @@
 """VIN validation with check digit verification and OCR error correction."""
 import re
 from dataclasses import dataclass
 from typing import Optional
@dataclass
 class VinValidationResult:
    """Result of VIN validation."""
    is_valid: bool
    vin: str
    confidence_adjustment: float
    error: Optional[str] = None
 class VinValidator:
    """Validates and corrects VIN strings."""
    # VIN character set (excludes I, O, Q)
    VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
    # Common OCR misreads and their corrections
    TRANSLITERATION = {
        "I": "1",
        "O": "0",
        "Q": "0",
        "i": "1",
        "o": "0",
        "q": "0",
        "l": "1",
        "L": "1",
        "B": "8",  # Sometimes confused
        "S": "5",  # Sometimes confused
    }
    # Weights for check digit calculation (positions 1-17)
    CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
    # Character to value mapping for check digit
    CHAR_VALUES = {
        "A": 1,
        "B": 2,
        "C": 3,
        "D": 4,
        "E": 5,
        "F": 6,
        "G": 7,
        "H": 8,
        "J": 1,
        "K": 2,
        "L": 3,
        "M": 4,
        "N": 5,
        "P": 7,
        "R": 9,
        "S": 2,
        "T": 3,
        "U": 4,
        "V": 5,
        "W": 6,
        "X": 7,
        "Y": 8,
        "Z": 9,
        "0": 0,
        "1": 1,
        "2": 2,
        "3": 3,
        "4": 4,
        "5": 5,
        "6": 6,
        "7": 7,
        "8": 8,
        "9": 9,
    }
    # Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q
    MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$")
    # Pre-1981 VIN pattern: 11-17 characters
    LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$")
    def correct_ocr_errors(self, vin: str) -> str:
        """
        Apply common OCR error corrections to a VIN string.
        Args:
            vin: Raw VIN string from OCR
        Returns:
            Corrected VIN string
        """
        corrected = vin.upper().strip()
        # Remove any spaces or dashes (common in formatted VINs)
        corrected = corrected.replace(" ", "").replace("-", "")
        # Apply transliteration for common OCR errors
        result = []
        for char in corrected:
            if char in self.TRANSLITERATION:
                result.append(self.TRANSLITERATION[char])
            else:
                result.append(char)
        return "".join(result)
    def calculate_check_digit(self, vin: str) -> Optional[str]:
        """
        Calculate the check digit (position 9) for a VIN.
        Args:
            vin: 17-character VIN string
        Returns:
            Expected check digit character, or None if calculation fails
        """
        if len(vin) != 17:
            return None
        try:
            total = 0
            for i, char in enumerate(vin.upper()):
                if i == 8:  # Skip check digit position
                    continue
                value = self.CHAR_VALUES.get(char)
                if value is None:
                    return None
                total += value * self.CHECK_WEIGHTS[i]
            remainder = total % 11
            if remainder == 10:
                return "X"
            return str(remainder)
        except (KeyError, ValueError):
            return None
    def validate_check_digit(self, vin: str) -> bool:
        """
        Validate the check digit of a VIN.
        Args:
            vin: 17-character VIN string
        Returns:
            True if check digit is valid
        """
        if len(vin) != 17:
            return False
        expected = self.calculate_check_digit(vin)
        if expected is None:
            return False
        return vin[8].upper() == expected
    def validate(
        self, vin: str, correct_errors: bool = True, allow_legacy: bool = False
    ) -> VinValidationResult:
        """
        Validate a VIN string and optionally correct OCR errors.
        Args:
            vin: VIN string to validate
            correct_errors: Whether to apply OCR error corrections
            allow_legacy: Whether to allow pre-1981 VINs (11-17 chars)
        Returns:
            VinValidationResult with validation status and corrected VIN
        """
        if not vin:
            return VinValidationResult(
                is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN"
            )
        # Apply error corrections if enabled
        corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper()
        # Check length
        if len(corrected_vin) != 17:
            if allow_legacy and 11 <= len(corrected_vin) <= 17:
                # Legacy VIN - reduced confidence
                if self.LEGACY_VIN_PATTERN.match(corrected_vin):
                    return VinValidationResult(
                        is_valid=True,
                        vin=corrected_vin,
                        confidence_adjustment=-0.2,
                    )
            return VinValidationResult(
                is_valid=False,
                vin=corrected_vin,
                confidence_adjustment=-0.5,
                error=f"Invalid length: {len(corrected_vin)} (expected 17)",
            )
        # Check character set
        if not self.MODERN_VIN_PATTERN.match(corrected_vin):
            invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS]
            return VinValidationResult(
                is_valid=False,
                vin=corrected_vin,
                confidence_adjustment=-0.3,
                error=f"Invalid characters: {invalid_chars}",
            )
        # Validate check digit
        if self.validate_check_digit(corrected_vin):
            # Valid check digit - boost confidence
            return VinValidationResult(
                is_valid=True, vin=corrected_vin, confidence_adjustment=0.1
            )
        else:
            # Invalid check digit - could be OCR error or old VIN
            return VinValidationResult(
                is_valid=True,  # Still return as valid but with reduced confidence
                vin=corrected_vin,
                confidence_adjustment=-0.15,
                error="Check digit validation failed",
            )
    def extract_candidates(
        self, text: str, max_candidates: int = 5
    ) -> list[tuple[str, int, int]]:
        """
        Extract VIN candidates from raw OCR text.
        Args:
            text: Raw OCR text
            max_candidates: Maximum number of candidates to return
        Returns:
            List of (vin, start_pos, end_pos) tuples
        """
        # Pattern to find potential VIN sequences
        # Allow some flexibility for OCR errors (include I, O, Q for correction later)
        potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
        candidates = []
        for match in potential_vin_pattern.finditer(text.upper()):
            candidate = match.group()
            corrected = self.correct_ocr_errors(candidate)
            # Only include if it could be a valid VIN after correction
            if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
                candidates.append((corrected, match.start(), match.end()))
        # Sort by likelihood of being valid (check digit validation)
        def score_candidate(c: tuple[str, int, int]) -> int:
            vin = c[0]
            if self.validate_check_digit(vin):
                return 0  # Best score
            return 1
        candidates.sort(key=score_candidate)
        return candidates[:max_candidates]
 # Singleton instance
 vin_validator = VinValidator()
--- a/ocr/tests/test_vin_extraction.py
+++ b/ocr/tests/test_vin_extraction.py
@@ -0,0 +1,242 @@
 """Integration tests for VIN extraction endpoint."""
 import io
 from unittest.mock import patch, MagicMock
 import pytest
 from fastapi.testclient import TestClient
 from PIL import Image, ImageDraw, ImageFont
 from app.main import app
@pytest.fixture
 def client() -> TestClient:
    """Create test client."""
    return TestClient(app)
 def create_vin_image(vin: str = "1HGBH41JXMN109186") -> bytes:
    """Create a test image with VIN text."""
    # Create white image
    image = Image.new("RGB", (400, 100), (255, 255, 255))
    draw = ImageDraw.Draw(image)
    # Draw VIN text (use default font)
    draw.text((50, 40), vin, fill=(0, 0, 0))
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    return buffer.getvalue()
 def create_empty_image() -> bytes:
    """Create an empty test image."""
    image = Image.new("RGB", (400, 100), (255, 255, 255))
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    return buffer.getvalue()
 class TestVinExtractionEndpoint:
    """Tests for POST /extract/vin endpoint."""
    def test_endpoint_exists(self, client: TestClient) -> None:
        """Test VIN endpoint is registered."""
        response = client.get("/")
        assert response.status_code == 200
        data = response.json()
        assert any("vin" in endpoint.lower() for endpoint in data.get("endpoints", []))
    def test_extract_vin_no_file(self, client: TestClient) -> None:
        """Test endpoint returns error when no file provided."""
        response = client.post("/extract/vin")
        assert response.status_code == 422  # Validation error
    def test_extract_vin_empty_file(self, client: TestClient) -> None:
        """Test endpoint returns error for empty file."""
        response = client.post(
            "/extract/vin",
            files={"file": ("empty.png", b"", "image/png")},
        )
        assert response.status_code == 400
        assert "empty" in response.json()["detail"].lower()
    def test_extract_vin_large_file(self, client: TestClient) -> None:
        """Test endpoint returns error for file too large."""
        # Create file larger than 10MB
        large_content = b"x" * (11 * 1024 * 1024)
        response = client.post(
            "/extract/vin",
            files={"file": ("large.png", large_content, "image/png")},
        )
        assert response.status_code == 413
    @patch("app.extractors.vin_extractor.vin_extractor.extract")
    def test_extract_vin_success(
        self, mock_extract: MagicMock, client: TestClient
    ) -> None:
        """Test successful VIN extraction."""
        from app.extractors.vin_extractor import VinExtractionResult
        mock_extract.return_value = VinExtractionResult(
            success=True,
            vin="1HGBH41JXMN109186",
            confidence=0.94,
            bounding_box=None,
            alternatives=[],
            processing_time_ms=500,
        )
        image_bytes = create_vin_image()
        response = client.post(
            "/extract/vin",
            files={"file": ("vin.png", image_bytes, "image/png")},
        )
        assert response.status_code == 200
        data = response.json()
        assert data["success"] is True
        assert data["vin"] == "1HGBH41JXMN109186"
        assert data["confidence"] == 0.94
        assert "processingTimeMs" in data
    @patch("app.extractors.vin_extractor.vin_extractor.extract")
    def test_extract_vin_not_found(
        self, mock_extract: MagicMock, client: TestClient
    ) -> None:
        """Test VIN not found returns success=false."""
        from app.extractors.vin_extractor import VinExtractionResult
        mock_extract.return_value = VinExtractionResult(
            success=False,
            vin=None,
            confidence=0.0,
            error="No VIN pattern found in image",
            processing_time_ms=300,
        )
        image_bytes = create_empty_image()
        response = client.post(
            "/extract/vin",
            files={"file": ("empty.png", image_bytes, "image/png")},
        )
        assert response.status_code == 200
        data = response.json()
        assert data["success"] is False
        assert data["vin"] is None
        assert data["error"] == "No VIN pattern found in image"
    @patch("app.extractors.vin_extractor.vin_extractor.extract")
    def test_extract_vin_with_alternatives(
        self, mock_extract: MagicMock, client: TestClient
    ) -> None:
        """Test VIN extraction with alternatives."""
        from app.extractors.vin_extractor import VinExtractionResult, VinAlternative
        mock_extract.return_value = VinExtractionResult(
            success=True,
            vin="1HGBH41JXMN109186",
            confidence=0.94,
            bounding_box=None,
            alternatives=[
                VinAlternative(vin="1HGBH41JXMN109186", confidence=0.72),
            ],
            processing_time_ms=600,
        )
        image_bytes = create_vin_image()
        response = client.post(
            "/extract/vin",
            files={"file": ("vin.png", image_bytes, "image/png")},
        )
        assert response.status_code == 200
        data = response.json()
        assert data["success"] is True
        assert len(data["alternatives"]) == 1
        assert data["alternatives"][0]["confidence"] == 0.72
    @patch("app.extractors.vin_extractor.vin_extractor.extract")
    def test_extract_vin_with_bounding_box(
        self, mock_extract: MagicMock, client: TestClient
    ) -> None:
        """Test VIN extraction includes bounding box."""
        from app.extractors.vin_extractor import VinExtractionResult
        from app.preprocessors.vin_preprocessor import BoundingBox
        mock_extract.return_value = VinExtractionResult(
            success=True,
            vin="1HGBH41JXMN109186",
            confidence=0.94,
            bounding_box=BoundingBox(x=50, y=40, width=300, height=20),
            alternatives=[],
            processing_time_ms=500,
        )
        image_bytes = create_vin_image()
        response = client.post(
            "/extract/vin",
            files={"file": ("vin.png", image_bytes, "image/png")},
        )
        assert response.status_code == 200
        data = response.json()
        assert data["boundingBox"] is not None
        assert data["boundingBox"]["x"] == 50
        assert data["boundingBox"]["y"] == 40
        assert data["boundingBox"]["width"] == 300
        assert data["boundingBox"]["height"] == 20
 class TestVinExtractionContentTypes:
    """Tests for different content types."""
    @patch("app.extractors.vin_extractor.vin_extractor.extract")
    def test_accepts_jpeg(
        self, mock_extract: MagicMock, client: TestClient
    ) -> None:
        """Test endpoint accepts JPEG images."""
        from app.extractors.vin_extractor import VinExtractionResult
        mock_extract.return_value = VinExtractionResult(
            success=True,
            vin="1HGBH41JXMN109186",
            confidence=0.9,
            processing_time_ms=400,
        )
        # Create JPEG image
        image = Image.new("RGB", (400, 100), (255, 255, 255))
        buffer = io.BytesIO()
        image.save(buffer, format="JPEG")
        response = client.post(
            "/extract/vin",
            files={"file": ("vin.jpg", buffer.getvalue(), "image/jpeg")},
        )
        assert response.status_code == 200
    @patch("app.extractors.vin_extractor.vin_extractor.extract")
    def test_accepts_png(
        self, mock_extract: MagicMock, client: TestClient
    ) -> None:
        """Test endpoint accepts PNG images."""
        from app.extractors.vin_extractor import VinExtractionResult
        mock_extract.return_value = VinExtractionResult(
            success=True,
            vin="1HGBH41JXMN109186",
            confidence=0.9,
            processing_time_ms=400,
        )
        image_bytes = create_vin_image()
        response = client.post(
            "/extract/vin",
            files={"file": ("vin.png", image_bytes, "image/png")},
        )
        assert response.status_code == 200
--- a/ocr/tests/test_vin_preprocessor.py
+++ b/ocr/tests/test_vin_preprocessor.py
@@ -0,0 +1,202 @@
 """Unit tests for VIN preprocessor."""
 import io
 from unittest.mock import patch, MagicMock
 import numpy as np
 import pytest
 from PIL import Image
 from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor
 def create_test_image(width: int = 400, height: int = 100, color: int = 128) -> bytes:
    """Create a simple test image."""
    image = Image.new("RGB", (width, height), (color, color, color))
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    return buffer.getvalue()
 def create_grayscale_test_image(width: int = 400, height: int = 100) -> bytes:
    """Create a grayscale test image."""
    image = Image.new("L", (width, height), 128)
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    return buffer.getvalue()
 class TestVinPreprocessor:
    """Tests for VIN-optimized preprocessing."""
    def test_preprocess_returns_result(self) -> None:
        """Test basic preprocessing returns a result."""
        preprocessor = VinPreprocessor()
        image_bytes = create_test_image()
        result = preprocessor.preprocess(image_bytes)
        assert result.image_bytes is not None
        assert len(result.image_bytes) > 0
        assert "grayscale" in result.preprocessing_applied
    def test_preprocess_applies_all_steps(self) -> None:
        """Test preprocessing applies all requested steps."""
        preprocessor = VinPreprocessor()
        image_bytes = create_test_image()
        result = preprocessor.preprocess(
            image_bytes,
            apply_clahe=True,
            apply_deskew=True,
            apply_denoise=True,
            apply_threshold=True,
        )
        assert "grayscale" in result.preprocessing_applied
        assert "clahe" in result.preprocessing_applied
        assert "deskew" in result.preprocessing_applied
        assert "denoise" in result.preprocessing_applied
        assert "threshold" in result.preprocessing_applied
    def test_preprocess_skips_disabled_steps(self) -> None:
        """Test preprocessing skips disabled steps."""
        preprocessor = VinPreprocessor()
        image_bytes = create_test_image()
        result = preprocessor.preprocess(
            image_bytes,
            apply_clahe=False,
            apply_deskew=False,
            apply_denoise=False,
            apply_threshold=False,
        )
        assert "clahe" not in result.preprocessing_applied
        assert "deskew" not in result.preprocessing_applied
        assert "denoise" not in result.preprocessing_applied
        assert "threshold" not in result.preprocessing_applied
    def test_preprocess_output_is_valid_image(self) -> None:
        """Test preprocessing output is a valid PNG image."""
        preprocessor = VinPreprocessor()
        image_bytes = create_test_image()
        result = preprocessor.preprocess(image_bytes)
        # Should be able to open as image
        output_image = Image.open(io.BytesIO(result.image_bytes))
        assert output_image is not None
        assert output_image.format == "PNG"
    def test_preprocess_handles_grayscale_input(self) -> None:
        """Test preprocessing handles grayscale input."""
        preprocessor = VinPreprocessor()
        image_bytes = create_grayscale_test_image()
        result = preprocessor.preprocess(image_bytes)
        assert result.image_bytes is not None
        assert len(result.image_bytes) > 0
    def test_preprocess_handles_rgba_input(self) -> None:
        """Test preprocessing handles RGBA input."""
        preprocessor = VinPreprocessor()
        # Create RGBA image
        image = Image.new("RGBA", (400, 100), (128, 128, 128, 255))
        buffer = io.BytesIO()
        image.save(buffer, format="PNG")
        result = preprocessor.preprocess(buffer.getvalue())
        assert result.image_bytes is not None
        assert "convert_rgb" in result.preprocessing_applied
    def test_singleton_instance(self) -> None:
        """Test singleton instance is available."""
        assert vin_preprocessor is not None
        assert isinstance(vin_preprocessor, VinPreprocessor)
 class TestVinPreprocessorDeskew:
    """Tests for deskew functionality."""
    def test_deskew_no_change_for_straight_image(self) -> None:
        """Test deskew doesn't change a straight image significantly."""
        preprocessor = VinPreprocessor()
        # Create image with horizontal line (no skew)
        image = np.zeros((100, 400), dtype=np.uint8)
        image[50, 50:350] = 255  # Horizontal line
        result = preprocessor._deskew(image)
        # Shape should be similar (might change slightly due to processing)
        assert result.shape[0] > 0
        assert result.shape[1] > 0
 class TestVinPreprocessorCLAHE:
    """Tests for CLAHE contrast enhancement."""
    def test_clahe_improves_contrast(self) -> None:
        """Test CLAHE changes the image."""
        preprocessor = VinPreprocessor()
        # Create low contrast image
        image = np.full((100, 400), 128, dtype=np.uint8)
        result = preprocessor._apply_clahe(image)
        # Result should be numpy array of same shape
        assert result.shape == image.shape
 class TestVinPreprocessorDenoise:
    """Tests for denoising functionality."""
    def test_denoise_reduces_noise(self) -> None:
        """Test denoising works on noisy image."""
        preprocessor = VinPreprocessor()
        # Create noisy image
        image = np.random.randint(0, 256, (100, 400), dtype=np.uint8)
        result = preprocessor._denoise(image)
        # Should return array of same shape
        assert result.shape == image.shape
 class TestVinPreprocessorThreshold:
    """Tests for adaptive thresholding."""
    def test_threshold_creates_binary_image(self) -> None:
        """Test thresholding creates binary output."""
        preprocessor = VinPreprocessor()
        # Create grayscale image
        image = np.full((100, 400), 128, dtype=np.uint8)
        result = preprocessor._adaptive_threshold(image)
        # Result should be binary (only 0 and 255)
        unique_values = np.unique(result)
        assert len(unique_values) <= 2
 class TestVinRegionDetection:
    """Tests for VIN region detection."""
    def test_detect_vin_region_returns_none_for_empty(self) -> None:
        """Test region detection returns None for empty image."""
        preprocessor = VinPreprocessor()
        # Solid color image - no regions to detect
        image_bytes = create_test_image(color=128)
        result = preprocessor.detect_vin_region(image_bytes)
        # May return None for uniform image
        # This is expected behavior
        assert result is None or result.width > 0
--- a/ocr/tests/test_vin_validator.py
+++ b/ocr/tests/test_vin_validator.py
@@ -0,0 +1,211 @@
 """Unit tests for VIN validator."""
 import pytest
 from app.validators.vin_validator import VinValidator, vin_validator
 class TestVinValidator:
    """Tests for VIN validation logic."""
    def test_correct_ocr_errors_basic(self) -> None:
        """Test basic OCR error correction."""
        validator = VinValidator()
        # I -> 1
        assert validator.correct_ocr_errors("IHGBH41JXMN109186") == "1HGBH41JXMN109186"
        # O -> 0
        assert validator.correct_ocr_errors("1HGBH41JXMN1O9186") == "1HGBH41JXMN109186"
        # Q -> 0
        assert validator.correct_ocr_errors("1HGBH41JXMN1Q9186") == "1HGBH41JXMN109186"
    def test_correct_ocr_errors_lowercase(self) -> None:
        """Test OCR error correction handles lowercase."""
        validator = VinValidator()
        result = validator.correct_ocr_errors("1hgbh41jxmn109186")
        assert result == "1HGBH41JXMN109186"
    def test_correct_ocr_errors_strips_spaces(self) -> None:
        """Test OCR error correction removes spaces and dashes."""
        validator = VinValidator()
        assert validator.correct_ocr_errors("1HG BH41 JXMN 109186") == "1HGBH41JXMN109186"
        assert validator.correct_ocr_errors("1HG-BH41-JXMN-109186") == "1HGBH41JXMN109186"
    def test_calculate_check_digit(self) -> None:
        """Test check digit calculation."""
        validator = VinValidator()
        # Test with known valid VINs
        # 1HGBH41JXMN109186 has check digit X at position 9
        result = validator.calculate_check_digit("1HGBH41JXMN109186")
        assert result == "X"
        # 5YJSA1E28HF123456 has check digit 2 at position 9
        result = validator.calculate_check_digit("5YJSA1E28HF123456")
        assert result == "8"  # Verify this is correct for this VIN
    def test_validate_check_digit_valid(self) -> None:
        """Test check digit validation with valid VIN."""
        validator = VinValidator()
        # This VIN has a valid check digit
        assert validator.validate_check_digit("1HGBH41JXMN109186") is True
    def test_validate_check_digit_invalid(self) -> None:
        """Test check digit validation with invalid VIN."""
        validator = VinValidator()
        # Modify check digit to make it invalid
        assert validator.validate_check_digit("1HGBH41J1MN109186") is False
    def test_validate_modern_vin_valid(self) -> None:
        """Test validation of valid modern VIN."""
        validator = VinValidator()
        result = validator.validate("1HGBH41JXMN109186")
        assert result.is_valid is True
        assert result.vin == "1HGBH41JXMN109186"
        assert result.confidence_adjustment > 0  # Check digit valid = boost
    def test_validate_modern_vin_with_ocr_errors(self) -> None:
        """Test validation corrects OCR errors."""
        validator = VinValidator()
        # I at start should be corrected to 1
        result = validator.validate("IHGBH41JXMN109186")
        assert result.is_valid is True
        assert result.vin == "1HGBH41JXMN109186"
    def test_validate_short_vin(self) -> None:
        """Test validation rejects short VIN."""
        validator = VinValidator()
        result = validator.validate("1HGBH41JX")
        assert result.is_valid is False
        assert "length" in result.error.lower()
    def test_validate_long_vin(self) -> None:
        """Test validation rejects long VIN."""
        validator = VinValidator()
        result = validator.validate("1HGBH41JXMN109186XX")
        assert result.is_valid is False
        assert "length" in result.error.lower()
    def test_validate_empty_vin(self) -> None:
        """Test validation handles empty VIN."""
        validator = VinValidator()
        result = validator.validate("")
        assert result.is_valid is False
        assert "empty" in result.error.lower()
    def test_validate_invalid_characters(self) -> None:
        """Test validation rejects invalid characters after correction."""
        validator = VinValidator()
        # Contains characters not in VIN alphabet
        result = validator.validate("1HGBH41JXMN!@#186", correct_errors=False)
        assert result.is_valid is False
        assert "character" in result.error.lower()
    def test_validate_legacy_vin_allowed(self) -> None:
        """Test validation allows legacy VINs when enabled."""
        validator = VinValidator()
        # 13-character VIN (pre-1981)
        result = validator.validate("ABCD123456789", allow_legacy=True)
        assert result.is_valid is True
        assert result.confidence_adjustment < 0  # Reduced confidence for legacy
    def test_validate_legacy_vin_rejected(self) -> None:
        """Test validation rejects legacy VINs by default."""
        validator = VinValidator()
        result = validator.validate("ABCD123456789", allow_legacy=False)
        assert result.is_valid is False
    def test_extract_candidates_finds_vin(self) -> None:
        """Test candidate extraction from text."""
        validator = VinValidator()
        text = "VIN: 1HGBH41JXMN109186 is shown here"
        candidates = validator.extract_candidates(text)
        assert len(candidates) >= 1
        assert candidates[0][0] == "1HGBH41JXMN109186"
    def test_extract_candidates_multiple_vins(self) -> None:
        """Test candidate extraction with multiple VINs."""
        validator = VinValidator()
        text = "First VIN: 1HGBH41JXMN109186 Second VIN: 5YJSA1E28HF123456"
        candidates = validator.extract_candidates(text)
        assert len(candidates) >= 2
        vins = [c[0] for c in candidates]
        assert "1HGBH41JXMN109186" in vins
        assert "5YJSA1E28HF123456" in vins
    def test_extract_candidates_with_ocr_errors(self) -> None:
        """Test candidate extraction corrects OCR errors."""
        validator = VinValidator()
        # Contains O instead of 0
        text = "VIN: 1HGBH41JXMN1O9186"
        candidates = validator.extract_candidates(text)
        assert len(candidates) >= 1
        assert candidates[0][0] == "1HGBH41JXMN109186"
    def test_extract_candidates_no_vin(self) -> None:
        """Test candidate extraction with no VIN."""
        validator = VinValidator()
        text = "This text contains no VIN numbers"
        candidates = validator.extract_candidates(text)
        assert len(candidates) == 0
    def test_singleton_instance(self) -> None:
        """Test singleton instance is available."""
        assert vin_validator is not None
        assert isinstance(vin_validator, VinValidator)
 class TestVinValidatorEdgeCases:
    """Edge case tests for VIN validator."""
    def test_all_zeros_vin(self) -> None:
        """Test VIN with all zeros (unlikely but valid format)."""
        validator = VinValidator()
        result = validator.validate("00000000000000000")
        assert result.is_valid is True
        assert len(result.vin) == 17
    def test_mixed_case_vin(self) -> None:
        """Test VIN with mixed case."""
        validator = VinValidator()
        result = validator.validate("1hGbH41jXmN109186")
        assert result.is_valid is True
        assert result.vin == "1HGBH41JXMN109186"
    def test_vin_with_leading_trailing_whitespace(self) -> None:
        """Test VIN with whitespace."""
        validator = VinValidator()
        result = validator.validate("  1HGBH41JXMN109186  ")
        assert result.is_valid is True
        assert result.vin == "1HGBH41JXMN109186"
    def test_check_digit_x(self) -> None:
        """Test VIN with X as check digit."""
        validator = VinValidator()
        # 1HGBH41JXMN109186 has X as check digit
        assert validator.validate_check_digit("1HGBH41JXMN109186") is True