Merge pull request 'feat: VIN Photo OCR Pipeline (#67)' (#75) from issue-67-vin-ocr-pipeline into main

Reviewed-on: #75
2026-02-02 01:36:25 +00:00
parent 004940b013 54cbd49171
commit 944a5963ab
14 changed files with 1694 additions and 1 deletions
--- a/ocr/app/extractors/init.py
+++ b/ocr/app/extractors/init.py
@@ -0,0 +1,10 @@
+"""Extractors package for domain-specific OCR extraction."""
+from app.extractors.base import BaseExtractor, ExtractionResult
+from app.extractors.vin_extractor import VinExtractor, vin_extractor
+
+__all__ = [
+    "BaseExtractor",
+    "ExtractionResult",
+    "VinExtractor",
+    "vin_extractor",
+]
--- a/ocr/app/extractors/base.py
+++ b/ocr/app/extractors/base.py
@@ -0,0 +1,47 @@
+"""Base extractor class for domain-specific OCR extraction."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+
+@dataclass
+class ExtractionResult:
+    """Base result for extraction operations."""
+
+    success: bool
+    confidence: float
+    raw_text: str
+    processing_time_ms: int
+    extracted_data: dict[str, Any] = field(default_factory=dict)
+    error: Optional[str] = None
+
+
+class BaseExtractor(ABC):
+    """Abstract base class for domain-specific extractors."""
+
+    @abstractmethod
+    def extract(self, image_bytes: bytes, content_type: Optional[str] = None) -> ExtractionResult:
+        """
+        Extract domain-specific data from an image.
+
+        Args:
+            image_bytes: Raw image bytes
+            content_type: MIME type of the image
+
+        Returns:
+            ExtractionResult with extracted data
+        """
+        pass
+
+    @abstractmethod
+    def validate(self, data: Any) -> bool:
+        """
+        Validate extracted data.
+
+        Args:
+            data: Extracted data to validate
+
+        Returns:
+            True if data is valid
+        """
+        pass
--- a/ocr/app/extractors/vin_extractor.py
+++ b/ocr/app/extractors/vin_extractor.py
@@ -0,0 +1,275 @@
+"""VIN-specific OCR extractor with preprocessing and validation."""
+import io
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+
+import magic
+import pytesseract
+from PIL import Image
+from pillow_heif import register_heif_opener
+
+from app.config import settings
+from app.extractors.base import BaseExtractor
+from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox
+from app.validators.vin_validator import vin_validator
+
+# Register HEIF/HEIC opener
+register_heif_opener()
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class VinAlternative:
+    """Alternative VIN candidate with confidence."""
+
+    vin: str
+    confidence: float
+
+
+@dataclass
+class VinExtractionResult:
+    """Result of VIN extraction."""
+
+    success: bool
+    vin: Optional[str] = None
+    confidence: float = 0.0
+    bounding_box: Optional[BoundingBox] = None
+    alternatives: list[VinAlternative] = field(default_factory=list)
+    processing_time_ms: int = 0
+    error: Optional[str] = None
+    raw_text: Optional[str] = None
+
+
+class VinExtractor(BaseExtractor):
+    """VIN-specific OCR extractor optimized for VIN plates and stickers."""
+
+    # Supported MIME types
+    SUPPORTED_TYPES = {
+        "image/jpeg",
+        "image/png",
+        "image/heic",
+        "image/heif",
+    }
+
+    # VIN character whitelist for Tesseract
+    VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789"
+
+    def __init__(self) -> None:
+        """Initialize VIN extractor."""
+        pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
+
+    def extract(
+        self, image_bytes: bytes, content_type: Optional[str] = None
+    ) -> VinExtractionResult:
+        """
+        Extract VIN from an image using optimized preprocessing and OCR.
+
+        Args:
+            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
+            content_type: MIME type (auto-detected if not provided)
+
+        Returns:
+            VinExtractionResult with extracted VIN and metadata
+        """
+        start_time = time.time()
+
+        # Detect content type if not provided
+        if not content_type:
+            content_type = self._detect_mime_type(image_bytes)
+
+        # Validate content type
+        if content_type not in self.SUPPORTED_TYPES:
+            return VinExtractionResult(
+                success=False,
+                error=f"Unsupported file type: {content_type}",
+                processing_time_ms=int((time.time() - start_time) * 1000),
+            )
+
+        try:
+            # Apply VIN-optimized preprocessing
+            preprocessing_result = vin_preprocessor.preprocess(image_bytes)
+            preprocessed_bytes = preprocessing_result.image_bytes
+
+            # Perform OCR with VIN-optimized settings
+            raw_text, word_confidences = self._perform_ocr(preprocessed_bytes)
+
+            # Extract VIN candidates from raw text
+            candidates = vin_validator.extract_candidates(raw_text)
+
+            if not candidates:
+                # No VIN candidates found - try with different PSM modes
+                candidates = self._try_alternate_ocr(preprocessed_bytes)
+
+            if not candidates:
+                return VinExtractionResult(
+                    success=False,
+                    error="No VIN pattern found in image",
+                    raw_text=raw_text,
+                    processing_time_ms=int((time.time() - start_time) * 1000),
+                )
+
+            # Validate and score candidates
+            scored_candidates = []
+            for vin, start_pos, end_pos in candidates:
+                validation = vin_validator.validate(vin)
+
+                # Calculate confidence
+                base_confidence = self._calculate_base_confidence(word_confidences)
+                adjusted_confidence = min(
+                    1.0, max(0.0, base_confidence + validation.confidence_adjustment)
+                )
+
+                scored_candidates.append(
+                    (validation.vin, adjusted_confidence, validation.is_valid)
+                )
+
+            # Sort by confidence
+            scored_candidates.sort(key=lambda x: x[1], reverse=True)
+
+            # Primary result is the highest confidence valid candidate
+            primary_vin = None
+            primary_confidence = 0.0
+
+            for vin, confidence, is_valid in scored_candidates:
+                if is_valid:
+                    primary_vin = vin
+                    primary_confidence = confidence
+                    break
+
+            # If no valid candidate, use the highest confidence one
+            if primary_vin is None and scored_candidates:
+                primary_vin = scored_candidates[0][0]
+                primary_confidence = scored_candidates[0][1]
+
+            # Build alternatives list (excluding primary)
+            alternatives = [
+                VinAlternative(vin=vin, confidence=conf)
+                for vin, conf, _ in scored_candidates[1:5]  # Max 4 alternatives
+            ]
+
+            processing_time_ms = int((time.time() - start_time) * 1000)
+
+            logger.info(
+                f"VIN extraction: {primary_vin}, confidence={primary_confidence:.2%}, "
+                f"time={processing_time_ms}ms"
+            )
+
+            return VinExtractionResult(
+                success=True,
+                vin=primary_vin,
+                confidence=primary_confidence,
+                bounding_box=preprocessing_result.bounding_box,
+                alternatives=alternatives,
+                processing_time_ms=processing_time_ms,
+                raw_text=raw_text,
+            )
+
+        except Exception as e:
+            logger.error(f"VIN extraction failed: {e}", exc_info=True)
+            return VinExtractionResult(
+                success=False,
+                error=str(e),
+                processing_time_ms=int((time.time() - start_time) * 1000),
+            )
+
+    def _detect_mime_type(self, file_bytes: bytes) -> str:
+        """Detect MIME type using python-magic."""
+        mime = magic.Magic(mime=True)
+        detected = mime.from_buffer(file_bytes)
+        return detected or "application/octet-stream"
+
+    def _perform_ocr(
+        self, image_bytes: bytes, psm: int = 6
+    ) -> tuple[str, list[float]]:
+        """
+        Perform OCR with VIN-optimized settings.
+
+        Args:
+            image_bytes: Preprocessed image bytes
+            psm: Tesseract page segmentation mode
+                 6 = Uniform block of text
+                 7 = Single text line
+                 8 = Single word
+
+        Returns:
+            Tuple of (raw_text, word_confidences)
+        """
+        image = Image.open(io.BytesIO(image_bytes))
+
+        # Configure Tesseract for VIN extraction
+        # Use character whitelist to exclude I, O, Q
+        config = (
+            f"--psm {psm} "
+            f"-c tessedit_char_whitelist={self.VIN_WHITELIST}"
+        )
+
+        # Get detailed OCR data
+        ocr_data = pytesseract.image_to_data(
+            image, config=config, output_type=pytesseract.Output.DICT
+        )
+
+        # Extract words and confidences
+        words = []
+        confidences = []
+
+        for i, text in enumerate(ocr_data["text"]):
+            conf = int(ocr_data["conf"][i])
+            if text.strip() and conf > 0:
+                words.append(text.strip())
+                confidences.append(conf / 100.0)
+
+        raw_text = " ".join(words)
+        return raw_text, confidences
+
+    def _try_alternate_ocr(self, image_bytes: bytes) -> list[tuple[str, int, int]]:
+        """
+        Try alternate OCR configurations when initial extraction fails.
+
+        Returns:
+            List of VIN candidates
+        """
+        # Try PSM 7 (single text line)
+        raw_text, _ = self._perform_ocr(image_bytes, psm=7)
+        candidates = vin_validator.extract_candidates(raw_text)
+        if candidates:
+            return candidates
+
+        # Try PSM 8 (single word)
+        raw_text, _ = self._perform_ocr(image_bytes, psm=8)
+        candidates = vin_validator.extract_candidates(raw_text)
+        if candidates:
+            return candidates
+
+        return []
+
+    def _calculate_base_confidence(self, word_confidences: list[float]) -> float:
+        """Calculate base confidence from word confidences."""
+        if not word_confidences:
+            return 0.5
+
+        # Use average confidence, weighted slightly toward minimum
+        avg_conf = sum(word_confidences) / len(word_confidences)
+        min_conf = min(word_confidences)
+
+        # Blend: 70% average, 30% minimum
+        return 0.7 * avg_conf + 0.3 * min_conf
+
+    def validate(self, data: str) -> bool:
+        """
+        Validate a VIN string.
+
+        Args:
+            data: VIN string to validate
+
+        Returns:
+            True if VIN is valid
+        """
+        result = vin_validator.validate(data)
+        return result.is_valid
+
+
+# Singleton instance
+vin_extractor = VinExtractor()
--- a/ocr/app/main.py
+++ b/ocr/app/main.py
@@ -55,6 +55,7 @@ async def root() -> dict:
        "log_level": settings.log_level,
        "endpoints": [
            "POST /extract - Synchronous OCR extraction",
+            "POST /extract/vin - VIN-specific extraction with validation",
            "POST /jobs - Submit async OCR job",
            "GET /jobs/{job_id} - Get async job status",
        ],
--- a/ocr/app/models/init.py
+++ b/ocr/app/models/init.py
@@ -1,18 +1,24 @@
 """Pydantic models for OCR service."""
 from .schemas import (
+    BoundingBox,
    DocumentType,
    ExtractedField,
    JobResponse,
    JobStatus,
    JobSubmitRequest,
    OcrResponse,
+    VinAlternative,
+    VinExtractionResponse,
 )

 __all__ = [
+    "BoundingBox",
    "DocumentType",
    "ExtractedField",
    "JobResponse",
    "JobStatus",
    "JobSubmitRequest",
    "OcrResponse",
+    "VinAlternative",
+    "VinExtractionResponse",
 ]
--- a/ocr/app/models/schemas.py
+++ b/ocr/app/models/schemas.py
@@ -21,6 +21,36 @@ class ExtractedField(BaseModel):
    confidence: float = Field(ge=0.0, le=1.0)


+class BoundingBox(BaseModel):
+    """Bounding box for detected region."""
+
+    x: int
+    y: int
+    width: int
+    height: int
+
+
+class VinAlternative(BaseModel):
+    """Alternative VIN candidate."""
+
+    vin: str
+    confidence: float = Field(ge=0.0, le=1.0)
+
+
+class VinExtractionResponse(BaseModel):
+    """Response from VIN extraction endpoint."""
+
+    success: bool
+    vin: Optional[str] = None
+    confidence: float = Field(ge=0.0, le=1.0)
+    bounding_box: Optional[BoundingBox] = Field(default=None, alias="boundingBox")
+    alternatives: list[VinAlternative] = Field(default_factory=list)
+    processing_time_ms: int = Field(alias="processingTimeMs")
+    error: Optional[str] = None
+
+    model_config = {"populate_by_name": True}
+
+
 class OcrResponse(BaseModel):
    """Response from OCR extraction."""

--- a/ocr/app/preprocessors/init.py
+++ b/ocr/app/preprocessors/init.py
@@ -0,0 +1,10 @@
+"""Image preprocessors for OCR optimization."""
+from app.services.preprocessor import ImagePreprocessor, preprocessor
+from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor
+
+__all__ = [
+    "ImagePreprocessor",
+    "preprocessor",
+    "VinPreprocessor",
+    "vin_preprocessor",
+]
--- a/ocr/app/preprocessors/vin_preprocessor.py
+++ b/ocr/app/preprocessors/vin_preprocessor.py
@@ -0,0 +1,309 @@
+"""VIN-optimized image preprocessing pipeline."""
+import io
+import logging
+from dataclasses import dataclass
+from typing import Optional
+
+import cv2
+import numpy as np
+from PIL import Image
+from pillow_heif import register_heif_opener
+
+# Register HEIF/HEIC opener
+register_heif_opener()
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BoundingBox:
+    """Represents a region in an image."""
+
+    x: int
+    y: int
+    width: int
+    height: int
+
+
+@dataclass
+class PreprocessingResult:
+    """Result of VIN preprocessing."""
+
+    image_bytes: bytes
+    bounding_box: Optional[BoundingBox] = None
+    preprocessing_applied: list[str] = None
+
+    def __post_init__(self) -> None:
+        if self.preprocessing_applied is None:
+            self.preprocessing_applied = []
+
+
+class VinPreprocessor:
+    """VIN-optimized image preprocessing for improved OCR accuracy."""
+
+    def preprocess(
+        self,
+        image_bytes: bytes,
+        apply_clahe: bool = True,
+        apply_deskew: bool = True,
+        apply_denoise: bool = True,
+        apply_threshold: bool = True,
+    ) -> PreprocessingResult:
+        """
+        Apply VIN-optimized preprocessing pipeline.
+
+        Pipeline:
+        1. HEIC conversion (if needed)
+        2. Grayscale conversion
+        3. Deskew (correct rotation/tilt)
+        4. Contrast enhancement (CLAHE)
+        5. Noise reduction (fastNlMeansDenoising)
+        6. Adaptive thresholding
+
+        Args:
+            image_bytes: Raw image bytes (HEIC, JPEG, PNG)
+            apply_clahe: Apply CLAHE contrast enhancement
+            apply_deskew: Apply deskew correction
+            apply_denoise: Apply noise reduction
+            apply_threshold: Apply adaptive thresholding
+
+        Returns:
+            PreprocessingResult with processed image bytes
+        """
+        steps_applied = []
+
+        # Load image with PIL (handles HEIC via pillow-heif)
+        pil_image = Image.open(io.BytesIO(image_bytes))
+        steps_applied.append("loaded")
+
+        # Convert to RGB if needed
+        if pil_image.mode not in ("RGB", "L"):
+            pil_image = pil_image.convert("RGB")
+            steps_applied.append("convert_rgb")
+
+        # Convert to OpenCV format
+        cv_image = np.array(pil_image)
+        if len(cv_image.shape) == 3:
+            cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
+
+        # Convert to grayscale
+        if len(cv_image.shape) == 3:
+            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = cv_image
+        steps_applied.append("grayscale")
+
+        # Apply deskew
+        if apply_deskew:
+            gray = self._deskew(gray)
+            steps_applied.append("deskew")
+
+        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
+        if apply_clahe:
+            gray = self._apply_clahe(gray)
+            steps_applied.append("clahe")
+
+        # Apply denoising
+        if apply_denoise:
+            gray = self._denoise(gray)
+            steps_applied.append("denoise")
+
+        # Apply adaptive thresholding
+        if apply_threshold:
+            gray = self._adaptive_threshold(gray)
+            steps_applied.append("threshold")
+
+        # Convert back to PNG bytes
+        result_image = Image.fromarray(gray)
+        buffer = io.BytesIO()
+        result_image.save(buffer, format="PNG")
+
+        return PreprocessingResult(
+            image_bytes=buffer.getvalue(),
+            preprocessing_applied=steps_applied,
+        )
+
+    def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply CLAHE (Contrast Limited Adaptive Histogram Equalization).
+
+        CLAHE improves contrast in images with varying illumination,
+        which is common in VIN photos taken in different lighting conditions.
+        """
+        try:
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+            return clahe.apply(image)
+        except cv2.error as e:
+            logger.warning(f"CLAHE failed: {e}")
+            return image
+
+    def _deskew(self, image: np.ndarray) -> np.ndarray:
+        """
+        Correct image rotation using Hough transform line detection.
+
+        VIN plates/stickers are often photographed at slight angles.
+        """
+        try:
+            # Detect edges
+            edges = cv2.Canny(image, 50, 150, apertureSize=3)
+
+            # Detect lines
+            lines = cv2.HoughLinesP(
+                edges,
+                rho=1,
+                theta=np.pi / 180,
+                threshold=100,
+                minLineLength=100,
+                maxLineGap=10,
+            )
+
+            if lines is None:
+                return image
+
+            # Calculate angles of detected lines
+            angles = []
+            for line in lines:
+                x1, y1, x2, y2 = line[0]
+                if x2 - x1 != 0:
+                    angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
+                    # Only consider nearly horizontal lines
+                    if -45 < angle < 45:
+                        angles.append(angle)
+
+            if not angles:
+                return image
+
+            # Use median angle to avoid outliers
+            median_angle = np.median(angles)
+
+            # Only correct if skew is significant but not extreme
+            if abs(median_angle) < 0.5 or abs(median_angle) > 20:
+                return image
+
+            # Rotate to correct skew
+            height, width = image.shape[:2]
+            center = (width // 2, height // 2)
+            rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+
+            # Calculate new bounds
+            cos_val = abs(rotation_matrix[0, 0])
+            sin_val = abs(rotation_matrix[0, 1])
+            new_width = int(height * sin_val + width * cos_val)
+            new_height = int(height * cos_val + width * sin_val)
+
+            rotation_matrix[0, 2] += (new_width - width) / 2
+            rotation_matrix[1, 2] += (new_height - height) / 2
+
+            rotated = cv2.warpAffine(
+                image,
+                rotation_matrix,
+                (new_width, new_height),
+                borderMode=cv2.BORDER_REPLICATE,
+            )
+
+            logger.debug(f"Deskewed by {median_angle:.2f} degrees")
+            return rotated
+
+        except Exception as e:
+            logger.warning(f"Deskew failed: {e}")
+            return image
+
+    def _denoise(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply non-local means denoising.
+
+        This helps remove noise while preserving VIN character edges.
+        """
+        try:
+            return cv2.fastNlMeansDenoising(
+                image, h=10, templateWindowSize=7, searchWindowSize=21
+            )
+        except cv2.error as e:
+            logger.warning(f"Denoising failed: {e}")
+            return image
+
+    def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
+        """
+        Apply adaptive thresholding for binarization.
+
+        Adaptive thresholding handles varying illumination across the image,
+        which is common in VIN photos.
+        """
+        try:
+            return cv2.adaptiveThreshold(
+                image,
+                255,
+                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY,
+                blockSize=11,
+                C=2,
+            )
+        except cv2.error as e:
+            logger.warning(f"Adaptive threshold failed: {e}")
+            return image
+
+    def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]:
+        """
+        Attempt to detect the VIN region in an image.
+
+        Uses contour detection to find rectangular regions that might contain VINs.
+
+        Args:
+            image_bytes: Raw image bytes
+
+        Returns:
+            BoundingBox of detected VIN region, or None if not found
+        """
+        try:
+            pil_image = Image.open(io.BytesIO(image_bytes))
+            if pil_image.mode != "L":
+                pil_image = pil_image.convert("L")
+
+            cv_image = np.array(pil_image)
+
+            # Apply preprocessing for better contour detection
+            blurred = cv2.GaussianBlur(cv_image, (5, 5), 0)
+            edges = cv2.Canny(blurred, 50, 150)
+
+            # Find contours
+            contours, _ = cv2.findContours(
+                edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            )
+
+            if not contours:
+                return None
+
+            # Find rectangular contours with appropriate aspect ratio for VIN
+            # VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio
+            vin_candidates = []
+
+            for contour in contours:
+                x, y, w, h = cv2.boundingRect(contour)
+                if h == 0:
+                    continue
+
+                aspect_ratio = w / h
+                area = w * h
+
+                # VIN regions typically have:
+                # - Aspect ratio between 4:1 and 12:1
+                # - Minimum area (to filter out noise)
+                if 4 <= aspect_ratio <= 12 and area > 1000:
+                    vin_candidates.append((x, y, w, h, area))
+
+            if not vin_candidates:
+                return None
+
+            # Return the largest candidate
+            vin_candidates.sort(key=lambda c: c[4], reverse=True)
+            x, y, w, h, _ = vin_candidates[0]
+
+            return BoundingBox(x=x, y=y, width=w, height=h)
+
+        except Exception as e:
+            logger.warning(f"VIN region detection failed: {e}")
+            return None
+
+
+# Singleton instance
+vin_preprocessor = VinPreprocessor()
--- a/ocr/app/routers/extract.py
+++ b/ocr/app/routers/extract.py
@@ -3,7 +3,8 @@ import logging

 from fastapi import APIRouter, File, HTTPException, Query, UploadFile

-from app.models import OcrResponse
+from app.extractors.vin_extractor import vin_extractor
+from app.models import BoundingBox, OcrResponse, VinAlternative, VinExtractionResponse
 from app.services import ocr_service

 logger = logging.getLogger(__name__)
@@ -67,3 +68,89 @@ async def extract_text(
        )

    return result
+
+
+@router.post("/vin", response_model=VinExtractionResponse)
+async def extract_vin(
+    file: UploadFile = File(..., description="Image file containing VIN"),
+) -> VinExtractionResponse:
+    """
+    Extract VIN (Vehicle Identification Number) from an uploaded image.
+
+    Uses VIN-optimized preprocessing and pattern matching:
+    - HEIC conversion (if needed)
+    - Grayscale conversion
+    - Deskew correction
+    - CLAHE contrast enhancement
+    - Noise reduction
+    - Adaptive thresholding
+    - VIN pattern matching (17 chars, excludes I/O/Q)
+    - Check digit validation
+    - Common OCR error correction (I->1, O->0, Q->0)
+
+    Supports HEIC, JPEG, PNG formats.
+    Processing time target: <3 seconds.
+
+    - **file**: Image file (max 10MB)
+
+    Returns:
+    - **vin**: Extracted VIN (17 alphanumeric characters)
+    - **confidence**: Confidence score (0.0-1.0)
+    - **boundingBox**: Location of VIN in image (if detected)
+    - **alternatives**: Other VIN candidates with confidence scores
+    - **processingTimeMs**: Processing time in milliseconds
+    """
+    # Validate file presence
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+
+    # Read file content
+    content = await file.read()
+    file_size = len(content)
+
+    # Validate file size
+    if file_size > MAX_SYNC_SIZE:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB",
+        )
+
+    if file_size == 0:
+        raise HTTPException(status_code=400, detail="Empty file provided")
+
+    logger.info(
+        f"VIN extraction: {file.filename}, "
+        f"size: {file_size} bytes, "
+        f"content_type: {file.content_type}"
+    )
+
+    # Perform VIN extraction
+    result = vin_extractor.extract(
+        image_bytes=content,
+        content_type=file.content_type,
+    )
+
+    # Convert internal result to API response
+    bounding_box = None
+    if result.bounding_box:
+        bounding_box = BoundingBox(
+            x=result.bounding_box.x,
+            y=result.bounding_box.y,
+            width=result.bounding_box.width,
+            height=result.bounding_box.height,
+        )
+
+    alternatives = [
+        VinAlternative(vin=alt.vin, confidence=alt.confidence)
+        for alt in result.alternatives
+    ]
+
+    return VinExtractionResponse(
+        success=result.success,
+        vin=result.vin,
+        confidence=result.confidence,
+        boundingBox=bounding_box,
+        alternatives=alternatives,
+        processingTimeMs=result.processing_time_ms,
+        error=result.error,
+    )
--- a/ocr/app/validators/init.py
+++ b/ocr/app/validators/init.py
@@ -0,0 +1,4 @@
+"""Validators package for OCR data validation."""
+from app.validators.vin_validator import VinValidator, vin_validator
+
+__all__ = ["VinValidator", "vin_validator"]
--- a/ocr/app/validators/vin_validator.py
+++ b/ocr/app/validators/vin_validator.py
@@ -0,0 +1,259 @@
+"""VIN validation with check digit verification and OCR error correction."""
+import re
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class VinValidationResult:
+    """Result of VIN validation."""
+
+    is_valid: bool
+    vin: str
+    confidence_adjustment: float
+    error: Optional[str] = None
+
+
+class VinValidator:
+    """Validates and corrects VIN strings."""
+
+    # VIN character set (excludes I, O, Q)
+    VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
+
+    # Common OCR misreads and their corrections
+    TRANSLITERATION = {
+        "I": "1",
+        "O": "0",
+        "Q": "0",
+        "i": "1",
+        "o": "0",
+        "q": "0",
+        "l": "1",
+        "L": "1",
+        "B": "8",  # Sometimes confused
+        "S": "5",  # Sometimes confused
+    }
+
+    # Weights for check digit calculation (positions 1-17)
+    CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
+
+    # Character to value mapping for check digit
+    CHAR_VALUES = {
+        "A": 1,
+        "B": 2,
+        "C": 3,
+        "D": 4,
+        "E": 5,
+        "F": 6,
+        "G": 7,
+        "H": 8,
+        "J": 1,
+        "K": 2,
+        "L": 3,
+        "M": 4,
+        "N": 5,
+        "P": 7,
+        "R": 9,
+        "S": 2,
+        "T": 3,
+        "U": 4,
+        "V": 5,
+        "W": 6,
+        "X": 7,
+        "Y": 8,
+        "Z": 9,
+        "0": 0,
+        "1": 1,
+        "2": 2,
+        "3": 3,
+        "4": 4,
+        "5": 5,
+        "6": 6,
+        "7": 7,
+        "8": 8,
+        "9": 9,
+    }
+
+    # Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q
+    MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$")
+
+    # Pre-1981 VIN pattern: 11-17 characters
+    LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$")
+
+    def correct_ocr_errors(self, vin: str) -> str:
+        """
+        Apply common OCR error corrections to a VIN string.
+
+        Args:
+            vin: Raw VIN string from OCR
+
+        Returns:
+            Corrected VIN string
+        """
+        corrected = vin.upper().strip()
+
+        # Remove any spaces or dashes (common in formatted VINs)
+        corrected = corrected.replace(" ", "").replace("-", "")
+
+        # Apply transliteration for common OCR errors
+        result = []
+        for char in corrected:
+            if char in self.TRANSLITERATION:
+                result.append(self.TRANSLITERATION[char])
+            else:
+                result.append(char)
+
+        return "".join(result)
+
+    def calculate_check_digit(self, vin: str) -> Optional[str]:
+        """
+        Calculate the check digit (position 9) for a VIN.
+
+        Args:
+            vin: 17-character VIN string
+
+        Returns:
+            Expected check digit character, or None if calculation fails
+        """
+        if len(vin) != 17:
+            return None
+
+        try:
+            total = 0
+            for i, char in enumerate(vin.upper()):
+                if i == 8:  # Skip check digit position
+                    continue
+                value = self.CHAR_VALUES.get(char)
+                if value is None:
+                    return None
+                total += value * self.CHECK_WEIGHTS[i]
+
+            remainder = total % 11
+            if remainder == 10:
+                return "X"
+            return str(remainder)
+        except (KeyError, ValueError):
+            return None
+
+    def validate_check_digit(self, vin: str) -> bool:
+        """
+        Validate the check digit of a VIN.
+
+        Args:
+            vin: 17-character VIN string
+
+        Returns:
+            True if check digit is valid
+        """
+        if len(vin) != 17:
+            return False
+
+        expected = self.calculate_check_digit(vin)
+        if expected is None:
+            return False
+
+        return vin[8].upper() == expected
+
+    def validate(
+        self, vin: str, correct_errors: bool = True, allow_legacy: bool = False
+    ) -> VinValidationResult:
+        """
+        Validate a VIN string and optionally correct OCR errors.
+
+        Args:
+            vin: VIN string to validate
+            correct_errors: Whether to apply OCR error corrections
+            allow_legacy: Whether to allow pre-1981 VINs (11-17 chars)
+
+        Returns:
+            VinValidationResult with validation status and corrected VIN
+        """
+        if not vin:
+            return VinValidationResult(
+                is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN"
+            )
+
+        # Apply error corrections if enabled
+        corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper()
+
+        # Check length
+        if len(corrected_vin) != 17:
+            if allow_legacy and 11 <= len(corrected_vin) <= 17:
+                # Legacy VIN - reduced confidence
+                if self.LEGACY_VIN_PATTERN.match(corrected_vin):
+                    return VinValidationResult(
+                        is_valid=True,
+                        vin=corrected_vin,
+                        confidence_adjustment=-0.2,
+                    )
+            return VinValidationResult(
+                is_valid=False,
+                vin=corrected_vin,
+                confidence_adjustment=-0.5,
+                error=f"Invalid length: {len(corrected_vin)} (expected 17)",
+            )
+
+        # Check character set
+        if not self.MODERN_VIN_PATTERN.match(corrected_vin):
+            invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS]
+            return VinValidationResult(
+                is_valid=False,
+                vin=corrected_vin,
+                confidence_adjustment=-0.3,
+                error=f"Invalid characters: {invalid_chars}",
+            )
+
+        # Validate check digit
+        if self.validate_check_digit(corrected_vin):
+            # Valid check digit - boost confidence
+            return VinValidationResult(
+                is_valid=True, vin=corrected_vin, confidence_adjustment=0.1
+            )
+        else:
+            # Invalid check digit - could be OCR error or old VIN
+            return VinValidationResult(
+                is_valid=True,  # Still return as valid but with reduced confidence
+                vin=corrected_vin,
+                confidence_adjustment=-0.15,
+                error="Check digit validation failed",
+            )
+
+    def extract_candidates(
+        self, text: str, max_candidates: int = 5
+    ) -> list[tuple[str, int, int]]:
+        """
+        Extract VIN candidates from raw OCR text.
+
+        Args:
+            text: Raw OCR text
+            max_candidates: Maximum number of candidates to return
+
+        Returns:
+            List of (vin, start_pos, end_pos) tuples
+        """
+        # Pattern to find potential VIN sequences
+        # Allow some flexibility for OCR errors (include I, O, Q for correction later)
+        potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE)
+
+        candidates = []
+        for match in potential_vin_pattern.finditer(text.upper()):
+            candidate = match.group()
+            corrected = self.correct_ocr_errors(candidate)
+
+            # Only include if it could be a valid VIN after correction
+            if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected):
+                candidates.append((corrected, match.start(), match.end()))
+
+        # Sort by likelihood of being valid (check digit validation)
+        def score_candidate(c: tuple[str, int, int]) -> int:
+            vin = c[0]
+            if self.validate_check_digit(vin):
+                return 0  # Best score
+            return 1
+
+        candidates.sort(key=score_candidate)
+        return candidates[:max_candidates]
+
+
+# Singleton instance
+vin_validator = VinValidator()
--- a/ocr/tests/test_vin_extraction.py
+++ b/ocr/tests/test_vin_extraction.py
@@ -0,0 +1,242 @@
+"""Integration tests for VIN extraction endpoint."""
+import io
+from unittest.mock import patch, MagicMock
+
+import pytest
+from fastapi.testclient import TestClient
+from PIL import Image, ImageDraw, ImageFont
+
+from app.main import app
+
+
+@pytest.fixture
+def client() -> TestClient:
+    """Create test client."""
+    return TestClient(app)
+
+
+def create_vin_image(vin: str = "1HGBH41JXMN109186") -> bytes:
+    """Create a test image with VIN text."""
+    # Create white image
+    image = Image.new("RGB", (400, 100), (255, 255, 255))
+    draw = ImageDraw.Draw(image)
+
+    # Draw VIN text (use default font)
+    draw.text((50, 40), vin, fill=(0, 0, 0))
+
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    return buffer.getvalue()
+
+
+def create_empty_image() -> bytes:
+    """Create an empty test image."""
+    image = Image.new("RGB", (400, 100), (255, 255, 255))
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    return buffer.getvalue()
+
+
+class TestVinExtractionEndpoint:
+    """Tests for POST /extract/vin endpoint."""
+
+    def test_endpoint_exists(self, client: TestClient) -> None:
+        """Test VIN endpoint is registered."""
+        response = client.get("/")
+        assert response.status_code == 200
+        data = response.json()
+        assert any("vin" in endpoint.lower() for endpoint in data.get("endpoints", []))
+
+    def test_extract_vin_no_file(self, client: TestClient) -> None:
+        """Test endpoint returns error when no file provided."""
+        response = client.post("/extract/vin")
+        assert response.status_code == 422  # Validation error
+
+    def test_extract_vin_empty_file(self, client: TestClient) -> None:
+        """Test endpoint returns error for empty file."""
+        response = client.post(
+            "/extract/vin",
+            files={"file": ("empty.png", b"", "image/png")},
+        )
+        assert response.status_code == 400
+        assert "empty" in response.json()["detail"].lower()
+
+    def test_extract_vin_large_file(self, client: TestClient) -> None:
+        """Test endpoint returns error for file too large."""
+        # Create file larger than 10MB
+        large_content = b"x" * (11 * 1024 * 1024)
+
+        response = client.post(
+            "/extract/vin",
+            files={"file": ("large.png", large_content, "image/png")},
+        )
+        assert response.status_code == 413
+
+    @patch("app.extractors.vin_extractor.vin_extractor.extract")
+    def test_extract_vin_success(
+        self, mock_extract: MagicMock, client: TestClient
+    ) -> None:
+        """Test successful VIN extraction."""
+        from app.extractors.vin_extractor import VinExtractionResult
+
+        mock_extract.return_value = VinExtractionResult(
+            success=True,
+            vin="1HGBH41JXMN109186",
+            confidence=0.94,
+            bounding_box=None,
+            alternatives=[],
+            processing_time_ms=500,
+        )
+
+        image_bytes = create_vin_image()
+        response = client.post(
+            "/extract/vin",
+            files={"file": ("vin.png", image_bytes, "image/png")},
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+        assert data["vin"] == "1HGBH41JXMN109186"
+        assert data["confidence"] == 0.94
+        assert "processingTimeMs" in data
+
+    @patch("app.extractors.vin_extractor.vin_extractor.extract")
+    def test_extract_vin_not_found(
+        self, mock_extract: MagicMock, client: TestClient
+    ) -> None:
+        """Test VIN not found returns success=false."""
+        from app.extractors.vin_extractor import VinExtractionResult
+
+        mock_extract.return_value = VinExtractionResult(
+            success=False,
+            vin=None,
+            confidence=0.0,
+            error="No VIN pattern found in image",
+            processing_time_ms=300,
+        )
+
+        image_bytes = create_empty_image()
+        response = client.post(
+            "/extract/vin",
+            files={"file": ("empty.png", image_bytes, "image/png")},
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is False
+        assert data["vin"] is None
+        assert data["error"] == "No VIN pattern found in image"
+
+    @patch("app.extractors.vin_extractor.vin_extractor.extract")
+    def test_extract_vin_with_alternatives(
+        self, mock_extract: MagicMock, client: TestClient
+    ) -> None:
+        """Test VIN extraction with alternatives."""
+        from app.extractors.vin_extractor import VinExtractionResult, VinAlternative
+
+        mock_extract.return_value = VinExtractionResult(
+            success=True,
+            vin="1HGBH41JXMN109186",
+            confidence=0.94,
+            bounding_box=None,
+            alternatives=[
+                VinAlternative(vin="1HGBH41JXMN109186", confidence=0.72),
+            ],
+            processing_time_ms=600,
+        )
+
+        image_bytes = create_vin_image()
+        response = client.post(
+            "/extract/vin",
+            files={"file": ("vin.png", image_bytes, "image/png")},
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+        assert len(data["alternatives"]) == 1
+        assert data["alternatives"][0]["confidence"] == 0.72
+
+    @patch("app.extractors.vin_extractor.vin_extractor.extract")
+    def test_extract_vin_with_bounding_box(
+        self, mock_extract: MagicMock, client: TestClient
+    ) -> None:
+        """Test VIN extraction includes bounding box."""
+        from app.extractors.vin_extractor import VinExtractionResult
+        from app.preprocessors.vin_preprocessor import BoundingBox
+
+        mock_extract.return_value = VinExtractionResult(
+            success=True,
+            vin="1HGBH41JXMN109186",
+            confidence=0.94,
+            bounding_box=BoundingBox(x=50, y=40, width=300, height=20),
+            alternatives=[],
+            processing_time_ms=500,
+        )
+
+        image_bytes = create_vin_image()
+        response = client.post(
+            "/extract/vin",
+            files={"file": ("vin.png", image_bytes, "image/png")},
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["boundingBox"] is not None
+        assert data["boundingBox"]["x"] == 50
+        assert data["boundingBox"]["y"] == 40
+        assert data["boundingBox"]["width"] == 300
+        assert data["boundingBox"]["height"] == 20
+
+
+class TestVinExtractionContentTypes:
+    """Tests for different content types."""
+
+    @patch("app.extractors.vin_extractor.vin_extractor.extract")
+    def test_accepts_jpeg(
+        self, mock_extract: MagicMock, client: TestClient
+    ) -> None:
+        """Test endpoint accepts JPEG images."""
+        from app.extractors.vin_extractor import VinExtractionResult
+
+        mock_extract.return_value = VinExtractionResult(
+            success=True,
+            vin="1HGBH41JXMN109186",
+            confidence=0.9,
+            processing_time_ms=400,
+        )
+
+        # Create JPEG image
+        image = Image.new("RGB", (400, 100), (255, 255, 255))
+        buffer = io.BytesIO()
+        image.save(buffer, format="JPEG")
+
+        response = client.post(
+            "/extract/vin",
+            files={"file": ("vin.jpg", buffer.getvalue(), "image/jpeg")},
+        )
+
+        assert response.status_code == 200
+
+    @patch("app.extractors.vin_extractor.vin_extractor.extract")
+    def test_accepts_png(
+        self, mock_extract: MagicMock, client: TestClient
+    ) -> None:
+        """Test endpoint accepts PNG images."""
+        from app.extractors.vin_extractor import VinExtractionResult
+
+        mock_extract.return_value = VinExtractionResult(
+            success=True,
+            vin="1HGBH41JXMN109186",
+            confidence=0.9,
+            processing_time_ms=400,
+        )
+
+        image_bytes = create_vin_image()
+        response = client.post(
+            "/extract/vin",
+            files={"file": ("vin.png", image_bytes, "image/png")},
+        )
+
+        assert response.status_code == 200
--- a/ocr/tests/test_vin_preprocessor.py
+++ b/ocr/tests/test_vin_preprocessor.py
@@ -0,0 +1,202 @@
+"""Unit tests for VIN preprocessor."""
+import io
+from unittest.mock import patch, MagicMock
+
+import numpy as np
+import pytest
+from PIL import Image
+
+from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor
+
+
+def create_test_image(width: int = 400, height: int = 100, color: int = 128) -> bytes:
+    """Create a simple test image."""
+    image = Image.new("RGB", (width, height), (color, color, color))
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    return buffer.getvalue()
+
+
+def create_grayscale_test_image(width: int = 400, height: int = 100) -> bytes:
+    """Create a grayscale test image."""
+    image = Image.new("L", (width, height), 128)
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    return buffer.getvalue()
+
+
+class TestVinPreprocessor:
+    """Tests for VIN-optimized preprocessing."""
+
+    def test_preprocess_returns_result(self) -> None:
+        """Test basic preprocessing returns a result."""
+        preprocessor = VinPreprocessor()
+        image_bytes = create_test_image()
+
+        result = preprocessor.preprocess(image_bytes)
+
+        assert result.image_bytes is not None
+        assert len(result.image_bytes) > 0
+        assert "grayscale" in result.preprocessing_applied
+
+    def test_preprocess_applies_all_steps(self) -> None:
+        """Test preprocessing applies all requested steps."""
+        preprocessor = VinPreprocessor()
+        image_bytes = create_test_image()
+
+        result = preprocessor.preprocess(
+            image_bytes,
+            apply_clahe=True,
+            apply_deskew=True,
+            apply_denoise=True,
+            apply_threshold=True,
+        )
+
+        assert "grayscale" in result.preprocessing_applied
+        assert "clahe" in result.preprocessing_applied
+        assert "deskew" in result.preprocessing_applied
+        assert "denoise" in result.preprocessing_applied
+        assert "threshold" in result.preprocessing_applied
+
+    def test_preprocess_skips_disabled_steps(self) -> None:
+        """Test preprocessing skips disabled steps."""
+        preprocessor = VinPreprocessor()
+        image_bytes = create_test_image()
+
+        result = preprocessor.preprocess(
+            image_bytes,
+            apply_clahe=False,
+            apply_deskew=False,
+            apply_denoise=False,
+            apply_threshold=False,
+        )
+
+        assert "clahe" not in result.preprocessing_applied
+        assert "deskew" not in result.preprocessing_applied
+        assert "denoise" not in result.preprocessing_applied
+        assert "threshold" not in result.preprocessing_applied
+
+    def test_preprocess_output_is_valid_image(self) -> None:
+        """Test preprocessing output is a valid PNG image."""
+        preprocessor = VinPreprocessor()
+        image_bytes = create_test_image()
+
+        result = preprocessor.preprocess(image_bytes)
+
+        # Should be able to open as image
+        output_image = Image.open(io.BytesIO(result.image_bytes))
+        assert output_image is not None
+        assert output_image.format == "PNG"
+
+    def test_preprocess_handles_grayscale_input(self) -> None:
+        """Test preprocessing handles grayscale input."""
+        preprocessor = VinPreprocessor()
+        image_bytes = create_grayscale_test_image()
+
+        result = preprocessor.preprocess(image_bytes)
+
+        assert result.image_bytes is not None
+        assert len(result.image_bytes) > 0
+
+    def test_preprocess_handles_rgba_input(self) -> None:
+        """Test preprocessing handles RGBA input."""
+        preprocessor = VinPreprocessor()
+
+        # Create RGBA image
+        image = Image.new("RGBA", (400, 100), (128, 128, 128, 255))
+        buffer = io.BytesIO()
+        image.save(buffer, format="PNG")
+
+        result = preprocessor.preprocess(buffer.getvalue())
+
+        assert result.image_bytes is not None
+        assert "convert_rgb" in result.preprocessing_applied
+
+    def test_singleton_instance(self) -> None:
+        """Test singleton instance is available."""
+        assert vin_preprocessor is not None
+        assert isinstance(vin_preprocessor, VinPreprocessor)
+
+
+class TestVinPreprocessorDeskew:
+    """Tests for deskew functionality."""
+
+    def test_deskew_no_change_for_straight_image(self) -> None:
+        """Test deskew doesn't change a straight image significantly."""
+        preprocessor = VinPreprocessor()
+
+        # Create image with horizontal line (no skew)
+        image = np.zeros((100, 400), dtype=np.uint8)
+        image[50, 50:350] = 255  # Horizontal line
+
+        result = preprocessor._deskew(image)
+
+        # Shape should be similar (might change slightly due to processing)
+        assert result.shape[0] > 0
+        assert result.shape[1] > 0
+
+
+class TestVinPreprocessorCLAHE:
+    """Tests for CLAHE contrast enhancement."""
+
+    def test_clahe_improves_contrast(self) -> None:
+        """Test CLAHE changes the image."""
+        preprocessor = VinPreprocessor()
+
+        # Create low contrast image
+        image = np.full((100, 400), 128, dtype=np.uint8)
+
+        result = preprocessor._apply_clahe(image)
+
+        # Result should be numpy array of same shape
+        assert result.shape == image.shape
+
+
+class TestVinPreprocessorDenoise:
+    """Tests for denoising functionality."""
+
+    def test_denoise_reduces_noise(self) -> None:
+        """Test denoising works on noisy image."""
+        preprocessor = VinPreprocessor()
+
+        # Create noisy image
+        image = np.random.randint(0, 256, (100, 400), dtype=np.uint8)
+
+        result = preprocessor._denoise(image)
+
+        # Should return array of same shape
+        assert result.shape == image.shape
+
+
+class TestVinPreprocessorThreshold:
+    """Tests for adaptive thresholding."""
+
+    def test_threshold_creates_binary_image(self) -> None:
+        """Test thresholding creates binary output."""
+        preprocessor = VinPreprocessor()
+
+        # Create grayscale image
+        image = np.full((100, 400), 128, dtype=np.uint8)
+
+        result = preprocessor._adaptive_threshold(image)
+
+        # Result should be binary (only 0 and 255)
+        unique_values = np.unique(result)
+        assert len(unique_values) <= 2
+
+
+class TestVinRegionDetection:
+    """Tests for VIN region detection."""
+
+    def test_detect_vin_region_returns_none_for_empty(self) -> None:
+        """Test region detection returns None for empty image."""
+        preprocessor = VinPreprocessor()
+
+        # Solid color image - no regions to detect
+        image_bytes = create_test_image(color=128)
+
+        result = preprocessor.detect_vin_region(image_bytes)
+
+        # May return None for uniform image
+        # This is expected behavior
+        assert result is None or result.width > 0
--- a/ocr/tests/test_vin_validator.py
+++ b/ocr/tests/test_vin_validator.py
@@ -0,0 +1,211 @@
+"""Unit tests for VIN validator."""
+import pytest
+
+from app.validators.vin_validator import VinValidator, vin_validator
+
+
+class TestVinValidator:
+    """Tests for VIN validation logic."""
+
+    def test_correct_ocr_errors_basic(self) -> None:
+        """Test basic OCR error correction."""
+        validator = VinValidator()
+
+        # I -> 1
+        assert validator.correct_ocr_errors("IHGBH41JXMN109186") == "1HGBH41JXMN109186"
+
+        # O -> 0
+        assert validator.correct_ocr_errors("1HGBH41JXMN1O9186") == "1HGBH41JXMN109186"
+
+        # Q -> 0
+        assert validator.correct_ocr_errors("1HGBH41JXMN1Q9186") == "1HGBH41JXMN109186"
+
+    def test_correct_ocr_errors_lowercase(self) -> None:
+        """Test OCR error correction handles lowercase."""
+        validator = VinValidator()
+
+        result = validator.correct_ocr_errors("1hgbh41jxmn109186")
+        assert result == "1HGBH41JXMN109186"
+
+    def test_correct_ocr_errors_strips_spaces(self) -> None:
+        """Test OCR error correction removes spaces and dashes."""
+        validator = VinValidator()
+
+        assert validator.correct_ocr_errors("1HG BH41 JXMN 109186") == "1HGBH41JXMN109186"
+        assert validator.correct_ocr_errors("1HG-BH41-JXMN-109186") == "1HGBH41JXMN109186"
+
+    def test_calculate_check_digit(self) -> None:
+        """Test check digit calculation."""
+        validator = VinValidator()
+
+        # Test with known valid VINs
+        # 1HGBH41JXMN109186 has check digit X at position 9
+        result = validator.calculate_check_digit("1HGBH41JXMN109186")
+        assert result == "X"
+
+        # 5YJSA1E28HF123456 has check digit 2 at position 9
+        result = validator.calculate_check_digit("5YJSA1E28HF123456")
+        assert result == "8"  # Verify this is correct for this VIN
+
+    def test_validate_check_digit_valid(self) -> None:
+        """Test check digit validation with valid VIN."""
+        validator = VinValidator()
+
+        # This VIN has a valid check digit
+        assert validator.validate_check_digit("1HGBH41JXMN109186") is True
+
+    def test_validate_check_digit_invalid(self) -> None:
+        """Test check digit validation with invalid VIN."""
+        validator = VinValidator()
+
+        # Modify check digit to make it invalid
+        assert validator.validate_check_digit("1HGBH41J1MN109186") is False
+
+    def test_validate_modern_vin_valid(self) -> None:
+        """Test validation of valid modern VIN."""
+        validator = VinValidator()
+
+        result = validator.validate("1HGBH41JXMN109186")
+        assert result.is_valid is True
+        assert result.vin == "1HGBH41JXMN109186"
+        assert result.confidence_adjustment > 0  # Check digit valid = boost
+
+    def test_validate_modern_vin_with_ocr_errors(self) -> None:
+        """Test validation corrects OCR errors."""
+        validator = VinValidator()
+
+        # I at start should be corrected to 1
+        result = validator.validate("IHGBH41JXMN109186")
+        assert result.is_valid is True
+        assert result.vin == "1HGBH41JXMN109186"
+
+    def test_validate_short_vin(self) -> None:
+        """Test validation rejects short VIN."""
+        validator = VinValidator()
+
+        result = validator.validate("1HGBH41JX")
+        assert result.is_valid is False
+        assert "length" in result.error.lower()
+
+    def test_validate_long_vin(self) -> None:
+        """Test validation rejects long VIN."""
+        validator = VinValidator()
+
+        result = validator.validate("1HGBH41JXMN109186XX")
+        assert result.is_valid is False
+        assert "length" in result.error.lower()
+
+    def test_validate_empty_vin(self) -> None:
+        """Test validation handles empty VIN."""
+        validator = VinValidator()
+
+        result = validator.validate("")
+        assert result.is_valid is False
+        assert "empty" in result.error.lower()
+
+    def test_validate_invalid_characters(self) -> None:
+        """Test validation rejects invalid characters after correction."""
+        validator = VinValidator()
+
+        # Contains characters not in VIN alphabet
+        result = validator.validate("1HGBH41JXMN!@#186", correct_errors=False)
+        assert result.is_valid is False
+        assert "character" in result.error.lower()
+
+    def test_validate_legacy_vin_allowed(self) -> None:
+        """Test validation allows legacy VINs when enabled."""
+        validator = VinValidator()
+
+        # 13-character VIN (pre-1981)
+        result = validator.validate("ABCD123456789", allow_legacy=True)
+        assert result.is_valid is True
+        assert result.confidence_adjustment < 0  # Reduced confidence for legacy
+
+    def test_validate_legacy_vin_rejected(self) -> None:
+        """Test validation rejects legacy VINs by default."""
+        validator = VinValidator()
+
+        result = validator.validate("ABCD123456789", allow_legacy=False)
+        assert result.is_valid is False
+
+    def test_extract_candidates_finds_vin(self) -> None:
+        """Test candidate extraction from text."""
+        validator = VinValidator()
+
+        text = "VIN: 1HGBH41JXMN109186 is shown here"
+        candidates = validator.extract_candidates(text)
+
+        assert len(candidates) >= 1
+        assert candidates[0][0] == "1HGBH41JXMN109186"
+
+    def test_extract_candidates_multiple_vins(self) -> None:
+        """Test candidate extraction with multiple VINs."""
+        validator = VinValidator()
+
+        text = "First VIN: 1HGBH41JXMN109186 Second VIN: 5YJSA1E28HF123456"
+        candidates = validator.extract_candidates(text)
+
+        assert len(candidates) >= 2
+        vins = [c[0] for c in candidates]
+        assert "1HGBH41JXMN109186" in vins
+        assert "5YJSA1E28HF123456" in vins
+
+    def test_extract_candidates_with_ocr_errors(self) -> None:
+        """Test candidate extraction corrects OCR errors."""
+        validator = VinValidator()
+
+        # Contains O instead of 0
+        text = "VIN: 1HGBH41JXMN1O9186"
+        candidates = validator.extract_candidates(text)
+
+        assert len(candidates) >= 1
+        assert candidates[0][0] == "1HGBH41JXMN109186"
+
+    def test_extract_candidates_no_vin(self) -> None:
+        """Test candidate extraction with no VIN."""
+        validator = VinValidator()
+
+        text = "This text contains no VIN numbers"
+        candidates = validator.extract_candidates(text)
+
+        assert len(candidates) == 0
+
+    def test_singleton_instance(self) -> None:
+        """Test singleton instance is available."""
+        assert vin_validator is not None
+        assert isinstance(vin_validator, VinValidator)
+
+
+class TestVinValidatorEdgeCases:
+    """Edge case tests for VIN validator."""
+
+    def test_all_zeros_vin(self) -> None:
+        """Test VIN with all zeros (unlikely but valid format)."""
+        validator = VinValidator()
+
+        result = validator.validate("00000000000000000")
+        assert result.is_valid is True
+        assert len(result.vin) == 17
+
+    def test_mixed_case_vin(self) -> None:
+        """Test VIN with mixed case."""
+        validator = VinValidator()
+
+        result = validator.validate("1hGbH41jXmN109186")
+        assert result.is_valid is True
+        assert result.vin == "1HGBH41JXMN109186"
+
+    def test_vin_with_leading_trailing_whitespace(self) -> None:
+        """Test VIN with whitespace."""
+        validator = VinValidator()
+
+        result = validator.validate("  1HGBH41JXMN109186  ")
+        assert result.is_valid is True
+        assert result.vin == "1HGBH41JXMN109186"
+
+    def test_check_digit_x(self) -> None:
+        """Test VIN with X as check digit."""
+        validator = VinValidator()
+
+        # 1HGBH41JXMN109186 has X as check digit
+        assert validator.validate_check_digit("1HGBH41JXMN109186") is True