From 54cbd491710a46d8cfab30890ab47da279650527 Mon Sep 17 00:00:00 2001 From: Eric Gullickson <16152721+ericgullickson@users.noreply.github.com> Date: Sun, 1 Feb 2026 19:31:36 -0600 Subject: [PATCH] feat: add VIN photo OCR pipeline (refs #67) Implement VIN-specific OCR extraction with optimized preprocessing: - Add POST /extract/vin endpoint for VIN extraction - VIN preprocessor: CLAHE, deskew, denoise, adaptive threshold - VIN validator: check digit validation, OCR error correction (I->1, O->0) - VIN extractor: PSM modes 6/7/8, character whitelist, alternatives - Response includes confidence, bounding box, and alternatives - Unit tests for validator and preprocessor - Integration tests for VIN extraction endpoint Co-Authored-By: Claude Opus 4.5 --- ocr/app/extractors/__init__.py | 10 + ocr/app/extractors/base.py | 47 ++++ ocr/app/extractors/vin_extractor.py | 275 +++++++++++++++++++ ocr/app/main.py | 1 + ocr/app/models/__init__.py | 6 + ocr/app/models/schemas.py | 30 +++ ocr/app/preprocessors/__init__.py | 10 + ocr/app/preprocessors/vin_preprocessor.py | 309 ++++++++++++++++++++++ ocr/app/routers/extract.py | 89 ++++++- ocr/app/validators/__init__.py | 4 + ocr/app/validators/vin_validator.py | 259 ++++++++++++++++++ ocr/tests/test_vin_extraction.py | 242 +++++++++++++++++ ocr/tests/test_vin_preprocessor.py | 202 ++++++++++++++ ocr/tests/test_vin_validator.py | 211 +++++++++++++++ 14 files changed, 1694 insertions(+), 1 deletion(-) create mode 100644 ocr/app/extractors/__init__.py create mode 100644 ocr/app/extractors/base.py create mode 100644 ocr/app/extractors/vin_extractor.py create mode 100644 ocr/app/preprocessors/__init__.py create mode 100644 ocr/app/preprocessors/vin_preprocessor.py create mode 100644 ocr/app/validators/__init__.py create mode 100644 ocr/app/validators/vin_validator.py create mode 100644 ocr/tests/test_vin_extraction.py create mode 100644 ocr/tests/test_vin_preprocessor.py create mode 100644 ocr/tests/test_vin_validator.py diff --git a/ocr/app/extractors/__init__.py b/ocr/app/extractors/__init__.py new file mode 100644 index 0000000..9ae8f51 --- /dev/null +++ b/ocr/app/extractors/__init__.py @@ -0,0 +1,10 @@ +"""Extractors package for domain-specific OCR extraction.""" +from app.extractors.base import BaseExtractor, ExtractionResult +from app.extractors.vin_extractor import VinExtractor, vin_extractor + +__all__ = [ + "BaseExtractor", + "ExtractionResult", + "VinExtractor", + "vin_extractor", +] diff --git a/ocr/app/extractors/base.py b/ocr/app/extractors/base.py new file mode 100644 index 0000000..eb0559d --- /dev/null +++ b/ocr/app/extractors/base.py @@ -0,0 +1,47 @@ +"""Base extractor class for domain-specific OCR extraction.""" +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Optional + + +@dataclass +class ExtractionResult: + """Base result for extraction operations.""" + + success: bool + confidence: float + raw_text: str + processing_time_ms: int + extracted_data: dict[str, Any] = field(default_factory=dict) + error: Optional[str] = None + + +class BaseExtractor(ABC): + """Abstract base class for domain-specific extractors.""" + + @abstractmethod + def extract(self, image_bytes: bytes, content_type: Optional[str] = None) -> ExtractionResult: + """ + Extract domain-specific data from an image. + + Args: + image_bytes: Raw image bytes + content_type: MIME type of the image + + Returns: + ExtractionResult with extracted data + """ + pass + + @abstractmethod + def validate(self, data: Any) -> bool: + """ + Validate extracted data. + + Args: + data: Extracted data to validate + + Returns: + True if data is valid + """ + pass diff --git a/ocr/app/extractors/vin_extractor.py b/ocr/app/extractors/vin_extractor.py new file mode 100644 index 0000000..37fdad1 --- /dev/null +++ b/ocr/app/extractors/vin_extractor.py @@ -0,0 +1,275 @@ +"""VIN-specific OCR extractor with preprocessing and validation.""" +import io +import logging +import time +from dataclasses import dataclass, field +from typing import Optional + +import magic +import pytesseract +from PIL import Image +from pillow_heif import register_heif_opener + +from app.config import settings +from app.extractors.base import BaseExtractor +from app.preprocessors.vin_preprocessor import vin_preprocessor, BoundingBox +from app.validators.vin_validator import vin_validator + +# Register HEIF/HEIC opener +register_heif_opener() + +logger = logging.getLogger(__name__) + + +@dataclass +class VinAlternative: + """Alternative VIN candidate with confidence.""" + + vin: str + confidence: float + + +@dataclass +class VinExtractionResult: + """Result of VIN extraction.""" + + success: bool + vin: Optional[str] = None + confidence: float = 0.0 + bounding_box: Optional[BoundingBox] = None + alternatives: list[VinAlternative] = field(default_factory=list) + processing_time_ms: int = 0 + error: Optional[str] = None + raw_text: Optional[str] = None + + +class VinExtractor(BaseExtractor): + """VIN-specific OCR extractor optimized for VIN plates and stickers.""" + + # Supported MIME types + SUPPORTED_TYPES = { + "image/jpeg", + "image/png", + "image/heic", + "image/heif", + } + + # VIN character whitelist for Tesseract + VIN_WHITELIST = "ABCDEFGHJKLMNPRSTUVWXYZ0123456789" + + def __init__(self) -> None: + """Initialize VIN extractor.""" + pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd + + def extract( + self, image_bytes: bytes, content_type: Optional[str] = None + ) -> VinExtractionResult: + """ + Extract VIN from an image using optimized preprocessing and OCR. + + Args: + image_bytes: Raw image bytes (HEIC, JPEG, PNG) + content_type: MIME type (auto-detected if not provided) + + Returns: + VinExtractionResult with extracted VIN and metadata + """ + start_time = time.time() + + # Detect content type if not provided + if not content_type: + content_type = self._detect_mime_type(image_bytes) + + # Validate content type + if content_type not in self.SUPPORTED_TYPES: + return VinExtractionResult( + success=False, + error=f"Unsupported file type: {content_type}", + processing_time_ms=int((time.time() - start_time) * 1000), + ) + + try: + # Apply VIN-optimized preprocessing + preprocessing_result = vin_preprocessor.preprocess(image_bytes) + preprocessed_bytes = preprocessing_result.image_bytes + + # Perform OCR with VIN-optimized settings + raw_text, word_confidences = self._perform_ocr(preprocessed_bytes) + + # Extract VIN candidates from raw text + candidates = vin_validator.extract_candidates(raw_text) + + if not candidates: + # No VIN candidates found - try with different PSM modes + candidates = self._try_alternate_ocr(preprocessed_bytes) + + if not candidates: + return VinExtractionResult( + success=False, + error="No VIN pattern found in image", + raw_text=raw_text, + processing_time_ms=int((time.time() - start_time) * 1000), + ) + + # Validate and score candidates + scored_candidates = [] + for vin, start_pos, end_pos in candidates: + validation = vin_validator.validate(vin) + + # Calculate confidence + base_confidence = self._calculate_base_confidence(word_confidences) + adjusted_confidence = min( + 1.0, max(0.0, base_confidence + validation.confidence_adjustment) + ) + + scored_candidates.append( + (validation.vin, adjusted_confidence, validation.is_valid) + ) + + # Sort by confidence + scored_candidates.sort(key=lambda x: x[1], reverse=True) + + # Primary result is the highest confidence valid candidate + primary_vin = None + primary_confidence = 0.0 + + for vin, confidence, is_valid in scored_candidates: + if is_valid: + primary_vin = vin + primary_confidence = confidence + break + + # If no valid candidate, use the highest confidence one + if primary_vin is None and scored_candidates: + primary_vin = scored_candidates[0][0] + primary_confidence = scored_candidates[0][1] + + # Build alternatives list (excluding primary) + alternatives = [ + VinAlternative(vin=vin, confidence=conf) + for vin, conf, _ in scored_candidates[1:5] # Max 4 alternatives + ] + + processing_time_ms = int((time.time() - start_time) * 1000) + + logger.info( + f"VIN extraction: {primary_vin}, confidence={primary_confidence:.2%}, " + f"time={processing_time_ms}ms" + ) + + return VinExtractionResult( + success=True, + vin=primary_vin, + confidence=primary_confidence, + bounding_box=preprocessing_result.bounding_box, + alternatives=alternatives, + processing_time_ms=processing_time_ms, + raw_text=raw_text, + ) + + except Exception as e: + logger.error(f"VIN extraction failed: {e}", exc_info=True) + return VinExtractionResult( + success=False, + error=str(e), + processing_time_ms=int((time.time() - start_time) * 1000), + ) + + def _detect_mime_type(self, file_bytes: bytes) -> str: + """Detect MIME type using python-magic.""" + mime = magic.Magic(mime=True) + detected = mime.from_buffer(file_bytes) + return detected or "application/octet-stream" + + def _perform_ocr( + self, image_bytes: bytes, psm: int = 6 + ) -> tuple[str, list[float]]: + """ + Perform OCR with VIN-optimized settings. + + Args: + image_bytes: Preprocessed image bytes + psm: Tesseract page segmentation mode + 6 = Uniform block of text + 7 = Single text line + 8 = Single word + + Returns: + Tuple of (raw_text, word_confidences) + """ + image = Image.open(io.BytesIO(image_bytes)) + + # Configure Tesseract for VIN extraction + # Use character whitelist to exclude I, O, Q + config = ( + f"--psm {psm} " + f"-c tessedit_char_whitelist={self.VIN_WHITELIST}" + ) + + # Get detailed OCR data + ocr_data = pytesseract.image_to_data( + image, config=config, output_type=pytesseract.Output.DICT + ) + + # Extract words and confidences + words = [] + confidences = [] + + for i, text in enumerate(ocr_data["text"]): + conf = int(ocr_data["conf"][i]) + if text.strip() and conf > 0: + words.append(text.strip()) + confidences.append(conf / 100.0) + + raw_text = " ".join(words) + return raw_text, confidences + + def _try_alternate_ocr(self, image_bytes: bytes) -> list[tuple[str, int, int]]: + """ + Try alternate OCR configurations when initial extraction fails. + + Returns: + List of VIN candidates + """ + # Try PSM 7 (single text line) + raw_text, _ = self._perform_ocr(image_bytes, psm=7) + candidates = vin_validator.extract_candidates(raw_text) + if candidates: + return candidates + + # Try PSM 8 (single word) + raw_text, _ = self._perform_ocr(image_bytes, psm=8) + candidates = vin_validator.extract_candidates(raw_text) + if candidates: + return candidates + + return [] + + def _calculate_base_confidence(self, word_confidences: list[float]) -> float: + """Calculate base confidence from word confidences.""" + if not word_confidences: + return 0.5 + + # Use average confidence, weighted slightly toward minimum + avg_conf = sum(word_confidences) / len(word_confidences) + min_conf = min(word_confidences) + + # Blend: 70% average, 30% minimum + return 0.7 * avg_conf + 0.3 * min_conf + + def validate(self, data: str) -> bool: + """ + Validate a VIN string. + + Args: + data: VIN string to validate + + Returns: + True if VIN is valid + """ + result = vin_validator.validate(data) + return result.is_valid + + +# Singleton instance +vin_extractor = VinExtractor() diff --git a/ocr/app/main.py b/ocr/app/main.py index 5af3c8c..dca38c1 100644 --- a/ocr/app/main.py +++ b/ocr/app/main.py @@ -55,6 +55,7 @@ async def root() -> dict: "log_level": settings.log_level, "endpoints": [ "POST /extract - Synchronous OCR extraction", + "POST /extract/vin - VIN-specific extraction with validation", "POST /jobs - Submit async OCR job", "GET /jobs/{job_id} - Get async job status", ], diff --git a/ocr/app/models/__init__.py b/ocr/app/models/__init__.py index bac3e26..9063882 100644 --- a/ocr/app/models/__init__.py +++ b/ocr/app/models/__init__.py @@ -1,18 +1,24 @@ """Pydantic models for OCR service.""" from .schemas import ( + BoundingBox, DocumentType, ExtractedField, JobResponse, JobStatus, JobSubmitRequest, OcrResponse, + VinAlternative, + VinExtractionResponse, ) __all__ = [ + "BoundingBox", "DocumentType", "ExtractedField", "JobResponse", "JobStatus", "JobSubmitRequest", "OcrResponse", + "VinAlternative", + "VinExtractionResponse", ] diff --git a/ocr/app/models/schemas.py b/ocr/app/models/schemas.py index d5c86a5..ff34d94 100644 --- a/ocr/app/models/schemas.py +++ b/ocr/app/models/schemas.py @@ -21,6 +21,36 @@ class ExtractedField(BaseModel): confidence: float = Field(ge=0.0, le=1.0) +class BoundingBox(BaseModel): + """Bounding box for detected region.""" + + x: int + y: int + width: int + height: int + + +class VinAlternative(BaseModel): + """Alternative VIN candidate.""" + + vin: str + confidence: float = Field(ge=0.0, le=1.0) + + +class VinExtractionResponse(BaseModel): + """Response from VIN extraction endpoint.""" + + success: bool + vin: Optional[str] = None + confidence: float = Field(ge=0.0, le=1.0) + bounding_box: Optional[BoundingBox] = Field(default=None, alias="boundingBox") + alternatives: list[VinAlternative] = Field(default_factory=list) + processing_time_ms: int = Field(alias="processingTimeMs") + error: Optional[str] = None + + model_config = {"populate_by_name": True} + + class OcrResponse(BaseModel): """Response from OCR extraction.""" diff --git a/ocr/app/preprocessors/__init__.py b/ocr/app/preprocessors/__init__.py new file mode 100644 index 0000000..ff54eee --- /dev/null +++ b/ocr/app/preprocessors/__init__.py @@ -0,0 +1,10 @@ +"""Image preprocessors for OCR optimization.""" +from app.services.preprocessor import ImagePreprocessor, preprocessor +from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor + +__all__ = [ + "ImagePreprocessor", + "preprocessor", + "VinPreprocessor", + "vin_preprocessor", +] diff --git a/ocr/app/preprocessors/vin_preprocessor.py b/ocr/app/preprocessors/vin_preprocessor.py new file mode 100644 index 0000000..e0ffbba --- /dev/null +++ b/ocr/app/preprocessors/vin_preprocessor.py @@ -0,0 +1,309 @@ +"""VIN-optimized image preprocessing pipeline.""" +import io +import logging +from dataclasses import dataclass +from typing import Optional + +import cv2 +import numpy as np +from PIL import Image +from pillow_heif import register_heif_opener + +# Register HEIF/HEIC opener +register_heif_opener() + +logger = logging.getLogger(__name__) + + +@dataclass +class BoundingBox: + """Represents a region in an image.""" + + x: int + y: int + width: int + height: int + + +@dataclass +class PreprocessingResult: + """Result of VIN preprocessing.""" + + image_bytes: bytes + bounding_box: Optional[BoundingBox] = None + preprocessing_applied: list[str] = None + + def __post_init__(self) -> None: + if self.preprocessing_applied is None: + self.preprocessing_applied = [] + + +class VinPreprocessor: + """VIN-optimized image preprocessing for improved OCR accuracy.""" + + def preprocess( + self, + image_bytes: bytes, + apply_clahe: bool = True, + apply_deskew: bool = True, + apply_denoise: bool = True, + apply_threshold: bool = True, + ) -> PreprocessingResult: + """ + Apply VIN-optimized preprocessing pipeline. + + Pipeline: + 1. HEIC conversion (if needed) + 2. Grayscale conversion + 3. Deskew (correct rotation/tilt) + 4. Contrast enhancement (CLAHE) + 5. Noise reduction (fastNlMeansDenoising) + 6. Adaptive thresholding + + Args: + image_bytes: Raw image bytes (HEIC, JPEG, PNG) + apply_clahe: Apply CLAHE contrast enhancement + apply_deskew: Apply deskew correction + apply_denoise: Apply noise reduction + apply_threshold: Apply adaptive thresholding + + Returns: + PreprocessingResult with processed image bytes + """ + steps_applied = [] + + # Load image with PIL (handles HEIC via pillow-heif) + pil_image = Image.open(io.BytesIO(image_bytes)) + steps_applied.append("loaded") + + # Convert to RGB if needed + if pil_image.mode not in ("RGB", "L"): + pil_image = pil_image.convert("RGB") + steps_applied.append("convert_rgb") + + # Convert to OpenCV format + cv_image = np.array(pil_image) + if len(cv_image.shape) == 3: + cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR) + + # Convert to grayscale + if len(cv_image.shape) == 3: + gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) + else: + gray = cv_image + steps_applied.append("grayscale") + + # Apply deskew + if apply_deskew: + gray = self._deskew(gray) + steps_applied.append("deskew") + + # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) + if apply_clahe: + gray = self._apply_clahe(gray) + steps_applied.append("clahe") + + # Apply denoising + if apply_denoise: + gray = self._denoise(gray) + steps_applied.append("denoise") + + # Apply adaptive thresholding + if apply_threshold: + gray = self._adaptive_threshold(gray) + steps_applied.append("threshold") + + # Convert back to PNG bytes + result_image = Image.fromarray(gray) + buffer = io.BytesIO() + result_image.save(buffer, format="PNG") + + return PreprocessingResult( + image_bytes=buffer.getvalue(), + preprocessing_applied=steps_applied, + ) + + def _apply_clahe(self, image: np.ndarray) -> np.ndarray: + """ + Apply CLAHE (Contrast Limited Adaptive Histogram Equalization). + + CLAHE improves contrast in images with varying illumination, + which is common in VIN photos taken in different lighting conditions. + """ + try: + clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) + return clahe.apply(image) + except cv2.error as e: + logger.warning(f"CLAHE failed: {e}") + return image + + def _deskew(self, image: np.ndarray) -> np.ndarray: + """ + Correct image rotation using Hough transform line detection. + + VIN plates/stickers are often photographed at slight angles. + """ + try: + # Detect edges + edges = cv2.Canny(image, 50, 150, apertureSize=3) + + # Detect lines + lines = cv2.HoughLinesP( + edges, + rho=1, + theta=np.pi / 180, + threshold=100, + minLineLength=100, + maxLineGap=10, + ) + + if lines is None: + return image + + # Calculate angles of detected lines + angles = [] + for line in lines: + x1, y1, x2, y2 = line[0] + if x2 - x1 != 0: + angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi + # Only consider nearly horizontal lines + if -45 < angle < 45: + angles.append(angle) + + if not angles: + return image + + # Use median angle to avoid outliers + median_angle = np.median(angles) + + # Only correct if skew is significant but not extreme + if abs(median_angle) < 0.5 or abs(median_angle) > 20: + return image + + # Rotate to correct skew + height, width = image.shape[:2] + center = (width // 2, height // 2) + rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0) + + # Calculate new bounds + cos_val = abs(rotation_matrix[0, 0]) + sin_val = abs(rotation_matrix[0, 1]) + new_width = int(height * sin_val + width * cos_val) + new_height = int(height * cos_val + width * sin_val) + + rotation_matrix[0, 2] += (new_width - width) / 2 + rotation_matrix[1, 2] += (new_height - height) / 2 + + rotated = cv2.warpAffine( + image, + rotation_matrix, + (new_width, new_height), + borderMode=cv2.BORDER_REPLICATE, + ) + + logger.debug(f"Deskewed by {median_angle:.2f} degrees") + return rotated + + except Exception as e: + logger.warning(f"Deskew failed: {e}") + return image + + def _denoise(self, image: np.ndarray) -> np.ndarray: + """ + Apply non-local means denoising. + + This helps remove noise while preserving VIN character edges. + """ + try: + return cv2.fastNlMeansDenoising( + image, h=10, templateWindowSize=7, searchWindowSize=21 + ) + except cv2.error as e: + logger.warning(f"Denoising failed: {e}") + return image + + def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray: + """ + Apply adaptive thresholding for binarization. + + Adaptive thresholding handles varying illumination across the image, + which is common in VIN photos. + """ + try: + return cv2.adaptiveThreshold( + image, + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + blockSize=11, + C=2, + ) + except cv2.error as e: + logger.warning(f"Adaptive threshold failed: {e}") + return image + + def detect_vin_region(self, image_bytes: bytes) -> Optional[BoundingBox]: + """ + Attempt to detect the VIN region in an image. + + Uses contour detection to find rectangular regions that might contain VINs. + + Args: + image_bytes: Raw image bytes + + Returns: + BoundingBox of detected VIN region, or None if not found + """ + try: + pil_image = Image.open(io.BytesIO(image_bytes)) + if pil_image.mode != "L": + pil_image = pil_image.convert("L") + + cv_image = np.array(pil_image) + + # Apply preprocessing for better contour detection + blurred = cv2.GaussianBlur(cv_image, (5, 5), 0) + edges = cv2.Canny(blurred, 50, 150) + + # Find contours + contours, _ = cv2.findContours( + edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + + if not contours: + return None + + # Find rectangular contours with appropriate aspect ratio for VIN + # VIN is typically 17 characters, roughly 5:1 to 10:1 aspect ratio + vin_candidates = [] + + for contour in contours: + x, y, w, h = cv2.boundingRect(contour) + if h == 0: + continue + + aspect_ratio = w / h + area = w * h + + # VIN regions typically have: + # - Aspect ratio between 4:1 and 12:1 + # - Minimum area (to filter out noise) + if 4 <= aspect_ratio <= 12 and area > 1000: + vin_candidates.append((x, y, w, h, area)) + + if not vin_candidates: + return None + + # Return the largest candidate + vin_candidates.sort(key=lambda c: c[4], reverse=True) + x, y, w, h, _ = vin_candidates[0] + + return BoundingBox(x=x, y=y, width=w, height=h) + + except Exception as e: + logger.warning(f"VIN region detection failed: {e}") + return None + + +# Singleton instance +vin_preprocessor = VinPreprocessor() diff --git a/ocr/app/routers/extract.py b/ocr/app/routers/extract.py index 1f7afaf..23f8483 100644 --- a/ocr/app/routers/extract.py +++ b/ocr/app/routers/extract.py @@ -3,7 +3,8 @@ import logging from fastapi import APIRouter, File, HTTPException, Query, UploadFile -from app.models import OcrResponse +from app.extractors.vin_extractor import vin_extractor +from app.models import BoundingBox, OcrResponse, VinAlternative, VinExtractionResponse from app.services import ocr_service logger = logging.getLogger(__name__) @@ -67,3 +68,89 @@ async def extract_text( ) return result + + +@router.post("/vin", response_model=VinExtractionResponse) +async def extract_vin( + file: UploadFile = File(..., description="Image file containing VIN"), +) -> VinExtractionResponse: + """ + Extract VIN (Vehicle Identification Number) from an uploaded image. + + Uses VIN-optimized preprocessing and pattern matching: + - HEIC conversion (if needed) + - Grayscale conversion + - Deskew correction + - CLAHE contrast enhancement + - Noise reduction + - Adaptive thresholding + - VIN pattern matching (17 chars, excludes I/O/Q) + - Check digit validation + - Common OCR error correction (I->1, O->0, Q->0) + + Supports HEIC, JPEG, PNG formats. + Processing time target: <3 seconds. + + - **file**: Image file (max 10MB) + + Returns: + - **vin**: Extracted VIN (17 alphanumeric characters) + - **confidence**: Confidence score (0.0-1.0) + - **boundingBox**: Location of VIN in image (if detected) + - **alternatives**: Other VIN candidates with confidence scores + - **processingTimeMs**: Processing time in milliseconds + """ + # Validate file presence + if not file.filename: + raise HTTPException(status_code=400, detail="No file provided") + + # Read file content + content = await file.read() + file_size = len(content) + + # Validate file size + if file_size > MAX_SYNC_SIZE: + raise HTTPException( + status_code=413, + detail=f"File too large. Max: {MAX_SYNC_SIZE // (1024*1024)}MB", + ) + + if file_size == 0: + raise HTTPException(status_code=400, detail="Empty file provided") + + logger.info( + f"VIN extraction: {file.filename}, " + f"size: {file_size} bytes, " + f"content_type: {file.content_type}" + ) + + # Perform VIN extraction + result = vin_extractor.extract( + image_bytes=content, + content_type=file.content_type, + ) + + # Convert internal result to API response + bounding_box = None + if result.bounding_box: + bounding_box = BoundingBox( + x=result.bounding_box.x, + y=result.bounding_box.y, + width=result.bounding_box.width, + height=result.bounding_box.height, + ) + + alternatives = [ + VinAlternative(vin=alt.vin, confidence=alt.confidence) + for alt in result.alternatives + ] + + return VinExtractionResponse( + success=result.success, + vin=result.vin, + confidence=result.confidence, + boundingBox=bounding_box, + alternatives=alternatives, + processingTimeMs=result.processing_time_ms, + error=result.error, + ) diff --git a/ocr/app/validators/__init__.py b/ocr/app/validators/__init__.py new file mode 100644 index 0000000..6076070 --- /dev/null +++ b/ocr/app/validators/__init__.py @@ -0,0 +1,4 @@ +"""Validators package for OCR data validation.""" +from app.validators.vin_validator import VinValidator, vin_validator + +__all__ = ["VinValidator", "vin_validator"] diff --git a/ocr/app/validators/vin_validator.py b/ocr/app/validators/vin_validator.py new file mode 100644 index 0000000..6a4b264 --- /dev/null +++ b/ocr/app/validators/vin_validator.py @@ -0,0 +1,259 @@ +"""VIN validation with check digit verification and OCR error correction.""" +import re +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class VinValidationResult: + """Result of VIN validation.""" + + is_valid: bool + vin: str + confidence_adjustment: float + error: Optional[str] = None + + +class VinValidator: + """Validates and corrects VIN strings.""" + + # VIN character set (excludes I, O, Q) + VALID_CHARS = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789") + + # Common OCR misreads and their corrections + TRANSLITERATION = { + "I": "1", + "O": "0", + "Q": "0", + "i": "1", + "o": "0", + "q": "0", + "l": "1", + "L": "1", + "B": "8", # Sometimes confused + "S": "5", # Sometimes confused + } + + # Weights for check digit calculation (positions 1-17) + CHECK_WEIGHTS = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2] + + # Character to value mapping for check digit + CHAR_VALUES = { + "A": 1, + "B": 2, + "C": 3, + "D": 4, + "E": 5, + "F": 6, + "G": 7, + "H": 8, + "J": 1, + "K": 2, + "L": 3, + "M": 4, + "N": 5, + "P": 7, + "R": 9, + "S": 2, + "T": 3, + "U": 4, + "V": 5, + "W": 6, + "X": 7, + "Y": 8, + "Z": 9, + "0": 0, + "1": 1, + "2": 2, + "3": 3, + "4": 4, + "5": 5, + "6": 6, + "7": 7, + "8": 8, + "9": 9, + } + + # Modern VIN pattern (1981+): exactly 17 alphanumeric, no I/O/Q + MODERN_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{17}$") + + # Pre-1981 VIN pattern: 11-17 characters + LEGACY_VIN_PATTERN = re.compile(r"^[A-HJ-NPR-Z0-9]{11,17}$") + + def correct_ocr_errors(self, vin: str) -> str: + """ + Apply common OCR error corrections to a VIN string. + + Args: + vin: Raw VIN string from OCR + + Returns: + Corrected VIN string + """ + corrected = vin.upper().strip() + + # Remove any spaces or dashes (common in formatted VINs) + corrected = corrected.replace(" ", "").replace("-", "") + + # Apply transliteration for common OCR errors + result = [] + for char in corrected: + if char in self.TRANSLITERATION: + result.append(self.TRANSLITERATION[char]) + else: + result.append(char) + + return "".join(result) + + def calculate_check_digit(self, vin: str) -> Optional[str]: + """ + Calculate the check digit (position 9) for a VIN. + + Args: + vin: 17-character VIN string + + Returns: + Expected check digit character, or None if calculation fails + """ + if len(vin) != 17: + return None + + try: + total = 0 + for i, char in enumerate(vin.upper()): + if i == 8: # Skip check digit position + continue + value = self.CHAR_VALUES.get(char) + if value is None: + return None + total += value * self.CHECK_WEIGHTS[i] + + remainder = total % 11 + if remainder == 10: + return "X" + return str(remainder) + except (KeyError, ValueError): + return None + + def validate_check_digit(self, vin: str) -> bool: + """ + Validate the check digit of a VIN. + + Args: + vin: 17-character VIN string + + Returns: + True if check digit is valid + """ + if len(vin) != 17: + return False + + expected = self.calculate_check_digit(vin) + if expected is None: + return False + + return vin[8].upper() == expected + + def validate( + self, vin: str, correct_errors: bool = True, allow_legacy: bool = False + ) -> VinValidationResult: + """ + Validate a VIN string and optionally correct OCR errors. + + Args: + vin: VIN string to validate + correct_errors: Whether to apply OCR error corrections + allow_legacy: Whether to allow pre-1981 VINs (11-17 chars) + + Returns: + VinValidationResult with validation status and corrected VIN + """ + if not vin: + return VinValidationResult( + is_valid=False, vin="", confidence_adjustment=-1.0, error="Empty VIN" + ) + + # Apply error corrections if enabled + corrected_vin = self.correct_ocr_errors(vin) if correct_errors else vin.upper() + + # Check length + if len(corrected_vin) != 17: + if allow_legacy and 11 <= len(corrected_vin) <= 17: + # Legacy VIN - reduced confidence + if self.LEGACY_VIN_PATTERN.match(corrected_vin): + return VinValidationResult( + is_valid=True, + vin=corrected_vin, + confidence_adjustment=-0.2, + ) + return VinValidationResult( + is_valid=False, + vin=corrected_vin, + confidence_adjustment=-0.5, + error=f"Invalid length: {len(corrected_vin)} (expected 17)", + ) + + # Check character set + if not self.MODERN_VIN_PATTERN.match(corrected_vin): + invalid_chars = [c for c in corrected_vin if c not in self.VALID_CHARS] + return VinValidationResult( + is_valid=False, + vin=corrected_vin, + confidence_adjustment=-0.3, + error=f"Invalid characters: {invalid_chars}", + ) + + # Validate check digit + if self.validate_check_digit(corrected_vin): + # Valid check digit - boost confidence + return VinValidationResult( + is_valid=True, vin=corrected_vin, confidence_adjustment=0.1 + ) + else: + # Invalid check digit - could be OCR error or old VIN + return VinValidationResult( + is_valid=True, # Still return as valid but with reduced confidence + vin=corrected_vin, + confidence_adjustment=-0.15, + error="Check digit validation failed", + ) + + def extract_candidates( + self, text: str, max_candidates: int = 5 + ) -> list[tuple[str, int, int]]: + """ + Extract VIN candidates from raw OCR text. + + Args: + text: Raw OCR text + max_candidates: Maximum number of candidates to return + + Returns: + List of (vin, start_pos, end_pos) tuples + """ + # Pattern to find potential VIN sequences + # Allow some flexibility for OCR errors (include I, O, Q for correction later) + potential_vin_pattern = re.compile(r"[A-Z0-9IOQ]{11,17}", re.IGNORECASE) + + candidates = [] + for match in potential_vin_pattern.finditer(text.upper()): + candidate = match.group() + corrected = self.correct_ocr_errors(candidate) + + # Only include if it could be a valid VIN after correction + if len(corrected) == 17 and self.MODERN_VIN_PATTERN.match(corrected): + candidates.append((corrected, match.start(), match.end())) + + # Sort by likelihood of being valid (check digit validation) + def score_candidate(c: tuple[str, int, int]) -> int: + vin = c[0] + if self.validate_check_digit(vin): + return 0 # Best score + return 1 + + candidates.sort(key=score_candidate) + return candidates[:max_candidates] + + +# Singleton instance +vin_validator = VinValidator() diff --git a/ocr/tests/test_vin_extraction.py b/ocr/tests/test_vin_extraction.py new file mode 100644 index 0000000..b2c8170 --- /dev/null +++ b/ocr/tests/test_vin_extraction.py @@ -0,0 +1,242 @@ +"""Integration tests for VIN extraction endpoint.""" +import io +from unittest.mock import patch, MagicMock + +import pytest +from fastapi.testclient import TestClient +from PIL import Image, ImageDraw, ImageFont + +from app.main import app + + +@pytest.fixture +def client() -> TestClient: + """Create test client.""" + return TestClient(app) + + +def create_vin_image(vin: str = "1HGBH41JXMN109186") -> bytes: + """Create a test image with VIN text.""" + # Create white image + image = Image.new("RGB", (400, 100), (255, 255, 255)) + draw = ImageDraw.Draw(image) + + # Draw VIN text (use default font) + draw.text((50, 40), vin, fill=(0, 0, 0)) + + buffer = io.BytesIO() + image.save(buffer, format="PNG") + return buffer.getvalue() + + +def create_empty_image() -> bytes: + """Create an empty test image.""" + image = Image.new("RGB", (400, 100), (255, 255, 255)) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + return buffer.getvalue() + + +class TestVinExtractionEndpoint: + """Tests for POST /extract/vin endpoint.""" + + def test_endpoint_exists(self, client: TestClient) -> None: + """Test VIN endpoint is registered.""" + response = client.get("/") + assert response.status_code == 200 + data = response.json() + assert any("vin" in endpoint.lower() for endpoint in data.get("endpoints", [])) + + def test_extract_vin_no_file(self, client: TestClient) -> None: + """Test endpoint returns error when no file provided.""" + response = client.post("/extract/vin") + assert response.status_code == 422 # Validation error + + def test_extract_vin_empty_file(self, client: TestClient) -> None: + """Test endpoint returns error for empty file.""" + response = client.post( + "/extract/vin", + files={"file": ("empty.png", b"", "image/png")}, + ) + assert response.status_code == 400 + assert "empty" in response.json()["detail"].lower() + + def test_extract_vin_large_file(self, client: TestClient) -> None: + """Test endpoint returns error for file too large.""" + # Create file larger than 10MB + large_content = b"x" * (11 * 1024 * 1024) + + response = client.post( + "/extract/vin", + files={"file": ("large.png", large_content, "image/png")}, + ) + assert response.status_code == 413 + + @patch("app.extractors.vin_extractor.vin_extractor.extract") + def test_extract_vin_success( + self, mock_extract: MagicMock, client: TestClient + ) -> None: + """Test successful VIN extraction.""" + from app.extractors.vin_extractor import VinExtractionResult + + mock_extract.return_value = VinExtractionResult( + success=True, + vin="1HGBH41JXMN109186", + confidence=0.94, + bounding_box=None, + alternatives=[], + processing_time_ms=500, + ) + + image_bytes = create_vin_image() + response = client.post( + "/extract/vin", + files={"file": ("vin.png", image_bytes, "image/png")}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["success"] is True + assert data["vin"] == "1HGBH41JXMN109186" + assert data["confidence"] == 0.94 + assert "processingTimeMs" in data + + @patch("app.extractors.vin_extractor.vin_extractor.extract") + def test_extract_vin_not_found( + self, mock_extract: MagicMock, client: TestClient + ) -> None: + """Test VIN not found returns success=false.""" + from app.extractors.vin_extractor import VinExtractionResult + + mock_extract.return_value = VinExtractionResult( + success=False, + vin=None, + confidence=0.0, + error="No VIN pattern found in image", + processing_time_ms=300, + ) + + image_bytes = create_empty_image() + response = client.post( + "/extract/vin", + files={"file": ("empty.png", image_bytes, "image/png")}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["success"] is False + assert data["vin"] is None + assert data["error"] == "No VIN pattern found in image" + + @patch("app.extractors.vin_extractor.vin_extractor.extract") + def test_extract_vin_with_alternatives( + self, mock_extract: MagicMock, client: TestClient + ) -> None: + """Test VIN extraction with alternatives.""" + from app.extractors.vin_extractor import VinExtractionResult, VinAlternative + + mock_extract.return_value = VinExtractionResult( + success=True, + vin="1HGBH41JXMN109186", + confidence=0.94, + bounding_box=None, + alternatives=[ + VinAlternative(vin="1HGBH41JXMN109186", confidence=0.72), + ], + processing_time_ms=600, + ) + + image_bytes = create_vin_image() + response = client.post( + "/extract/vin", + files={"file": ("vin.png", image_bytes, "image/png")}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["success"] is True + assert len(data["alternatives"]) == 1 + assert data["alternatives"][0]["confidence"] == 0.72 + + @patch("app.extractors.vin_extractor.vin_extractor.extract") + def test_extract_vin_with_bounding_box( + self, mock_extract: MagicMock, client: TestClient + ) -> None: + """Test VIN extraction includes bounding box.""" + from app.extractors.vin_extractor import VinExtractionResult + from app.preprocessors.vin_preprocessor import BoundingBox + + mock_extract.return_value = VinExtractionResult( + success=True, + vin="1HGBH41JXMN109186", + confidence=0.94, + bounding_box=BoundingBox(x=50, y=40, width=300, height=20), + alternatives=[], + processing_time_ms=500, + ) + + image_bytes = create_vin_image() + response = client.post( + "/extract/vin", + files={"file": ("vin.png", image_bytes, "image/png")}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["boundingBox"] is not None + assert data["boundingBox"]["x"] == 50 + assert data["boundingBox"]["y"] == 40 + assert data["boundingBox"]["width"] == 300 + assert data["boundingBox"]["height"] == 20 + + +class TestVinExtractionContentTypes: + """Tests for different content types.""" + + @patch("app.extractors.vin_extractor.vin_extractor.extract") + def test_accepts_jpeg( + self, mock_extract: MagicMock, client: TestClient + ) -> None: + """Test endpoint accepts JPEG images.""" + from app.extractors.vin_extractor import VinExtractionResult + + mock_extract.return_value = VinExtractionResult( + success=True, + vin="1HGBH41JXMN109186", + confidence=0.9, + processing_time_ms=400, + ) + + # Create JPEG image + image = Image.new("RGB", (400, 100), (255, 255, 255)) + buffer = io.BytesIO() + image.save(buffer, format="JPEG") + + response = client.post( + "/extract/vin", + files={"file": ("vin.jpg", buffer.getvalue(), "image/jpeg")}, + ) + + assert response.status_code == 200 + + @patch("app.extractors.vin_extractor.vin_extractor.extract") + def test_accepts_png( + self, mock_extract: MagicMock, client: TestClient + ) -> None: + """Test endpoint accepts PNG images.""" + from app.extractors.vin_extractor import VinExtractionResult + + mock_extract.return_value = VinExtractionResult( + success=True, + vin="1HGBH41JXMN109186", + confidence=0.9, + processing_time_ms=400, + ) + + image_bytes = create_vin_image() + response = client.post( + "/extract/vin", + files={"file": ("vin.png", image_bytes, "image/png")}, + ) + + assert response.status_code == 200 diff --git a/ocr/tests/test_vin_preprocessor.py b/ocr/tests/test_vin_preprocessor.py new file mode 100644 index 0000000..8076294 --- /dev/null +++ b/ocr/tests/test_vin_preprocessor.py @@ -0,0 +1,202 @@ +"""Unit tests for VIN preprocessor.""" +import io +from unittest.mock import patch, MagicMock + +import numpy as np +import pytest +from PIL import Image + +from app.preprocessors.vin_preprocessor import VinPreprocessor, vin_preprocessor + + +def create_test_image(width: int = 400, height: int = 100, color: int = 128) -> bytes: + """Create a simple test image.""" + image = Image.new("RGB", (width, height), (color, color, color)) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + return buffer.getvalue() + + +def create_grayscale_test_image(width: int = 400, height: int = 100) -> bytes: + """Create a grayscale test image.""" + image = Image.new("L", (width, height), 128) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + return buffer.getvalue() + + +class TestVinPreprocessor: + """Tests for VIN-optimized preprocessing.""" + + def test_preprocess_returns_result(self) -> None: + """Test basic preprocessing returns a result.""" + preprocessor = VinPreprocessor() + image_bytes = create_test_image() + + result = preprocessor.preprocess(image_bytes) + + assert result.image_bytes is not None + assert len(result.image_bytes) > 0 + assert "grayscale" in result.preprocessing_applied + + def test_preprocess_applies_all_steps(self) -> None: + """Test preprocessing applies all requested steps.""" + preprocessor = VinPreprocessor() + image_bytes = create_test_image() + + result = preprocessor.preprocess( + image_bytes, + apply_clahe=True, + apply_deskew=True, + apply_denoise=True, + apply_threshold=True, + ) + + assert "grayscale" in result.preprocessing_applied + assert "clahe" in result.preprocessing_applied + assert "deskew" in result.preprocessing_applied + assert "denoise" in result.preprocessing_applied + assert "threshold" in result.preprocessing_applied + + def test_preprocess_skips_disabled_steps(self) -> None: + """Test preprocessing skips disabled steps.""" + preprocessor = VinPreprocessor() + image_bytes = create_test_image() + + result = preprocessor.preprocess( + image_bytes, + apply_clahe=False, + apply_deskew=False, + apply_denoise=False, + apply_threshold=False, + ) + + assert "clahe" not in result.preprocessing_applied + assert "deskew" not in result.preprocessing_applied + assert "denoise" not in result.preprocessing_applied + assert "threshold" not in result.preprocessing_applied + + def test_preprocess_output_is_valid_image(self) -> None: + """Test preprocessing output is a valid PNG image.""" + preprocessor = VinPreprocessor() + image_bytes = create_test_image() + + result = preprocessor.preprocess(image_bytes) + + # Should be able to open as image + output_image = Image.open(io.BytesIO(result.image_bytes)) + assert output_image is not None + assert output_image.format == "PNG" + + def test_preprocess_handles_grayscale_input(self) -> None: + """Test preprocessing handles grayscale input.""" + preprocessor = VinPreprocessor() + image_bytes = create_grayscale_test_image() + + result = preprocessor.preprocess(image_bytes) + + assert result.image_bytes is not None + assert len(result.image_bytes) > 0 + + def test_preprocess_handles_rgba_input(self) -> None: + """Test preprocessing handles RGBA input.""" + preprocessor = VinPreprocessor() + + # Create RGBA image + image = Image.new("RGBA", (400, 100), (128, 128, 128, 255)) + buffer = io.BytesIO() + image.save(buffer, format="PNG") + + result = preprocessor.preprocess(buffer.getvalue()) + + assert result.image_bytes is not None + assert "convert_rgb" in result.preprocessing_applied + + def test_singleton_instance(self) -> None: + """Test singleton instance is available.""" + assert vin_preprocessor is not None + assert isinstance(vin_preprocessor, VinPreprocessor) + + +class TestVinPreprocessorDeskew: + """Tests for deskew functionality.""" + + def test_deskew_no_change_for_straight_image(self) -> None: + """Test deskew doesn't change a straight image significantly.""" + preprocessor = VinPreprocessor() + + # Create image with horizontal line (no skew) + image = np.zeros((100, 400), dtype=np.uint8) + image[50, 50:350] = 255 # Horizontal line + + result = preprocessor._deskew(image) + + # Shape should be similar (might change slightly due to processing) + assert result.shape[0] > 0 + assert result.shape[1] > 0 + + +class TestVinPreprocessorCLAHE: + """Tests for CLAHE contrast enhancement.""" + + def test_clahe_improves_contrast(self) -> None: + """Test CLAHE changes the image.""" + preprocessor = VinPreprocessor() + + # Create low contrast image + image = np.full((100, 400), 128, dtype=np.uint8) + + result = preprocessor._apply_clahe(image) + + # Result should be numpy array of same shape + assert result.shape == image.shape + + +class TestVinPreprocessorDenoise: + """Tests for denoising functionality.""" + + def test_denoise_reduces_noise(self) -> None: + """Test denoising works on noisy image.""" + preprocessor = VinPreprocessor() + + # Create noisy image + image = np.random.randint(0, 256, (100, 400), dtype=np.uint8) + + result = preprocessor._denoise(image) + + # Should return array of same shape + assert result.shape == image.shape + + +class TestVinPreprocessorThreshold: + """Tests for adaptive thresholding.""" + + def test_threshold_creates_binary_image(self) -> None: + """Test thresholding creates binary output.""" + preprocessor = VinPreprocessor() + + # Create grayscale image + image = np.full((100, 400), 128, dtype=np.uint8) + + result = preprocessor._adaptive_threshold(image) + + # Result should be binary (only 0 and 255) + unique_values = np.unique(result) + assert len(unique_values) <= 2 + + +class TestVinRegionDetection: + """Tests for VIN region detection.""" + + def test_detect_vin_region_returns_none_for_empty(self) -> None: + """Test region detection returns None for empty image.""" + preprocessor = VinPreprocessor() + + # Solid color image - no regions to detect + image_bytes = create_test_image(color=128) + + result = preprocessor.detect_vin_region(image_bytes) + + # May return None for uniform image + # This is expected behavior + assert result is None or result.width > 0 diff --git a/ocr/tests/test_vin_validator.py b/ocr/tests/test_vin_validator.py new file mode 100644 index 0000000..26f170b --- /dev/null +++ b/ocr/tests/test_vin_validator.py @@ -0,0 +1,211 @@ +"""Unit tests for VIN validator.""" +import pytest + +from app.validators.vin_validator import VinValidator, vin_validator + + +class TestVinValidator: + """Tests for VIN validation logic.""" + + def test_correct_ocr_errors_basic(self) -> None: + """Test basic OCR error correction.""" + validator = VinValidator() + + # I -> 1 + assert validator.correct_ocr_errors("IHGBH41JXMN109186") == "1HGBH41JXMN109186" + + # O -> 0 + assert validator.correct_ocr_errors("1HGBH41JXMN1O9186") == "1HGBH41JXMN109186" + + # Q -> 0 + assert validator.correct_ocr_errors("1HGBH41JXMN1Q9186") == "1HGBH41JXMN109186" + + def test_correct_ocr_errors_lowercase(self) -> None: + """Test OCR error correction handles lowercase.""" + validator = VinValidator() + + result = validator.correct_ocr_errors("1hgbh41jxmn109186") + assert result == "1HGBH41JXMN109186" + + def test_correct_ocr_errors_strips_spaces(self) -> None: + """Test OCR error correction removes spaces and dashes.""" + validator = VinValidator() + + assert validator.correct_ocr_errors("1HG BH41 JXMN 109186") == "1HGBH41JXMN109186" + assert validator.correct_ocr_errors("1HG-BH41-JXMN-109186") == "1HGBH41JXMN109186" + + def test_calculate_check_digit(self) -> None: + """Test check digit calculation.""" + validator = VinValidator() + + # Test with known valid VINs + # 1HGBH41JXMN109186 has check digit X at position 9 + result = validator.calculate_check_digit("1HGBH41JXMN109186") + assert result == "X" + + # 5YJSA1E28HF123456 has check digit 2 at position 9 + result = validator.calculate_check_digit("5YJSA1E28HF123456") + assert result == "8" # Verify this is correct for this VIN + + def test_validate_check_digit_valid(self) -> None: + """Test check digit validation with valid VIN.""" + validator = VinValidator() + + # This VIN has a valid check digit + assert validator.validate_check_digit("1HGBH41JXMN109186") is True + + def test_validate_check_digit_invalid(self) -> None: + """Test check digit validation with invalid VIN.""" + validator = VinValidator() + + # Modify check digit to make it invalid + assert validator.validate_check_digit("1HGBH41J1MN109186") is False + + def test_validate_modern_vin_valid(self) -> None: + """Test validation of valid modern VIN.""" + validator = VinValidator() + + result = validator.validate("1HGBH41JXMN109186") + assert result.is_valid is True + assert result.vin == "1HGBH41JXMN109186" + assert result.confidence_adjustment > 0 # Check digit valid = boost + + def test_validate_modern_vin_with_ocr_errors(self) -> None: + """Test validation corrects OCR errors.""" + validator = VinValidator() + + # I at start should be corrected to 1 + result = validator.validate("IHGBH41JXMN109186") + assert result.is_valid is True + assert result.vin == "1HGBH41JXMN109186" + + def test_validate_short_vin(self) -> None: + """Test validation rejects short VIN.""" + validator = VinValidator() + + result = validator.validate("1HGBH41JX") + assert result.is_valid is False + assert "length" in result.error.lower() + + def test_validate_long_vin(self) -> None: + """Test validation rejects long VIN.""" + validator = VinValidator() + + result = validator.validate("1HGBH41JXMN109186XX") + assert result.is_valid is False + assert "length" in result.error.lower() + + def test_validate_empty_vin(self) -> None: + """Test validation handles empty VIN.""" + validator = VinValidator() + + result = validator.validate("") + assert result.is_valid is False + assert "empty" in result.error.lower() + + def test_validate_invalid_characters(self) -> None: + """Test validation rejects invalid characters after correction.""" + validator = VinValidator() + + # Contains characters not in VIN alphabet + result = validator.validate("1HGBH41JXMN!@#186", correct_errors=False) + assert result.is_valid is False + assert "character" in result.error.lower() + + def test_validate_legacy_vin_allowed(self) -> None: + """Test validation allows legacy VINs when enabled.""" + validator = VinValidator() + + # 13-character VIN (pre-1981) + result = validator.validate("ABCD123456789", allow_legacy=True) + assert result.is_valid is True + assert result.confidence_adjustment < 0 # Reduced confidence for legacy + + def test_validate_legacy_vin_rejected(self) -> None: + """Test validation rejects legacy VINs by default.""" + validator = VinValidator() + + result = validator.validate("ABCD123456789", allow_legacy=False) + assert result.is_valid is False + + def test_extract_candidates_finds_vin(self) -> None: + """Test candidate extraction from text.""" + validator = VinValidator() + + text = "VIN: 1HGBH41JXMN109186 is shown here" + candidates = validator.extract_candidates(text) + + assert len(candidates) >= 1 + assert candidates[0][0] == "1HGBH41JXMN109186" + + def test_extract_candidates_multiple_vins(self) -> None: + """Test candidate extraction with multiple VINs.""" + validator = VinValidator() + + text = "First VIN: 1HGBH41JXMN109186 Second VIN: 5YJSA1E28HF123456" + candidates = validator.extract_candidates(text) + + assert len(candidates) >= 2 + vins = [c[0] for c in candidates] + assert "1HGBH41JXMN109186" in vins + assert "5YJSA1E28HF123456" in vins + + def test_extract_candidates_with_ocr_errors(self) -> None: + """Test candidate extraction corrects OCR errors.""" + validator = VinValidator() + + # Contains O instead of 0 + text = "VIN: 1HGBH41JXMN1O9186" + candidates = validator.extract_candidates(text) + + assert len(candidates) >= 1 + assert candidates[0][0] == "1HGBH41JXMN109186" + + def test_extract_candidates_no_vin(self) -> None: + """Test candidate extraction with no VIN.""" + validator = VinValidator() + + text = "This text contains no VIN numbers" + candidates = validator.extract_candidates(text) + + assert len(candidates) == 0 + + def test_singleton_instance(self) -> None: + """Test singleton instance is available.""" + assert vin_validator is not None + assert isinstance(vin_validator, VinValidator) + + +class TestVinValidatorEdgeCases: + """Edge case tests for VIN validator.""" + + def test_all_zeros_vin(self) -> None: + """Test VIN with all zeros (unlikely but valid format).""" + validator = VinValidator() + + result = validator.validate("00000000000000000") + assert result.is_valid is True + assert len(result.vin) == 17 + + def test_mixed_case_vin(self) -> None: + """Test VIN with mixed case.""" + validator = VinValidator() + + result = validator.validate("1hGbH41jXmN109186") + assert result.is_valid is True + assert result.vin == "1HGBH41JXMN109186" + + def test_vin_with_leading_trailing_whitespace(self) -> None: + """Test VIN with whitespace.""" + validator = VinValidator() + + result = validator.validate(" 1HGBH41JXMN109186 ") + assert result.is_valid is True + assert result.vin == "1HGBH41JXMN109186" + + def test_check_digit_x(self) -> None: + """Test VIN with X as check digit.""" + validator = VinValidator() + + # 1HGBH41JXMN109186 has X as check digit + assert validator.validate_check_digit("1HGBH41JXMN109186") is True