All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 35s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 51s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m31s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 8s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped
Root cause: Tesseract fragments VINs into multiple words but candidate extraction required continuous 17-char sequences, rejecting all results. Changes: - Fix candidate extraction to concatenate adjacent OCR fragments - Disable Tesseract dictionaries (VINs are not dictionary words) - Set OEM 1 (LSTM engine) for better accuracy - Add PSM 11 (sparse text) and PSM 13 (raw line) fallback modes - Add Otsu's thresholding as alternative preprocessing pipeline - Upscale small images to meet Tesseract's 300 DPI requirement - Remove incorrect B->8 and S->5 transliterations (valid VIN chars) - Fix pre-existing test bug in check digit expected value Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
233 lines
8.3 KiB
Python
233 lines
8.3 KiB
Python
"""Unit tests for VIN validator."""
|
|
import pytest
|
|
|
|
from app.validators.vin_validator import VinValidator, vin_validator
|
|
|
|
|
|
class TestVinValidator:
|
|
"""Tests for VIN validation logic."""
|
|
|
|
def test_correct_ocr_errors_basic(self) -> None:
|
|
"""Test basic OCR error correction."""
|
|
validator = VinValidator()
|
|
|
|
# I -> 1
|
|
assert validator.correct_ocr_errors("IHGBH41JXMN109186") == "1HGBH41JXMN109186"
|
|
|
|
# O -> 0
|
|
assert validator.correct_ocr_errors("1HGBH41JXMN1O9186") == "1HGBH41JXMN109186"
|
|
|
|
# Q -> 0
|
|
assert validator.correct_ocr_errors("1HGBH41JXMN1Q9186") == "1HGBH41JXMN109186"
|
|
|
|
def test_correct_ocr_errors_lowercase(self) -> None:
|
|
"""Test OCR error correction handles lowercase."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.correct_ocr_errors("1hgbh41jxmn109186")
|
|
assert result == "1HGBH41JXMN109186"
|
|
|
|
def test_correct_ocr_errors_strips_spaces(self) -> None:
|
|
"""Test OCR error correction removes spaces and dashes."""
|
|
validator = VinValidator()
|
|
|
|
assert validator.correct_ocr_errors("1HG BH41 JXMN 109186") == "1HGBH41JXMN109186"
|
|
assert validator.correct_ocr_errors("1HG-BH41-JXMN-109186") == "1HGBH41JXMN109186"
|
|
|
|
def test_calculate_check_digit(self) -> None:
|
|
"""Test check digit calculation."""
|
|
validator = VinValidator()
|
|
|
|
# Test with known valid VINs
|
|
# 1HGBH41JXMN109186 has check digit X at position 9
|
|
result = validator.calculate_check_digit("1HGBH41JXMN109186")
|
|
assert result == "X"
|
|
|
|
# 5YJSA1E28HF123456 has check digit at position 9
|
|
result = validator.calculate_check_digit("5YJSA1E28HF123456")
|
|
assert result == "5"
|
|
|
|
def test_validate_check_digit_valid(self) -> None:
|
|
"""Test check digit validation with valid VIN."""
|
|
validator = VinValidator()
|
|
|
|
# This VIN has a valid check digit
|
|
assert validator.validate_check_digit("1HGBH41JXMN109186") is True
|
|
|
|
def test_validate_check_digit_invalid(self) -> None:
|
|
"""Test check digit validation with invalid VIN."""
|
|
validator = VinValidator()
|
|
|
|
# Modify check digit to make it invalid
|
|
assert validator.validate_check_digit("1HGBH41J1MN109186") is False
|
|
|
|
def test_validate_modern_vin_valid(self) -> None:
|
|
"""Test validation of valid modern VIN."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.validate("1HGBH41JXMN109186")
|
|
assert result.is_valid is True
|
|
assert result.vin == "1HGBH41JXMN109186"
|
|
assert result.confidence_adjustment > 0 # Check digit valid = boost
|
|
|
|
def test_validate_modern_vin_with_ocr_errors(self) -> None:
|
|
"""Test validation corrects OCR errors."""
|
|
validator = VinValidator()
|
|
|
|
# I at start should be corrected to 1
|
|
result = validator.validate("IHGBH41JXMN109186")
|
|
assert result.is_valid is True
|
|
assert result.vin == "1HGBH41JXMN109186"
|
|
|
|
def test_validate_short_vin(self) -> None:
|
|
"""Test validation rejects short VIN."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.validate("1HGBH41JX")
|
|
assert result.is_valid is False
|
|
assert "length" in result.error.lower()
|
|
|
|
def test_validate_long_vin(self) -> None:
|
|
"""Test validation rejects long VIN."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.validate("1HGBH41JXMN109186XX")
|
|
assert result.is_valid is False
|
|
assert "length" in result.error.lower()
|
|
|
|
def test_validate_empty_vin(self) -> None:
|
|
"""Test validation handles empty VIN."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.validate("")
|
|
assert result.is_valid is False
|
|
assert "empty" in result.error.lower()
|
|
|
|
def test_validate_invalid_characters(self) -> None:
|
|
"""Test validation rejects invalid characters after correction."""
|
|
validator = VinValidator()
|
|
|
|
# Contains characters not in VIN alphabet
|
|
result = validator.validate("1HGBH41JXMN!@#186", correct_errors=False)
|
|
assert result.is_valid is False
|
|
assert "character" in result.error.lower()
|
|
|
|
def test_validate_legacy_vin_allowed(self) -> None:
|
|
"""Test validation allows legacy VINs when enabled."""
|
|
validator = VinValidator()
|
|
|
|
# 13-character VIN (pre-1981)
|
|
result = validator.validate("ABCD123456789", allow_legacy=True)
|
|
assert result.is_valid is True
|
|
assert result.confidence_adjustment < 0 # Reduced confidence for legacy
|
|
|
|
def test_validate_legacy_vin_rejected(self) -> None:
|
|
"""Test validation rejects legacy VINs by default."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.validate("ABCD123456789", allow_legacy=False)
|
|
assert result.is_valid is False
|
|
|
|
def test_extract_candidates_finds_vin(self) -> None:
|
|
"""Test candidate extraction from text."""
|
|
validator = VinValidator()
|
|
|
|
text = "VIN: 1HGBH41JXMN109186 is shown here"
|
|
candidates = validator.extract_candidates(text)
|
|
|
|
assert len(candidates) >= 1
|
|
assert candidates[0][0] == "1HGBH41JXMN109186"
|
|
|
|
def test_extract_candidates_multiple_vins(self) -> None:
|
|
"""Test candidate extraction with multiple VINs."""
|
|
validator = VinValidator()
|
|
|
|
text = "First VIN: 1HGBH41JXMN109186 Second VIN: 5YJSA1E28HF123456"
|
|
candidates = validator.extract_candidates(text)
|
|
|
|
assert len(candidates) >= 2
|
|
vins = [c[0] for c in candidates]
|
|
assert "1HGBH41JXMN109186" in vins
|
|
assert "5YJSA1E28HF123456" in vins
|
|
|
|
def test_extract_candidates_with_ocr_errors(self) -> None:
|
|
"""Test candidate extraction corrects OCR errors."""
|
|
validator = VinValidator()
|
|
|
|
# Contains O instead of 0
|
|
text = "VIN: 1HGBH41JXMN1O9186"
|
|
candidates = validator.extract_candidates(text)
|
|
|
|
assert len(candidates) >= 1
|
|
assert candidates[0][0] == "1HGBH41JXMN109186"
|
|
|
|
def test_extract_candidates_fragmented_vin(self) -> None:
|
|
"""Test candidate extraction handles space-fragmented VINs from OCR."""
|
|
validator = VinValidator()
|
|
|
|
# Tesseract often fragments VINs into multiple words
|
|
text = "1HGBH 41JXMN 109186"
|
|
candidates = validator.extract_candidates(text)
|
|
|
|
assert len(candidates) >= 1
|
|
assert candidates[0][0] == "1HGBH41JXMN109186"
|
|
|
|
def test_extract_candidates_dash_fragmented_vin(self) -> None:
|
|
"""Test candidate extraction handles dash-separated VINs."""
|
|
validator = VinValidator()
|
|
|
|
text = "1HGBH41J-XMN109186"
|
|
candidates = validator.extract_candidates(text)
|
|
|
|
assert len(candidates) >= 1
|
|
assert candidates[0][0] == "1HGBH41JXMN109186"
|
|
|
|
def test_extract_candidates_no_vin(self) -> None:
|
|
"""Test candidate extraction with no VIN."""
|
|
validator = VinValidator()
|
|
|
|
text = "This text contains no VIN numbers"
|
|
candidates = validator.extract_candidates(text)
|
|
|
|
assert len(candidates) == 0
|
|
|
|
def test_singleton_instance(self) -> None:
|
|
"""Test singleton instance is available."""
|
|
assert vin_validator is not None
|
|
assert isinstance(vin_validator, VinValidator)
|
|
|
|
|
|
class TestVinValidatorEdgeCases:
|
|
"""Edge case tests for VIN validator."""
|
|
|
|
def test_all_zeros_vin(self) -> None:
|
|
"""Test VIN with all zeros (unlikely but valid format)."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.validate("00000000000000000")
|
|
assert result.is_valid is True
|
|
assert len(result.vin) == 17
|
|
|
|
def test_mixed_case_vin(self) -> None:
|
|
"""Test VIN with mixed case."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.validate("1hGbH41jXmN109186")
|
|
assert result.is_valid is True
|
|
assert result.vin == "1HGBH41JXMN109186"
|
|
|
|
def test_vin_with_leading_trailing_whitespace(self) -> None:
|
|
"""Test VIN with whitespace."""
|
|
validator = VinValidator()
|
|
|
|
result = validator.validate(" 1HGBH41JXMN109186 ")
|
|
assert result.is_valid is True
|
|
assert result.vin == "1HGBH41JXMN109186"
|
|
|
|
def test_check_digit_x(self) -> None:
|
|
"""Test VIN with X as check digit."""
|
|
validator = VinValidator()
|
|
|
|
# 1HGBH41JXMN109186 has X as check digit
|
|
assert validator.validate_check_digit("1HGBH41JXMN109186") is True
|