feat: add core OCR API integration (refs #65)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 5m59s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

OCR Service (Python/FastAPI):
- POST /extract for synchronous OCR extraction
- POST /jobs and GET /jobs/{job_id} for async processing
- Image preprocessing (deskew, denoise) for accuracy
- HEIC conversion via pillow-heif
- Redis job queue for async processing

Backend (Fastify):
- POST /api/ocr/extract - authenticated proxy to OCR
- POST /api/ocr/jobs - async job submission
- GET /api/ocr/jobs/:jobId - job polling
- Multipart file upload handling
- JWT authentication required

File size limits: 10MB sync, 200MB async
Processing time target: <3 seconds for typical photos

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 16:02:11 -06:00
parent 94e49306dc
commit 852c9013b5
25 changed files with 1931 additions and 3 deletions

View File

@@ -0,0 +1,275 @@
"""Core OCR service using Tesseract with HEIC support."""
import io
import logging
import time
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor
# Register HEIF/HEIC opener with Pillow
register_heif_opener()
logger = logging.getLogger(__name__)
class OcrService:
"""Core OCR processing service."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
"application/pdf",
}
def __init__(self) -> None:
"""Initialize OCR service."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
def extract(
self,
file_bytes: bytes,
content_type: Optional[str] = None,
preprocess: bool = True,
) -> OcrResponse:
"""
Extract text from an image file.
Args:
file_bytes: Raw file bytes
content_type: MIME type (optional, will be detected if not provided)
preprocess: Whether to apply preprocessing
Returns:
OcrResponse with extracted text and metadata
"""
start_time = time.time()
# Detect file type if not provided
if not content_type:
content_type = self._detect_mime_type(file_bytes)
# Validate file type
if content_type not in self.SUPPORTED_TYPES:
return OcrResponse(
success=False,
documentType=DocumentType.UNKNOWN,
rawText="",
confidence=0.0,
extractedFields={},
processingTimeMs=int((time.time() - start_time) * 1000),
)
try:
# Convert HEIC/HEIF to standard format
if content_type in ("image/heic", "image/heif"):
file_bytes = self._convert_heic(file_bytes)
content_type = "image/png"
# Handle PDF (extract first page as image)
if content_type == "application/pdf":
file_bytes = self._extract_pdf_first_page(file_bytes)
content_type = "image/png"
# Apply preprocessing if enabled
if preprocess:
file_bytes = preprocessor.preprocess(
file_bytes, deskew=True, denoise=True
)
# Perform OCR
image = Image.open(io.BytesIO(file_bytes))
ocr_data = pytesseract.image_to_data(
image, output_type=pytesseract.Output.DICT
)
# Extract text and calculate confidence
raw_text, confidence = self._process_ocr_data(ocr_data)
# Detect document type from content
document_type = self._detect_document_type(raw_text)
# Extract fields based on document type
extracted_fields = self._extract_fields(raw_text, document_type)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"OCR completed: {len(raw_text)} chars, "
f"{confidence:.2%} confidence, {processing_time_ms}ms"
)
return OcrResponse(
success=True,
documentType=document_type,
rawText=raw_text,
confidence=confidence,
extractedFields=extracted_fields,
processingTimeMs=processing_time_ms,
)
except Exception as e:
logger.error(f"OCR extraction failed: {e}", exc_info=True)
return OcrResponse(
success=False,
documentType=DocumentType.UNKNOWN,
rawText="",
confidence=0.0,
extractedFields={},
processingTimeMs=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _convert_heic(self, heic_bytes: bytes) -> bytes:
"""Convert HEIC/HEIF to PNG format."""
# pillow-heif registers itself with PIL, so we can open HEIC directly
image = Image.open(io.BytesIO(heic_bytes))
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return buffer.getvalue()
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
"""Extract first page of PDF as PNG image."""
try:
# Use pdf2image if available, otherwise return empty
from pdf2image import convert_from_bytes
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
if images:
buffer = io.BytesIO()
images[0].save(buffer, format="PNG")
return buffer.getvalue()
except ImportError:
logger.warning("pdf2image not available, PDF support limited")
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return b""
def _process_ocr_data(
self, ocr_data: dict
) -> tuple[str, float]:
"""Process Tesseract output to extract text and confidence."""
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
# Filter out empty strings and low-confidence results
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text)
confidences.append(conf)
raw_text = " ".join(words)
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
return raw_text, avg_confidence / 100.0
def _detect_document_type(self, text: str) -> DocumentType:
"""Detect document type from extracted text content."""
text_lower = text.lower()
# VIN document indicators
vin_indicators = [
"vin",
"vehicle identification",
"title",
"registration",
"certificate of title",
]
if any(indicator in text_lower for indicator in vin_indicators):
# Additional check: look for 17-character alphanumeric sequences
import re
vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b"
if re.search(vin_pattern, text.upper()):
return DocumentType.VIN
# Receipt indicators
receipt_indicators = [
"receipt",
"total",
"subtotal",
"tax",
"payment",
"invoice",
"amount due",
"gallons",
"price/gallon",
]
if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2:
return DocumentType.RECEIPT
# Manual indicators
manual_indicators = [
"owner's manual",
"maintenance schedule",
"service interval",
"chapter",
"table of contents",
"specifications",
]
if any(indicator in text_lower for indicator in manual_indicators):
return DocumentType.MANUAL
return DocumentType.UNKNOWN
def _extract_fields(
self, text: str, document_type: DocumentType
) -> dict[str, ExtractedField]:
"""Extract specific fields based on document type."""
import re
fields: dict[str, ExtractedField] = {}
if document_type == DocumentType.VIN:
# Extract VIN (17 alphanumeric characters, excluding I, O, Q)
vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b"
match = re.search(vin_pattern, text.upper())
if match:
fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9)
elif document_type == DocumentType.RECEIPT:
# Extract amounts (currency patterns)
amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)"
amounts = re.findall(amount_pattern, text)
if amounts:
# Last amount is often the total
fields["total"] = ExtractedField(
value=f"${amounts[-1]}", confidence=0.7
)
# Extract date
date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})"
date_match = re.search(date_pattern, text)
if date_match:
fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8)
# Extract gallons (for fuel receipts)
gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)"
gallon_match = re.search(gallon_pattern, text.lower())
if gallon_match:
fields["gallons"] = ExtractedField(
value=gallon_match.group(1), confidence=0.85
)
return fields
# Singleton instance
ocr_service = OcrService()