feat: add core OCR API integration (refs #65)
All checks were successful
Deploy to Staging / Build Images (pull_request) Successful in 5m59s
Deploy to Staging / Deploy to Staging (pull_request) Successful in 31s
Deploy to Staging / Verify Staging (pull_request) Successful in 2m19s
Deploy to Staging / Notify Staging Ready (pull_request) Successful in 7s
Deploy to Staging / Notify Staging Failure (pull_request) Has been skipped

OCR Service (Python/FastAPI):
- POST /extract for synchronous OCR extraction
- POST /jobs and GET /jobs/{job_id} for async processing
- Image preprocessing (deskew, denoise) for accuracy
- HEIC conversion via pillow-heif
- Redis job queue for async processing

Backend (Fastify):
- POST /api/ocr/extract - authenticated proxy to OCR
- POST /api/ocr/jobs - async job submission
- GET /api/ocr/jobs/:jobId - job polling
- Multipart file upload handling
- JWT authentication required

File size limits: 10MB sync, 200MB async
Processing time target: <3 seconds for typical photos

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Eric Gullickson
2026-02-01 16:02:11 -06:00
parent 94e49306dc
commit 852c9013b5
25 changed files with 1931 additions and 3 deletions

View File

@@ -0,0 +1,6 @@
"""OCR service layer."""
from .job_queue import job_queue
from .ocr_service import ocr_service
from .preprocessor import preprocessor
__all__ = ["job_queue", "ocr_service", "preprocessor"]

View File

@@ -0,0 +1,233 @@
"""Redis-based job queue for async OCR processing."""
import asyncio
import json
import logging
import uuid
from typing import Optional
import redis.asyncio as redis
from app.config import settings
from app.models import JobResponse, JobStatus, OcrResponse
logger = logging.getLogger(__name__)
# Job TTL in seconds (1 hour)
JOB_TTL = 3600
# Key prefixes
JOB_PREFIX = "ocr:job:"
JOB_DATA_PREFIX = "ocr:job:data:"
JOB_RESULT_PREFIX = "ocr:job:result:"
class JobQueue:
"""Manages async OCR jobs using Redis."""
def __init__(self) -> None:
"""Initialize job queue."""
self._redis: Optional[redis.Redis] = None
async def get_redis(self) -> redis.Redis:
"""Get or create Redis connection."""
if self._redis is None:
self._redis = redis.Redis(
host=settings.redis_host,
port=settings.redis_port,
db=settings.redis_db,
decode_responses=True,
)
return self._redis
async def close(self) -> None:
"""Close Redis connection."""
if self._redis:
await self._redis.close()
self._redis = None
async def submit_job(
self,
file_bytes: bytes,
content_type: str,
callback_url: Optional[str] = None,
) -> str:
"""
Submit a new OCR job.
Args:
file_bytes: Raw file bytes to process
content_type: MIME type of the file
callback_url: Optional URL to call when job completes
Returns:
Job ID
"""
r = await self.get_redis()
job_id = str(uuid.uuid4())
# Store job metadata
job_meta = {
"status": JobStatus.PENDING.value,
"progress": 0,
"content_type": content_type,
"callback_url": callback_url or "",
}
# Store file data separately (binary)
data_key = f"{JOB_DATA_PREFIX}{job_id}"
meta_key = f"{JOB_PREFIX}{job_id}"
# Use pipeline for atomic operation
async with r.pipeline() as pipe:
# Store metadata as hash
await pipe.hset(meta_key, mapping=job_meta) # type: ignore
await pipe.expire(meta_key, JOB_TTL)
# Store binary data
await pipe.set(data_key, file_bytes)
await pipe.expire(data_key, JOB_TTL)
await pipe.execute()
logger.info(f"Job {job_id} submitted")
return job_id
async def get_job_status(self, job_id: str) -> Optional[JobResponse]:
"""
Get the status of a job.
Args:
job_id: Job ID to check
Returns:
JobResponse or None if job doesn't exist
"""
r = await self.get_redis()
meta_key = f"{JOB_PREFIX}{job_id}"
result_key = f"{JOB_RESULT_PREFIX}{job_id}"
# Get job metadata
meta = await r.hgetall(meta_key) # type: ignore
if not meta:
return None
status = JobStatus(meta.get("status", JobStatus.PENDING.value))
progress = int(meta.get("progress", 0))
error = meta.get("error")
# Get result if completed
result = None
if status == JobStatus.COMPLETED:
result_json = await r.get(result_key)
if result_json:
result_dict = json.loads(result_json)
result = OcrResponse(**result_dict)
return JobResponse(
jobId=job_id,
status=status,
progress=progress if status == JobStatus.PROCESSING else None,
result=result,
error=error if status == JobStatus.FAILED else None,
)
async def update_job_progress(self, job_id: str, progress: int) -> None:
"""Update job progress percentage."""
r = await self.get_redis()
meta_key = f"{JOB_PREFIX}{job_id}"
await r.hset(meta_key, mapping={ # type: ignore
"status": JobStatus.PROCESSING.value,
"progress": progress,
})
async def complete_job(self, job_id: str, result: OcrResponse) -> None:
"""Mark job as completed with result."""
r = await self.get_redis()
meta_key = f"{JOB_PREFIX}{job_id}"
result_key = f"{JOB_RESULT_PREFIX}{job_id}"
data_key = f"{JOB_DATA_PREFIX}{job_id}"
# Store result
result_dict = result.model_dump(by_alias=True)
result_json = json.dumps(result_dict)
async with r.pipeline() as pipe:
# Update status
await pipe.hset(meta_key, mapping={ # type: ignore
"status": JobStatus.COMPLETED.value,
"progress": 100,
})
# Store result
await pipe.set(result_key, result_json)
await pipe.expire(result_key, JOB_TTL)
# Delete file data (no longer needed)
await pipe.delete(data_key)
await pipe.execute()
logger.info(f"Job {job_id} completed")
# TODO: Trigger callback if configured
meta = await r.hgetall(meta_key) # type: ignore
callback_url = meta.get("callback_url")
if callback_url:
# Fire-and-forget callback (don't block)
asyncio.create_task(self._send_callback(callback_url, job_id, result_dict))
async def fail_job(self, job_id: str, error: str) -> None:
"""Mark job as failed with error message."""
r = await self.get_redis()
meta_key = f"{JOB_PREFIX}{job_id}"
data_key = f"{JOB_DATA_PREFIX}{job_id}"
async with r.pipeline() as pipe:
await pipe.hset(meta_key, mapping={ # type: ignore
"status": JobStatus.FAILED.value,
"error": error,
})
# Delete file data
await pipe.delete(data_key)
await pipe.execute()
logger.error(f"Job {job_id} failed: {error}")
async def get_job_data(self, job_id: str) -> Optional[bytes]:
"""Get the file data for a job."""
r = await self.get_redis()
data_key = f"{JOB_DATA_PREFIX}{job_id}"
# Get raw bytes (not decoded)
raw_redis = redis.Redis(
host=settings.redis_host,
port=settings.redis_port,
db=settings.redis_db,
decode_responses=False,
)
try:
data = await raw_redis.get(data_key)
return data # type: ignore
finally:
await raw_redis.close()
async def _send_callback(
self, url: str, job_id: str, result: dict
) -> None:
"""Send callback notification when job completes."""
try:
import httpx
async with httpx.AsyncClient(timeout=10.0) as client:
await client.post(
url,
json={"jobId": job_id, "result": result},
)
logger.info(f"Callback sent for job {job_id}")
except Exception as e:
logger.error(f"Callback failed for job {job_id}: {e}")
# Singleton instance
job_queue = JobQueue()

View File

@@ -0,0 +1,275 @@
"""Core OCR service using Tesseract with HEIC support."""
import io
import logging
import time
from typing import Optional
import magic
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener
from app.config import settings
from app.models import DocumentType, ExtractedField, OcrResponse
from app.services.preprocessor import preprocessor
# Register HEIF/HEIC opener with Pillow
register_heif_opener()
logger = logging.getLogger(__name__)
class OcrService:
"""Core OCR processing service."""
# Supported MIME types
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/heic",
"image/heif",
"application/pdf",
}
def __init__(self) -> None:
"""Initialize OCR service."""
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
def extract(
self,
file_bytes: bytes,
content_type: Optional[str] = None,
preprocess: bool = True,
) -> OcrResponse:
"""
Extract text from an image file.
Args:
file_bytes: Raw file bytes
content_type: MIME type (optional, will be detected if not provided)
preprocess: Whether to apply preprocessing
Returns:
OcrResponse with extracted text and metadata
"""
start_time = time.time()
# Detect file type if not provided
if not content_type:
content_type = self._detect_mime_type(file_bytes)
# Validate file type
if content_type not in self.SUPPORTED_TYPES:
return OcrResponse(
success=False,
documentType=DocumentType.UNKNOWN,
rawText="",
confidence=0.0,
extractedFields={},
processingTimeMs=int((time.time() - start_time) * 1000),
)
try:
# Convert HEIC/HEIF to standard format
if content_type in ("image/heic", "image/heif"):
file_bytes = self._convert_heic(file_bytes)
content_type = "image/png"
# Handle PDF (extract first page as image)
if content_type == "application/pdf":
file_bytes = self._extract_pdf_first_page(file_bytes)
content_type = "image/png"
# Apply preprocessing if enabled
if preprocess:
file_bytes = preprocessor.preprocess(
file_bytes, deskew=True, denoise=True
)
# Perform OCR
image = Image.open(io.BytesIO(file_bytes))
ocr_data = pytesseract.image_to_data(
image, output_type=pytesseract.Output.DICT
)
# Extract text and calculate confidence
raw_text, confidence = self._process_ocr_data(ocr_data)
# Detect document type from content
document_type = self._detect_document_type(raw_text)
# Extract fields based on document type
extracted_fields = self._extract_fields(raw_text, document_type)
processing_time_ms = int((time.time() - start_time) * 1000)
logger.info(
f"OCR completed: {len(raw_text)} chars, "
f"{confidence:.2%} confidence, {processing_time_ms}ms"
)
return OcrResponse(
success=True,
documentType=document_type,
rawText=raw_text,
confidence=confidence,
extractedFields=extracted_fields,
processingTimeMs=processing_time_ms,
)
except Exception as e:
logger.error(f"OCR extraction failed: {e}", exc_info=True)
return OcrResponse(
success=False,
documentType=DocumentType.UNKNOWN,
rawText="",
confidence=0.0,
extractedFields={},
processingTimeMs=int((time.time() - start_time) * 1000),
)
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""Detect MIME type using python-magic."""
mime = magic.Magic(mime=True)
detected = mime.from_buffer(file_bytes)
return detected or "application/octet-stream"
def _convert_heic(self, heic_bytes: bytes) -> bytes:
"""Convert HEIC/HEIF to PNG format."""
# pillow-heif registers itself with PIL, so we can open HEIC directly
image = Image.open(io.BytesIO(heic_bytes))
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return buffer.getvalue()
def _extract_pdf_first_page(self, pdf_bytes: bytes) -> bytes:
"""Extract first page of PDF as PNG image."""
try:
# Use pdf2image if available, otherwise return empty
from pdf2image import convert_from_bytes
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=300)
if images:
buffer = io.BytesIO()
images[0].save(buffer, format="PNG")
return buffer.getvalue()
except ImportError:
logger.warning("pdf2image not available, PDF support limited")
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return b""
def _process_ocr_data(
self, ocr_data: dict
) -> tuple[str, float]:
"""Process Tesseract output to extract text and confidence."""
words = []
confidences = []
for i, text in enumerate(ocr_data["text"]):
# Filter out empty strings and low-confidence results
conf = int(ocr_data["conf"][i])
if text.strip() and conf > 0:
words.append(text)
confidences.append(conf)
raw_text = " ".join(words)
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
# Normalize confidence to 0-1 range (Tesseract returns 0-100)
return raw_text, avg_confidence / 100.0
def _detect_document_type(self, text: str) -> DocumentType:
"""Detect document type from extracted text content."""
text_lower = text.lower()
# VIN document indicators
vin_indicators = [
"vin",
"vehicle identification",
"title",
"registration",
"certificate of title",
]
if any(indicator in text_lower for indicator in vin_indicators):
# Additional check: look for 17-character alphanumeric sequences
import re
vin_pattern = r"\b[A-HJ-NPR-Z0-9]{17}\b"
if re.search(vin_pattern, text.upper()):
return DocumentType.VIN
# Receipt indicators
receipt_indicators = [
"receipt",
"total",
"subtotal",
"tax",
"payment",
"invoice",
"amount due",
"gallons",
"price/gallon",
]
if sum(1 for ind in receipt_indicators if ind in text_lower) >= 2:
return DocumentType.RECEIPT
# Manual indicators
manual_indicators = [
"owner's manual",
"maintenance schedule",
"service interval",
"chapter",
"table of contents",
"specifications",
]
if any(indicator in text_lower for indicator in manual_indicators):
return DocumentType.MANUAL
return DocumentType.UNKNOWN
def _extract_fields(
self, text: str, document_type: DocumentType
) -> dict[str, ExtractedField]:
"""Extract specific fields based on document type."""
import re
fields: dict[str, ExtractedField] = {}
if document_type == DocumentType.VIN:
# Extract VIN (17 alphanumeric characters, excluding I, O, Q)
vin_pattern = r"\b([A-HJ-NPR-Z0-9]{17})\b"
match = re.search(vin_pattern, text.upper())
if match:
fields["vin"] = ExtractedField(value=match.group(1), confidence=0.9)
elif document_type == DocumentType.RECEIPT:
# Extract amounts (currency patterns)
amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)"
amounts = re.findall(amount_pattern, text)
if amounts:
# Last amount is often the total
fields["total"] = ExtractedField(
value=f"${amounts[-1]}", confidence=0.7
)
# Extract date
date_pattern = r"(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})"
date_match = re.search(date_pattern, text)
if date_match:
fields["date"] = ExtractedField(value=date_match.group(1), confidence=0.8)
# Extract gallons (for fuel receipts)
gallon_pattern = r"(\d+\.?\d*)\s*(?:gal|gallons)"
gallon_match = re.search(gallon_pattern, text.lower())
if gallon_match:
fields["gallons"] = ExtractedField(
value=gallon_match.group(1), confidence=0.85
)
return fields
# Singleton instance
ocr_service = OcrService()

View File

@@ -0,0 +1,176 @@
"""Image preprocessing service for OCR accuracy improvement."""
import io
import logging
from typing import Optional
import cv2
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
class ImagePreprocessor:
"""Handles image preprocessing for improved OCR accuracy."""
def preprocess(
self,
image_bytes: bytes,
deskew: bool = True,
denoise: bool = True,
binarize: bool = False,
) -> bytes:
"""
Apply preprocessing to an image for better OCR results.
Args:
image_bytes: Raw image bytes
deskew: Whether to correct image rotation
denoise: Whether to apply noise reduction
binarize: Whether to convert to black and white
Returns:
Preprocessed image as PNG bytes
"""
# Convert bytes to numpy array via PIL
pil_image = Image.open(io.BytesIO(image_bytes))
# Convert to RGB if necessary (handles RGBA, grayscale, etc.)
if pil_image.mode not in ("RGB", "L"):
pil_image = pil_image.convert("RGB")
# Convert PIL to OpenCV format
cv_image = np.array(pil_image)
# Convert RGB to BGR for OpenCV (if color image)
if len(cv_image.shape) == 3:
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
# Convert to grayscale for processing
if len(cv_image.shape) == 3:
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
else:
gray = cv_image
# Apply denoising
if denoise:
gray = self._denoise(gray)
# Apply deskewing
if deskew:
gray = self._deskew(gray)
# Apply binarization (optional - can help with some documents)
if binarize:
gray = self._binarize(gray)
# Convert back to PIL and return as PNG bytes
result_image = Image.fromarray(gray)
buffer = io.BytesIO()
result_image.save(buffer, format="PNG")
return buffer.getvalue()
def _denoise(self, image: np.ndarray) -> np.ndarray:
"""Apply noise reduction using non-local means denoising."""
try:
# fastNlMeansDenoising is effective for grayscale images
return cv2.fastNlMeansDenoising(image, h=10, templateWindowSize=7, searchWindowSize=21)
except cv2.error as e:
logger.warning(f"Denoising failed: {e}")
return image
def _deskew(self, image: np.ndarray) -> np.ndarray:
"""Correct image rotation using Hough transform."""
try:
# Detect edges
edges = cv2.Canny(image, 50, 150, apertureSize=3)
# Detect lines using Hough transform
lines = cv2.HoughLinesP(
edges,
rho=1,
theta=np.pi / 180,
threshold=100,
minLineLength=100,
maxLineGap=10,
)
if lines is None:
return image
# Calculate the average angle of detected lines
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
if x2 - x1 != 0: # Avoid division by zero
angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
# Only consider nearly horizontal lines (within 45 degrees)
if -45 < angle < 45:
angles.append(angle)
if not angles:
return image
# Use median angle to avoid outliers
median_angle = np.median(angles)
# Only correct if skew is significant but not too extreme
if abs(median_angle) < 0.5 or abs(median_angle) > 15:
return image
# Rotate the image to correct skew
height, width = image.shape[:2]
center = (width // 2, height // 2)
rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
# Calculate new image bounds to avoid cropping
cos_val = abs(rotation_matrix[0, 0])
sin_val = abs(rotation_matrix[0, 1])
new_width = int(height * sin_val + width * cos_val)
new_height = int(height * cos_val + width * sin_val)
rotation_matrix[0, 2] += (new_width - width) / 2
rotation_matrix[1, 2] += (new_height - height) / 2
rotated = cv2.warpAffine(
image,
rotation_matrix,
(new_width, new_height),
borderMode=cv2.BORDER_REPLICATE,
)
logger.debug(f"Deskewed image by {median_angle:.2f} degrees")
return rotated
except Exception as e:
logger.warning(f"Deskewing failed: {e}")
return image
def _binarize(self, image: np.ndarray) -> np.ndarray:
"""Convert to binary (black and white) using adaptive thresholding."""
try:
return cv2.adaptiveThreshold(
image,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11,
C=2,
)
except cv2.error as e:
logger.warning(f"Binarization failed: {e}")
return image
def get_image_info(self, image_bytes: bytes) -> dict:
"""Get basic information about an image."""
pil_image = Image.open(io.BytesIO(image_bytes))
return {
"width": pil_image.width,
"height": pil_image.height,
"mode": pil_image.mode,
"format": pil_image.format,
}
# Singleton instance
preprocessor = ImagePreprocessor()