diff --git a/ocr/Dockerfile b/ocr/Dockerfile index 8028575..49a89bb 100644 --- a/ocr/Dockerfile +++ b/ocr/Dockerfile @@ -36,7 +36,8 @@ RUN pip install --no-cache-dir -r requirements.txt \ # Pre-download PaddleOCR PP-OCRv4 models during build (not at runtime). # Models are baked into the image so container starts are fast and # no network access is needed at runtime for model download. -RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)" \ +ENV PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK=True +RUN python -c "from paddleocr import PaddleOCR; PaddleOCR(ocr_version='PP-OCRv4', use_textline_orientation=True, lang='en', device='cpu')" \ && echo "PaddleOCR PP-OCRv4 models downloaded and verified" COPY . . diff --git a/ocr/app/engines/paddle_engine.py b/ocr/app/engines/paddle_engine.py index 41433f1..61fb0cf 100644 --- a/ocr/app/engines/paddle_engine.py +++ b/ocr/app/engines/paddle_engine.py @@ -34,12 +34,12 @@ class PaddleOcrEngine(OcrEngine): from paddleocr import PaddleOCR # type: ignore[import-untyped] self._ocr = PaddleOCR( - use_angle_cls=True, + ocr_version="PP-OCRv4", + use_textline_orientation=True, lang="en", - use_gpu=False, - show_log=False, + device="cpu", ) - logger.info("PaddleOCR PP-OCRv4 initialized (CPU, angle_cls=True)") + logger.info("PaddleOCR PP-OCRv4 initialized (CPU, textline_orientation=True)") return self._ocr except ImportError as exc: raise EngineUnavailableError( @@ -54,8 +54,9 @@ class PaddleOcrEngine(OcrEngine): def recognize(self, image_bytes: bytes, config: OcrConfig) -> OcrEngineResult: """Run PaddleOCR on image bytes. - PaddleOCR returns: list of pages, each page is a list of - ``[[box_coords], (text, confidence)]`` entries. + PaddleOCR v3.x ``predict()`` returns an iterator of result objects. + Each result has a ``res`` dict with ``dt_polys``, ``rec_texts``, + and ``rec_scores``. """ ocr = self._get_ocr() @@ -66,10 +67,22 @@ class PaddleOcrEngine(OcrEngine): image = Image.open(io.BytesIO(image_bytes)).convert("RGB") img_array = np.array(image) - # PaddleOCR accepts numpy arrays - results = ocr.ocr(img_array, cls=config.use_angle_cls) + results = list(ocr.predict(img_array)) - if not results or not results[0]: + if not results: + return OcrEngineResult( + text="", + confidence=0.0, + word_boxes=[], + engine_name=self.name, + ) + + res = results[0].res + dt_polys = res.get("dt_polys", []) + rec_texts = res.get("rec_texts", []) + rec_scores = res.get("rec_scores", []) + + if not rec_texts: return OcrEngineResult( text="", confidence=0.0, @@ -81,10 +94,8 @@ class PaddleOcrEngine(OcrEngine): texts: list[str] = [] confidences: list[float] = [] - for line in results[0]: - box_coords = line[0] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] - text = line[1][0] - conf = float(line[1][1]) + for i, text in enumerate(rec_texts): + conf = float(rec_scores[i]) if i < len(rec_scores) else 0.0 # Apply character whitelist filter if configured if config.char_whitelist: @@ -94,11 +105,16 @@ class PaddleOcrEngine(OcrEngine): if not text.strip(): continue - # Convert quadrilateral to bounding box - xs = [pt[0] for pt in box_coords] - ys = [pt[1] for pt in box_coords] - x_min, y_min = int(min(xs)), int(min(ys)) - x_max, y_max = int(max(xs)), int(max(ys)) + # Convert quadrilateral polygon to bounding box + x_min, y_min, width, height = 0, 0, 0, 0 + if i < len(dt_polys): + poly = dt_polys[i] + xs = [pt[0] for pt in poly] + ys = [pt[1] for pt in poly] + x_min, y_min = int(min(xs)), int(min(ys)) + x_max, y_max = int(max(xs)), int(max(ys)) + width = x_max - x_min + height = y_max - y_min word_boxes.append( WordBox( @@ -106,8 +122,8 @@ class PaddleOcrEngine(OcrEngine): confidence=conf, x=x_min, y=y_min, - width=x_max - x_min, - height=y_max - y_min, + width=width, + height=height, ) ) texts.append(text.strip()) diff --git a/ocr/tests/test_engine_abstraction.py b/ocr/tests/test_engine_abstraction.py index 44c314f..a85f0f5 100644 --- a/ocr/tests/test_engine_abstraction.py +++ b/ocr/tests/test_engine_abstraction.py @@ -41,6 +41,19 @@ def _make_result( ) +def _mock_paddle_result( + dt_polys: list, rec_texts: list[str], rec_scores: list[float] +) -> MagicMock: + """Create a mock PaddleOCR v3.x predict() result object.""" + result = MagicMock() + result.res = { + "dt_polys": dt_polys, + "rec_texts": rec_texts, + "rec_scores": rec_scores, + } + return result + + # --------------------------------------------------------------------------- # Exception hierarchy # --------------------------------------------------------------------------- @@ -182,7 +195,9 @@ class TestPaddleOcrEngine: engine = PaddleOcrEngine() mock_ocr = MagicMock() - mock_ocr.ocr.return_value = [None] + mock_ocr.predict.return_value = iter([ + _mock_paddle_result(dt_polys=[], rec_texts=[], rec_scores=[]) + ]) engine._ocr = mock_ocr result = engine.recognize(_create_test_image_bytes(), OcrConfig()) @@ -196,12 +211,16 @@ class TestPaddleOcrEngine: engine = PaddleOcrEngine() mock_ocr = MagicMock() - mock_ocr.ocr.return_value = [ - [ - [[[10, 20], [110, 20], [110, 50], [10, 50]], ("HELLO", 0.95)], - [[[10, 60], [110, 60], [110, 90], [10, 90]], ("WORLD", 0.88)], - ] - ] + mock_ocr.predict.return_value = iter([ + _mock_paddle_result( + dt_polys=[ + [[10, 20], [110, 20], [110, 50], [10, 50]], + [[10, 60], [110, 60], [110, 90], [10, 90]], + ], + rec_texts=["HELLO", "WORLD"], + rec_scores=[0.95, 0.88], + ) + ]) engine._ocr = mock_ocr result = engine.recognize(_create_test_image_bytes(), OcrConfig()) @@ -218,11 +237,13 @@ class TestPaddleOcrEngine: engine = PaddleOcrEngine() mock_ocr = MagicMock() - mock_ocr.ocr.return_value = [ - [ - [[[0, 0], [100, 0], [100, 30], [0, 30]], ("1HG-BH4!", 0.9)], - ] - ] + mock_ocr.predict.return_value = iter([ + _mock_paddle_result( + dt_polys=[[[0, 0], [100, 0], [100, 30], [0, 30]]], + rec_texts=["1HG-BH4!"], + rec_scores=[0.9], + ) + ]) engine._ocr = mock_ocr config = OcrConfig(char_whitelist="ABCDEFGHJKLMNPRSTUVWXYZ0123456789") @@ -237,11 +258,13 @@ class TestPaddleOcrEngine: engine = PaddleOcrEngine() mock_ocr = MagicMock() # Slightly rotated quad: min x=8, min y=20, max x=110, max y=55 - mock_ocr.ocr.return_value = [ - [ - [[[10, 20], [110, 25], [108, 55], [8, 50]], ("TEXT", 0.9)], - ] - ] + mock_ocr.predict.return_value = iter([ + _mock_paddle_result( + dt_polys=[[[10, 20], [110, 25], [108, 55], [8, 50]]], + rec_texts=["TEXT"], + rec_scores=[0.9], + ) + ]) engine._ocr = mock_ocr result = engine.recognize(_create_test_image_bytes(), OcrConfig()) @@ -257,11 +280,13 @@ class TestPaddleOcrEngine: engine = PaddleOcrEngine() mock_ocr = MagicMock() - mock_ocr.ocr.return_value = [ - [ - [[[0, 0], [50, 0], [50, 20], [0, 20]], ("---", 0.9)], - ] - ] + mock_ocr.predict.return_value = iter([ + _mock_paddle_result( + dt_polys=[[[0, 0], [50, 0], [50, 20], [0, 20]]], + rec_texts=["---"], + rec_scores=[0.9], + ) + ]) engine._ocr = mock_ocr config = OcrConfig(char_whitelist="ABC") @@ -296,7 +321,7 @@ class TestPaddleOcrEngine: engine = PaddleOcrEngine() mock_ocr = MagicMock() - mock_ocr.ocr.side_effect = RuntimeError("OCR crashed") + mock_ocr.predict.side_effect = RuntimeError("OCR crashed") engine._ocr = mock_ocr with pytest.raises(EngineProcessingError, match="PaddleOCR recognition failed"):