feat: Improve OCR process - replace Tesseract with PaddleOCR (#115) #122
@@ -56,8 +56,9 @@ class PaddleOcrEngine(OcrEngine):
|
|||||||
"""Run PaddleOCR on image bytes.
|
"""Run PaddleOCR on image bytes.
|
||||||
|
|
||||||
PaddleOCR v3.x ``predict()`` returns an iterator of result objects.
|
PaddleOCR v3.x ``predict()`` returns an iterator of result objects.
|
||||||
Each result has a ``res`` dict with ``dt_polys``, ``rec_texts``,
|
Each result's ``.json`` property returns a dict. The OCR fields
|
||||||
and ``rec_scores``.
|
(``dt_polys``, ``rec_texts``, ``rec_scores``) may be at the top
|
||||||
|
level or nested under a ``"res"`` key depending on the version.
|
||||||
"""
|
"""
|
||||||
ocr = self._get_ocr()
|
ocr = self._get_ocr()
|
||||||
|
|
||||||
@@ -78,7 +79,13 @@ class PaddleOcrEngine(OcrEngine):
|
|||||||
engine_name=self.name,
|
engine_name=self.name,
|
||||||
)
|
)
|
||||||
|
|
||||||
res = results[0].json
|
raw = results[0].json
|
||||||
|
# Unwrap nested "res" key if present (save_to_json format)
|
||||||
|
res = raw.get("res", raw) if isinstance(raw, dict) else raw
|
||||||
|
logger.debug(
|
||||||
|
"PaddleOCR result keys: %s",
|
||||||
|
list(res.keys()) if isinstance(res, dict) else type(res).__name__,
|
||||||
|
)
|
||||||
dt_polys = res.get("dt_polys", [])
|
dt_polys = res.get("dt_polys", [])
|
||||||
rec_texts = res.get("rec_texts", [])
|
rec_texts = res.get("rec_texts", [])
|
||||||
rec_scores = res.get("rec_scores", [])
|
rec_scores = res.get("rec_scores", [])
|
||||||
|
|||||||
@@ -44,12 +44,17 @@ def _make_result(
|
|||||||
def _mock_paddle_result(
|
def _mock_paddle_result(
|
||||||
dt_polys: list, rec_texts: list[str], rec_scores: list[float]
|
dt_polys: list, rec_texts: list[str], rec_scores: list[float]
|
||||||
) -> MagicMock:
|
) -> MagicMock:
|
||||||
"""Create a mock PaddleOCR v3.x predict() result object."""
|
"""Create a mock PaddleOCR v3.x predict() result object.
|
||||||
|
|
||||||
|
Wraps data under ``"res"`` key to match save_to_json format.
|
||||||
|
"""
|
||||||
result = MagicMock()
|
result = MagicMock()
|
||||||
result.json = {
|
result.json = {
|
||||||
"dt_polys": dt_polys,
|
"res": {
|
||||||
"rec_texts": rec_texts,
|
"dt_polys": dt_polys,
|
||||||
"rec_scores": rec_scores,
|
"rec_texts": rec_texts,
|
||||||
|
"rec_scores": rec_scores,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user