2026-02-07 15:47:37 +00:00
1 changed files with 34 additions and 27 deletions
--- a/ocr/app/preprocessors/vin_preprocessor.py
+++ b/ocr/app/preprocessors/vin_preprocessor.py
@@ -115,6 +115,7 @@ class VinPreprocessor:
        # Apply adaptive thresholding
        if apply_threshold:
            gray = self._adaptive_threshold(gray)
+            gray = self._morphological_cleanup(gray)
            steps_applied.append("threshold")

        # Convert back to PNG bytes
@@ -152,43 +153,34 @@ class VinPreprocessor:

    def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray:
        """
-        Select the single color channel with the highest contrast.
+        Compute a grayscale image that maximizes text-to-background contrast.

-        Standard grayscale conversion (0.299R + 0.587G + 0.114B) averages
-        channels, which destroys contrast when text and background differ
-        primarily in one channel.  For example, white text on a green VIN
-        sticker has almost identical luminance, but the blue and red channels
-        show strong contrast.
+        Uses per-pixel minimum across B, G, R channels.  White text has
+        min(255,255,255) = 255 regardless of channel, while any colored
+        background has a low value in at least one channel (e.g. green
+        sticker: min(130,230,150) = 130).  This gives ~125 units of
+        contrast vs ~60 from standard grayscale.

-        This method evaluates each BGR channel by its standard deviation
-        (a proxy for contrast) and returns the one with the highest value.
-        Falls back to standard grayscale when all channels are similar.
+        Falls back to standard grayscale when the min-channel doesn't
+        improve contrast (i.e. for already-neutral/gray images).
        """
        b_channel, g_channel, r_channel = cv2.split(bgr_image)

-        stds = [
-            float(np.std(b_channel)),
-            float(np.std(g_channel)),
-            float(np.std(r_channel)),
-        ]
-        channels = [b_channel, g_channel, r_channel]
-        channel_names = ["blue", "green", "red"]
+        min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel)

-        best_idx = int(np.argmax(stds))
-        max_std = stds[best_idx]
-        min_std = min(stds)
+        min_std = float(np.std(min_channel))
+        gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
+        gray_std = float(np.std(gray))

-        # Only use single-channel extraction when one channel is notably
-        # better (>20% higher std than the weakest).  Otherwise, standard
-        # grayscale is fine and more robust for neutral-colored images.
-        if max_std > 0 and (max_std - min_std) / max_std > 0.20:
+        # Use min-channel when it provides meaningfully more contrast
+        if min_std > gray_std * 1.1:
            logger.debug(
-                "Using %s channel (std=%.1f) over grayscale (stds: B=%.1f G=%.1f R=%.1f)",
-                channel_names[best_idx], max_std, stds[0], stds[1], stds[2],
+                "Using min-channel (std=%.1f) over grayscale (std=%.1f)",
+                min_std, gray_std,
            )
-            return channels[best_idx]
+            return min_channel

-        return cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
+        return gray

    def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
        """
@@ -309,6 +301,20 @@ class VinPreprocessor:
            logger.warning(f"Adaptive threshold failed: {e}")
            return image

+    def _morphological_cleanup(self, image: np.ndarray) -> np.ndarray:
+        """
+        Remove small noise artifacts from a thresholded binary image.
+
+        Morphological opening (erosion then dilation) removes isolated
+        pixels and thin noise lines while preserving larger text characters.
+        """
+        try:
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+            return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
+        except cv2.error as e:
+            logger.warning(f"Morphological cleanup failed: {e}")
+            return image
+
    def _otsu_threshold(self, image: np.ndarray) -> np.ndarray:
        """
        Apply Otsu's thresholding for binarization.
@@ -361,6 +367,7 @@ class VinPreprocessor:
        steps_applied.append("denoise")

        gray = self._otsu_threshold(gray)
+        gray = self._morphological_cleanup(gray)
        steps_applied.append("otsu_threshold")

        result_image = Image.fromarray(gray)