diff --git a/ocr/app/preprocessors/vin_preprocessor.py b/ocr/app/preprocessors/vin_preprocessor.py index aee5a86..cd9b388 100644 --- a/ocr/app/preprocessors/vin_preprocessor.py +++ b/ocr/app/preprocessors/vin_preprocessor.py @@ -115,6 +115,7 @@ class VinPreprocessor: # Apply adaptive thresholding if apply_threshold: gray = self._adaptive_threshold(gray) + gray = self._morphological_cleanup(gray) steps_applied.append("threshold") # Convert back to PNG bytes @@ -152,43 +153,34 @@ class VinPreprocessor: def _best_contrast_channel(self, bgr_image: np.ndarray) -> np.ndarray: """ - Select the single color channel with the highest contrast. + Compute a grayscale image that maximizes text-to-background contrast. - Standard grayscale conversion (0.299R + 0.587G + 0.114B) averages - channels, which destroys contrast when text and background differ - primarily in one channel. For example, white text on a green VIN - sticker has almost identical luminance, but the blue and red channels - show strong contrast. + Uses per-pixel minimum across B, G, R channels. White text has + min(255,255,255) = 255 regardless of channel, while any colored + background has a low value in at least one channel (e.g. green + sticker: min(130,230,150) = 130). This gives ~125 units of + contrast vs ~60 from standard grayscale. - This method evaluates each BGR channel by its standard deviation - (a proxy for contrast) and returns the one with the highest value. - Falls back to standard grayscale when all channels are similar. + Falls back to standard grayscale when the min-channel doesn't + improve contrast (i.e. for already-neutral/gray images). """ b_channel, g_channel, r_channel = cv2.split(bgr_image) - stds = [ - float(np.std(b_channel)), - float(np.std(g_channel)), - float(np.std(r_channel)), - ] - channels = [b_channel, g_channel, r_channel] - channel_names = ["blue", "green", "red"] + min_channel = np.minimum(np.minimum(b_channel, g_channel), r_channel) - best_idx = int(np.argmax(stds)) - max_std = stds[best_idx] - min_std = min(stds) + min_std = float(np.std(min_channel)) + gray = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY) + gray_std = float(np.std(gray)) - # Only use single-channel extraction when one channel is notably - # better (>20% higher std than the weakest). Otherwise, standard - # grayscale is fine and more robust for neutral-colored images. - if max_std > 0 and (max_std - min_std) / max_std > 0.20: + # Use min-channel when it provides meaningfully more contrast + if min_std > gray_std * 1.1: logger.debug( - "Using %s channel (std=%.1f) over grayscale (stds: B=%.1f G=%.1f R=%.1f)", - channel_names[best_idx], max_std, stds[0], stds[1], stds[2], + "Using min-channel (std=%.1f) over grayscale (std=%.1f)", + min_std, gray_std, ) - return channels[best_idx] + return min_channel - return cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY) + return gray def _apply_clahe(self, image: np.ndarray) -> np.ndarray: """ @@ -309,6 +301,20 @@ class VinPreprocessor: logger.warning(f"Adaptive threshold failed: {e}") return image + def _morphological_cleanup(self, image: np.ndarray) -> np.ndarray: + """ + Remove small noise artifacts from a thresholded binary image. + + Morphological opening (erosion then dilation) removes isolated + pixels and thin noise lines while preserving larger text characters. + """ + try: + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) + return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel) + except cv2.error as e: + logger.warning(f"Morphological cleanup failed: {e}") + return image + def _otsu_threshold(self, image: np.ndarray) -> np.ndarray: """ Apply Otsu's thresholding for binarization. @@ -361,6 +367,7 @@ class VinPreprocessor: steps_applied.append("denoise") gray = self._otsu_threshold(gray) + gray = self._morphological_cleanup(gray) steps_applied.append("otsu_threshold") result_image = Image.fromarray(gray)