motovaultpro/ocr/app/table_extraction/parser.py

"""Parse maintenance schedule tables into structured data."""
import logging
import re
from dataclasses import dataclass, field
from typing import Optional

from app.patterns.maintenance_patterns import maintenance_matcher
from app.patterns.service_mapping import service_mapper

logger = logging.getLogger(__name__)


@dataclass
class ParsedScheduleRow:
    """A parsed maintenance schedule row."""

    service: str
    normalized_service: Optional[str]
    subtypes: list[str]
    interval_miles: Optional[int]
    interval_months: Optional[int]
    details: Optional[str]
    fluid_spec: Optional[str]
    confidence: float
    raw_row: list[str] = field(default_factory=list)


class TableParser:
    """Parse detected tables into maintenance schedules.

    Handles various table formats:
    - Service | Miles | Months | Notes
    - Service | Interval | Description
    - Miles/Months header with service rows
    """

    # Common column header patterns
    COLUMN_PATTERNS = {
        "service": [
            r"service", r"item", r"maintenance", r"operation",
            r"component", r"part", r"system", r"description",
        ],
        "miles": [
            r"miles?", r"mi\.?", r"mileage", r"odometer",
            r"km", r"kilometers?",
        ],
        "months": [
            r"months?", r"mo\.?", r"time", r"interval",
            r"years?", r"yr\.?",
        ],
        "details": [
            r"notes?", r"details?", r"remarks?", r"comments?",
            r"specification", r"specs?", r"procedure",
        ],
    }

    def parse_table(
        self,
        header_row: list[str],
        data_rows: list[list[str]],
    ) -> list[ParsedScheduleRow]:
        """
        Parse a maintenance table into structured schedule rows.

        Args:
            header_row: Table header cells
            data_rows: Table data rows

        Returns:
            List of ParsedScheduleRow objects
        """
        # Identify column types
        column_types = self._identify_columns(header_row)

        if not column_types:
            logger.warning("Could not identify table columns")
            return self._parse_without_headers(data_rows)

        results = []

        for row in data_rows:
            parsed = self._parse_row(row, column_types)
            if parsed:
                results.append(parsed)

        return results

    def parse_text_block(self, text: str) -> list[ParsedScheduleRow]:
        """
        Parse maintenance schedules from unstructured text.

        Useful when table detection fails but text contains schedule info.

        Args:
            text: Text block that may contain maintenance schedules

        Returns:
            List of ParsedScheduleRow objects
        """
        results = []
        lines = text.split("\n")

        for line in lines:
            # Look for lines with service + interval pattern
            service_match = service_mapper.map_service(line)
            mileage_match = maintenance_matcher.extract_mileage_interval(line)
            time_match = maintenance_matcher.extract_time_interval(line)

            if service_match and (mileage_match or time_match):
                # Extract fluid spec if present
                fluid_match = maintenance_matcher.extract_fluid_spec(line)

                results.append(
                    ParsedScheduleRow(
                        service=line.strip(),
                        normalized_service=service_match.normalized_name,
                        subtypes=service_match.subtypes,
                        interval_miles=mileage_match.value if mileage_match else None,
                        interval_months=time_match.value if time_match else None,
                        details=None,
                        fluid_spec=fluid_match.value if fluid_match else None,
                        confidence=min(
                            service_match.confidence,
                            mileage_match.confidence if mileage_match else 1.0,
                            time_match.confidence if time_match else 1.0,
                        ),
                        raw_row=[line],
                    )
                )

        return results

    def _identify_columns(
        self, header_row: list[str]
    ) -> dict[int, str]:
        """
        Identify column types from header row.

        Args:
            header_row: Table header cells

        Returns:
            Dict mapping column index to type
        """
        column_types: dict[int, str] = {}

        for i, header in enumerate(header_row):
            header_lower = header.lower().strip()

            for col_type, patterns in self.COLUMN_PATTERNS.items():
                for pattern in patterns:
                    if re.search(pattern, header_lower, re.IGNORECASE):
                        column_types[i] = col_type
                        break
                if i in column_types:
                    break

        # If no service column found, assume first column
        if "service" not in column_types.values() and header_row:
            for i, header in enumerate(header_row):
                if i not in column_types:
                    column_types[i] = "service"
                    break

        return column_types

    def _parse_row(
        self,
        row: list[str],
        column_types: dict[int, str],
    ) -> Optional[ParsedScheduleRow]:
        """
        Parse a single data row using identified column types.

        Args:
            row: Table row cells
            column_types: Column index to type mapping

        Returns:
            ParsedScheduleRow or None
        """
        service = ""
        interval_miles: Optional[int] = None
        interval_months: Optional[int] = None
        details: Optional[str] = None
        fluid_spec: Optional[str] = None

        # Extract values based on column types
        for i, cell in enumerate(row):
            cell_value = cell.strip()
            if not cell_value:
                continue

            col_type = column_types.get(i)

            if col_type == "service":
                service = cell_value
            elif col_type == "miles":
                miles = self._extract_miles(cell_value)
                if miles:
                    interval_miles = miles
            elif col_type == "months":
                months = self._extract_months(cell_value)
                if months:
                    interval_months = months
            elif col_type == "details":
                details = cell_value
                # Also check for fluid specs in details
                fluid_match = maintenance_matcher.extract_fluid_spec(cell_value)
                if fluid_match:
                    fluid_spec = fluid_match.value

        # If no explicit miles/months columns, try to extract from service text
        if not interval_miles and not interval_months:
            mileage_match = maintenance_matcher.extract_mileage_interval(service)
            time_match = maintenance_matcher.extract_time_interval(service)
            if mileage_match:
                interval_miles = mileage_match.value
            if time_match:
                interval_months = time_match.value

        # Check for intervals in any cell
        if not interval_miles:
            for cell in row:
                mileage_match = maintenance_matcher.extract_mileage_interval(cell)
                if mileage_match:
                    interval_miles = mileage_match.value
                    break

        if not interval_months:
            for cell in row:
                time_match = maintenance_matcher.extract_time_interval(cell)
                if time_match:
                    interval_months = time_match.value
                    break

        # Skip if no service identified
        if not service:
            return None

        # Map service to normalized name and subtypes
        service_match = service_mapper.map_service(service)

        normalized_service = service_match.normalized_name if service_match else None
        subtypes = service_match.subtypes if service_match else []
        service_confidence = service_match.confidence if service_match else 0.5

        # Calculate overall confidence
        interval_confidence = 0.0
        if interval_miles:
            interval_confidence = max(interval_confidence, 0.8)
        if interval_months:
            interval_confidence = max(interval_confidence, 0.8)

        confidence = (service_confidence + interval_confidence) / 2 if interval_confidence else service_confidence * 0.7

        return ParsedScheduleRow(
            service=service,
            normalized_service=normalized_service,
            subtypes=subtypes,
            interval_miles=interval_miles,
            interval_months=interval_months,
            details=details,
            fluid_spec=fluid_spec,
            confidence=confidence,
            raw_row=row,
        )

    def _parse_without_headers(
        self, data_rows: list[list[str]]
    ) -> list[ParsedScheduleRow]:
        """
        Parse table without clear headers by analyzing content.

        Args:
            data_rows: Table rows

        Returns:
            List of ParsedScheduleRow
        """
        results = []

        for row in data_rows:
            if not row:
                continue

            # Join all cells and try to extract info
            row_text = " ".join(row)

            service_match = service_mapper.map_service(row_text)
            mileage_match = maintenance_matcher.extract_mileage_interval(row_text)
            time_match = maintenance_matcher.extract_time_interval(row_text)
            fluid_match = maintenance_matcher.extract_fluid_spec(row_text)

            if service_match:
                results.append(
                    ParsedScheduleRow(
                        service=row[0] if row else row_text,
                        normalized_service=service_match.normalized_name,
                        subtypes=service_match.subtypes,
                        interval_miles=mileage_match.value if mileage_match else None,
                        interval_months=time_match.value if time_match else None,
                        details=None,
                        fluid_spec=fluid_match.value if fluid_match else None,
                        confidence=service_match.confidence * 0.8,  # Reduce for no-header parsing
                        raw_row=row,
                    )
                )

        return results

    def _extract_miles(self, text: str) -> Optional[int]:
        """Extract mileage value from cell text."""
        # First try pattern matcher
        match = maintenance_matcher.extract_mileage_interval(text)
        if match:
            return match.value

        # Try simple number extraction
        # Look for patterns like "5,000", "5000", "5K"
        number_match = re.search(r"([\d,]+)(?:K)?", text.replace(" ", ""), re.IGNORECASE)
        if number_match:
            num_str = number_match.group(1).replace(",", "")
            try:
                value = int(num_str)
                # Handle "5K" notation
                if "K" in text.upper() and value < 1000:
                    value *= 1000
                if 500 <= value <= 150000:
                    return value
            except ValueError:
                pass

        return None

    def _extract_months(self, text: str) -> Optional[int]:
        """Extract month interval from cell text."""
        # First try pattern matcher
        match = maintenance_matcher.extract_time_interval(text)
        if match:
            return match.value

        # Try simple number extraction
        number_match = re.search(r"(\d+)", text)
        if number_match:
            try:
                value = int(number_match.group(1))
                if 1 <= value <= 120:
                    return value
            except ValueError:
                pass

        return None


# Singleton instance
table_parser = TableParser()