dataset-yolo-script/sam2-cpu/frigate_mini/detector/onnx_detector.py

"""
ONNX Runtime detector backend.
"""

import numpy as np
import logging
from typing import List, Tuple, Optional

from .base import BaseDetector, Detection, BBox

logger = logging.getLogger(__name__)


class ONNXDetector(BaseDetector):
    """
    ONNX Runtime-based YOLO detector.

    Supports CPU and CUDA execution providers.
    This is the recommended backend for CPU-only inference.

    Features:
    - Cross-platform (Linux, Windows, macOS, ARM)
    - No special hardware required
    - Optimized CPU inference with threading
    - Optional CUDA support
    """

    def __init__(
        self,
        model_path: str,
        input_size: Tuple[int, int] = (640, 640),
        conf_threshold: float = 0.25,
        nms_threshold: float = 0.45,
        device: str = "cpu",
        num_threads: int = 0,
        optimization_level: str = "all",
        class_names: Optional[dict] = None,
    ):
        """
        Initialize ONNX detector.

        Args:
            model_path: Path to .onnx model file
            input_size: Model input size (width, height)
            conf_threshold: Confidence threshold
            nms_threshold: NMS IoU threshold
            device: Device ('cpu' or 'cuda')
            num_threads: CPU threads (0 = auto based on CPU cores)
            optimization_level: Graph optimization ('none', 'basic', 'extended', 'all')
            class_names: Class ID to name mapping
        """
        super().__init__(
            model_path=model_path,
            input_size=input_size,
            conf_threshold=conf_threshold,
            nms_threshold=nms_threshold,
            class_names=class_names,
        )
        self.device = device
        self.num_threads = num_threads
        self.optimization_level = optimization_level
        self.session = None
        self.input_name = None
        self.output_names = None

    def load_model(self) -> bool:
        """Load ONNX model."""
        try:
            import onnxruntime as ort

            # Select execution providers
            if self.device == "cuda":
                providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
            else:
                providers = ['CPUExecutionProvider']

            logger.info(f"Loading ONNX model: {self.model_path}")
            logger.info(f"  Device: {self.device}")
            logger.info(f"  Threads: {self.num_threads if self.num_threads > 0 else 'auto'}")

            # Create session options
            sess_options = ort.SessionOptions()

            # Set optimization level
            opt_levels = {
                'none': ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
                'basic': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
                'extended': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
                'all': ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
            }
            sess_options.graph_optimization_level = opt_levels.get(
                self.optimization_level,
                ort.GraphOptimizationLevel.ORT_ENABLE_ALL
            )

            # Set CPU threading options
            if self.num_threads > 0:
                sess_options.intra_op_num_threads = self.num_threads
                sess_options.inter_op_num_threads = self.num_threads

            # Enable memory optimization
            sess_options.enable_mem_pattern = True
            sess_options.enable_cpu_mem_arena = True

            # Create session
            self.session = ort.InferenceSession(
                self.model_path,
                sess_options=sess_options,
                providers=providers,
            )

            # Get input/output info
            self.input_name = self.session.get_inputs()[0].name
            self.output_names = [o.name for o in self.session.get_outputs()]

            # Get input shape
            input_shape = self.session.get_inputs()[0].shape
            if len(input_shape) == 4:
                self.input_size = (input_shape[3], input_shape[2])  # width, height

            actual_provider = self.session.get_providers()[0]
            logger.info(f"ONNX model loaded successfully")
            logger.info(f"  Provider: {actual_provider}")
            logger.info(f"  Input size: {self.input_size}")

            return True

        except ImportError:
            logger.error("onnxruntime not found. Install with: pip install onnxruntime")
            return False
        except Exception as e:
            logger.error(f"Failed to load ONNX model: {e}")
            return False

    def detect(self, frame: np.ndarray) -> List[Detection]:
        """
        Run detection on frame.

        Args:
            frame: Input image (BGR, HWC)

        Returns:
            List of Detection objects
        """
        if self.session is None:
            logger.warning("ONNX session not initialized")
            return []

        try:
            orig_h, orig_w = frame.shape[:2]

            # Preprocess
            input_tensor, ratio, pad = self._preprocess(frame)

            # Run inference
            outputs = self.session.run(self.output_names, {self.input_name: input_tensor})

            # Postprocess
            detections = self._postprocess(outputs, (orig_h, orig_w), ratio, pad)

            return detections

        except Exception as e:
            logger.error(f"ONNX inference error: {e}")
            return []

    def _preprocess(self, frame: np.ndarray) -> Tuple[np.ndarray, float, Tuple[float, float]]:
        """Preprocess frame for ONNX inference."""
        import cv2

        input_w, input_h = self.input_size
        orig_h, orig_w = frame.shape[:2]

        # Calculate scale
        ratio = min(input_w / orig_w, input_h / orig_h)
        new_w = int(orig_w * ratio)
        new_h = int(orig_h * ratio)

        # Resize
        resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)

        # Pad
        pad_w = (input_w - new_w) / 2
        pad_h = (input_h - new_h) / 2

        top = int(round(pad_h - 0.1))
        bottom = int(round(pad_h + 0.1))
        left = int(round(pad_w - 0.1))
        right = int(round(pad_w + 0.1))

        padded = cv2.copyMakeBorder(
            resized, top, bottom, left, right,
            cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        # BGR to RGB
        rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB)

        # Normalize
        normalized = rgb.astype(np.float32) / 255.0

        # HWC to NCHW
        transposed = normalized.transpose(2, 0, 1)
        batched = np.expand_dims(transposed, axis=0)

        return batched, ratio, (pad_w, pad_h)

    def _postprocess(
        self,
        outputs: list,
        original_shape: Tuple[int, int],
        ratio: float,
        pad: Tuple[float, float],
    ) -> List[Detection]:
        """Postprocess ONNX outputs."""
        detections = []
        orig_h, orig_w = original_shape
        pad_w, pad_h = pad

        # Handle different output formats
        output = outputs[0]

        if output.ndim == 3:
            output = output[0]

        # Transpose if needed (num_classes+4 x num_boxes -> num_boxes x num_classes+4)
        if output.shape[0] < output.shape[1]:
            output = output.T

        for row in output:
            if len(row) < 5:
                continue

            # Parse based on format
            if len(row) == 85:  # YOLOv5 format with obj_conf
                x, y, w, h, obj_conf = row[:5]
                class_confs = row[5:]
                class_id = np.argmax(class_confs)
                confidence = obj_conf * class_confs[class_id]
            else:  # YOLOv8/v9 format without obj_conf
                x, y, w, h = row[:4]
                class_confs = row[4:]
                class_id = np.argmax(class_confs)
                confidence = class_confs[class_id]

            if confidence < self.conf_threshold:
                continue

            # Convert to xyxy and scale back
            x1 = (x - w / 2 - pad_w) / ratio
            y1 = (y - h / 2 - pad_h) / ratio
            x2 = (x + w / 2 - pad_w) / ratio
            y2 = (y + h / 2 - pad_h) / ratio

            # Clip
            x1 = max(0, min(orig_w, x1))
            y1 = max(0, min(orig_h, y1))
            x2 = max(0, min(orig_w, x2))
            y2 = max(0, min(orig_h, y2))

            class_name = self.class_names.get(int(class_id), str(class_id))

            detection = Detection(
                class_id=int(class_id),
                class_name=class_name,
                confidence=float(confidence),
                bbox=BBox(x1=x1, y1=y1, x2=x2, y2=y2),
            )
            detections.append(detection)

        # Apply NMS
        if detections:
            boxes = np.array([[d.bbox.x1, d.bbox.y1, d.bbox.x2, d.bbox.y2] for d in detections])
            scores = np.array([d.confidence for d in detections])
            keep = self.nms(boxes, scores, self.nms_threshold)
            detections = [detections[i] for i in keep]

        return detections

    def release(self) -> None:
        """Release ONNX session."""
        self.session = None
        logger.info("ONNX detector released")