dataset-yolo-script/sam2-cpu/frigate_mini/detector/rknn_detector.py

"""
RKNN detector backend for Rockchip NPU.
"""

import numpy as np
import logging
from typing import List, Tuple, Optional

from .base import BaseDetector, Detection, BBox

logger = logging.getLogger(__name__)


class RKNNDetector(BaseDetector):
    """
    RKNN-based YOLO detector for Rockchip NPU.

    Supports: RK3588, RK3568, RK3566, RK3562, RV1106, etc.
    """

    def __init__(
        self,
        model_path: str,
        target_platform: str = "rk3588",
        core_mask: int = 7,
        input_size: Tuple[int, int] = (640, 640),
        conf_threshold: float = 0.25,
        nms_threshold: float = 0.45,
        class_names: Optional[dict] = None,
    ):
        """
        Initialize RKNN detector.

        Args:
            model_path: Path to .rknn model file
            target_platform: Target Rockchip platform
            core_mask: NPU core mask (RK3588: 7=all 3 cores)
            input_size: Model input size
            conf_threshold: Confidence threshold
            nms_threshold: NMS threshold
            class_names: Class ID to name mapping
        """
        super().__init__(
            model_path=model_path,
            input_size=input_size,
            conf_threshold=conf_threshold,
            nms_threshold=nms_threshold,
            class_names=class_names,
        )

        self.target_platform = target_platform
        self.core_mask = core_mask
        self.rknn = None

    def load_model(self) -> bool:
        """Load RKNN model to NPU."""
        try:
            # Try rknnlite2 first (for ARM devices)
            try:
                from rknnlite.api import RKNNLite
                self.rknn = RKNNLite()
                is_lite = True
                logger.info("Using RKNNLite2 runtime")
            except ImportError:
                # Fall back to rknn-toolkit2 (for x86 simulation)
                from rknn.api import RKNN
                self.rknn = RKNN()
                is_lite = False
                logger.info("Using RKNN-Toolkit2 runtime")

            # Load model
            logger.info(f"Loading RKNN model: {self.model_path}")
            ret = self.rknn.load_rknn(self.model_path)
            if ret != 0:
                logger.error(f"Failed to load RKNN model: {ret}")
                return False

            # Initialize runtime
            if is_lite:
                ret = self.rknn.init_runtime(core_mask=self.core_mask)
            else:
                ret = self.rknn.init_runtime(
                    target=self.target_platform,
                    device_id=None,
                )

            if ret != 0:
                logger.error(f"Failed to init RKNN runtime: {ret}")
                return False

            logger.info("RKNN model loaded successfully")
            return True

        except ImportError as e:
            logger.error(f"RKNN library not available: {e}")
            logger.info("Install with: pip install rknnlite2 (ARM) or rknn-toolkit2 (x86)")
            return False
        except Exception as e:
            logger.error(f"Failed to load RKNN model: {e}")
            return False

    def detect(self, frame: np.ndarray) -> List[Detection]:
        """
        Run detection on frame using NPU.

        Args:
            frame: Input image (BGR, HWC)

        Returns:
            List of Detection objects
        """
        if self.rknn is None:
            logger.warning("RKNN not initialized")
            return []

        orig_h, orig_w = frame.shape[:2]

        # Preprocess
        input_data = self._preprocess_rknn(frame)

        # Run inference
        outputs = self.rknn.inference(inputs=[input_data])

        if outputs is None:
            logger.warning("RKNN inference returned None")
            return []

        # Postprocess
        detections = self._postprocess_yolo(outputs, (orig_h, orig_w))

        return detections

    def _preprocess_rknn(self, frame: np.ndarray) -> np.ndarray:
        """Preprocess frame for RKNN inference."""
        import cv2

        input_w, input_h = self.input_size

        # Resize with letterbox
        img, ratio, (dw, dh) = self._letterbox(frame, (input_h, input_w))

        # Store for postprocessing
        self._ratio = ratio
        self._pad = (dw, dh)
        self._orig_shape = frame.shape[:2]

        # BGR to RGB (RKNN typically expects RGB)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        return img

    def _letterbox(
        self,
        img: np.ndarray,
        new_shape: Tuple[int, int],
        color: Tuple[int, int, int] = (114, 114, 114),
    ) -> Tuple[np.ndarray, float, Tuple[int, int]]:
        """Resize and pad image while maintaining aspect ratio."""
        import cv2

        shape = img.shape[:2]  # [height, width]

        # Scale ratio
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])

        # Compute padding
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        dw = new_shape[1] - new_unpad[0]
        dh = new_shape[0] - new_unpad[1]

        dw /= 2
        dh /= 2

        if shape[::-1] != new_unpad:
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))

        img = cv2.copyMakeBorder(
            img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
        )

        return img, r, (dw, dh)

    def _postprocess_yolo(
        self,
        outputs: list,
        original_shape: Tuple[int, int],
    ) -> List[Detection]:
        """
        Postprocess YOLO outputs from RKNN.

        Handles common YOLO output formats:
        - YOLOv5/v8/v9 style: [1, num_boxes, 5+num_classes]
        - Split outputs: boxes, scores, classes separate
        """
        detections = []

        try:
            # Handle different output formats
            if len(outputs) == 1:
                # Single output tensor
                output = outputs[0]
                if output.ndim == 3:
                    output = output[0]  # Remove batch dim

                # Assume format: [num_boxes, 5+num_classes] or [5+num_classes, num_boxes]
                if output.shape[0] < output.shape[1]:
                    output = output.T

                detections = self._parse_yolo_output(output, original_shape)

            elif len(outputs) >= 3:
                # Split outputs (boxes, scores, classes)
                # This is common for quantized RKNN models
                detections = self._parse_split_outputs(outputs, original_shape)

        except Exception as e:
            logger.error(f"Postprocessing error: {e}")

        return detections

    def _parse_yolo_output(
        self,
        output: np.ndarray,
        original_shape: Tuple[int, int],
    ) -> List[Detection]:
        """Parse standard YOLO output format."""
        detections = []
        orig_h, orig_w = original_shape
        input_w, input_h = self.input_size

        ratio = self._ratio
        dw, dh = self._pad

        for row in output:
            # Format: [x, y, w, h, obj_conf, cls1_conf, cls2_conf, ...]
            # or: [x, y, w, h, cls1_conf, cls2_conf, ...] (obj_conf = max class conf)

            if len(row) < 5:
                continue

            # Check if obj_conf exists
            if len(row) == 85:  # 4 + 1 + 80 classes (with obj_conf)
                x, y, w, h, obj_conf = row[:5]
                class_confs = row[5:]
                class_id = np.argmax(class_confs)
                class_conf = class_confs[class_id]
                confidence = obj_conf * class_conf
            else:  # No separate obj_conf
                x, y, w, h = row[:4]
                class_confs = row[4:]
                class_id = np.argmax(class_confs)
                confidence = class_confs[class_id]

            if confidence < self.conf_threshold:
                continue

            # Convert to xyxy
            x1 = x - w / 2
            y1 = y - h / 2
            x2 = x + w / 2
            y2 = y + h / 2

            # Remove padding and scale back
            x1 = (x1 - dw) / ratio
            y1 = (y1 - dh) / ratio
            x2 = (x2 - dw) / ratio
            y2 = (y2 - dh) / ratio

            # Clip to image bounds
            x1 = max(0, min(orig_w, x1))
            y1 = max(0, min(orig_h, y1))
            x2 = max(0, min(orig_w, x2))
            y2 = max(0, min(orig_h, y2))

            class_name = self.class_names.get(int(class_id), str(class_id))

            detection = Detection(
                class_id=int(class_id),
                class_name=class_name,
                confidence=float(confidence),
                bbox=BBox(x1=x1, y1=y1, x2=x2, y2=y2),
            )
            detections.append(detection)

        # Apply NMS
        if detections:
            detections = self._apply_nms(detections)

        return detections

    def _parse_split_outputs(
        self,
        outputs: list,
        original_shape: Tuple[int, int],
    ) -> List[Detection]:
        """Parse split output format (common in quantized models)."""
        # This format varies by model - implement based on specific model output
        # Common format: [boxes, scores, class_ids, num_dets]

        detections = []

        # Placeholder - implement based on actual model output format
        logger.warning("Split output parsing not fully implemented")

        return detections

    def _apply_nms(self, detections: List[Detection]) -> List[Detection]:
        """Apply NMS to detections."""
        if not detections:
            return []

        boxes = np.array([[d.bbox.x1, d.bbox.y1, d.bbox.x2, d.bbox.y2] for d in detections])
        scores = np.array([d.confidence for d in detections])

        keep_indices = self.nms(boxes, scores, self.nms_threshold)

        return [detections[i] for i in keep_indices]

    def release(self) -> None:
        """Release RKNN resources."""
        if self.rknn is not None:
            self.rknn.release()
            self.rknn = None
            logger.info("RKNN resources released")