add sam2 yolo auto annotation

2026-02-04 15:29:36 +07:00
parent 7e56948ece
commit 5a951d8812
2061 changed files with 316473 additions and 0 deletions
@@ -0,0 +1,118 @@
+"""
+Object detection backends.
+"""
+
+from typing import Dict, Any, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def create_detector(config: Dict[str, Any]):
+    """
+    Create detector based on configuration.
+    
+    Args:
+        config: Detector configuration dict
+        
+    Returns:
+        Detector instance
+    """
+    detector_type = config.get('type', 'yolo').lower()
+    model_path = config.get('model_path', 'models/yolov9t.pt')
+    
+    logger.info(f"Creating detector: type={detector_type}, model={model_path}")
+    
+    # Try RKNN first if specified
+    if detector_type == 'rknn':
+        try:
+            from .rknn_detector import RKNNDetector
+            
+            rknn_config = config.get('rknn', {})
+            detector = RKNNDetector(
+                model_path=model_path,
+                target_platform=rknn_config.get('target_platform', 'rk3588'),
+                core_mask=rknn_config.get('core_mask', 7),
+                input_size=tuple(config.get('input_size', [640, 640])),
+                conf_threshold=config.get('conf_threshold', 0.25),
+                nms_threshold=config.get('nms_threshold', 0.45),
+            )
+            
+            if detector.load_model():
+                logger.info("RKNN detector initialized successfully")
+                return detector
+            else:
+                logger.warning("RKNN detector failed to load, trying fallback")
+                
+        except ImportError as e:
+            logger.warning(f"RKNN not available: {e}")
+        except Exception as e:
+            logger.warning(f"RKNN initialization failed: {e}")
+    
+    # Try ONNX if specified or as fallback
+    if detector_type == 'onnx' or (detector_type == 'rknn' and config.get('fallback', {}).get('enabled', True)):
+        fallback_config = config.get('fallback', {})
+        onnx_config = config.get('onnx', {})
+        
+        if fallback_config.get('type') == 'onnx' or detector_type == 'onnx':
+            try:
+                from .onnx_detector import ONNXDetector
+                
+                # Determine model path
+                onnx_model_path = model_path
+                if model_path.endswith('.rknn'):
+                    onnx_model_path = model_path.replace('.rknn', '.onnx')
+                elif model_path.endswith('.pt'):
+                    onnx_model_path = model_path.replace('.pt', '.onnx')
+                
+                # Get device from onnx config or fallback config
+                device = onnx_config.get('device') or fallback_config.get('device', 'cpu')
+                
+                detector = ONNXDetector(
+                    model_path=onnx_model_path,
+                    input_size=tuple(config.get('input_size', [640, 640])),
+                    conf_threshold=config.get('conf_threshold', 0.25),
+                    nms_threshold=config.get('nms_threshold', 0.45),
+                    device=device,
+                    num_threads=onnx_config.get('num_threads', 0),
+                    optimization_level=onnx_config.get('optimization_level', 'all'),
+                )
+                
+                if detector.load_model():
+                    logger.info("ONNX detector initialized successfully")
+                    return detector
+                    
+            except ImportError as e:
+                logger.warning(f"ONNX runtime not available: {e}")
+                logger.info("Install with: pip install onnxruntime")
+            except Exception as e:
+                logger.warning(f"ONNX initialization failed: {e}")
+    
+    # Use Ultralytics YOLO as default/fallback
+    try:
+        from .yolo_detector import YOLODetector
+        
+        fallback_config = config.get('fallback', {})
+        device = fallback_config.get('device', 'cpu')
+        
+        # Adjust model path
+        if model_path.endswith('.rknn'):
+            model_path = model_path.replace('.rknn', '.pt')
+        elif model_path.endswith('.onnx'):
+            model_path = model_path.replace('.onnx', '.pt')
+        
+        detector = YOLODetector(
+            model_path=model_path,
+            conf_threshold=config.get('conf_threshold', 0.25),
+            nms_threshold=config.get('nms_threshold', 0.45),
+            device=device,
+        )
+        
+        if detector.load_model():
+            logger.info(f"YOLO detector initialized on {device}")
+            return detector
+            
+    except Exception as e:
+        logger.error(f"Failed to initialize any detector: {e}")
+    
+    return None
@@ -0,0 +1,230 @@
+"""
+Base detector interface.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Optional
+from dataclasses import dataclass
+import numpy as np
+
+
+@dataclass
+class BBox:
+    """Bounding box."""
+    x1: float
+    y1: float
+    x2: float
+    y2: float
+    
+    def to_yolo(self, img_width: int, img_height: int) -> Tuple[float, float, float, float]:
+        """Convert to YOLO format (normalized x_center, y_center, width, height)."""
+        x_center = ((self.x1 + self.x2) / 2) / img_width
+        y_center = ((self.y1 + self.y2) / 2) / img_height
+        width = (self.x2 - self.x1) / img_width
+        height = (self.y2 - self.y1) / img_height
+        return (x_center, y_center, width, height)
+    
+    def area(self) -> float:
+        """Calculate area in pixels."""
+        return (self.x2 - self.x1) * (self.y2 - self.y1)
+    
+    @property
+    def width(self) -> float:
+        return self.x2 - self.x1
+    
+    @property
+    def height(self) -> float:
+        return self.y2 - self.y1
+
+
+@dataclass
+class Detection:
+    """Single detection result."""
+    class_id: int
+    class_name: str
+    confidence: float
+    bbox: BBox
+    track_id: Optional[int] = None
+
+
+class BaseDetector(ABC):
+    """Abstract base class for object detectors."""
+    
+    # COCO class names
+    COCO_CLASSES = {
+        0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
+        5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
+        10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
+        14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
+        20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
+        25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
+        30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite',
+        34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard',
+        38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork',
+        43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple',
+        48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog',
+        53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch',
+        58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv',
+        63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone',
+        68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator',
+        73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear',
+        78: 'hair drier', 79: 'toothbrush'
+    }
+    
+    def __init__(
+        self,
+        model_path: str,
+        input_size: Tuple[int, int] = (640, 640),
+        conf_threshold: float = 0.25,
+        nms_threshold: float = 0.45,
+        class_names: Optional[dict] = None,
+    ):
+        """
+        Initialize detector.
+        
+        Args:
+            model_path: Path to model file
+            input_size: Model input size (width, height)
+            conf_threshold: Confidence threshold
+            nms_threshold: NMS IoU threshold
+            class_names: Class ID to name mapping
+        """
+        self.model_path = model_path
+        self.input_size = input_size
+        self.conf_threshold = conf_threshold
+        self.nms_threshold = nms_threshold
+        self.class_names = class_names or self.COCO_CLASSES
+        self.model = None
+    
+    @abstractmethod
+    def load_model(self) -> bool:
+        """Load model. Returns True on success."""
+        pass
+    
+    @abstractmethod
+    def detect(self, frame: np.ndarray) -> List[Detection]:
+        """
+        Run detection on frame.
+        
+        Args:
+            frame: Input image (BGR, HWC)
+            
+        Returns:
+            List of Detection objects
+        """
+        pass
+    
+    @abstractmethod
+    def release(self) -> None:
+        """Release resources."""
+        pass
+    
+    def preprocess(self, frame: np.ndarray) -> np.ndarray:
+        """
+        Preprocess frame for inference.
+        
+        Args:
+            frame: Input frame (BGR, HWC)
+            
+        Returns:
+            Preprocessed input tensor
+        """
+        import cv2
+        
+        # Resize
+        input_width, input_height = self.input_size
+        resized = cv2.resize(frame, (input_width, input_height))
+        
+        # BGR to RGB
+        rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+        
+        # Normalize to [0, 1]
+        normalized = rgb.astype(np.float32) / 255.0
+        
+        # HWC to CHW
+        transposed = normalized.transpose(2, 0, 1)
+        
+        # Add batch dimension
+        batched = np.expand_dims(transposed, axis=0)
+        
+        return batched
+    
+    def postprocess(
+        self,
+        outputs: np.ndarray,
+        original_shape: Tuple[int, int],
+    ) -> List[Detection]:
+        """
+        Postprocess model outputs.
+        
+        Args:
+            outputs: Raw model outputs
+            original_shape: Original frame shape (height, width)
+            
+        Returns:
+            List of Detection objects
+        """
+        # This is a generic implementation for YOLO-style outputs
+        # Override in subclasses for specific model output formats
+        
+        orig_h, orig_w = original_shape
+        input_w, input_h = self.input_size
+        
+        detections = []
+        
+        # Assume outputs shape: [1, num_boxes, 5+num_classes] or similar
+        # This will vary by model - subclasses should override
+        
+        return detections
+    
+    def nms(
+        self,
+        boxes: np.ndarray,
+        scores: np.ndarray,
+        iou_threshold: float = 0.45,
+    ) -> List[int]:
+        """
+        Non-maximum suppression.
+        
+        Args:
+            boxes: Array of boxes [N, 4] in xyxy format
+            scores: Array of scores [N]
+            iou_threshold: IoU threshold
+            
+        Returns:
+            List of indices to keep
+        """
+        if len(boxes) == 0:
+            return []
+        
+        x1 = boxes[:, 0]
+        y1 = boxes[:, 1]
+        x2 = boxes[:, 2]
+        y2 = boxes[:, 3]
+        
+        areas = (x2 - x1) * (y2 - y1)
+        order = scores.argsort()[::-1]
+        
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            
+            if order.size == 1:
+                break
+            
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+            
+            w = np.maximum(0, xx2 - xx1)
+            h = np.maximum(0, yy2 - yy1)
+            
+            inter = w * h
+            iou = inter / (areas[i] + areas[order[1:]] - inter)
+            
+            inds = np.where(iou <= iou_threshold)[0]
+            order = order[inds + 1]
+        
+        return keep
@@ -0,0 +1,283 @@
+"""
+ONNX Runtime detector backend.
+"""
+
+import numpy as np
+import logging
+from typing import List, Tuple, Optional
+
+from .base import BaseDetector, Detection, BBox
+
+logger = logging.getLogger(__name__)
+
+
+class ONNXDetector(BaseDetector):
+    """
+    ONNX Runtime-based YOLO detector.
+    
+    Supports CPU and CUDA execution providers.
+    This is the recommended backend for CPU-only inference.
+    
+    Features:
+    - Cross-platform (Linux, Windows, macOS, ARM)
+    - No special hardware required
+    - Optimized CPU inference with threading
+    - Optional CUDA support
+    """
+    
+    def __init__(
+        self,
+        model_path: str,
+        input_size: Tuple[int, int] = (640, 640),
+        conf_threshold: float = 0.25,
+        nms_threshold: float = 0.45,
+        device: str = "cpu",
+        num_threads: int = 0,
+        optimization_level: str = "all",
+        class_names: Optional[dict] = None,
+    ):
+        """
+        Initialize ONNX detector.
+        
+        Args:
+            model_path: Path to .onnx model file
+            input_size: Model input size (width, height)
+            conf_threshold: Confidence threshold
+            nms_threshold: NMS IoU threshold
+            device: Device ('cpu' or 'cuda')
+            num_threads: CPU threads (0 = auto based on CPU cores)
+            optimization_level: Graph optimization ('none', 'basic', 'extended', 'all')
+            class_names: Class ID to name mapping
+        """
+        super().__init__(
+            model_path=model_path,
+            input_size=input_size,
+            conf_threshold=conf_threshold,
+            nms_threshold=nms_threshold,
+            class_names=class_names,
+        )
+        self.device = device
+        self.num_threads = num_threads
+        self.optimization_level = optimization_level
+        self.session = None
+        self.input_name = None
+        self.output_names = None
+    
+    def load_model(self) -> bool:
+        """Load ONNX model."""
+        try:
+            import onnxruntime as ort
+            
+            # Select execution providers
+            if self.device == "cuda":
+                providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+            else:
+                providers = ['CPUExecutionProvider']
+            
+            logger.info(f"Loading ONNX model: {self.model_path}")
+            logger.info(f"  Device: {self.device}")
+            logger.info(f"  Threads: {self.num_threads if self.num_threads > 0 else 'auto'}")
+            
+            # Create session options
+            sess_options = ort.SessionOptions()
+            
+            # Set optimization level
+            opt_levels = {
+                'none': ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
+                'basic': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
+                'extended': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
+                'all': ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
+            }
+            sess_options.graph_optimization_level = opt_levels.get(
+                self.optimization_level, 
+                ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+            )
+            
+            # Set CPU threading options
+            if self.num_threads > 0:
+                sess_options.intra_op_num_threads = self.num_threads
+                sess_options.inter_op_num_threads = self.num_threads
+            
+            # Enable memory optimization
+            sess_options.enable_mem_pattern = True
+            sess_options.enable_cpu_mem_arena = True
+            
+            # Create session
+            self.session = ort.InferenceSession(
+                self.model_path,
+                sess_options=sess_options,
+                providers=providers,
+            )
+            
+            # Get input/output info
+            self.input_name = self.session.get_inputs()[0].name
+            self.output_names = [o.name for o in self.session.get_outputs()]
+            
+            # Get input shape
+            input_shape = self.session.get_inputs()[0].shape
+            if len(input_shape) == 4:
+                self.input_size = (input_shape[3], input_shape[2])  # width, height
+            
+            actual_provider = self.session.get_providers()[0]
+            logger.info(f"ONNX model loaded successfully")
+            logger.info(f"  Provider: {actual_provider}")
+            logger.info(f"  Input size: {self.input_size}")
+            
+            return True
+            
+        except ImportError:
+            logger.error("onnxruntime not found. Install with: pip install onnxruntime")
+            return False
+        except Exception as e:
+            logger.error(f"Failed to load ONNX model: {e}")
+            return False
+    
+    def detect(self, frame: np.ndarray) -> List[Detection]:
+        """
+        Run detection on frame.
+        
+        Args:
+            frame: Input image (BGR, HWC)
+            
+        Returns:
+            List of Detection objects
+        """
+        if self.session is None:
+            logger.warning("ONNX session not initialized")
+            return []
+        
+        try:
+            orig_h, orig_w = frame.shape[:2]
+            
+            # Preprocess
+            input_tensor, ratio, pad = self._preprocess(frame)
+            
+            # Run inference
+            outputs = self.session.run(self.output_names, {self.input_name: input_tensor})
+            
+            # Postprocess
+            detections = self._postprocess(outputs, (orig_h, orig_w), ratio, pad)
+            
+            return detections
+            
+        except Exception as e:
+            logger.error(f"ONNX inference error: {e}")
+            return []
+    
+    def _preprocess(self, frame: np.ndarray) -> Tuple[np.ndarray, float, Tuple[float, float]]:
+        """Preprocess frame for ONNX inference."""
+        import cv2
+        
+        input_w, input_h = self.input_size
+        orig_h, orig_w = frame.shape[:2]
+        
+        # Calculate scale
+        ratio = min(input_w / orig_w, input_h / orig_h)
+        new_w = int(orig_w * ratio)
+        new_h = int(orig_h * ratio)
+        
+        # Resize
+        resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+        
+        # Pad
+        pad_w = (input_w - new_w) / 2
+        pad_h = (input_h - new_h) / 2
+        
+        top = int(round(pad_h - 0.1))
+        bottom = int(round(pad_h + 0.1))
+        left = int(round(pad_w - 0.1))
+        right = int(round(pad_w + 0.1))
+        
+        padded = cv2.copyMakeBorder(
+            resized, top, bottom, left, right,
+            cv2.BORDER_CONSTANT, value=(114, 114, 114)
+        )
+        
+        # BGR to RGB
+        rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB)
+        
+        # Normalize
+        normalized = rgb.astype(np.float32) / 255.0
+        
+        # HWC to NCHW
+        transposed = normalized.transpose(2, 0, 1)
+        batched = np.expand_dims(transposed, axis=0)
+        
+        return batched, ratio, (pad_w, pad_h)
+    
+    def _postprocess(
+        self,
+        outputs: list,
+        original_shape: Tuple[int, int],
+        ratio: float,
+        pad: Tuple[float, float],
+    ) -> List[Detection]:
+        """Postprocess ONNX outputs."""
+        detections = []
+        orig_h, orig_w = original_shape
+        pad_w, pad_h = pad
+        
+        # Handle different output formats
+        output = outputs[0]
+        
+        if output.ndim == 3:
+            output = output[0]
+        
+        # Transpose if needed (num_classes+4 x num_boxes -> num_boxes x num_classes+4)
+        if output.shape[0] < output.shape[1]:
+            output = output.T
+        
+        for row in output:
+            if len(row) < 5:
+                continue
+            
+            # Parse based on format
+            if len(row) == 85:  # YOLOv5 format with obj_conf
+                x, y, w, h, obj_conf = row[:5]
+                class_confs = row[5:]
+                class_id = np.argmax(class_confs)
+                confidence = obj_conf * class_confs[class_id]
+            else:  # YOLOv8/v9 format without obj_conf
+                x, y, w, h = row[:4]
+                class_confs = row[4:]
+                class_id = np.argmax(class_confs)
+                confidence = class_confs[class_id]
+            
+            if confidence < self.conf_threshold:
+                continue
+            
+            # Convert to xyxy and scale back
+            x1 = (x - w / 2 - pad_w) / ratio
+            y1 = (y - h / 2 - pad_h) / ratio
+            x2 = (x + w / 2 - pad_w) / ratio
+            y2 = (y + h / 2 - pad_h) / ratio
+            
+            # Clip
+            x1 = max(0, min(orig_w, x1))
+            y1 = max(0, min(orig_h, y1))
+            x2 = max(0, min(orig_w, x2))
+            y2 = max(0, min(orig_h, y2))
+            
+            class_name = self.class_names.get(int(class_id), str(class_id))
+            
+            detection = Detection(
+                class_id=int(class_id),
+                class_name=class_name,
+                confidence=float(confidence),
+                bbox=BBox(x1=x1, y1=y1, x2=x2, y2=y2),
+            )
+            detections.append(detection)
+        
+        # Apply NMS
+        if detections:
+            boxes = np.array([[d.bbox.x1, d.bbox.y1, d.bbox.x2, d.bbox.y2] for d in detections])
+            scores = np.array([d.confidence for d in detections])
+            keep = self.nms(boxes, scores, self.nms_threshold)
+            detections = [detections[i] for i in keep]
+        
+        return detections
+    
+    def release(self) -> None:
+        """Release ONNX session."""
+        self.session = None
+        logger.info("ONNX detector released")
@@ -0,0 +1,327 @@
+"""
+RKNN detector backend for Rockchip NPU.
+"""
+
+import numpy as np
+import logging
+from typing import List, Tuple, Optional
+
+from .base import BaseDetector, Detection, BBox
+
+logger = logging.getLogger(__name__)
+
+
+class RKNNDetector(BaseDetector):
+    """
+    RKNN-based YOLO detector for Rockchip NPU.
+    
+    Supports: RK3588, RK3568, RK3566, RK3562, RV1106, etc.
+    """
+    
+    def __init__(
+        self,
+        model_path: str,
+        target_platform: str = "rk3588",
+        core_mask: int = 7,
+        input_size: Tuple[int, int] = (640, 640),
+        conf_threshold: float = 0.25,
+        nms_threshold: float = 0.45,
+        class_names: Optional[dict] = None,
+    ):
+        """
+        Initialize RKNN detector.
+        
+        Args:
+            model_path: Path to .rknn model file
+            target_platform: Target Rockchip platform
+            core_mask: NPU core mask (RK3588: 7=all 3 cores)
+            input_size: Model input size
+            conf_threshold: Confidence threshold
+            nms_threshold: NMS threshold
+            class_names: Class ID to name mapping
+        """
+        super().__init__(
+            model_path=model_path,
+            input_size=input_size,
+            conf_threshold=conf_threshold,
+            nms_threshold=nms_threshold,
+            class_names=class_names,
+        )
+        
+        self.target_platform = target_platform
+        self.core_mask = core_mask
+        self.rknn = None
+    
+    def load_model(self) -> bool:
+        """Load RKNN model to NPU."""
+        try:
+            # Try rknnlite2 first (for ARM devices)
+            try:
+                from rknnlite.api import RKNNLite
+                self.rknn = RKNNLite()
+                is_lite = True
+                logger.info("Using RKNNLite2 runtime")
+            except ImportError:
+                # Fall back to rknn-toolkit2 (for x86 simulation)
+                from rknn.api import RKNN
+                self.rknn = RKNN()
+                is_lite = False
+                logger.info("Using RKNN-Toolkit2 runtime")
+            
+            # Load model
+            logger.info(f"Loading RKNN model: {self.model_path}")
+            ret = self.rknn.load_rknn(self.model_path)
+            if ret != 0:
+                logger.error(f"Failed to load RKNN model: {ret}")
+                return False
+            
+            # Initialize runtime
+            if is_lite:
+                ret = self.rknn.init_runtime(core_mask=self.core_mask)
+            else:
+                ret = self.rknn.init_runtime(
+                    target=self.target_platform,
+                    device_id=None,
+                )
+            
+            if ret != 0:
+                logger.error(f"Failed to init RKNN runtime: {ret}")
+                return False
+            
+            logger.info("RKNN model loaded successfully")
+            return True
+            
+        except ImportError as e:
+            logger.error(f"RKNN library not available: {e}")
+            logger.info("Install with: pip install rknnlite2 (ARM) or rknn-toolkit2 (x86)")
+            return False
+        except Exception as e:
+            logger.error(f"Failed to load RKNN model: {e}")
+            return False
+    
+    def detect(self, frame: np.ndarray) -> List[Detection]:
+        """
+        Run detection on frame using NPU.
+        
+        Args:
+            frame: Input image (BGR, HWC)
+            
+        Returns:
+            List of Detection objects
+        """
+        if self.rknn is None:
+            logger.warning("RKNN not initialized")
+            return []
+        
+        orig_h, orig_w = frame.shape[:2]
+        
+        # Preprocess
+        input_data = self._preprocess_rknn(frame)
+        
+        # Run inference
+        outputs = self.rknn.inference(inputs=[input_data])
+        
+        if outputs is None:
+            logger.warning("RKNN inference returned None")
+            return []
+        
+        # Postprocess
+        detections = self._postprocess_yolo(outputs, (orig_h, orig_w))
+        
+        return detections
+    
+    def _preprocess_rknn(self, frame: np.ndarray) -> np.ndarray:
+        """Preprocess frame for RKNN inference."""
+        import cv2
+        
+        input_w, input_h = self.input_size
+        
+        # Resize with letterbox
+        img, ratio, (dw, dh) = self._letterbox(frame, (input_h, input_w))
+        
+        # Store for postprocessing
+        self._ratio = ratio
+        self._pad = (dw, dh)
+        self._orig_shape = frame.shape[:2]
+        
+        # BGR to RGB (RKNN typically expects RGB)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        
+        return img
+    
+    def _letterbox(
+        self,
+        img: np.ndarray,
+        new_shape: Tuple[int, int],
+        color: Tuple[int, int, int] = (114, 114, 114),
+    ) -> Tuple[np.ndarray, float, Tuple[int, int]]:
+        """Resize and pad image while maintaining aspect ratio."""
+        import cv2
+        
+        shape = img.shape[:2]  # [height, width]
+        
+        # Scale ratio
+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+        
+        # Compute padding
+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+        dw = new_shape[1] - new_unpad[0]
+        dh = new_shape[0] - new_unpad[1]
+        
+        dw /= 2
+        dh /= 2
+        
+        if shape[::-1] != new_unpad:
+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+        
+        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+        
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
+        )
+        
+        return img, r, (dw, dh)
+    
+    def _postprocess_yolo(
+        self,
+        outputs: list,
+        original_shape: Tuple[int, int],
+    ) -> List[Detection]:
+        """
+        Postprocess YOLO outputs from RKNN.
+        
+        Handles common YOLO output formats:
+        - YOLOv5/v8/v9 style: [1, num_boxes, 5+num_classes]
+        - Split outputs: boxes, scores, classes separate
+        """
+        detections = []
+        
+        try:
+            # Handle different output formats
+            if len(outputs) == 1:
+                # Single output tensor
+                output = outputs[0]
+                if output.ndim == 3:
+                    output = output[0]  # Remove batch dim
+                
+                # Assume format: [num_boxes, 5+num_classes] or [5+num_classes, num_boxes]
+                if output.shape[0] < output.shape[1]:
+                    output = output.T
+                
+                detections = self._parse_yolo_output(output, original_shape)
+                
+            elif len(outputs) >= 3:
+                # Split outputs (boxes, scores, classes)
+                # This is common for quantized RKNN models
+                detections = self._parse_split_outputs(outputs, original_shape)
+            
+        except Exception as e:
+            logger.error(f"Postprocessing error: {e}")
+        
+        return detections
+    
+    def _parse_yolo_output(
+        self,
+        output: np.ndarray,
+        original_shape: Tuple[int, int],
+    ) -> List[Detection]:
+        """Parse standard YOLO output format."""
+        detections = []
+        orig_h, orig_w = original_shape
+        input_w, input_h = self.input_size
+        
+        ratio = self._ratio
+        dw, dh = self._pad
+        
+        for row in output:
+            # Format: [x, y, w, h, obj_conf, cls1_conf, cls2_conf, ...]
+            # or: [x, y, w, h, cls1_conf, cls2_conf, ...] (obj_conf = max class conf)
+            
+            if len(row) < 5:
+                continue
+            
+            # Check if obj_conf exists
+            if len(row) == 85:  # 4 + 1 + 80 classes (with obj_conf)
+                x, y, w, h, obj_conf = row[:5]
+                class_confs = row[5:]
+                class_id = np.argmax(class_confs)
+                class_conf = class_confs[class_id]
+                confidence = obj_conf * class_conf
+            else:  # No separate obj_conf
+                x, y, w, h = row[:4]
+                class_confs = row[4:]
+                class_id = np.argmax(class_confs)
+                confidence = class_confs[class_id]
+            
+            if confidence < self.conf_threshold:
+                continue
+            
+            # Convert to xyxy
+            x1 = x - w / 2
+            y1 = y - h / 2
+            x2 = x + w / 2
+            y2 = y + h / 2
+            
+            # Remove padding and scale back
+            x1 = (x1 - dw) / ratio
+            y1 = (y1 - dh) / ratio
+            x2 = (x2 - dw) / ratio
+            y2 = (y2 - dh) / ratio
+            
+            # Clip to image bounds
+            x1 = max(0, min(orig_w, x1))
+            y1 = max(0, min(orig_h, y1))
+            x2 = max(0, min(orig_w, x2))
+            y2 = max(0, min(orig_h, y2))
+            
+            class_name = self.class_names.get(int(class_id), str(class_id))
+            
+            detection = Detection(
+                class_id=int(class_id),
+                class_name=class_name,
+                confidence=float(confidence),
+                bbox=BBox(x1=x1, y1=y1, x2=x2, y2=y2),
+            )
+            detections.append(detection)
+        
+        # Apply NMS
+        if detections:
+            detections = self._apply_nms(detections)
+        
+        return detections
+    
+    def _parse_split_outputs(
+        self,
+        outputs: list,
+        original_shape: Tuple[int, int],
+    ) -> List[Detection]:
+        """Parse split output format (common in quantized models)."""
+        # This format varies by model - implement based on specific model output
+        # Common format: [boxes, scores, class_ids, num_dets]
+        
+        detections = []
+        
+        # Placeholder - implement based on actual model output format
+        logger.warning("Split output parsing not fully implemented")
+        
+        return detections
+    
+    def _apply_nms(self, detections: List[Detection]) -> List[Detection]:
+        """Apply NMS to detections."""
+        if not detections:
+            return []
+        
+        boxes = np.array([[d.bbox.x1, d.bbox.y1, d.bbox.x2, d.bbox.y2] for d in detections])
+        scores = np.array([d.confidence for d in detections])
+        
+        keep_indices = self.nms(boxes, scores, self.nms_threshold)
+        
+        return [detections[i] for i in keep_indices]
+    
+    def release(self) -> None:
+        """Release RKNN resources."""
+        if self.rknn is not None:
+            self.rknn.release()
+            self.rknn = None
+            logger.info("RKNN resources released")
@@ -0,0 +1,123 @@
+"""
+Ultralytics YOLO detector backend.
+"""
+
+import numpy as np
+import logging
+from typing import List, Optional
+
+from .base import BaseDetector, Detection, BBox
+
+logger = logging.getLogger(__name__)
+
+
+class YOLODetector(BaseDetector):
+    """
+    Ultralytics YOLO detector.
+    
+    Supports YOLOv5, YOLOv8, YOLOv9, etc.
+    """
+    
+    def __init__(
+        self,
+        model_path: str,
+        conf_threshold: float = 0.25,
+        nms_threshold: float = 0.45,
+        device: str = "cpu",
+        class_names: Optional[dict] = None,
+    ):
+        """
+        Initialize YOLO detector.
+        
+        Args:
+            model_path: Path to .pt model file
+            conf_threshold: Confidence threshold
+            nms_threshold: NMS IoU threshold
+            device: Device to run on ('cpu', 'cuda', '0', etc.)
+            class_names: Class ID to name mapping
+        """
+        super().__init__(
+            model_path=model_path,
+            conf_threshold=conf_threshold,
+            nms_threshold=nms_threshold,
+            class_names=class_names,
+        )
+        self.device = device
+    
+    def load_model(self) -> bool:
+        """Load YOLO model."""
+        try:
+            from ultralytics import YOLO
+            
+            logger.info(f"Loading YOLO model: {self.model_path}")
+            self.model = YOLO(self.model_path)
+            self.model.to(self.device)
+            
+            # Update class names from model if available
+            if hasattr(self.model, 'names'):
+                self.class_names = self.model.names
+            
+            logger.info(f"YOLO model loaded on {self.device}")
+            return True
+            
+        except ImportError:
+            logger.error("ultralytics package not found. Install with: pip install ultralytics")
+            return False
+        except Exception as e:
+            logger.error(f"Failed to load YOLO model: {e}")
+            return False
+    
+    def detect(self, frame: np.ndarray) -> List[Detection]:
+        """
+        Run detection on frame.
+        
+        Args:
+            frame: Input image (BGR, HWC)
+            
+        Returns:
+            List of Detection objects
+        """
+        if self.model is None:
+            logger.warning("Model not loaded")
+            return []
+        
+        try:
+            # Run inference
+            results = self.model.predict(
+                frame,
+                conf=self.conf_threshold,
+                iou=self.nms_threshold,
+                verbose=False,
+            )
+            
+            detections = []
+            
+            for result in results:
+                if result.boxes is None:
+                    continue
+                
+                for box in result.boxes:
+                    class_id = int(box.cls[0].item())
+                    confidence = float(box.conf[0].item())
+                    x1, y1, x2, y2 = box.xyxy[0].tolist()
+                    
+                    class_name = self.class_names.get(class_id, str(class_id))
+                    
+                    detection = Detection(
+                        class_id=class_id,
+                        class_name=class_name,
+                        confidence=confidence,
+                        bbox=BBox(x1=x1, y1=y1, x2=x2, y2=y2),
+                    )
+                    detections.append(detection)
+            
+            return detections
+            
+        except Exception as e:
+            logger.error(f"Detection error: {e}")
+            return []
+    
+    def release(self) -> None:
+        """Release resources."""
+        self.model = None
+        logger.info("YOLO detector released")