add sam2 yolo auto annotation

This commit is contained in:
2026-02-04 15:29:36 +07:00
parent 7e56948ece
commit 5a951d8812
2061 changed files with 316473 additions and 0 deletions
+118
View File
@@ -0,0 +1,118 @@
"""
Object detection backends.
"""
from typing import Dict, Any, Optional
import logging
logger = logging.getLogger(__name__)
def create_detector(config: Dict[str, Any]):
"""
Create detector based on configuration.
Args:
config: Detector configuration dict
Returns:
Detector instance
"""
detector_type = config.get('type', 'yolo').lower()
model_path = config.get('model_path', 'models/yolov9t.pt')
logger.info(f"Creating detector: type={detector_type}, model={model_path}")
# Try RKNN first if specified
if detector_type == 'rknn':
try:
from .rknn_detector import RKNNDetector
rknn_config = config.get('rknn', {})
detector = RKNNDetector(
model_path=model_path,
target_platform=rknn_config.get('target_platform', 'rk3588'),
core_mask=rknn_config.get('core_mask', 7),
input_size=tuple(config.get('input_size', [640, 640])),
conf_threshold=config.get('conf_threshold', 0.25),
nms_threshold=config.get('nms_threshold', 0.45),
)
if detector.load_model():
logger.info("RKNN detector initialized successfully")
return detector
else:
logger.warning("RKNN detector failed to load, trying fallback")
except ImportError as e:
logger.warning(f"RKNN not available: {e}")
except Exception as e:
logger.warning(f"RKNN initialization failed: {e}")
# Try ONNX if specified or as fallback
if detector_type == 'onnx' or (detector_type == 'rknn' and config.get('fallback', {}).get('enabled', True)):
fallback_config = config.get('fallback', {})
onnx_config = config.get('onnx', {})
if fallback_config.get('type') == 'onnx' or detector_type == 'onnx':
try:
from .onnx_detector import ONNXDetector
# Determine model path
onnx_model_path = model_path
if model_path.endswith('.rknn'):
onnx_model_path = model_path.replace('.rknn', '.onnx')
elif model_path.endswith('.pt'):
onnx_model_path = model_path.replace('.pt', '.onnx')
# Get device from onnx config or fallback config
device = onnx_config.get('device') or fallback_config.get('device', 'cpu')
detector = ONNXDetector(
model_path=onnx_model_path,
input_size=tuple(config.get('input_size', [640, 640])),
conf_threshold=config.get('conf_threshold', 0.25),
nms_threshold=config.get('nms_threshold', 0.45),
device=device,
num_threads=onnx_config.get('num_threads', 0),
optimization_level=onnx_config.get('optimization_level', 'all'),
)
if detector.load_model():
logger.info("ONNX detector initialized successfully")
return detector
except ImportError as e:
logger.warning(f"ONNX runtime not available: {e}")
logger.info("Install with: pip install onnxruntime")
except Exception as e:
logger.warning(f"ONNX initialization failed: {e}")
# Use Ultralytics YOLO as default/fallback
try:
from .yolo_detector import YOLODetector
fallback_config = config.get('fallback', {})
device = fallback_config.get('device', 'cpu')
# Adjust model path
if model_path.endswith('.rknn'):
model_path = model_path.replace('.rknn', '.pt')
elif model_path.endswith('.onnx'):
model_path = model_path.replace('.onnx', '.pt')
detector = YOLODetector(
model_path=model_path,
conf_threshold=config.get('conf_threshold', 0.25),
nms_threshold=config.get('nms_threshold', 0.45),
device=device,
)
if detector.load_model():
logger.info(f"YOLO detector initialized on {device}")
return detector
except Exception as e:
logger.error(f"Failed to initialize any detector: {e}")
return None
+230
View File
@@ -0,0 +1,230 @@
"""
Base detector interface.
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Optional
from dataclasses import dataclass
import numpy as np
@dataclass
class BBox:
"""Bounding box."""
x1: float
y1: float
x2: float
y2: float
def to_yolo(self, img_width: int, img_height: int) -> Tuple[float, float, float, float]:
"""Convert to YOLO format (normalized x_center, y_center, width, height)."""
x_center = ((self.x1 + self.x2) / 2) / img_width
y_center = ((self.y1 + self.y2) / 2) / img_height
width = (self.x2 - self.x1) / img_width
height = (self.y2 - self.y1) / img_height
return (x_center, y_center, width, height)
def area(self) -> float:
"""Calculate area in pixels."""
return (self.x2 - self.x1) * (self.y2 - self.y1)
@property
def width(self) -> float:
return self.x2 - self.x1
@property
def height(self) -> float:
return self.y2 - self.y1
@dataclass
class Detection:
"""Single detection result."""
class_id: int
class_name: str
confidence: float
bbox: BBox
track_id: Optional[int] = None
class BaseDetector(ABC):
"""Abstract base class for object detectors."""
# COCO class names
COCO_CLASSES = {
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite',
34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard',
38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork',
43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple',
48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog',
53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch',
58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv',
63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone',
68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator',
73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear',
78: 'hair drier', 79: 'toothbrush'
}
def __init__(
self,
model_path: str,
input_size: Tuple[int, int] = (640, 640),
conf_threshold: float = 0.25,
nms_threshold: float = 0.45,
class_names: Optional[dict] = None,
):
"""
Initialize detector.
Args:
model_path: Path to model file
input_size: Model input size (width, height)
conf_threshold: Confidence threshold
nms_threshold: NMS IoU threshold
class_names: Class ID to name mapping
"""
self.model_path = model_path
self.input_size = input_size
self.conf_threshold = conf_threshold
self.nms_threshold = nms_threshold
self.class_names = class_names or self.COCO_CLASSES
self.model = None
@abstractmethod
def load_model(self) -> bool:
"""Load model. Returns True on success."""
pass
@abstractmethod
def detect(self, frame: np.ndarray) -> List[Detection]:
"""
Run detection on frame.
Args:
frame: Input image (BGR, HWC)
Returns:
List of Detection objects
"""
pass
@abstractmethod
def release(self) -> None:
"""Release resources."""
pass
def preprocess(self, frame: np.ndarray) -> np.ndarray:
"""
Preprocess frame for inference.
Args:
frame: Input frame (BGR, HWC)
Returns:
Preprocessed input tensor
"""
import cv2
# Resize
input_width, input_height = self.input_size
resized = cv2.resize(frame, (input_width, input_height))
# BGR to RGB
rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
# Normalize to [0, 1]
normalized = rgb.astype(np.float32) / 255.0
# HWC to CHW
transposed = normalized.transpose(2, 0, 1)
# Add batch dimension
batched = np.expand_dims(transposed, axis=0)
return batched
def postprocess(
self,
outputs: np.ndarray,
original_shape: Tuple[int, int],
) -> List[Detection]:
"""
Postprocess model outputs.
Args:
outputs: Raw model outputs
original_shape: Original frame shape (height, width)
Returns:
List of Detection objects
"""
# This is a generic implementation for YOLO-style outputs
# Override in subclasses for specific model output formats
orig_h, orig_w = original_shape
input_w, input_h = self.input_size
detections = []
# Assume outputs shape: [1, num_boxes, 5+num_classes] or similar
# This will vary by model - subclasses should override
return detections
def nms(
self,
boxes: np.ndarray,
scores: np.ndarray,
iou_threshold: float = 0.45,
) -> List[int]:
"""
Non-maximum suppression.
Args:
boxes: Array of boxes [N, 4] in xyxy format
scores: Array of scores [N]
iou_threshold: IoU threshold
Returns:
List of indices to keep
"""
if len(boxes) == 0:
return []
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1) * (y2 - y1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
if order.size == 1:
break
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0, xx2 - xx1)
h = np.maximum(0, yy2 - yy1)
inter = w * h
iou = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(iou <= iou_threshold)[0]
order = order[inds + 1]
return keep
@@ -0,0 +1,283 @@
"""
ONNX Runtime detector backend.
"""
import numpy as np
import logging
from typing import List, Tuple, Optional
from .base import BaseDetector, Detection, BBox
logger = logging.getLogger(__name__)
class ONNXDetector(BaseDetector):
"""
ONNX Runtime-based YOLO detector.
Supports CPU and CUDA execution providers.
This is the recommended backend for CPU-only inference.
Features:
- Cross-platform (Linux, Windows, macOS, ARM)
- No special hardware required
- Optimized CPU inference with threading
- Optional CUDA support
"""
def __init__(
self,
model_path: str,
input_size: Tuple[int, int] = (640, 640),
conf_threshold: float = 0.25,
nms_threshold: float = 0.45,
device: str = "cpu",
num_threads: int = 0,
optimization_level: str = "all",
class_names: Optional[dict] = None,
):
"""
Initialize ONNX detector.
Args:
model_path: Path to .onnx model file
input_size: Model input size (width, height)
conf_threshold: Confidence threshold
nms_threshold: NMS IoU threshold
device: Device ('cpu' or 'cuda')
num_threads: CPU threads (0 = auto based on CPU cores)
optimization_level: Graph optimization ('none', 'basic', 'extended', 'all')
class_names: Class ID to name mapping
"""
super().__init__(
model_path=model_path,
input_size=input_size,
conf_threshold=conf_threshold,
nms_threshold=nms_threshold,
class_names=class_names,
)
self.device = device
self.num_threads = num_threads
self.optimization_level = optimization_level
self.session = None
self.input_name = None
self.output_names = None
def load_model(self) -> bool:
"""Load ONNX model."""
try:
import onnxruntime as ort
# Select execution providers
if self.device == "cuda":
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
providers = ['CPUExecutionProvider']
logger.info(f"Loading ONNX model: {self.model_path}")
logger.info(f" Device: {self.device}")
logger.info(f" Threads: {self.num_threads if self.num_threads > 0 else 'auto'}")
# Create session options
sess_options = ort.SessionOptions()
# Set optimization level
opt_levels = {
'none': ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
'basic': ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
'extended': ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
'all': ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
}
sess_options.graph_optimization_level = opt_levels.get(
self.optimization_level,
ort.GraphOptimizationLevel.ORT_ENABLE_ALL
)
# Set CPU threading options
if self.num_threads > 0:
sess_options.intra_op_num_threads = self.num_threads
sess_options.inter_op_num_threads = self.num_threads
# Enable memory optimization
sess_options.enable_mem_pattern = True
sess_options.enable_cpu_mem_arena = True
# Create session
self.session = ort.InferenceSession(
self.model_path,
sess_options=sess_options,
providers=providers,
)
# Get input/output info
self.input_name = self.session.get_inputs()[0].name
self.output_names = [o.name for o in self.session.get_outputs()]
# Get input shape
input_shape = self.session.get_inputs()[0].shape
if len(input_shape) == 4:
self.input_size = (input_shape[3], input_shape[2]) # width, height
actual_provider = self.session.get_providers()[0]
logger.info(f"ONNX model loaded successfully")
logger.info(f" Provider: {actual_provider}")
logger.info(f" Input size: {self.input_size}")
return True
except ImportError:
logger.error("onnxruntime not found. Install with: pip install onnxruntime")
return False
except Exception as e:
logger.error(f"Failed to load ONNX model: {e}")
return False
def detect(self, frame: np.ndarray) -> List[Detection]:
"""
Run detection on frame.
Args:
frame: Input image (BGR, HWC)
Returns:
List of Detection objects
"""
if self.session is None:
logger.warning("ONNX session not initialized")
return []
try:
orig_h, orig_w = frame.shape[:2]
# Preprocess
input_tensor, ratio, pad = self._preprocess(frame)
# Run inference
outputs = self.session.run(self.output_names, {self.input_name: input_tensor})
# Postprocess
detections = self._postprocess(outputs, (orig_h, orig_w), ratio, pad)
return detections
except Exception as e:
logger.error(f"ONNX inference error: {e}")
return []
def _preprocess(self, frame: np.ndarray) -> Tuple[np.ndarray, float, Tuple[float, float]]:
"""Preprocess frame for ONNX inference."""
import cv2
input_w, input_h = self.input_size
orig_h, orig_w = frame.shape[:2]
# Calculate scale
ratio = min(input_w / orig_w, input_h / orig_h)
new_w = int(orig_w * ratio)
new_h = int(orig_h * ratio)
# Resize
resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
# Pad
pad_w = (input_w - new_w) / 2
pad_h = (input_h - new_h) / 2
top = int(round(pad_h - 0.1))
bottom = int(round(pad_h + 0.1))
left = int(round(pad_w - 0.1))
right = int(round(pad_w + 0.1))
padded = cv2.copyMakeBorder(
resized, top, bottom, left, right,
cv2.BORDER_CONSTANT, value=(114, 114, 114)
)
# BGR to RGB
rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB)
# Normalize
normalized = rgb.astype(np.float32) / 255.0
# HWC to NCHW
transposed = normalized.transpose(2, 0, 1)
batched = np.expand_dims(transposed, axis=0)
return batched, ratio, (pad_w, pad_h)
def _postprocess(
self,
outputs: list,
original_shape: Tuple[int, int],
ratio: float,
pad: Tuple[float, float],
) -> List[Detection]:
"""Postprocess ONNX outputs."""
detections = []
orig_h, orig_w = original_shape
pad_w, pad_h = pad
# Handle different output formats
output = outputs[0]
if output.ndim == 3:
output = output[0]
# Transpose if needed (num_classes+4 x num_boxes -> num_boxes x num_classes+4)
if output.shape[0] < output.shape[1]:
output = output.T
for row in output:
if len(row) < 5:
continue
# Parse based on format
if len(row) == 85: # YOLOv5 format with obj_conf
x, y, w, h, obj_conf = row[:5]
class_confs = row[5:]
class_id = np.argmax(class_confs)
confidence = obj_conf * class_confs[class_id]
else: # YOLOv8/v9 format without obj_conf
x, y, w, h = row[:4]
class_confs = row[4:]
class_id = np.argmax(class_confs)
confidence = class_confs[class_id]
if confidence < self.conf_threshold:
continue
# Convert to xyxy and scale back
x1 = (x - w / 2 - pad_w) / ratio
y1 = (y - h / 2 - pad_h) / ratio
x2 = (x + w / 2 - pad_w) / ratio
y2 = (y + h / 2 - pad_h) / ratio
# Clip
x1 = max(0, min(orig_w, x1))
y1 = max(0, min(orig_h, y1))
x2 = max(0, min(orig_w, x2))
y2 = max(0, min(orig_h, y2))
class_name = self.class_names.get(int(class_id), str(class_id))
detection = Detection(
class_id=int(class_id),
class_name=class_name,
confidence=float(confidence),
bbox=BBox(x1=x1, y1=y1, x2=x2, y2=y2),
)
detections.append(detection)
# Apply NMS
if detections:
boxes = np.array([[d.bbox.x1, d.bbox.y1, d.bbox.x2, d.bbox.y2] for d in detections])
scores = np.array([d.confidence for d in detections])
keep = self.nms(boxes, scores, self.nms_threshold)
detections = [detections[i] for i in keep]
return detections
def release(self) -> None:
"""Release ONNX session."""
self.session = None
logger.info("ONNX detector released")
@@ -0,0 +1,327 @@
"""
RKNN detector backend for Rockchip NPU.
"""
import numpy as np
import logging
from typing import List, Tuple, Optional
from .base import BaseDetector, Detection, BBox
logger = logging.getLogger(__name__)
class RKNNDetector(BaseDetector):
"""
RKNN-based YOLO detector for Rockchip NPU.
Supports: RK3588, RK3568, RK3566, RK3562, RV1106, etc.
"""
def __init__(
self,
model_path: str,
target_platform: str = "rk3588",
core_mask: int = 7,
input_size: Tuple[int, int] = (640, 640),
conf_threshold: float = 0.25,
nms_threshold: float = 0.45,
class_names: Optional[dict] = None,
):
"""
Initialize RKNN detector.
Args:
model_path: Path to .rknn model file
target_platform: Target Rockchip platform
core_mask: NPU core mask (RK3588: 7=all 3 cores)
input_size: Model input size
conf_threshold: Confidence threshold
nms_threshold: NMS threshold
class_names: Class ID to name mapping
"""
super().__init__(
model_path=model_path,
input_size=input_size,
conf_threshold=conf_threshold,
nms_threshold=nms_threshold,
class_names=class_names,
)
self.target_platform = target_platform
self.core_mask = core_mask
self.rknn = None
def load_model(self) -> bool:
"""Load RKNN model to NPU."""
try:
# Try rknnlite2 first (for ARM devices)
try:
from rknnlite.api import RKNNLite
self.rknn = RKNNLite()
is_lite = True
logger.info("Using RKNNLite2 runtime")
except ImportError:
# Fall back to rknn-toolkit2 (for x86 simulation)
from rknn.api import RKNN
self.rknn = RKNN()
is_lite = False
logger.info("Using RKNN-Toolkit2 runtime")
# Load model
logger.info(f"Loading RKNN model: {self.model_path}")
ret = self.rknn.load_rknn(self.model_path)
if ret != 0:
logger.error(f"Failed to load RKNN model: {ret}")
return False
# Initialize runtime
if is_lite:
ret = self.rknn.init_runtime(core_mask=self.core_mask)
else:
ret = self.rknn.init_runtime(
target=self.target_platform,
device_id=None,
)
if ret != 0:
logger.error(f"Failed to init RKNN runtime: {ret}")
return False
logger.info("RKNN model loaded successfully")
return True
except ImportError as e:
logger.error(f"RKNN library not available: {e}")
logger.info("Install with: pip install rknnlite2 (ARM) or rknn-toolkit2 (x86)")
return False
except Exception as e:
logger.error(f"Failed to load RKNN model: {e}")
return False
def detect(self, frame: np.ndarray) -> List[Detection]:
"""
Run detection on frame using NPU.
Args:
frame: Input image (BGR, HWC)
Returns:
List of Detection objects
"""
if self.rknn is None:
logger.warning("RKNN not initialized")
return []
orig_h, orig_w = frame.shape[:2]
# Preprocess
input_data = self._preprocess_rknn(frame)
# Run inference
outputs = self.rknn.inference(inputs=[input_data])
if outputs is None:
logger.warning("RKNN inference returned None")
return []
# Postprocess
detections = self._postprocess_yolo(outputs, (orig_h, orig_w))
return detections
def _preprocess_rknn(self, frame: np.ndarray) -> np.ndarray:
"""Preprocess frame for RKNN inference."""
import cv2
input_w, input_h = self.input_size
# Resize with letterbox
img, ratio, (dw, dh) = self._letterbox(frame, (input_h, input_w))
# Store for postprocessing
self._ratio = ratio
self._pad = (dw, dh)
self._orig_shape = frame.shape[:2]
# BGR to RGB (RKNN typically expects RGB)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
def _letterbox(
self,
img: np.ndarray,
new_shape: Tuple[int, int],
color: Tuple[int, int, int] = (114, 114, 114),
) -> Tuple[np.ndarray, float, Tuple[int, int]]:
"""Resize and pad image while maintaining aspect ratio."""
import cv2
shape = img.shape[:2] # [height, width]
# Scale ratio
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw = new_shape[1] - new_unpad[0]
dh = new_shape[0] - new_unpad[1]
dw /= 2
dh /= 2
if shape[::-1] != new_unpad:
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
img = cv2.copyMakeBorder(
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
)
return img, r, (dw, dh)
def _postprocess_yolo(
self,
outputs: list,
original_shape: Tuple[int, int],
) -> List[Detection]:
"""
Postprocess YOLO outputs from RKNN.
Handles common YOLO output formats:
- YOLOv5/v8/v9 style: [1, num_boxes, 5+num_classes]
- Split outputs: boxes, scores, classes separate
"""
detections = []
try:
# Handle different output formats
if len(outputs) == 1:
# Single output tensor
output = outputs[0]
if output.ndim == 3:
output = output[0] # Remove batch dim
# Assume format: [num_boxes, 5+num_classes] or [5+num_classes, num_boxes]
if output.shape[0] < output.shape[1]:
output = output.T
detections = self._parse_yolo_output(output, original_shape)
elif len(outputs) >= 3:
# Split outputs (boxes, scores, classes)
# This is common for quantized RKNN models
detections = self._parse_split_outputs(outputs, original_shape)
except Exception as e:
logger.error(f"Postprocessing error: {e}")
return detections
def _parse_yolo_output(
self,
output: np.ndarray,
original_shape: Tuple[int, int],
) -> List[Detection]:
"""Parse standard YOLO output format."""
detections = []
orig_h, orig_w = original_shape
input_w, input_h = self.input_size
ratio = self._ratio
dw, dh = self._pad
for row in output:
# Format: [x, y, w, h, obj_conf, cls1_conf, cls2_conf, ...]
# or: [x, y, w, h, cls1_conf, cls2_conf, ...] (obj_conf = max class conf)
if len(row) < 5:
continue
# Check if obj_conf exists
if len(row) == 85: # 4 + 1 + 80 classes (with obj_conf)
x, y, w, h, obj_conf = row[:5]
class_confs = row[5:]
class_id = np.argmax(class_confs)
class_conf = class_confs[class_id]
confidence = obj_conf * class_conf
else: # No separate obj_conf
x, y, w, h = row[:4]
class_confs = row[4:]
class_id = np.argmax(class_confs)
confidence = class_confs[class_id]
if confidence < self.conf_threshold:
continue
# Convert to xyxy
x1 = x - w / 2
y1 = y - h / 2
x2 = x + w / 2
y2 = y + h / 2
# Remove padding and scale back
x1 = (x1 - dw) / ratio
y1 = (y1 - dh) / ratio
x2 = (x2 - dw) / ratio
y2 = (y2 - dh) / ratio
# Clip to image bounds
x1 = max(0, min(orig_w, x1))
y1 = max(0, min(orig_h, y1))
x2 = max(0, min(orig_w, x2))
y2 = max(0, min(orig_h, y2))
class_name = self.class_names.get(int(class_id), str(class_id))
detection = Detection(
class_id=int(class_id),
class_name=class_name,
confidence=float(confidence),
bbox=BBox(x1=x1, y1=y1, x2=x2, y2=y2),
)
detections.append(detection)
# Apply NMS
if detections:
detections = self._apply_nms(detections)
return detections
def _parse_split_outputs(
self,
outputs: list,
original_shape: Tuple[int, int],
) -> List[Detection]:
"""Parse split output format (common in quantized models)."""
# This format varies by model - implement based on specific model output
# Common format: [boxes, scores, class_ids, num_dets]
detections = []
# Placeholder - implement based on actual model output format
logger.warning("Split output parsing not fully implemented")
return detections
def _apply_nms(self, detections: List[Detection]) -> List[Detection]:
"""Apply NMS to detections."""
if not detections:
return []
boxes = np.array([[d.bbox.x1, d.bbox.y1, d.bbox.x2, d.bbox.y2] for d in detections])
scores = np.array([d.confidence for d in detections])
keep_indices = self.nms(boxes, scores, self.nms_threshold)
return [detections[i] for i in keep_indices]
def release(self) -> None:
"""Release RKNN resources."""
if self.rknn is not None:
self.rknn.release()
self.rknn = None
logger.info("RKNN resources released")
@@ -0,0 +1,123 @@
"""
Ultralytics YOLO detector backend.
"""
import numpy as np
import logging
from typing import List, Optional
from .base import BaseDetector, Detection, BBox
logger = logging.getLogger(__name__)
class YOLODetector(BaseDetector):
"""
Ultralytics YOLO detector.
Supports YOLOv5, YOLOv8, YOLOv9, etc.
"""
def __init__(
self,
model_path: str,
conf_threshold: float = 0.25,
nms_threshold: float = 0.45,
device: str = "cpu",
class_names: Optional[dict] = None,
):
"""
Initialize YOLO detector.
Args:
model_path: Path to .pt model file
conf_threshold: Confidence threshold
nms_threshold: NMS IoU threshold
device: Device to run on ('cpu', 'cuda', '0', etc.)
class_names: Class ID to name mapping
"""
super().__init__(
model_path=model_path,
conf_threshold=conf_threshold,
nms_threshold=nms_threshold,
class_names=class_names,
)
self.device = device
def load_model(self) -> bool:
"""Load YOLO model."""
try:
from ultralytics import YOLO
logger.info(f"Loading YOLO model: {self.model_path}")
self.model = YOLO(self.model_path)
self.model.to(self.device)
# Update class names from model if available
if hasattr(self.model, 'names'):
self.class_names = self.model.names
logger.info(f"YOLO model loaded on {self.device}")
return True
except ImportError:
logger.error("ultralytics package not found. Install with: pip install ultralytics")
return False
except Exception as e:
logger.error(f"Failed to load YOLO model: {e}")
return False
def detect(self, frame: np.ndarray) -> List[Detection]:
"""
Run detection on frame.
Args:
frame: Input image (BGR, HWC)
Returns:
List of Detection objects
"""
if self.model is None:
logger.warning("Model not loaded")
return []
try:
# Run inference
results = self.model.predict(
frame,
conf=self.conf_threshold,
iou=self.nms_threshold,
verbose=False,
)
detections = []
for result in results:
if result.boxes is None:
continue
for box in result.boxes:
class_id = int(box.cls[0].item())
confidence = float(box.conf[0].item())
x1, y1, x2, y2 = box.xyxy[0].tolist()
class_name = self.class_names.get(class_id, str(class_id))
detection = Detection(
class_id=class_id,
class_name=class_name,
confidence=confidence,
bbox=BBox(x1=x1, y1=y1, x2=x2, y2=y2),
)
detections.append(detection)
return detections
except Exception as e:
logger.error(f"Detection error: {e}")
return []
def release(self) -> None:
"""Release resources."""
self.model = None
logger.info("YOLO detector released")