Files
dataset-yolo-script/sam2-cpu/notebooks/02_create_yolo_dataset.ipynb
T
2026-02-04 15:29:36 +07:00

727 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create YOLO Dataset from SAM2 Annotations\n",
"\n",
"Convert SAM2 mask annotations to YOLO detection format (bounding boxes).\n",
"\n",
"## Input\n",
"- Frames from `01_sam2_video_annotation.ipynb`\n",
"- Annotations JSON file\n",
"\n",
"## Output\n",
"- YOLO format dataset ready for training\n",
"- `data.yaml` configuration file\n",
"\n",
"**Platform:** Kaggle"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import cv2\n",
"import json\n",
"import yaml\n",
"import shutil\n",
"import random\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from pathlib import Path\n",
"from tqdm.notebook import tqdm\n",
"from collections import defaultdict\n",
"\n",
"print(\"Setup complete!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Configuration - UPDATE THESE PATHS\n",
"FRAMES_DIR = './frames' # From notebook 01\n",
"ANNOTATIONS_FILE = './annotations/annotations.json' # From notebook 01\n",
"MASKS_DIR = './annotations/masks' # Optional: mask images\n",
"\n",
"# Output dataset\n",
"DATASET_DIR = './yolo_dataset'\n",
"\n",
"# Dataset settings\n",
"CLASS_NAMES = ['object'] # Single class for generic objects\n",
"VAL_SPLIT = 0.2 # 20% validation\n",
"SEED = 42 # Random seed for reproducibility\n",
"\n",
"# Filtering\n",
"MIN_BBOX_AREA = 100 # Minimum bbox area in pixels\n",
"MIN_BBOX_SIZE = 0.01 # Minimum bbox dimension (normalized, 0-1)\n",
"MAX_OBJECTS_PER_IMAGE = 100 # Maximum objects per image\n",
"MIN_IOU_SCORE = 0.5 # Minimum SAM2 IoU score\n",
"\n",
"random.seed(SEED)\n",
"np.random.seed(SEED)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Annotations"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load annotations\n",
"with open(ANNOTATIONS_FILE, 'r') as f:\n",
" annotations = json.load(f)\n",
"\n",
"print(f\"Loaded annotations for {len(annotations)} frames\")\n",
"\n",
"# Show sample\n",
"sample_frame = list(annotations.keys())[0]\n",
"print(f\"\\nSample annotation ({sample_frame}):\")\n",
"print(json.dumps(annotations[sample_frame][:2], indent=2))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Verify frames exist\n",
"frames_path = Path(FRAMES_DIR)\n",
"frame_files = list(frames_path.glob(\"*.jpg\")) + list(frames_path.glob(\"*.png\"))\n",
"\n",
"print(f\"Found {len(frame_files)} frame images\")\n",
"\n",
"# Check matching\n",
"annotation_frames = set(annotations.keys())\n",
"image_frames = {f.name for f in frame_files}\n",
"\n",
"matched = annotation_frames & image_frames\n",
"print(f\"Matched frames: {len(matched)}\")\n",
"\n",
"if len(matched) < len(annotation_frames):\n",
" missing = annotation_frames - image_frames\n",
" print(f\"Warning: {len(missing)} annotated frames missing images\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Create YOLO Dataset Structure"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create directory structure\n",
"dataset_path = Path(DATASET_DIR)\n",
"\n",
"images_train = dataset_path / 'images' / 'train'\n",
"images_val = dataset_path / 'images' / 'val'\n",
"labels_train = dataset_path / 'labels' / 'train'\n",
"labels_val = dataset_path / 'labels' / 'val'\n",
"\n",
"for dir_path in [images_train, images_val, labels_train, labels_val]:\n",
" dir_path.mkdir(parents=True, exist_ok=True)\n",
" print(f\"Created: {dir_path}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Convert Annotations to YOLO Format"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def bbox_xywh_to_yolo(bbox, image_width, image_height):\n",
" \"\"\"\n",
" Convert [x, y, w, h] bbox to YOLO format [x_center, y_center, width, height] normalized.\n",
" \"\"\"\n",
" x, y, w, h = bbox\n",
" \n",
" x_center = (x + w / 2) / image_width\n",
" y_center = (y + h / 2) / image_height\n",
" width = w / image_width\n",
" height = h / image_height\n",
" \n",
" # Clamp to [0, 1]\n",
" x_center = max(0, min(1, x_center))\n",
" y_center = max(0, min(1, y_center))\n",
" width = max(0, min(1, width))\n",
" height = max(0, min(1, height))\n",
" \n",
" return x_center, y_center, width, height\n",
"\n",
"\n",
"def filter_annotations(anns, img_width, img_height, \n",
" min_area=100, min_size=0.01, \n",
" min_iou=0.5, max_objects=100):\n",
" \"\"\"\n",
" Filter annotations based on criteria.\n",
" \"\"\"\n",
" filtered = []\n",
" \n",
" for ann in anns:\n",
" bbox = ann.get('bbox', [])\n",
" area = ann.get('area', 0)\n",
" iou = ann.get('predicted_iou', 1.0)\n",
" \n",
" # Check area\n",
" if area < min_area:\n",
" continue\n",
" \n",
" # Check IoU score\n",
" if iou < min_iou:\n",
" continue\n",
" \n",
" # Check bbox dimensions\n",
" if len(bbox) == 4:\n",
" _, _, w, h = bbox\n",
" if w / img_width < min_size or h / img_height < min_size:\n",
" continue\n",
" \n",
" filtered.append(ann)\n",
" \n",
" # Limit number of objects (keep highest IoU)\n",
" if len(filtered) > max_objects:\n",
" filtered.sort(key=lambda x: x.get('predicted_iou', 0), reverse=True)\n",
" filtered = filtered[:max_objects]\n",
" \n",
" return filtered"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def process_frame(frame_name, frame_anns, frames_dir, images_dir, labels_dir, class_id=0):\n",
" \"\"\"\n",
" Process a single frame: copy image and create YOLO label file.\n",
" \"\"\"\n",
" frame_path = Path(frames_dir) / frame_name\n",
" \n",
" if not frame_path.exists():\n",
" return 0\n",
" \n",
" # Read image dimensions\n",
" image = cv2.imread(str(frame_path))\n",
" if image is None:\n",
" return 0\n",
" \n",
" height, width = image.shape[:2]\n",
" \n",
" # Filter annotations\n",
" filtered_anns = filter_annotations(\n",
" frame_anns, width, height,\n",
" min_area=MIN_BBOX_AREA,\n",
" min_size=MIN_BBOX_SIZE,\n",
" min_iou=MIN_IOU_SCORE,\n",
" max_objects=MAX_OBJECTS_PER_IMAGE\n",
" )\n",
" \n",
" # Copy image\n",
" dest_image = images_dir / frame_name\n",
" shutil.copy2(frame_path, dest_image)\n",
" \n",
" # Create YOLO labels\n",
" labels = []\n",
" for ann in filtered_anns:\n",
" bbox = ann.get('bbox', [])\n",
" if len(bbox) != 4:\n",
" continue\n",
" \n",
" x_center, y_center, w, h = bbox_xywh_to_yolo(bbox, width, height)\n",
" \n",
" # YOLO format: class x_center y_center width height\n",
" label_line = f\"{class_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}\"\n",
" labels.append(label_line)\n",
" \n",
" # Write label file\n",
" label_name = Path(frame_name).stem + '.txt'\n",
" label_path = labels_dir / label_name\n",
" \n",
" with open(label_path, 'w') as f:\n",
" f.write('\\n'.join(labels))\n",
" \n",
" return len(labels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Split frames into train/val\n",
"frame_names = list(annotations.keys())\n",
"random.shuffle(frame_names)\n",
"\n",
"split_idx = int(len(frame_names) * (1 - VAL_SPLIT))\n",
"train_frames = frame_names[:split_idx]\n",
"val_frames = frame_names[split_idx:]\n",
"\n",
"print(f\"Train frames: {len(train_frames)}\")\n",
"print(f\"Val frames: {len(val_frames)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Process training frames\n",
"train_objects = 0\n",
"for frame_name in tqdm(train_frames, desc=\"Processing train\"):\n",
" count = process_frame(\n",
" frame_name,\n",
" annotations.get(frame_name, []),\n",
" FRAMES_DIR,\n",
" images_train,\n",
" labels_train,\n",
" class_id=0\n",
" )\n",
" train_objects += count\n",
"\n",
"print(f\"\\nTrain: {len(train_frames)} images, {train_objects} objects\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Process validation frames\n",
"val_objects = 0\n",
"for frame_name in tqdm(val_frames, desc=\"Processing val\"):\n",
" count = process_frame(\n",
" frame_name,\n",
" annotations.get(frame_name, []),\n",
" FRAMES_DIR,\n",
" images_val,\n",
" labels_val,\n",
" class_id=0\n",
" )\n",
" val_objects += count\n",
"\n",
"print(f\"\\nVal: {len(val_frames)} images, {val_objects} objects\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Create data.yaml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create YOLO data.yaml configuration\n",
"data_config = {\n",
" 'path': str(Path(DATASET_DIR).absolute()),\n",
" 'train': 'images/train',\n",
" 'val': 'images/val',\n",
" 'names': {i: name for i, name in enumerate(CLASS_NAMES)},\n",
" 'nc': len(CLASS_NAMES)\n",
"}\n",
"\n",
"yaml_path = dataset_path / 'data.yaml'\n",
"with open(yaml_path, 'w') as f:\n",
" yaml.dump(data_config, f, default_flow_style=False, sort_keys=False)\n",
"\n",
"print(f\"Created: {yaml_path}\")\n",
"print(\"\\nContents:\")\n",
"with open(yaml_path) as f:\n",
" print(f.read())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Validate Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def validate_dataset(dataset_dir):\n",
" \"\"\"Validate YOLO dataset structure.\"\"\"\n",
" dataset_path = Path(dataset_dir)\n",
" results = {'valid': True, 'errors': [], 'warnings': [], 'stats': {}}\n",
" \n",
" # Check data.yaml\n",
" yaml_path = dataset_path / 'data.yaml'\n",
" if not yaml_path.exists():\n",
" results['errors'].append(\"Missing data.yaml\")\n",
" results['valid'] = False\n",
" else:\n",
" with open(yaml_path) as f:\n",
" config = yaml.safe_load(f)\n",
" results['stats']['num_classes'] = config.get('nc', 0)\n",
" results['stats']['class_names'] = config.get('names', {})\n",
" \n",
" # Check directories and count files\n",
" for split in ['train', 'val']:\n",
" images_dir = dataset_path / 'images' / split\n",
" labels_dir = dataset_path / 'labels' / split\n",
" \n",
" if not images_dir.exists():\n",
" results['errors'].append(f\"Missing images/{split}\")\n",
" results['valid'] = False\n",
" continue\n",
" \n",
" image_files = list(images_dir.glob(\"*.jpg\")) + list(images_dir.glob(\"*.png\"))\n",
" label_files = list(labels_dir.glob(\"*.txt\"))\n",
" \n",
" results['stats'][f'{split}_images'] = len(image_files)\n",
" results['stats'][f'{split}_labels'] = len(label_files)\n",
" \n",
" # Check for missing labels\n",
" image_stems = {f.stem for f in image_files}\n",
" label_stems = {f.stem for f in label_files}\n",
" missing = image_stems - label_stems\n",
" \n",
" if missing:\n",
" results['warnings'].append(f\"{len(missing)} {split} images missing labels\")\n",
" \n",
" # Count total objects\n",
" total_objects = 0\n",
" for label_file in label_files:\n",
" with open(label_file) as f:\n",
" lines = [l.strip() for l in f if l.strip()]\n",
" total_objects += len(lines)\n",
" results['stats'][f'{split}_objects'] = total_objects\n",
" \n",
" return results\n",
"\n",
"# Validate\n",
"validation = validate_dataset(DATASET_DIR)\n",
"\n",
"print(\"Dataset Validation:\")\n",
"print(f\" Valid: {validation['valid']}\")\n",
"print(f\"\\nStatistics:\")\n",
"for key, value in validation['stats'].items():\n",
" print(f\" {key}: {value}\")\n",
"\n",
"if validation['errors']:\n",
" print(f\"\\nErrors:\")\n",
" for err in validation['errors']:\n",
" print(f\" - {err}\")\n",
"\n",
"if validation['warnings']:\n",
" print(f\"\\nWarnings:\")\n",
" for warn in validation['warnings']:\n",
" print(f\" - {warn}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Visualize Dataset Samples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def visualize_yolo_sample(image_path, label_path, class_names):\n",
" \"\"\"Visualize YOLO annotation on image.\"\"\"\n",
" image = cv2.imread(str(image_path))\n",
" image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
" height, width = image.shape[:2]\n",
" \n",
" # Read labels\n",
" if label_path.exists():\n",
" with open(label_path) as f:\n",
" labels = [l.strip().split() for l in f if l.strip()]\n",
" else:\n",
" labels = []\n",
" \n",
" # Draw bboxes\n",
" colors = plt.cm.tab10(np.linspace(0, 1, 10))\n",
" \n",
" for label in labels:\n",
" class_id = int(label[0])\n",
" x_center, y_center, w, h = map(float, label[1:5])\n",
" \n",
" # Convert to pixel coordinates\n",
" x1 = int((x_center - w/2) * width)\n",
" y1 = int((y_center - h/2) * height)\n",
" x2 = int((x_center + w/2) * width)\n",
" y2 = int((y_center + h/2) * height)\n",
" \n",
" color = tuple(int(c * 255) for c in colors[class_id % 10][:3])\n",
" cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)\n",
" \n",
" # Add label\n",
" class_name = class_names.get(class_id, str(class_id))\n",
" cv2.putText(image, class_name, (x1, y1-5), \n",
" cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)\n",
" \n",
" return image, len(labels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Visualize train samples\n",
"train_images = sorted(images_train.glob(\"*.jpg\"))[:6]\n",
"\n",
"fig, axes = plt.subplots(2, 3, figsize=(18, 12))\n",
"class_names_dict = {i: name for i, name in enumerate(CLASS_NAMES)}\n",
"\n",
"for ax, img_path in zip(axes.flat, train_images):\n",
" label_path = labels_train / (img_path.stem + '.txt')\n",
" vis, count = visualize_yolo_sample(img_path, label_path, class_names_dict)\n",
" \n",
" ax.imshow(vis)\n",
" ax.set_title(f\"{img_path.name} ({count} objects)\")\n",
" ax.axis('off')\n",
"\n",
"plt.suptitle('Training Samples with YOLO Annotations', fontsize=14)\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Object count distribution\n",
"train_label_files = list(labels_train.glob(\"*.txt\"))\n",
"val_label_files = list(labels_val.glob(\"*.txt\"))\n",
"\n",
"def count_objects_in_labels(label_files):\n",
" counts = []\n",
" for lf in label_files:\n",
" with open(lf) as f:\n",
" lines = [l.strip() for l in f if l.strip()]\n",
" counts.append(len(lines))\n",
" return counts\n",
"\n",
"train_counts = count_objects_in_labels(train_label_files)\n",
"val_counts = count_objects_in_labels(val_label_files)\n",
"\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"axes[0].hist(train_counts, bins=30, edgecolor='black', alpha=0.7, label='Train')\n",
"axes[0].hist(val_counts, bins=30, edgecolor='black', alpha=0.7, label='Val')\n",
"axes[0].set_xlabel('Objects per image')\n",
"axes[0].set_ylabel('Frequency')\n",
"axes[0].set_title('Objects per Image Distribution')\n",
"axes[0].legend()\n",
"\n",
"# Bbox size distribution\n",
"bbox_sizes = []\n",
"for lf in train_label_files:\n",
" with open(lf) as f:\n",
" for line in f:\n",
" parts = line.strip().split()\n",
" if len(parts) >= 5:\n",
" w, h = float(parts[3]), float(parts[4])\n",
" bbox_sizes.append(w * h)\n",
"\n",
"axes[1].hist(bbox_sizes, bins=50, edgecolor='black', alpha=0.7)\n",
"axes[1].set_xlabel('Bbox area (normalized)')\n",
"axes[1].set_ylabel('Frequency')\n",
"axes[1].set_title('Bounding Box Size Distribution')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"print(f\"\\nBbox size stats:\")\n",
"print(f\" Min: {min(bbox_sizes):.4f}\")\n",
"print(f\" Max: {max(bbox_sizes):.4f}\")\n",
"print(f\" Mean: {np.mean(bbox_sizes):.4f}\")\n",
"print(f\" Median: {np.median(bbox_sizes):.4f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Export for Kaggle"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create zip archive for Kaggle dataset\n",
"import zipfile\n",
"\n",
"EXPORT_ZIP = 'yolo_dataset.zip'\n",
"\n",
"print(f\"Creating {EXPORT_ZIP}...\")\n",
"\n",
"with zipfile.ZipFile(EXPORT_ZIP, 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
" for root, dirs, files in os.walk(DATASET_DIR):\n",
" for file in files:\n",
" file_path = os.path.join(root, file)\n",
" arcname = os.path.relpath(file_path, os.path.dirname(DATASET_DIR))\n",
" zipf.write(file_path, arcname)\n",
"\n",
"zip_size = os.path.getsize(EXPORT_ZIP) / 1024 / 1024\n",
"print(f\"\\nExport complete!\")\n",
"print(f\" File: {EXPORT_ZIP}\")\n",
"print(f\" Size: {zip_size:.1f} MB\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Alternative: Create Kaggle dataset directly (if kaggle CLI available)\n",
"# Uncomment to use\n",
"\n",
"# KAGGLE_USERNAME = 'your-username'\n",
"# DATASET_NAME = 'sam2-yolo-custom'\n",
"\n",
"# # Create dataset metadata\n",
"# metadata = {\n",
"# 'title': 'SAM2 Auto-Annotated YOLO Dataset',\n",
"# 'id': f'{KAGGLE_USERNAME}/{DATASET_NAME}',\n",
"# 'licenses': [{'name': 'CC0-1.0'}]\n",
"# }\n",
"\n",
"# metadata_path = dataset_path / 'dataset-metadata.json'\n",
"# with open(metadata_path, 'w') as f:\n",
"# json.dump(metadata, f, indent=2)\n",
"\n",
"# # Upload to Kaggle\n",
"# !kaggle datasets create -p {DATASET_DIR}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9. Dataset Summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Final summary\n",
"print(\"=\" * 50)\n",
"print(\"YOLO DATASET SUMMARY\")\n",
"print(\"=\" * 50)\n",
"print(f\"\\nDataset location: {Path(DATASET_DIR).absolute()}\")\n",
"print(f\"\\nClasses ({len(CLASS_NAMES)}):\")\n",
"for i, name in enumerate(CLASS_NAMES):\n",
" print(f\" {i}: {name}\")\n",
"\n",
"print(f\"\\nSplit:\")\n",
"print(f\" Train: {validation['stats']['train_images']} images, {validation['stats']['train_objects']} objects\")\n",
"print(f\" Val: {validation['stats']['val_images']} images, {validation['stats']['val_objects']} objects\")\n",
"print(f\" Total: {validation['stats']['train_images'] + validation['stats']['val_images']} images\")\n",
"\n",
"print(f\"\\nFiles:\")\n",
"print(f\" data.yaml: {yaml_path}\")\n",
"print(f\" Export: {EXPORT_ZIP}\")\n",
"\n",
"print(\"\\n\" + \"=\" * 50)\n",
"print(\"Ready for YOLOv9t training!\")\n",
"print(\"=\" * 50)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"\n",
"## Next Steps\n",
"\n",
"1. **Upload dataset to Kaggle** (if not already done)\n",
" - Go to kaggle.com/datasets/new\n",
" - Upload `yolo_dataset.zip`\n",
" \n",
"2. **Run `03_train_yolov9t.ipynb`** to train YOLOv9t\n",
"\n",
"### Dataset Structure\n",
"```\n",
"yolo_dataset/\n",
"├── data.yaml\n",
"├── images/\n",
"│ ├── train/\n",
"│ └── val/\n",
"└── labels/\n",
" ├── train/\n",
" └── val/\n",
"```"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}