From de99aef9addf8167411a78ae99a06f34613972b1 Mon Sep 17 00:00:00 2001
From: Hare <kei.hiracchi.0928@gmail.com>
Date: Mon, 23 Feb 2026 03:56:23 +0900
Subject: [PATCH] =?UTF-8?q?=E5=A7=BF=E5=8B=A2=E6=8E=A8=E5=AE=9A=E3=83=A2?=
 =?UTF-8?q?=E3=83=87=E3=83=AB=E3=82=92=E6=AD=A2=E3=82=81=E3=80=81=E9=A0=AD?=
 =?UTF-8?q?=E9=83=A8=E7=89=B9=E5=8C=96=E3=83=A2=E3=83=87=E3=83=AB=E3=81=AB?=
 =?UTF-8?q?=E5=A4=89=E6=9B=B4=E3=81=97=E3=81=A6=E3=83=86=E3=82=B9=E3=83=88?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 panels/vse_panel.py |   4 +-
 server/detector.py  | 146 ++++++++++----------------------------------
 2 files changed, 34 insertions(+), 116 deletions(-)

diff --git a/panels/vse_panel.py b/panels/vse_panel.py
index e6adcc8..af03f39 100644
--- a/panels/vse_panel.py
+++ b/panels/vse_panel.py
@@ -239,7 +239,9 @@ class SEQUENCER_PT_face_mask(Panel):
         if not selected_movies:
             return
         count = len(selected_movies)
-        label = f"Batch ({count} strip{'s' if count > 1 else ''} selected)"
+        image_count = sum(1 for s in selected_movies if s.type == "IMAGE")
+        video_count = sum(1 for s in selected_movies if s.type == "MOVIE")
+        label = f"Batch ({count} selected, image: {image_count}, video: {video_count})"
         box = layout.box()
         box.label(text=label, icon='RENDER_ANIMATION')
         box.operator(
diff --git a/server/detector.py b/server/detector.py
index abb03db..1454ca7 100644
--- a/server/detector.py
+++ b/server/detector.py
@@ -1,8 +1,8 @@
 """
-YOLOv8 Pose Head Detector using PyTorch with ROCm support.
+YOLOv8 Head Detector using CrowdHuman-trained model with PyTorch ROCm support.
 
-Detects human heads from all angles (frontal, profile, rear) by using
-YOLOv8 pose estimation and extracting head bounding boxes from keypoints.
+Directly detects human heads (frontal, profile, rear) using the Owen718
+CrowdHuman YOLOv8 model, which was trained on dense crowd scenes.
 """
 
 import os
@@ -10,96 +10,27 @@ from typing import List, Tuple, Optional
 import numpy as np
 
 
-# COCO pose keypoint indices
-_HEAD_KP = [0, 1, 2, 3, 4]      # nose, left_eye, right_eye, left_ear, right_ear
-_SHOULDER_KP = [5, 6]            # left_shoulder, right_shoulder
-_KP_CONF_THRESH = 0.3
+def _download_model(dest_path: str):
+    """モデルが存在しない場合に手動ダウンロード手順を表示して例外を送出する。"""
+    gdrive_id = "1qlBmiEU4GBV13fxPhLZqjhjBbREvs8-m"
+    raise RuntimeError(
+        f"モデルファイルが見つかりません: {dest_path}\n"
+        "以下の手順でダウンロードしてください:\n"
+        f"  1. https://drive.google.com/file/d/{gdrive_id} を開く\n"
+        f"  2. ダウンロードしたファイルを {dest_path} に配置する"
+    )
 
 
-def _head_bbox_from_pose(
-    kp_xy: np.ndarray,
-    kp_conf: np.ndarray,
-    person_x1: float,
-    person_y1: float,
-    person_x2: float,
-    person_y2: float,
-) -> Tuple[int, int, int, int]:
+class YOLOHeadDetector:
     """
-    Estimate head bounding box (x, y, w, h) from COCO pose keypoints.
+    Head detector using CrowdHuman-trained YOLOv8 model with PyTorch ROCm support.
 
-    Strategy:
-    1. Use head keypoints (0-4: nose, eyes, ears) if visible.
-    2. Fall back to shoulder keypoints (5-6) to infer head position.
-    3. Last resort: use top of the person bounding box.
-    """
-    person_w = max(person_x2 - person_x1, 1.0)
-
-    # --- Step 1: head keypoints ---
-    visible_head = [
-        (float(kp_xy[i][0]), float(kp_xy[i][1]))
-        for i in _HEAD_KP
-        if float(kp_conf[i]) > _KP_CONF_THRESH
-    ]
-    if visible_head:
-        xs = [p[0] for p in visible_head]
-        ys = [p[1] for p in visible_head]
-        kp_x1, kp_y1 = min(xs), min(ys)
-        kp_x2, kp_y2 = max(xs), max(ys)
-        span = max(kp_x2 - kp_x1, kp_y2 - kp_y1, 1.0)
-        cx = (kp_x1 + kp_x2) / 2.0
-        cy = (kp_y1 + kp_y2) / 2.0
-
-        # span はキーポイントの外接幅（≒顔幅）なので、半径 = span/2 で顔と等倍になる
-        r = max(span * 0.5, person_w * 0.10)
-        x1 = int(cx - r)
-        y1 = int(cy - r)
-        x2 = int(cx + r)
-        y2 = int(cy + r)
-        return x1, y1, x2 - x1, y2 - y1
-
-    # --- Step 2: shoulder keypoints ---
-    visible_shoulder = [
-        (float(kp_xy[i][0]), float(kp_xy[i][1]))
-        for i in _SHOULDER_KP
-        if float(kp_conf[i]) > _KP_CONF_THRESH
-    ]
-    if visible_shoulder:
-        cx = sum(p[0] for p in visible_shoulder) / len(visible_shoulder)
-        cy_sh = sum(p[1] for p in visible_shoulder) / len(visible_shoulder)
-        if len(visible_shoulder) == 2:
-            sh_width = abs(visible_shoulder[1][0] - visible_shoulder[0][0])
-        else:
-            sh_width = person_w * 0.5
-        # 肩幅は顔幅の約1.5〜2倍なので、0.3倍で顔サイズに近い半径になる
-        r = max(sh_width * 0.3, person_w * 0.12)
-        cy = cy_sh - r * 1.3   # 頭の中心は肩より上
-        x1 = int(cx - r)
-        y1 = int(cy - r)
-        x2 = int(cx + r)
-        y2 = int(cy + r)
-        return x1, y1, x2 - x1, y2 - y1
-
-    # --- Step 3: person bbox top ---
-    # 顔幅は人物幅の約20〜30%なので、半径 = person_w * 0.15 で顔サイズに近い
-    r = max(person_w * 0.15, 20.0)
-    cx = (person_x1 + person_x2) / 2.0
-    x1 = int(cx - r)
-    y1 = int(person_y1)
-    x2 = int(cx + r)
-    y2 = int(person_y1 + r * 2.0)
-    return x1, y1, x2 - x1, y2 - y1
-
-
-class YOLOPoseHeadDetector:
-    """
-    Head detector using YOLOv8 pose estimation with PyTorch ROCm support.
-
-    Extracts head bounding boxes from COCO pose keypoints (nose, eyes, ears)
-    so that detection works regardless of the person's facing direction.
+    Directly detects heads (class 0: head) without pose estimation,
+    enabling robust detection of rear-facing, side-facing, and partially
+    visible people in dense crowd scenes.
     """
 
-    # Standard Ultralytics model — auto-downloaded on first use
-    DEFAULT_MODEL = os.path.join("models", "yolov8n-pose.pt")
+    DEFAULT_MODEL = os.path.join("models", "crowdhuman_yolov8_head.pt")
 
     def __init__(
         self,
@@ -117,19 +48,19 @@ class YOLOPoseHeadDetector:
 
     @property
     def model(self):
-        """Lazy-load YOLO pose model."""
+        """Lazy-load YOLO head detection model."""
         if self._model is None:
             from ultralytics import YOLO
             import torch
 
-            # Use provided path or let Ultralytics auto-download the default
             if self._model_path is not None:
                 if not os.path.exists(self._model_path):
                     raise FileNotFoundError(f"Model not found: {self._model_path}")
                 model_path = self._model_path
             else:
                 model_path = self.DEFAULT_MODEL
-                os.makedirs(os.path.dirname(model_path), exist_ok=True)
+                if not os.path.exists(model_path):
+                    _download_model(model_path)
 
             if torch.cuda.is_available():
                 self._device = 'cuda'
@@ -141,7 +72,7 @@ class YOLOPoseHeadDetector:
 
             try:
                 self._model = YOLO(model_path)
-                print(f"[FaceMask] Pose model loaded: {model_path}")
+                print(f"[FaceMask] Head detection model loaded: {model_path}")
                 print(f"[FaceMask] Device: {self._device}")
             except Exception as e:
                 print(f"[FaceMask] Error loading model: {e}")
@@ -152,29 +83,14 @@ class YOLOPoseHeadDetector:
         return self._model
 
     def _results_to_detections(self, result) -> List[Tuple[int, int, int, int, float]]:
-        """Convert a single YOLO pose result to (x, y, w, h, conf) tuples."""
+        """Convert a single YOLO result to (x, y, w, h, conf) tuples."""
+        if result.boxes is None:
+            return []
         detections = []
-        if result.boxes is None or result.keypoints is None:
-            return detections
-
-        boxes = result.boxes
-        keypoints = result.keypoints
-
-        for i, box in enumerate(boxes):
+        for box in result.boxes:
             conf = float(box.conf[0].cpu().numpy())
             x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
-
-            # Extract keypoints for this person
-            kp_data = keypoints.data[i].cpu().numpy()  # shape (17, 3): x, y, conf
-            kp_xy = kp_data[:, :2]
-            kp_conf = kp_data[:, 2]
-
-            hx, hy, hw, hh = _head_bbox_from_pose(
-                kp_xy, kp_conf,
-                float(x1), float(y1), float(x2), float(y2),
-            )
-            detections.append((hx, hy, hw, hh, conf))
-
+            detections.append((int(x1), int(y1), int(x2 - x1), int(y2 - y1), conf))
         return detections
 
     def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]:
@@ -303,12 +219,12 @@ class YOLOPoseHeadDetector:
 
 
 # Singleton instance
-_detector: Optional[YOLOPoseHeadDetector] = None
+_detector: Optional[YOLOHeadDetector] = None
 
 
-def get_detector(**kwargs) -> YOLOPoseHeadDetector:
-    """Get or create the global YOLO pose head detector instance."""
+def get_detector(**kwargs) -> YOLOHeadDetector:
+    """Get or create the global YOLO head detector instance."""
     global _detector
     if _detector is None:
-        _detector = YOLOPoseHeadDetector(**kwargs)
+        _detector = YOLOHeadDetector(**kwargs)
     return _detector