From de99aef9addf8167411a78ae99a06f34613972b1 Mon Sep 17 00:00:00 2001 From: Hare Date: Mon, 23 Feb 2026 03:56:23 +0900 Subject: [PATCH] =?UTF-8?q?=E5=A7=BF=E5=8B=A2=E6=8E=A8=E5=AE=9A=E3=83=A2?= =?UTF-8?q?=E3=83=87=E3=83=AB=E3=82=92=E6=AD=A2=E3=82=81=E3=80=81=E9=A0=AD?= =?UTF-8?q?=E9=83=A8=E7=89=B9=E5=8C=96=E3=83=A2=E3=83=87=E3=83=AB=E3=81=AB?= =?UTF-8?q?=E5=A4=89=E6=9B=B4=E3=81=97=E3=81=A6=E3=83=86=E3=82=B9=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- panels/vse_panel.py | 4 +- server/detector.py | 146 ++++++++++---------------------------------- 2 files changed, 34 insertions(+), 116 deletions(-) diff --git a/panels/vse_panel.py b/panels/vse_panel.py index e6adcc8..af03f39 100644 --- a/panels/vse_panel.py +++ b/panels/vse_panel.py @@ -239,7 +239,9 @@ class SEQUENCER_PT_face_mask(Panel): if not selected_movies: return count = len(selected_movies) - label = f"Batch ({count} strip{'s' if count > 1 else ''} selected)" + image_count = sum(1 for s in selected_movies if s.type == "IMAGE") + video_count = sum(1 for s in selected_movies if s.type == "MOVIE") + label = f"Batch ({count} selected, image: {image_count}, video: {video_count})" box = layout.box() box.label(text=label, icon='RENDER_ANIMATION') box.operator( diff --git a/server/detector.py b/server/detector.py index abb03db..1454ca7 100644 --- a/server/detector.py +++ b/server/detector.py @@ -1,8 +1,8 @@ """ -YOLOv8 Pose Head Detector using PyTorch with ROCm support. +YOLOv8 Head Detector using CrowdHuman-trained model with PyTorch ROCm support. -Detects human heads from all angles (frontal, profile, rear) by using -YOLOv8 pose estimation and extracting head bounding boxes from keypoints. +Directly detects human heads (frontal, profile, rear) using the Owen718 +CrowdHuman YOLOv8 model, which was trained on dense crowd scenes. """ import os @@ -10,96 +10,27 @@ from typing import List, Tuple, Optional import numpy as np -# COCO pose keypoint indices -_HEAD_KP = [0, 1, 2, 3, 4] # nose, left_eye, right_eye, left_ear, right_ear -_SHOULDER_KP = [5, 6] # left_shoulder, right_shoulder -_KP_CONF_THRESH = 0.3 +def _download_model(dest_path: str): + """モデルが存在しない場合に手動ダウンロード手順を表示して例外を送出する。""" + gdrive_id = "1qlBmiEU4GBV13fxPhLZqjhjBbREvs8-m" + raise RuntimeError( + f"モデルファイルが見つかりません: {dest_path}\n" + "以下の手順でダウンロードしてください:\n" + f" 1. https://drive.google.com/file/d/{gdrive_id} を開く\n" + f" 2. ダウンロードしたファイルを {dest_path} に配置する" + ) -def _head_bbox_from_pose( - kp_xy: np.ndarray, - kp_conf: np.ndarray, - person_x1: float, - person_y1: float, - person_x2: float, - person_y2: float, -) -> Tuple[int, int, int, int]: +class YOLOHeadDetector: """ - Estimate head bounding box (x, y, w, h) from COCO pose keypoints. + Head detector using CrowdHuman-trained YOLOv8 model with PyTorch ROCm support. - Strategy: - 1. Use head keypoints (0-4: nose, eyes, ears) if visible. - 2. Fall back to shoulder keypoints (5-6) to infer head position. - 3. Last resort: use top of the person bounding box. - """ - person_w = max(person_x2 - person_x1, 1.0) - - # --- Step 1: head keypoints --- - visible_head = [ - (float(kp_xy[i][0]), float(kp_xy[i][1])) - for i in _HEAD_KP - if float(kp_conf[i]) > _KP_CONF_THRESH - ] - if visible_head: - xs = [p[0] for p in visible_head] - ys = [p[1] for p in visible_head] - kp_x1, kp_y1 = min(xs), min(ys) - kp_x2, kp_y2 = max(xs), max(ys) - span = max(kp_x2 - kp_x1, kp_y2 - kp_y1, 1.0) - cx = (kp_x1 + kp_x2) / 2.0 - cy = (kp_y1 + kp_y2) / 2.0 - - # span はキーポイントの外接幅(≒顔幅)なので、半径 = span/2 で顔と等倍になる - r = max(span * 0.5, person_w * 0.10) - x1 = int(cx - r) - y1 = int(cy - r) - x2 = int(cx + r) - y2 = int(cy + r) - return x1, y1, x2 - x1, y2 - y1 - - # --- Step 2: shoulder keypoints --- - visible_shoulder = [ - (float(kp_xy[i][0]), float(kp_xy[i][1])) - for i in _SHOULDER_KP - if float(kp_conf[i]) > _KP_CONF_THRESH - ] - if visible_shoulder: - cx = sum(p[0] for p in visible_shoulder) / len(visible_shoulder) - cy_sh = sum(p[1] for p in visible_shoulder) / len(visible_shoulder) - if len(visible_shoulder) == 2: - sh_width = abs(visible_shoulder[1][0] - visible_shoulder[0][0]) - else: - sh_width = person_w * 0.5 - # 肩幅は顔幅の約1.5〜2倍なので、0.3倍で顔サイズに近い半径になる - r = max(sh_width * 0.3, person_w * 0.12) - cy = cy_sh - r * 1.3 # 頭の中心は肩より上 - x1 = int(cx - r) - y1 = int(cy - r) - x2 = int(cx + r) - y2 = int(cy + r) - return x1, y1, x2 - x1, y2 - y1 - - # --- Step 3: person bbox top --- - # 顔幅は人物幅の約20〜30%なので、半径 = person_w * 0.15 で顔サイズに近い - r = max(person_w * 0.15, 20.0) - cx = (person_x1 + person_x2) / 2.0 - x1 = int(cx - r) - y1 = int(person_y1) - x2 = int(cx + r) - y2 = int(person_y1 + r * 2.0) - return x1, y1, x2 - x1, y2 - y1 - - -class YOLOPoseHeadDetector: - """ - Head detector using YOLOv8 pose estimation with PyTorch ROCm support. - - Extracts head bounding boxes from COCO pose keypoints (nose, eyes, ears) - so that detection works regardless of the person's facing direction. + Directly detects heads (class 0: head) without pose estimation, + enabling robust detection of rear-facing, side-facing, and partially + visible people in dense crowd scenes. """ - # Standard Ultralytics model — auto-downloaded on first use - DEFAULT_MODEL = os.path.join("models", "yolov8n-pose.pt") + DEFAULT_MODEL = os.path.join("models", "crowdhuman_yolov8_head.pt") def __init__( self, @@ -117,19 +48,19 @@ class YOLOPoseHeadDetector: @property def model(self): - """Lazy-load YOLO pose model.""" + """Lazy-load YOLO head detection model.""" if self._model is None: from ultralytics import YOLO import torch - # Use provided path or let Ultralytics auto-download the default if self._model_path is not None: if not os.path.exists(self._model_path): raise FileNotFoundError(f"Model not found: {self._model_path}") model_path = self._model_path else: model_path = self.DEFAULT_MODEL - os.makedirs(os.path.dirname(model_path), exist_ok=True) + if not os.path.exists(model_path): + _download_model(model_path) if torch.cuda.is_available(): self._device = 'cuda' @@ -141,7 +72,7 @@ class YOLOPoseHeadDetector: try: self._model = YOLO(model_path) - print(f"[FaceMask] Pose model loaded: {model_path}") + print(f"[FaceMask] Head detection model loaded: {model_path}") print(f"[FaceMask] Device: {self._device}") except Exception as e: print(f"[FaceMask] Error loading model: {e}") @@ -152,29 +83,14 @@ class YOLOPoseHeadDetector: return self._model def _results_to_detections(self, result) -> List[Tuple[int, int, int, int, float]]: - """Convert a single YOLO pose result to (x, y, w, h, conf) tuples.""" + """Convert a single YOLO result to (x, y, w, h, conf) tuples.""" + if result.boxes is None: + return [] detections = [] - if result.boxes is None or result.keypoints is None: - return detections - - boxes = result.boxes - keypoints = result.keypoints - - for i, box in enumerate(boxes): + for box in result.boxes: conf = float(box.conf[0].cpu().numpy()) x1, y1, x2, y2 = box.xyxy[0].cpu().numpy() - - # Extract keypoints for this person - kp_data = keypoints.data[i].cpu().numpy() # shape (17, 3): x, y, conf - kp_xy = kp_data[:, :2] - kp_conf = kp_data[:, 2] - - hx, hy, hw, hh = _head_bbox_from_pose( - kp_xy, kp_conf, - float(x1), float(y1), float(x2), float(y2), - ) - detections.append((hx, hy, hw, hh, conf)) - + detections.append((int(x1), int(y1), int(x2 - x1), int(y2 - y1), conf)) return detections def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]: @@ -303,12 +219,12 @@ class YOLOPoseHeadDetector: # Singleton instance -_detector: Optional[YOLOPoseHeadDetector] = None +_detector: Optional[YOLOHeadDetector] = None -def get_detector(**kwargs) -> YOLOPoseHeadDetector: - """Get or create the global YOLO pose head detector instance.""" +def get_detector(**kwargs) -> YOLOHeadDetector: + """Get or create the global YOLO head detector instance.""" global _detector if _detector is None: - _detector = YOLOPoseHeadDetector(**kwargs) + _detector = YOLOHeadDetector(**kwargs) return _detector