Change model: face -> pose

2026-02-18 20:18:53 +09:00 · 2026-02-18 20:18:53 +09:00 · 08f20fa6fe
commit 08f20fa6fe
parent 920695696b
1 changed files with 138 additions and 99 deletions
--- a/server/detector.py
+++ b/server/detector.py
@ -1,28 +1,104 @@
 """
-YOLOv8 Face Detector using PyTorch with ROCm support.
+YOLOv8 Pose Head Detector using PyTorch with ROCm support.

-This module provides high-performance face detection using
-YOLOv8-face model with AMD GPU (ROCm) acceleration.
+Detects human heads from all angles (frontal, profile, rear) by using
+YOLOv8 pose estimation and extracting head bounding boxes from keypoints.
 """

 import os
 from typing import List, Tuple, Optional
-from pathlib import Path
 import numpy as np


-class YOLOFaceDetector:
-    """
-    YOLOv8 face detector with PyTorch ROCm support.
+# COCO pose keypoint indices
+_HEAD_KP = [0, 1, 2, 3, 4]      # nose, left_eye, right_eye, left_ear, right_ear
+_SHOULDER_KP = [5, 6]            # left_shoulder, right_shoulder
+_KP_CONF_THRESH = 0.3

-    Features:
-    - ROCm GPU acceleration for AMD GPUs
-    - High accuracy face detection
-    - Automatic NMS for overlapping detections
+
+def _head_bbox_from_pose(
+    kp_xy: np.ndarray,
+    kp_conf: np.ndarray,
+    person_x1: float,
+    person_y1: float,
+    person_x2: float,
+    person_y2: float,
+) -> Tuple[int, int, int, int]:
+    """
+    Estimate head bounding box (x, y, w, h) from COCO pose keypoints.
+
+    Strategy:
+    1. Use head keypoints (0-4: nose, eyes, ears) if visible.
+    2. Fall back to shoulder keypoints (5-6) to infer head position.
+    3. Last resort: use top of the person bounding box.
+    """
+    person_w = max(person_x2 - person_x1, 1.0)
+
+    # --- Step 1: head keypoints ---
+    visible_head = [
+        (float(kp_xy[i][0]), float(kp_xy[i][1]))
+        for i in _HEAD_KP
+        if float(kp_conf[i]) > _KP_CONF_THRESH
+    ]
+    if visible_head:
+        xs = [p[0] for p in visible_head]
+        ys = [p[1] for p in visible_head]
+        kp_x1, kp_y1 = min(xs), min(ys)
+        kp_x2, kp_y2 = max(xs), max(ys)
+        span = max(kp_x2 - kp_x1, kp_y2 - kp_y1, 1.0)
+        cx = (kp_x1 + kp_x2) / 2.0
+        cy = (kp_y1 + kp_y2) / 2.0
+
+        # Head radius: inter-landmark span ≈ 80% of head width, so expand by ~1.25
+        # Shift center upward slightly to include scalp
+        r = max(span * 1.25, person_w * 0.20)
+        x1 = int(cx - r)
+        y1 = int(cy - r * 1.15)   # extra margin above (scalp)
+        x2 = int(cx + r)
+        y2 = int(cy + r * 0.85)   # less margin below (chin)
+        return x1, y1, x2 - x1, y2 - y1
+
+    # --- Step 2: shoulder keypoints ---
+    visible_shoulder = [
+        (float(kp_xy[i][0]), float(kp_xy[i][1]))
+        for i in _SHOULDER_KP
+        if float(kp_conf[i]) > _KP_CONF_THRESH
+    ]
+    if visible_shoulder:
+        cx = sum(p[0] for p in visible_shoulder) / len(visible_shoulder)
+        cy_sh = sum(p[1] for p in visible_shoulder) / len(visible_shoulder)
+        if len(visible_shoulder) == 2:
+            sh_width = abs(visible_shoulder[1][0] - visible_shoulder[0][0])
+        else:
+            sh_width = person_w * 0.5
+        r = max(sh_width * 0.5, person_w * 0.20)
+        cy = cy_sh - r * 1.3   # head center is above shoulders
+        x1 = int(cx - r)
+        y1 = int(cy - r)
+        x2 = int(cx + r)
+        y2 = int(cy + r)
+        return x1, y1, x2 - x1, y2 - y1
+
+    # --- Step 3: person bbox top ---
+    r = max(person_w * 0.35, 20.0)
+    cx = (person_x1 + person_x2) / 2.0
+    x1 = int(cx - r)
+    y1 = int(person_y1)
+    x2 = int(cx + r)
+    y2 = int(person_y1 + r * 2.0)
+    return x1, y1, x2 - x1, y2 - y1
+
+
+class YOLOPoseHeadDetector:
+    """
+    Head detector using YOLOv8 pose estimation with PyTorch ROCm support.
+
+    Extracts head bounding boxes from COCO pose keypoints (nose, eyes, ears)
+    so that detection works regardless of the person's facing direction.
    """

-    # Default model path relative to this file
-    DEFAULT_MODEL = "yolov8n-face-lindevs.pt"
+    # Standard Ultralytics model — auto-downloaded on first use
+    DEFAULT_MODEL = "yolov8n-pose.pt"

    def __init__(
        self,
@ -31,15 +107,6 @@ class YOLOFaceDetector:
        iou_threshold: float = 0.45,
        input_size: Tuple[int, int] = (640, 640),
    ):
-        """
-        Initialize the YOLO face detector.
-
-        Args:
-            model_path: Path to PyTorch model file. If None, uses default model.
-            conf_threshold: Confidence threshold for detections
-            iou_threshold: IoU threshold for NMS
-            input_size: Model input size (width, height)
-        """
        self.conf_threshold = conf_threshold
        self.iou_threshold = iou_threshold
        self.input_size = input_size
@ -49,23 +116,19 @@ class YOLOFaceDetector:

    @property
    def model(self):
-        """Lazy-load YOLO model."""
+        """Lazy-load YOLO pose model."""
        if self._model is None:
            from ultralytics import YOLO
            import torch

-            # Determine model path
-            if self._model_path is None:
-                # Assuming models are in ../models relative to server/detector.py
-                models_dir = Path(__file__).parent.parent / "models"
-                model_path = str(models_dir / self.DEFAULT_MODEL)
-            else:
+            # Use provided path or let Ultralytics auto-download the default
+            if self._model_path is not None:
+                if not os.path.exists(self._model_path):
+                    raise FileNotFoundError(f"Model not found: {self._model_path}")
                model_path = self._model_path
+            else:
+                model_path = self.DEFAULT_MODEL

-            if not os.path.exists(model_path):
-                raise FileNotFoundError(f"Model not found: {model_path}")
-
-            # Detect device (ROCm GPU or CPU)
            if torch.cuda.is_available():
                self._device = 'cuda'
                device_name = torch.cuda.get_device_name(0)
@ -74,25 +137,47 @@ class YOLOFaceDetector:
                self._device = 'cpu'
                print("[FaceMask] Using CPU for inference (ROCm GPU not available)")

-            # Load model (let Ultralytics handle device management)
            try:
                self._model = YOLO(model_path)
-                # Don't call .to() - let predict() handle device assignment
-                print(f"[FaceMask] Model loaded, will use device: {self._device}")
+                print(f"[FaceMask] Pose model loaded: {model_path}")
+                print(f"[FaceMask] Device: {self._device}")
            except Exception as e:
                print(f"[FaceMask] Error loading model: {e}")
                import traceback
                traceback.print_exc()
                raise

-            print(f"[FaceMask] YOLO model loaded: {model_path}")
-            print(f"[FaceMask] Device: {self._device}")
-
        return self._model

+    def _results_to_detections(self, result) -> List[Tuple[int, int, int, int, float]]:
+        """Convert a single YOLO pose result to (x, y, w, h, conf) tuples."""
+        detections = []
+        if result.boxes is None or result.keypoints is None:
+            return detections
+
+        boxes = result.boxes
+        keypoints = result.keypoints
+
+        for i, box in enumerate(boxes):
+            conf = float(box.conf[0].cpu().numpy())
+            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+
+            # Extract keypoints for this person
+            kp_data = keypoints.data[i].cpu().numpy()  # shape (17, 3): x, y, conf
+            kp_xy = kp_data[:, :2]
+            kp_conf = kp_data[:, 2]
+
+            hx, hy, hw, hh = _head_bbox_from_pose(
+                kp_xy, kp_conf,
+                float(x1), float(y1), float(x2), float(y2),
+            )
+            detections.append((hx, hy, hw, hh, conf))
+
+        return detections
+
    def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]:
        """
-        Detect faces in a frame.
+        Detect heads in a frame.

        Args:
            frame: BGR image as numpy array (H, W, C)
@ -100,7 +185,6 @@ class YOLOFaceDetector:
        Returns:
            List of detections as (x, y, width, height, confidence)
        """
-        # Run inference
        import torch
        print(f"[FaceMask] Inference device: {self._device}, CUDA available: {torch.cuda.is_available()}")
        try:
@ -116,7 +200,6 @@ class YOLOFaceDetector:
            print(f"[FaceMask] ERROR during inference: {e}")
            import traceback
            traceback.print_exc()
-            # Fallback to CPU
            print("[FaceMask] Falling back to CPU inference...")
            self._device = 'cpu'
            results = self.model.predict(
@ -128,28 +211,13 @@ class YOLOFaceDetector:
                device='cpu',
            )

-        # Extract detections
-        detections = []
-        if len(results) > 0 and results[0].boxes is not None:
-            boxes = results[0].boxes
-            for box in boxes:
-                # Get coordinates in xyxy format
-                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
-                conf = float(box.conf[0].cpu().numpy())
-
-                # Convert to x, y, width, height
-                x = int(x1)
-                y = int(y1)
-                w = int(x2 - x1)
-                h = int(y2 - y1)
-
-                detections.append((x, y, w, h, conf))
-
-        return detections
+        if results:
+            return self._results_to_detections(results[0])
+        return []

    def detect_batch(self, frames: List[np.ndarray]) -> List[List[Tuple[int, int, int, int, float]]]:
        """
-        Detect faces in multiple frames at once (batch processing).
+        Detect heads in multiple frames at once (batch processing).

        Args:
            frames: List of BGR images as numpy arrays (H, W, C)
@ -161,7 +229,6 @@ class YOLOFaceDetector:
        if not frames:
            return []

-        # Run batch inference
        try:
            results = self.model.predict(
                frames,
@ -175,7 +242,6 @@ class YOLOFaceDetector:
            print(f"[FaceMask] ERROR during batch inference: {e}")
            import traceback
            traceback.print_exc()
-            # Fallback to CPU
            print("[FaceMask] Falling back to CPU inference...")
            self._device = 'cpu'
            results = self.model.predict(
@ -187,28 +253,7 @@ class YOLOFaceDetector:
                device='cpu',
            )

-        # Extract detections for each frame
-        all_detections = []
-        for result in results:
-            detections = []
-            if result.boxes is not None:
-                boxes = result.boxes
-                for box in boxes:
-                    # Get coordinates in xyxy format
-                    x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
-                    conf = float(box.conf[0].cpu().numpy())
-
-                    # Convert to x, y, width, height
-                    x = int(x1)
-                    y = int(y1)
-                    w = int(x2 - x1)
-                    h = int(y2 - y1)
-
-                    detections.append((x, y, w, h, conf))
-
-            all_detections.append(detections)
-
-        return all_detections
+        return [self._results_to_detections(r) for r in results]

    def generate_mask(
        self,
@ -218,11 +263,11 @@ class YOLOFaceDetector:
        feather_radius: int = 20,
    ) -> np.ndarray:
        """
-        Generate a mask image from face detections.
+        Generate a mask image from head detections.

        Args:
            frame_shape: Shape of the original frame (height, width, channels)
-            detections: List of face detections (x, y, w, h, conf)
+            detections: List of head detections (x, y, w, h, conf)
            mask_scale: Scale factor for mask region
            feather_radius: Radius for edge feathering

@ -235,25 +280,19 @@ class YOLOFaceDetector:
        mask = np.zeros((height, width), dtype=np.uint8)

        for (x, y, w, h, conf) in detections:
-            # Scale the bounding box
            center_x = x + w // 2
            center_y = y + h // 2
-
            scaled_w = int(w * mask_scale)
            scaled_h = int(h * mask_scale)

-            # Draw ellipse for natural face shape
            cv2.ellipse(
                mask,
                (center_x, center_y),
                (scaled_w // 2, scaled_h // 2),
-                0,  # angle
-                0, 360,  # arc
-                255,  # color (white)
-                -1,  # filled
+                0, 0, 360,
+                255, -1,
            )

-        # Apply Gaussian blur for feathering
        if feather_radius > 0 and len(detections) > 0:
            kernel_size = feather_radius * 2 + 1
            mask = cv2.GaussianBlur(mask, (kernel_size, kernel_size), 0)
@ -262,12 +301,12 @@ class YOLOFaceDetector:


 # Singleton instance
-_detector: Optional[YOLOFaceDetector] = None
+_detector: Optional[YOLOPoseHeadDetector] = None


-def get_detector(**kwargs) -> YOLOFaceDetector:
-    """Get or create the global YOLO detector instance."""
+def get_detector(**kwargs) -> YOLOPoseHeadDetector:
+    """Get or create the global YOLO pose head detector instance."""
    global _detector
    if _detector is None:
-        _detector = YOLOFaceDetector(**kwargs)
+        _detector = YOLOPoseHeadDetector(**kwargs)
    return _detector