blender-mask-peoples/server/detector.py

"""
YOLOv11 Face Detector using ONNX Runtime with GPU support.

This module provides high-performance face detection using
YOLOv11-face model with CUDA acceleration.
"""

import os
from typing import List, Tuple, Optional
from pathlib import Path
import numpy as np


class YOLOFaceDetector:
    """
    YOLOv11 face detector with ONNX Runtime GPU support.

    Features:
    - CUDA GPU acceleration
    - High accuracy face detection
    - NMS for overlapping detections
    """

    # Default model path relative to this file
    DEFAULT_MODEL = "yolov11n-face.onnx"

    def __init__(
        self,
        model_path: Optional[str] = None,
        conf_threshold: float = 0.25,
        iou_threshold: float = 0.45,
        input_size: Tuple[int, int] = (640, 640),
    ):
        """
        Initialize the YOLO face detector.

        Args:
            model_path: Path to ONNX model file. If None, uses default model.
            conf_threshold: Confidence threshold for detections
            iou_threshold: IoU threshold for NMS
            input_size: Model input size (width, height)
        """
        self.conf_threshold = conf_threshold
        self.iou_threshold = iou_threshold
        self.input_size = input_size
        self._session = None
        self._model_path = model_path

    @property
    def session(self):
        """Lazy-load ONNX Runtime session."""
        if self._session is None:
            import onnxruntime as ort

            # Determine model path
            if self._model_path is None:
                # Assuming models are in ../models relative to server/detector.py
                models_dir = Path(__file__).parent.parent / "models"
                model_path = str(models_dir / self.DEFAULT_MODEL)
            else:
                model_path = self._model_path

            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model not found: {model_path}")

            # Configure providers (prefer CUDA)
            providers = []
            if 'CUDAExecutionProvider' in ort.get_available_providers():
                providers.append('CUDAExecutionProvider')
                print("[FaceMask] Using CUDA GPU for inference")
            providers.append('CPUExecutionProvider')

            # Create session
            sess_options = ort.SessionOptions()
            sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

            self._session = ort.InferenceSession(
                model_path,
                sess_options=sess_options,
                providers=providers,
            )

            print(f"[FaceMask] YOLO model loaded: {model_path}")
            print(f"[FaceMask] Providers: {self._session.get_providers()}")

        return self._session

    def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]:
        """
        Detect faces in a frame.

        Args:
            frame: BGR image as numpy array (H, W, C)

        Returns:
            List of detections as (x, y, width, height, confidence)
        """
        import cv2

        original_height, original_width = frame.shape[:2]

        input_tensor = self._preprocess(frame)
        # print(f"[DEBUG] Input tensor shape: {input_tensor.shape}, Range: [{input_tensor.min():.3f}, {input_tensor.max():.3f}]", flush=True)

        # Run inference
        input_name = self.session.get_inputs()[0].name
        outputs = self.session.run(None, {input_name: input_tensor})

        raw_output = outputs[0]
        # print(f"[DEBUG] Raw output shape: {raw_output.shape}, Range: [{raw_output.min():.3f}, {raw_output.max():.3f}]", flush=True)

        # Postprocess
        detections = self._postprocess(
            raw_output,
            original_width,
            original_height,
        )
        # print(f"[DEBUG] Detections found: {len(detections)}", flush=True)

        return detections

    def _preprocess(self, frame: np.ndarray) -> np.ndarray:
        """Preprocess frame for YOLO input with letterbox resizing."""
        import cv2

        # Letterbox resize
        shape = frame.shape[:2]  # current shape [height, width]
        new_shape = self.input_size

        # Scale ratio (new / old)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])

        # Compute padding
        ratio = r, r  # width, height ratios
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

        dw /= 2  # divide padding into 2 sides
        dh /= 2

        if shape[::-1] != new_unpad:  # resize
            frame = cv2.resize(frame, new_unpad, interpolation=cv2.INTER_LINEAR)

        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))

        # Add border
        frame = cv2.copyMakeBorder(frame, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))

        # Store metadata for postprocessing
        self._last_letterbox_meta = {'ratio': ratio, 'dwdh': (dw, dh)}

        # Convert BGR to RGB
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Normalize to [0, 1]
        normalized = rgb.astype(np.float32) / 255.0

        # Transpose to CHW format
        transposed = np.transpose(normalized, (2, 0, 1))

        # Add batch dimension
        batched = np.expand_dims(transposed, axis=0)

        return batched

    def _postprocess(
        self,
        output: np.ndarray,
        original_width: int,
        original_height: int,
    ) -> List[Tuple[int, int, int, int, float]]:
        """
        Postprocess YOLO output to get detections.
        """
        # Output shape: [1, num_detections, 5+] where 5 = x_center, y_center, w, h, conf

        # Handle different output formats
        if output.shape[1] < output.shape[2]:
            # Format: [1, 5+, num_detections] - transpose
            output = np.transpose(output[0], (1, 0))
        else:
            output = output[0]

        # Debug confidence stats
        # if output.shape[1] >= 5:
        #     max_conf = output[:, 4].max()
        #     print(f"[DEBUG] Max confidence in raw output: {max_conf:.4f}", flush=True)

        # Filter by confidence
        confidences = output[:, 4]
        mask = confidences > self.conf_threshold
        filtered = output[mask]

        if len(filtered) == 0:
            return []

        # Get letterbox metadata
        if hasattr(self, '_last_letterbox_meta') and self._last_letterbox_meta:
            ratio = self._last_letterbox_meta['ratio']
            dw, dh = self._last_letterbox_meta['dwdh']

            # Extract coordinates
            x_center = filtered[:, 0]
            y_center = filtered[:, 1]
            width = filtered[:, 2]
            height = filtered[:, 3]
            confidences = filtered[:, 4]

            # Convert center to corner
            x1 = x_center - width / 2
            y1 = y_center - height / 2
            x2 = x_center + width / 2
            y2 = y_center + height / 2

            # Adjust for letterbox padding
            x1 -= dw
            y1 -= dh
            x2 -= dw
            y2 -= dh

            # Adjust for resizing
            x1 /= ratio[0]
            y1 /= ratio[1]
            x2 /= ratio[0]
            y2 /= ratio[1]

            # Clip to image bounds
            x1 = np.clip(x1, 0, original_width)
            y1 = np.clip(y1, 0, original_height)
            x2 = np.clip(x2, 0, original_width)
            y2 = np.clip(y2, 0, original_height)

            # Convert back to x, y, w, h
            final_x = x1
            final_y = y1
            final_w = x2 - x1
            final_h = y2 - y1

        else:
            # Fallback for non-letterbox (legacy)
            scale_x = original_width / self.input_size[0]
            scale_y = original_height / self.input_size[1]

            x_center = filtered[:, 0] * scale_x
            y_center = filtered[:, 1] * scale_y
            width = filtered[:, 2] * scale_x
            height = filtered[:, 3] * scale_y
            confidences = filtered[:, 4]

            final_x = x_center - width / 2
            final_y = y_center - height / 2
            final_w = width
            final_h = height

        # Apply NMS
        boxes = np.stack([final_x, final_y, final_w, final_h], axis=1)
        indices = self._nms(boxes, confidences, self.iou_threshold)

        # Format output
        detections = []
        for i in indices:
            x = int(final_x[i])
            y = int(final_y[i])
            w = int(final_w[i])
            h = int(final_h[i])
            conf = float(confidences[i])
            detections.append((x, y, w, h, conf))

        return detections

    def _nms(
        self,
        boxes: np.ndarray,
        scores: np.ndarray,
        iou_threshold: float,
    ) -> List[int]:
        """Non-Maximum Suppression."""
        x1 = boxes[:, 0]
        y1 = boxes[:, 1]
        x2 = x1 + boxes[:, 2]
        y2 = y1 + boxes[:, 3]

        areas = boxes[:, 2] * boxes[:, 3]
        order = scores.argsort()[::-1]

        keep = []
        while len(order) > 0:
            i = order[0]
            keep.append(i)

            if len(order) == 1:
                break

            xx1 = np.maximum(x1[i], x1[order[1:]])
            yy1 = np.maximum(y1[i], y1[order[1:]])
            xx2 = np.minimum(x2[i], x2[order[1:]])
            yy2 = np.minimum(y2[i], y2[order[1:]])

            w = np.maximum(0, xx2 - xx1)
            h = np.maximum(0, yy2 - yy1)
            inter = w * h

            iou = inter / (areas[i] + areas[order[1:]] - inter)

            inds = np.where(iou <= iou_threshold)[0]
            order = order[inds + 1]

        return keep

    def generate_mask(
        self,
        frame_shape: Tuple[int, int, int],
        detections: List[Tuple[int, int, int, int, float]],
        mask_scale: float = 1.5,
        feather_radius: int = 20,
    ) -> np.ndarray:
        """
        Generate a mask image from face detections.

        Args:
            frame_shape: Shape of the original frame (height, width, channels)
            detections: List of face detections (x, y, w, h, conf)
            mask_scale: Scale factor for mask region
            feather_radius: Radius for edge feathering

        Returns:
            Grayscale mask image (white = blur, black = keep)
        """
        import cv2

        height, width = frame_shape[:2]
        mask = np.zeros((height, width), dtype=np.uint8)

        for (x, y, w, h, conf) in detections:
            # Scale the bounding box
            center_x = x + w // 2
            center_y = y + h // 2

            scaled_w = int(w * mask_scale)
            scaled_h = int(h * mask_scale)

            # Draw ellipse for natural face shape
            cv2.ellipse(
                mask,
                (center_x, center_y),
                (scaled_w // 2, scaled_h // 2),
                0,  # angle
                0, 360,  # arc
                255,  # color (white)
                -1,  # filled
            )

        # Apply Gaussian blur for feathering
        if feather_radius > 0 and len(detections) > 0:
            kernel_size = feather_radius * 2 + 1
            mask = cv2.GaussianBlur(mask, (kernel_size, kernel_size), 0)

        return mask


# Singleton instance
_detector: Optional[YOLOFaceDetector] = None


def get_detector(**kwargs) -> YOLOFaceDetector:
    """Get or create the global YOLO detector instance."""
    global _detector
    if _detector is None:
        _detector = YOLOFaceDetector(**kwargs)
    return _detector