""" YOLOv11 Face Detector using ONNX Runtime with GPU support. This module provides high-performance face detection using YOLOv11-face model with CUDA acceleration. """ import os from typing import List, Tuple, Optional from pathlib import Path import numpy as np class YOLOFaceDetector: """ YOLOv11 face detector with ONNX Runtime GPU support. Features: - CUDA GPU acceleration - High accuracy face detection - NMS for overlapping detections """ # Default model path relative to this file DEFAULT_MODEL = "yolov11n-face.onnx" def __init__( self, model_path: Optional[str] = None, conf_threshold: float = 0.25, iou_threshold: float = 0.45, input_size: Tuple[int, int] = (640, 640), ): """ Initialize the YOLO face detector. Args: model_path: Path to ONNX model file. If None, uses default model. conf_threshold: Confidence threshold for detections iou_threshold: IoU threshold for NMS input_size: Model input size (width, height) """ self.conf_threshold = conf_threshold self.iou_threshold = iou_threshold self.input_size = input_size self._session = None self._model_path = model_path @property def session(self): """Lazy-load ONNX Runtime session.""" if self._session is None: import onnxruntime as ort # Determine model path if self._model_path is None: # Assuming models are in ../models relative to server/detector.py models_dir = Path(__file__).parent.parent / "models" model_path = str(models_dir / self.DEFAULT_MODEL) else: model_path = self._model_path if not os.path.exists(model_path): raise FileNotFoundError(f"Model not found: {model_path}") # Configure providers (prefer CUDA) providers = [] if 'CUDAExecutionProvider' in ort.get_available_providers(): providers.append('CUDAExecutionProvider') print("[FaceMask] Using CUDA GPU for inference") providers.append('CPUExecutionProvider') # Create session sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL self._session = ort.InferenceSession( model_path, sess_options=sess_options, providers=providers, ) print(f"[FaceMask] YOLO model loaded: {model_path}") print(f"[FaceMask] Providers: {self._session.get_providers()}") return self._session def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]: """ Detect faces in a frame. Args: frame: BGR image as numpy array (H, W, C) Returns: List of detections as (x, y, width, height, confidence) """ import cv2 original_height, original_width = frame.shape[:2] input_tensor = self._preprocess(frame) # print(f"[DEBUG] Input tensor shape: {input_tensor.shape}, Range: [{input_tensor.min():.3f}, {input_tensor.max():.3f}]", flush=True) # Run inference input_name = self.session.get_inputs()[0].name outputs = self.session.run(None, {input_name: input_tensor}) raw_output = outputs[0] # print(f"[DEBUG] Raw output shape: {raw_output.shape}, Range: [{raw_output.min():.3f}, {raw_output.max():.3f}]", flush=True) # Postprocess detections = self._postprocess( raw_output, original_width, original_height, ) # print(f"[DEBUG] Detections found: {len(detections)}", flush=True) return detections def _preprocess(self, frame: np.ndarray) -> np.ndarray: """Preprocess frame for YOLO input with letterbox resizing.""" import cv2 # Letterbox resize shape = frame.shape[:2] # current shape [height, width] new_shape = self.input_size # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize frame = cv2.resize(frame, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) # Add border frame = cv2.copyMakeBorder(frame, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)) # Store metadata for postprocessing self._last_letterbox_meta = {'ratio': ratio, 'dwdh': (dw, dh)} # Convert BGR to RGB rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Normalize to [0, 1] normalized = rgb.astype(np.float32) / 255.0 # Transpose to CHW format transposed = np.transpose(normalized, (2, 0, 1)) # Add batch dimension batched = np.expand_dims(transposed, axis=0) return batched def _postprocess( self, output: np.ndarray, original_width: int, original_height: int, ) -> List[Tuple[int, int, int, int, float]]: """ Postprocess YOLO output to get detections. """ # Output shape: [1, num_detections, 5+] where 5 = x_center, y_center, w, h, conf # Handle different output formats if output.shape[1] < output.shape[2]: # Format: [1, 5+, num_detections] - transpose output = np.transpose(output[0], (1, 0)) else: output = output[0] # Debug confidence stats # if output.shape[1] >= 5: # max_conf = output[:, 4].max() # print(f"[DEBUG] Max confidence in raw output: {max_conf:.4f}", flush=True) # Filter by confidence confidences = output[:, 4] mask = confidences > self.conf_threshold filtered = output[mask] if len(filtered) == 0: return [] # Get letterbox metadata if hasattr(self, '_last_letterbox_meta') and self._last_letterbox_meta: ratio = self._last_letterbox_meta['ratio'] dw, dh = self._last_letterbox_meta['dwdh'] # Extract coordinates x_center = filtered[:, 0] y_center = filtered[:, 1] width = filtered[:, 2] height = filtered[:, 3] confidences = filtered[:, 4] # Convert center to corner x1 = x_center - width / 2 y1 = y_center - height / 2 x2 = x_center + width / 2 y2 = y_center + height / 2 # Adjust for letterbox padding x1 -= dw y1 -= dh x2 -= dw y2 -= dh # Adjust for resizing x1 /= ratio[0] y1 /= ratio[1] x2 /= ratio[0] y2 /= ratio[1] # Clip to image bounds x1 = np.clip(x1, 0, original_width) y1 = np.clip(y1, 0, original_height) x2 = np.clip(x2, 0, original_width) y2 = np.clip(y2, 0, original_height) # Convert back to x, y, w, h final_x = x1 final_y = y1 final_w = x2 - x1 final_h = y2 - y1 else: # Fallback for non-letterbox (legacy) scale_x = original_width / self.input_size[0] scale_y = original_height / self.input_size[1] x_center = filtered[:, 0] * scale_x y_center = filtered[:, 1] * scale_y width = filtered[:, 2] * scale_x height = filtered[:, 3] * scale_y confidences = filtered[:, 4] final_x = x_center - width / 2 final_y = y_center - height / 2 final_w = width final_h = height # Apply NMS boxes = np.stack([final_x, final_y, final_w, final_h], axis=1) indices = self._nms(boxes, confidences, self.iou_threshold) # Format output detections = [] for i in indices: x = int(final_x[i]) y = int(final_y[i]) w = int(final_w[i]) h = int(final_h[i]) conf = float(confidences[i]) detections.append((x, y, w, h, conf)) return detections def _nms( self, boxes: np.ndarray, scores: np.ndarray, iou_threshold: float, ) -> List[int]: """Non-Maximum Suppression.""" x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = x1 + boxes[:, 2] y2 = y1 + boxes[:, 3] areas = boxes[:, 2] * boxes[:, 3] order = scores.argsort()[::-1] keep = [] while len(order) > 0: i = order[0] keep.append(i) if len(order) == 1: break xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0, xx2 - xx1) h = np.maximum(0, yy2 - yy1) inter = w * h iou = inter / (areas[i] + areas[order[1:]] - inter) inds = np.where(iou <= iou_threshold)[0] order = order[inds + 1] return keep def generate_mask( self, frame_shape: Tuple[int, int, int], detections: List[Tuple[int, int, int, int, float]], mask_scale: float = 1.5, feather_radius: int = 20, ) -> np.ndarray: """ Generate a mask image from face detections. Args: frame_shape: Shape of the original frame (height, width, channels) detections: List of face detections (x, y, w, h, conf) mask_scale: Scale factor for mask region feather_radius: Radius for edge feathering Returns: Grayscale mask image (white = blur, black = keep) """ import cv2 height, width = frame_shape[:2] mask = np.zeros((height, width), dtype=np.uint8) for (x, y, w, h, conf) in detections: # Scale the bounding box center_x = x + w // 2 center_y = y + h // 2 scaled_w = int(w * mask_scale) scaled_h = int(h * mask_scale) # Draw ellipse for natural face shape cv2.ellipse( mask, (center_x, center_y), (scaled_w // 2, scaled_h // 2), 0, # angle 0, 360, # arc 255, # color (white) -1, # filled ) # Apply Gaussian blur for feathering if feather_radius > 0 and len(detections) > 0: kernel_size = feather_radius * 2 + 1 mask = cv2.GaussianBlur(mask, (kernel_size, kernel_size), 0) return mask # Singleton instance _detector: Optional[YOLOFaceDetector] = None def get_detector(**kwargs) -> YOLOFaceDetector: """Get or create the global YOLO detector instance.""" global _detector if _detector is None: _detector = YOLOFaceDetector(**kwargs) return _detector