372 lines
12 KiB
Python
372 lines
12 KiB
Python
"""
|
|
YOLOv11 Face Detector using ONNX Runtime with GPU support.
|
|
|
|
This module provides high-performance face detection using
|
|
YOLOv11-face model with CUDA acceleration.
|
|
"""
|
|
|
|
import os
|
|
from typing import List, Tuple, Optional
|
|
from pathlib import Path
|
|
import numpy as np
|
|
|
|
|
|
class YOLOFaceDetector:
|
|
"""
|
|
YOLOv11 face detector with ONNX Runtime GPU support.
|
|
|
|
Features:
|
|
- CUDA GPU acceleration
|
|
- High accuracy face detection
|
|
- NMS for overlapping detections
|
|
"""
|
|
|
|
# Default model path relative to this file
|
|
DEFAULT_MODEL = "yolov11n-face.onnx"
|
|
|
|
def __init__(
|
|
self,
|
|
model_path: Optional[str] = None,
|
|
conf_threshold: float = 0.25,
|
|
iou_threshold: float = 0.45,
|
|
input_size: Tuple[int, int] = (640, 640),
|
|
):
|
|
"""
|
|
Initialize the YOLO face detector.
|
|
|
|
Args:
|
|
model_path: Path to ONNX model file. If None, uses default model.
|
|
conf_threshold: Confidence threshold for detections
|
|
iou_threshold: IoU threshold for NMS
|
|
input_size: Model input size (width, height)
|
|
"""
|
|
self.conf_threshold = conf_threshold
|
|
self.iou_threshold = iou_threshold
|
|
self.input_size = input_size
|
|
self._session = None
|
|
self._model_path = model_path
|
|
|
|
@property
|
|
def session(self):
|
|
"""Lazy-load ONNX Runtime session."""
|
|
if self._session is None:
|
|
import onnxruntime as ort
|
|
|
|
# Determine model path
|
|
if self._model_path is None:
|
|
# Assuming models are in ../models relative to server/detector.py
|
|
models_dir = Path(__file__).parent.parent / "models"
|
|
model_path = str(models_dir / self.DEFAULT_MODEL)
|
|
else:
|
|
model_path = self._model_path
|
|
|
|
if not os.path.exists(model_path):
|
|
raise FileNotFoundError(f"Model not found: {model_path}")
|
|
|
|
# Configure providers (prefer CUDA)
|
|
providers = []
|
|
if 'CUDAExecutionProvider' in ort.get_available_providers():
|
|
providers.append('CUDAExecutionProvider')
|
|
print("[FaceMask] Using CUDA GPU for inference")
|
|
providers.append('CPUExecutionProvider')
|
|
|
|
# Create session
|
|
sess_options = ort.SessionOptions()
|
|
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
|
|
self._session = ort.InferenceSession(
|
|
model_path,
|
|
sess_options=sess_options,
|
|
providers=providers,
|
|
)
|
|
|
|
print(f"[FaceMask] YOLO model loaded: {model_path}")
|
|
print(f"[FaceMask] Providers: {self._session.get_providers()}")
|
|
|
|
return self._session
|
|
|
|
def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]:
|
|
"""
|
|
Detect faces in a frame.
|
|
|
|
Args:
|
|
frame: BGR image as numpy array (H, W, C)
|
|
|
|
Returns:
|
|
List of detections as (x, y, width, height, confidence)
|
|
"""
|
|
import cv2
|
|
|
|
original_height, original_width = frame.shape[:2]
|
|
|
|
input_tensor = self._preprocess(frame)
|
|
# print(f"[DEBUG] Input tensor shape: {input_tensor.shape}, Range: [{input_tensor.min():.3f}, {input_tensor.max():.3f}]", flush=True)
|
|
|
|
# Run inference
|
|
input_name = self.session.get_inputs()[0].name
|
|
outputs = self.session.run(None, {input_name: input_tensor})
|
|
|
|
raw_output = outputs[0]
|
|
# print(f"[DEBUG] Raw output shape: {raw_output.shape}, Range: [{raw_output.min():.3f}, {raw_output.max():.3f}]", flush=True)
|
|
|
|
# Postprocess
|
|
detections = self._postprocess(
|
|
raw_output,
|
|
original_width,
|
|
original_height,
|
|
)
|
|
# print(f"[DEBUG] Detections found: {len(detections)}", flush=True)
|
|
|
|
return detections
|
|
|
|
def _preprocess(self, frame: np.ndarray) -> np.ndarray:
|
|
"""Preprocess frame for YOLO input with letterbox resizing."""
|
|
import cv2
|
|
|
|
# Letterbox resize
|
|
shape = frame.shape[:2] # current shape [height, width]
|
|
new_shape = self.input_size
|
|
|
|
# Scale ratio (new / old)
|
|
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
|
|
|
# Compute padding
|
|
ratio = r, r # width, height ratios
|
|
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
|
|
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
|
|
|
|
dw /= 2 # divide padding into 2 sides
|
|
dh /= 2
|
|
|
|
if shape[::-1] != new_unpad: # resize
|
|
frame = cv2.resize(frame, new_unpad, interpolation=cv2.INTER_LINEAR)
|
|
|
|
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
|
|
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
|
|
|
|
# Add border
|
|
frame = cv2.copyMakeBorder(frame, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
|
|
|
|
# Store metadata for postprocessing
|
|
self._last_letterbox_meta = {'ratio': ratio, 'dwdh': (dw, dh)}
|
|
|
|
# Convert BGR to RGB
|
|
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
|
|
# Normalize to [0, 1]
|
|
normalized = rgb.astype(np.float32) / 255.0
|
|
|
|
# Transpose to CHW format
|
|
transposed = np.transpose(normalized, (2, 0, 1))
|
|
|
|
# Add batch dimension
|
|
batched = np.expand_dims(transposed, axis=0)
|
|
|
|
return batched
|
|
|
|
def _postprocess(
|
|
self,
|
|
output: np.ndarray,
|
|
original_width: int,
|
|
original_height: int,
|
|
) -> List[Tuple[int, int, int, int, float]]:
|
|
"""
|
|
Postprocess YOLO output to get detections.
|
|
"""
|
|
# Output shape: [1, num_detections, 5+] where 5 = x_center, y_center, w, h, conf
|
|
|
|
# Handle different output formats
|
|
if output.shape[1] < output.shape[2]:
|
|
# Format: [1, 5+, num_detections] - transpose
|
|
output = np.transpose(output[0], (1, 0))
|
|
else:
|
|
output = output[0]
|
|
|
|
# Debug confidence stats
|
|
# if output.shape[1] >= 5:
|
|
# max_conf = output[:, 4].max()
|
|
# print(f"[DEBUG] Max confidence in raw output: {max_conf:.4f}", flush=True)
|
|
|
|
# Filter by confidence
|
|
confidences = output[:, 4]
|
|
mask = confidences > self.conf_threshold
|
|
filtered = output[mask]
|
|
|
|
if len(filtered) == 0:
|
|
return []
|
|
|
|
# Get letterbox metadata
|
|
if hasattr(self, '_last_letterbox_meta') and self._last_letterbox_meta:
|
|
ratio = self._last_letterbox_meta['ratio']
|
|
dw, dh = self._last_letterbox_meta['dwdh']
|
|
|
|
# Extract coordinates
|
|
x_center = filtered[:, 0]
|
|
y_center = filtered[:, 1]
|
|
width = filtered[:, 2]
|
|
height = filtered[:, 3]
|
|
confidences = filtered[:, 4]
|
|
|
|
# Convert center to corner
|
|
x1 = x_center - width / 2
|
|
y1 = y_center - height / 2
|
|
x2 = x_center + width / 2
|
|
y2 = y_center + height / 2
|
|
|
|
# Adjust for letterbox padding
|
|
x1 -= dw
|
|
y1 -= dh
|
|
x2 -= dw
|
|
y2 -= dh
|
|
|
|
# Adjust for resizing
|
|
x1 /= ratio[0]
|
|
y1 /= ratio[1]
|
|
x2 /= ratio[0]
|
|
y2 /= ratio[1]
|
|
|
|
# Clip to image bounds
|
|
x1 = np.clip(x1, 0, original_width)
|
|
y1 = np.clip(y1, 0, original_height)
|
|
x2 = np.clip(x2, 0, original_width)
|
|
y2 = np.clip(y2, 0, original_height)
|
|
|
|
# Convert back to x, y, w, h
|
|
final_x = x1
|
|
final_y = y1
|
|
final_w = x2 - x1
|
|
final_h = y2 - y1
|
|
|
|
else:
|
|
# Fallback for non-letterbox (legacy)
|
|
scale_x = original_width / self.input_size[0]
|
|
scale_y = original_height / self.input_size[1]
|
|
|
|
x_center = filtered[:, 0] * scale_x
|
|
y_center = filtered[:, 1] * scale_y
|
|
width = filtered[:, 2] * scale_x
|
|
height = filtered[:, 3] * scale_y
|
|
confidences = filtered[:, 4]
|
|
|
|
final_x = x_center - width / 2
|
|
final_y = y_center - height / 2
|
|
final_w = width
|
|
final_h = height
|
|
|
|
# Apply NMS
|
|
boxes = np.stack([final_x, final_y, final_w, final_h], axis=1)
|
|
indices = self._nms(boxes, confidences, self.iou_threshold)
|
|
|
|
# Format output
|
|
detections = []
|
|
for i in indices:
|
|
x = int(final_x[i])
|
|
y = int(final_y[i])
|
|
w = int(final_w[i])
|
|
h = int(final_h[i])
|
|
conf = float(confidences[i])
|
|
detections.append((x, y, w, h, conf))
|
|
|
|
return detections
|
|
|
|
def _nms(
|
|
self,
|
|
boxes: np.ndarray,
|
|
scores: np.ndarray,
|
|
iou_threshold: float,
|
|
) -> List[int]:
|
|
"""Non-Maximum Suppression."""
|
|
x1 = boxes[:, 0]
|
|
y1 = boxes[:, 1]
|
|
x2 = x1 + boxes[:, 2]
|
|
y2 = y1 + boxes[:, 3]
|
|
|
|
areas = boxes[:, 2] * boxes[:, 3]
|
|
order = scores.argsort()[::-1]
|
|
|
|
keep = []
|
|
while len(order) > 0:
|
|
i = order[0]
|
|
keep.append(i)
|
|
|
|
if len(order) == 1:
|
|
break
|
|
|
|
xx1 = np.maximum(x1[i], x1[order[1:]])
|
|
yy1 = np.maximum(y1[i], y1[order[1:]])
|
|
xx2 = np.minimum(x2[i], x2[order[1:]])
|
|
yy2 = np.minimum(y2[i], y2[order[1:]])
|
|
|
|
w = np.maximum(0, xx2 - xx1)
|
|
h = np.maximum(0, yy2 - yy1)
|
|
inter = w * h
|
|
|
|
iou = inter / (areas[i] + areas[order[1:]] - inter)
|
|
|
|
inds = np.where(iou <= iou_threshold)[0]
|
|
order = order[inds + 1]
|
|
|
|
return keep
|
|
|
|
def generate_mask(
|
|
self,
|
|
frame_shape: Tuple[int, int, int],
|
|
detections: List[Tuple[int, int, int, int, float]],
|
|
mask_scale: float = 1.5,
|
|
feather_radius: int = 20,
|
|
) -> np.ndarray:
|
|
"""
|
|
Generate a mask image from face detections.
|
|
|
|
Args:
|
|
frame_shape: Shape of the original frame (height, width, channels)
|
|
detections: List of face detections (x, y, w, h, conf)
|
|
mask_scale: Scale factor for mask region
|
|
feather_radius: Radius for edge feathering
|
|
|
|
Returns:
|
|
Grayscale mask image (white = blur, black = keep)
|
|
"""
|
|
import cv2
|
|
|
|
height, width = frame_shape[:2]
|
|
mask = np.zeros((height, width), dtype=np.uint8)
|
|
|
|
for (x, y, w, h, conf) in detections:
|
|
# Scale the bounding box
|
|
center_x = x + w // 2
|
|
center_y = y + h // 2
|
|
|
|
scaled_w = int(w * mask_scale)
|
|
scaled_h = int(h * mask_scale)
|
|
|
|
# Draw ellipse for natural face shape
|
|
cv2.ellipse(
|
|
mask,
|
|
(center_x, center_y),
|
|
(scaled_w // 2, scaled_h // 2),
|
|
0, # angle
|
|
0, 360, # arc
|
|
255, # color (white)
|
|
-1, # filled
|
|
)
|
|
|
|
# Apply Gaussian blur for feathering
|
|
if feather_radius > 0 and len(detections) > 0:
|
|
kernel_size = feather_radius * 2 + 1
|
|
mask = cv2.GaussianBlur(mask, (kernel_size, kernel_size), 0)
|
|
|
|
return mask
|
|
|
|
|
|
# Singleton instance
|
|
_detector: Optional[YOLOFaceDetector] = None
|
|
|
|
|
|
def get_detector(**kwargs) -> YOLOFaceDetector:
|
|
"""Get or create the global YOLO detector instance."""
|
|
global _detector
|
|
if _detector is None:
|
|
_detector = YOLOFaceDetector(**kwargs)
|
|
return _detector
|