From c0ad2a551d6a4b4d737025383f827f9aa4877ff1 Mon Sep 17 00:00:00 2001 From: Hare Date: Fri, 6 Feb 2026 10:13:26 +0900 Subject: [PATCH] YOLO --- .envrc | 1 + .gitignore | 2 +- README.md | 5 + blender_manifest.toml | 10 +- core/__init__.py | 1 - core/async_generator.py | 115 +++++------- core/face_detector.py | 160 ---------------- core/inference_client.py | 159 ++++++++++++++++ flake.lock | 61 ++++++ flake.nix | 48 +++++ operators/generate_mask.py | 30 +-- server/detector.py | 371 +++++++++++++++++++++++++++++++++++++ server/main.py | 180 ++++++++++++++++++ 13 files changed, 893 insertions(+), 250 deletions(-) create mode 100644 .envrc create mode 100644 README.md delete mode 100644 core/face_detector.py create mode 100644 core/inference_client.py create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 server/detector.py create mode 100644 server/main.py diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/.gitignore b/.gitignore index 7e66a1c..8508b72 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ .mask_cache/ *.mp4 test.blend -wheels/ +models/ # Python __pycache__/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..13ed724 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# Blender Plugin: Mask Peoples + +街歩き映像に対して自動モザイクを掛けるために開発しました。 + +使用:https://github.com/akanametov/yolo-face \ No newline at end of file diff --git a/blender_manifest.toml b/blender_manifest.toml index fa5e548..311eb62 100644 --- a/blender_manifest.toml +++ b/blender_manifest.toml @@ -1,9 +1,9 @@ schema_version = "1.0.0" id = "mask_peoples" -version = "0.2.0" +version = "0.3.0" name = "Face Mask Blur" -tagline = "Detect faces and apply blur in VSE for privacy protection" +tagline = "GPU-accelerated face detection and blur in VSE using YOLOv11" maintainer = "Hare" type = "add-on" license = ["SPDX:GPL-3.0-or-later"] @@ -15,11 +15,5 @@ copyright = ["2026 Hare"] # Valid tags from Blender extension platform tags = ["Sequencer"] -# Bundled Python wheels - Blender will install these automatically -wheels = [ - "./wheels/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "./wheels/opencv_python_headless-4.13.0.92-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", -] - [permissions] files = "Read video frames and write mask image cache" diff --git a/core/__init__.py b/core/__init__.py index 671bdb1..bb658f4 100644 --- a/core/__init__.py +++ b/core/__init__.py @@ -1,5 +1,4 @@ """Core module exports.""" -from .face_detector import FaceDetector from .async_generator import AsyncMaskGenerator, get_generator from .compositor_setup import create_mask_blur_node_tree, get_or_create_blur_node_tree diff --git a/core/async_generator.py b/core/async_generator.py index 7edd3b8..171c425 100644 --- a/core/async_generator.py +++ b/core/async_generator.py @@ -43,14 +43,14 @@ class AsyncMaskGenerator: start_frame: int, end_frame: int, fps: float, - scale_factor: float = 1.1, - min_neighbors: int = 5, + conf_threshold: float = 0.5, + iou_threshold: float = 0.45, mask_scale: float = 1.5, on_complete: Optional[Callable] = None, on_progress: Optional[Callable] = None, ): """ - Start asynchronous mask generation. + Start asynchronous mask generation with YOLO GPU acceleration. Args: video_path: Path to source video file @@ -58,8 +58,8 @@ class AsyncMaskGenerator: start_frame: First frame to process end_frame: Last frame to process fps: Video frame rate (for seeking) - scale_factor: Face detection scale factor - min_neighbors: Face detection min neighbors + conf_threshold: YOLO confidence threshold + iou_threshold: YOLO NMS IoU threshold mask_scale: Mask region scale factor on_complete: Callback when processing completes (called from main thread) on_progress: Callback for progress updates (called from main thread) @@ -93,8 +93,8 @@ class AsyncMaskGenerator: start_frame, end_frame, fps, - scale_factor, - min_neighbors, + conf_threshold, + iou_threshold, mask_scale, ), daemon=True, @@ -120,77 +120,62 @@ class AsyncMaskGenerator: start_frame: int, end_frame: int, fps: float, - scale_factor: float, - min_neighbors: int, + conf_threshold: float, + iou_threshold: float, mask_scale: float, ): """ - Worker thread function. Runs face detection and saves masks. - - IMPORTANT: Do NOT use bpy in this function! + Worker thread function. Delegates to inference server and polls status. """ - try: - import cv2 - print(f"[FaceMask] OpenCV loaded: {cv2.__version__}") - from .face_detector import FaceDetector - except ImportError as e: - print(f"[FaceMask] Import error: {e}") - self.result_queue.put(("error", str(e))) - return + import time + from .inference_client import get_client try: - # Initialize detector - detector = FaceDetector( - scale_factor=scale_factor, - min_neighbors=min_neighbors, + client = get_client() + + # Start task on server + print(f"[FaceMask] Requesting generation on server...") + task_id = client.generate_mask( + video_path=video_path, + output_dir=output_dir, + start_frame=start_frame, + end_frame=end_frame, + conf_threshold=conf_threshold, + iou_threshold=iou_threshold, + mask_scale=mask_scale, ) + print(f"[FaceMask] Task started: {task_id}") - # Open video - cap = cv2.VideoCapture(video_path) - if not cap.isOpened(): - print(f"[FaceMask] Failed to open video: {video_path}") - self.result_queue.put(("error", f"Failed to open video: {video_path}")) - return - - total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - print(f"[FaceMask] Video opened, total frames: {total_video_frames}") - - # Process frames - for frame_idx in range(start_frame, end_frame + 1): - if not self.is_running: + # Poll loop + while self.is_running: + status = client.get_task_status(task_id) + state = status.get("status") + + if state == "completed": + self.result_queue.put(("done", output_dir)) + return + + elif state == "failed": + error_msg = status.get("message", "Unknown server error") + print(f"[FaceMask] Server task failed: {error_msg}") + self.result_queue.put(("error", error_msg)) + return + + elif state == "cancelled": self.result_queue.put(("cancelled", None)) return - # Seek to frame - cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) - ret, frame = cap.read() - - if not ret: - # Skip unreadable frames - continue - - # Detect faces - detections = detector.detect(frame) - - # Generate mask - mask = detector.generate_mask( - frame.shape, - detections, - mask_scale=mask_scale, - ) - - # Save mask - mask_filename = f"mask_{frame_idx:06d}.png" - mask_path = os.path.join(output_dir, mask_filename) - cv2.imwrite(mask_path, mask) - # Report progress - self.progress_queue.put(("progress", frame_idx - start_frame + 1)) + progress = status.get("progress", 0) + if progress > 0: + self.progress_queue.put(("progress", progress)) + + time.sleep(0.5) - cap.release() - - # Report completion - self.result_queue.put(("done", output_dir)) + # If loop exited but task not done, cancel server task + print("[FaceMask] Cancelling server task...") + client.cancel_task(task_id) + self.result_queue.put(("cancelled", None)) except Exception as e: import traceback diff --git a/core/face_detector.py b/core/face_detector.py deleted file mode 100644 index 0d76220..0000000 --- a/core/face_detector.py +++ /dev/null @@ -1,160 +0,0 @@ -""" -Face detector using OpenCV Haar Cascades. - -This module provides face detection functionality optimized for -privacy blur in video editing workflows. -""" - -import os -from typing import List, Tuple, Optional -import numpy as np - - -class FaceDetector: - """ - Face detector using OpenCV Haar Cascades. - - Optimized for privacy blur use case: - - Detects frontal faces - - Configurable detection sensitivity - - Generates feathered masks for smooth blur edges - """ - - def __init__( - self, - scale_factor: float = 1.1, - min_neighbors: int = 5, - min_size: Tuple[int, int] = (30, 30), - ): - """ - Initialize the face detector. - - Args: - scale_factor: Image pyramid scale factor - min_neighbors: Minimum neighbors for detection - min_size: Minimum face size in pixels - """ - self.scale_factor = scale_factor - self.min_neighbors = min_neighbors - self.min_size = min_size - self._classifier = None - - @property - def classifier(self): - """Lazy-load the Haar cascade classifier.""" - if self._classifier is None: - import cv2 - - # Use haarcascade for frontal face detection - cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' - - if not os.path.exists(cascade_path): - raise RuntimeError(f"Haar cascade not found: {cascade_path}") - - self._classifier = cv2.CascadeClassifier(cascade_path) - - return self._classifier - - def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int]]: - """ - Detect faces in a frame. - - Args: - frame: BGR image as numpy array - - Returns: - List of face bounding boxes as (x, y, width, height) - """ - import cv2 - - # Convert to grayscale for detection - gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) - - # Detect faces - faces = self.classifier.detectMultiScale( - gray, - scaleFactor=self.scale_factor, - minNeighbors=self.min_neighbors, - minSize=self.min_size, - flags=cv2.CASCADE_SCALE_IMAGE, - ) - - # Convert to list of tuples - return [tuple(face) for face in faces] - - def generate_mask( - self, - frame_shape: Tuple[int, int, int], - detections: List[Tuple[int, int, int, int]], - mask_scale: float = 1.5, - feather_radius: int = 20, - ) -> np.ndarray: - """ - Generate a mask image from face detections. - - Args: - frame_shape: Shape of the original frame (height, width, channels) - detections: List of face bounding boxes - mask_scale: Scale factor for mask region (1.0 = exact bounding box) - feather_radius: Radius for edge feathering - - Returns: - Grayscale mask image (white = blur, black = keep) - """ - import cv2 - - height, width = frame_shape[:2] - mask = np.zeros((height, width), dtype=np.uint8) - - for (x, y, w, h) in detections: - # Scale the bounding box - center_x = x + w // 2 - center_y = y + h // 2 - - scaled_w = int(w * mask_scale) - scaled_h = int(h * mask_scale) - - # Calculate scaled bounding box - x1 = max(0, center_x - scaled_w // 2) - y1 = max(0, center_y - scaled_h // 2) - x2 = min(width, center_x + scaled_w // 2) - y2 = min(height, center_y + scaled_h // 2) - - # Draw ellipse for more natural face shape - cv2.ellipse( - mask, - (center_x, center_y), - (scaled_w // 2, scaled_h // 2), - 0, # angle - 0, 360, # arc - 255, # color (white) - -1, # filled - ) - - # Apply Gaussian blur for feathering - if feather_radius > 0 and len(detections) > 0: - # Ensure kernel size is odd - kernel_size = feather_radius * 2 + 1 - mask = cv2.GaussianBlur(mask, (kernel_size, kernel_size), 0) - - return mask - - -def detect_faces_batch( - frames: List[np.ndarray], - detector: Optional[FaceDetector] = None, -) -> List[List[Tuple[int, int, int, int]]]: - """ - Detect faces in multiple frames. - - Args: - frames: List of BGR images - detector: Optional detector instance (creates one if not provided) - - Returns: - List of detection lists, one per frame - """ - if detector is None: - detector = FaceDetector() - - return [detector.detect(frame) for frame in frames] diff --git a/core/inference_client.py b/core/inference_client.py new file mode 100644 index 0000000..da43b65 --- /dev/null +++ b/core/inference_client.py @@ -0,0 +1,159 @@ +""" +Client for interacting with the external inference server. + +Manages the server process and handles HTTP communication +using standard library (avoiding requests dependency). +""" + +import subprocess +import time +import json +import urllib.request +import urllib.error +import threading +import os +import signal +from typing import Optional, Dict, Any, Tuple + + +class InferenceClient: + """Client for the YOLO inference server.""" + + SERVER_URL = "http://127.0.0.1:8181" + + def __init__(self): + self.server_process: Optional[subprocess.Popen] = None + self._server_lock = threading.Lock() + + def start_server(self): + """Start the inference server process.""" + with self._server_lock: + if self.is_server_running(): + return + + print("[FaceMask] Starting inference server...") + + # Find project root + # Assuming this file is in core/inference_client.py + root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + server_script = os.path.join(root_dir, "server", "main.py") + + # Use system python (assumed to have dependencies via Nix/venv) + # In user's environment, 'python' should refer to the environment python + python_cmd = "python" + + # Start process + self.server_process = subprocess.Popen( + [python_cmd, server_script], + cwd=root_dir, + text=True, + preexec_fn=os.setsid, # Create new process group + ) + + # Wait for startup + for _ in range(20): # Wait up to 10 seconds + if self.is_server_running(): + print("[FaceMask] Server started successfully") + return + + # Check if process died + if self.server_process.poll() is not None: + raise RuntimeError(f"Server failed to start (rc={self.server_process.returncode})") + + time.sleep(0.5) + + raise RuntimeError("Server startup timed out") + + def stop_server(self): + """Stop the inference server.""" + with self._server_lock: + if self.server_process: + print("[FaceMask] Stopping server...") + try: + os.killpg(os.getpgid(self.server_process.pid), signal.SIGTERM) + self.server_process.wait(timeout=3) + except (ProcessLookupError, subprocess.TimeoutExpired): + pass + finally: + self.server_process = None + + def is_server_running(self) -> bool: + """Check if server is responding.""" + try: + with urllib.request.urlopen(f"{self.SERVER_URL}/status", timeout=1) as response: + return response.status == 200 + except (urllib.error.URLError, ConnectionRefusedError, TimeoutError): + return False + + def generate_mask( + self, + video_path: str, + output_dir: str, + start_frame: int, + end_frame: int, + conf_threshold: float, + iou_threshold: float, + mask_scale: float, + ) -> str: + """ + Request mask generation. + + Returns: + task_id (str) + """ + if not self.is_server_running(): + self.start_server() + + data = { + "video_path": video_path, + "output_dir": output_dir, + "start_frame": start_frame, + "end_frame": end_frame, + "conf_threshold": conf_threshold, + "iou_threshold": iou_threshold, + "mask_scale": mask_scale, + } + + req = urllib.request.Request( + f"{self.SERVER_URL}/generate", + data=json.dumps(data).encode('utf-8'), + headers={'Content-Type': 'application/json'}, + method='POST' + ) + + try: + with urllib.request.urlopen(req) as response: + result = json.loads(response.read().decode('utf-8')) + return result['id'] + except urllib.error.HTTPError as e: + raise RuntimeError(f"Server error: {e.read().decode('utf-8')}") + + def get_task_status(self, task_id: str) -> Dict[str, Any]: + """Get status of a task.""" + try: + with urllib.request.urlopen(f"{self.SERVER_URL}/tasks/{task_id}") as response: + return json.loads(response.read().decode('utf-8')) + except urllib.error.HTTPError: + return {"status": "unknown"} + + def cancel_task(self, task_id: str): + """Cancel a task.""" + try: + req = urllib.request.Request( + f"{self.SERVER_URL}/tasks/{task_id}/cancel", + method='POST' + ) + with urllib.request.urlopen(req): + pass + except urllib.error.HTTPError: + pass + + +# Singleton +_client: Optional[InferenceClient] = None + +def get_client() -> InferenceClient: + global _client + if _client is None: + _client = InferenceClient() + return _client diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..668ac53 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1770115704, + "narHash": "sha256-KHFT9UWOF2yRPlAnSXQJh6uVcgNcWlFqqiAZ7OVlHNc=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "e6eae2ee2110f3d31110d5c222cd395303343b08", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..dbebab7 --- /dev/null +++ b/flake.nix @@ -0,0 +1,48 @@ +{ + description = "Blender VoiceVox Plugin Development Environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = + { + self, + nixpkgs, + flake-utils, + }: + flake-utils.lib.eachDefaultSystem ( + system: + let + pkgs = nixpkgs.legacyPackages.${system}; + in + { + devShells.default = pkgs.mkShell { + buildInputs = with pkgs; [ + python311 + python311Packages.pip + python311Packages.requests + python311Packages.fastapi + python311Packages.uvicorn + python311Packages.numpy + python311Packages.opencv4 + python311Packages.onnxruntime + git + ]; + + shellHook = '' + python --version + blender --version | head -n 1 + + # Pythonパスにカレントディレクトリを追加 + export PYTHONPATH="$PWD:$PYTHONPATH" + + # アドオンのインストールパスを環境変数として設定 + export BLENDER_USER_SCRIPTS="$HOME/.config/blender/5.0/scripts" + export BLENDER_USER_ADDONS="$BLENDER_USER_SCRIPTS/addons" + ''; + }; + } + ); +} diff --git a/operators/generate_mask.py b/operators/generate_mask.py index 4bfcc8f..8a926fb 100644 --- a/operators/generate_mask.py +++ b/operators/generate_mask.py @@ -21,21 +21,21 @@ class SEQUENCER_OT_generate_face_mask(Operator): bl_description = "Detect faces and generate mask image sequence" bl_options = {'REGISTER', 'UNDO'} - # Detection parameters - scale_factor: FloatProperty( - name="Scale Factor", - description="Detection scale factor (larger = faster but less accurate)", - default=1.1, - min=1.01, - max=2.0, + # YOLO Detection parameters + conf_threshold: FloatProperty( + name="Confidence", + description="YOLO confidence threshold (higher = fewer false positives)", + default=0.25, + min=0.1, + max=1.0, ) - min_neighbors: IntProperty( - name="Min Neighbors", - description="Minimum neighbors for detection (higher = fewer false positives)", - default=5, - min=1, - max=20, + iou_threshold: FloatProperty( + name="IOU Threshold", + description="Non-maximum suppression IOU threshold", + default=0.45, + min=0.1, + max=1.0, ) mask_scale: FloatProperty( @@ -133,8 +133,8 @@ class SEQUENCER_OT_generate_face_mask(Operator): start_frame=0, # Frame indices in video end_frame=end_frame - start_frame, fps=fps, - scale_factor=self.scale_factor, - min_neighbors=self.min_neighbors, + conf_threshold=self.conf_threshold, + iou_threshold=self.iou_threshold, mask_scale=self.mask_scale, on_complete=on_complete, on_progress=on_progress, diff --git a/server/detector.py b/server/detector.py new file mode 100644 index 0000000..3c03589 --- /dev/null +++ b/server/detector.py @@ -0,0 +1,371 @@ +""" +YOLOv11 Face Detector using ONNX Runtime with GPU support. + +This module provides high-performance face detection using +YOLOv11-face model with CUDA acceleration. +""" + +import os +from typing import List, Tuple, Optional +from pathlib import Path +import numpy as np + + +class YOLOFaceDetector: + """ + YOLOv11 face detector with ONNX Runtime GPU support. + + Features: + - CUDA GPU acceleration + - High accuracy face detection + - NMS for overlapping detections + """ + + # Default model path relative to this file + DEFAULT_MODEL = "yolov11n-face.onnx" + + def __init__( + self, + model_path: Optional[str] = None, + conf_threshold: float = 0.25, + iou_threshold: float = 0.45, + input_size: Tuple[int, int] = (640, 640), + ): + """ + Initialize the YOLO face detector. + + Args: + model_path: Path to ONNX model file. If None, uses default model. + conf_threshold: Confidence threshold for detections + iou_threshold: IoU threshold for NMS + input_size: Model input size (width, height) + """ + self.conf_threshold = conf_threshold + self.iou_threshold = iou_threshold + self.input_size = input_size + self._session = None + self._model_path = model_path + + @property + def session(self): + """Lazy-load ONNX Runtime session.""" + if self._session is None: + import onnxruntime as ort + + # Determine model path + if self._model_path is None: + # Assuming models are in ../models relative to server/detector.py + models_dir = Path(__file__).parent.parent / "models" + model_path = str(models_dir / self.DEFAULT_MODEL) + else: + model_path = self._model_path + + if not os.path.exists(model_path): + raise FileNotFoundError(f"Model not found: {model_path}") + + # Configure providers (prefer CUDA) + providers = [] + if 'CUDAExecutionProvider' in ort.get_available_providers(): + providers.append('CUDAExecutionProvider') + print("[FaceMask] Using CUDA GPU for inference") + providers.append('CPUExecutionProvider') + + # Create session + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + + self._session = ort.InferenceSession( + model_path, + sess_options=sess_options, + providers=providers, + ) + + print(f"[FaceMask] YOLO model loaded: {model_path}") + print(f"[FaceMask] Providers: {self._session.get_providers()}") + + return self._session + + def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]: + """ + Detect faces in a frame. + + Args: + frame: BGR image as numpy array (H, W, C) + + Returns: + List of detections as (x, y, width, height, confidence) + """ + import cv2 + + original_height, original_width = frame.shape[:2] + + input_tensor = self._preprocess(frame) + # print(f"[DEBUG] Input tensor shape: {input_tensor.shape}, Range: [{input_tensor.min():.3f}, {input_tensor.max():.3f}]", flush=True) + + # Run inference + input_name = self.session.get_inputs()[0].name + outputs = self.session.run(None, {input_name: input_tensor}) + + raw_output = outputs[0] + # print(f"[DEBUG] Raw output shape: {raw_output.shape}, Range: [{raw_output.min():.3f}, {raw_output.max():.3f}]", flush=True) + + # Postprocess + detections = self._postprocess( + raw_output, + original_width, + original_height, + ) + # print(f"[DEBUG] Detections found: {len(detections)}", flush=True) + + return detections + + def _preprocess(self, frame: np.ndarray) -> np.ndarray: + """Preprocess frame for YOLO input with letterbox resizing.""" + import cv2 + + # Letterbox resize + shape = frame.shape[:2] # current shape [height, width] + new_shape = self.input_size + + # Scale ratio (new / old) + r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) + + # Compute padding + ratio = r, r # width, height ratios + new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) + dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding + + dw /= 2 # divide padding into 2 sides + dh /= 2 + + if shape[::-1] != new_unpad: # resize + frame = cv2.resize(frame, new_unpad, interpolation=cv2.INTER_LINEAR) + + top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) + left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) + + # Add border + frame = cv2.copyMakeBorder(frame, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)) + + # Store metadata for postprocessing + self._last_letterbox_meta = {'ratio': ratio, 'dwdh': (dw, dh)} + + # Convert BGR to RGB + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + + # Normalize to [0, 1] + normalized = rgb.astype(np.float32) / 255.0 + + # Transpose to CHW format + transposed = np.transpose(normalized, (2, 0, 1)) + + # Add batch dimension + batched = np.expand_dims(transposed, axis=0) + + return batched + + def _postprocess( + self, + output: np.ndarray, + original_width: int, + original_height: int, + ) -> List[Tuple[int, int, int, int, float]]: + """ + Postprocess YOLO output to get detections. + """ + # Output shape: [1, num_detections, 5+] where 5 = x_center, y_center, w, h, conf + + # Handle different output formats + if output.shape[1] < output.shape[2]: + # Format: [1, 5+, num_detections] - transpose + output = np.transpose(output[0], (1, 0)) + else: + output = output[0] + + # Debug confidence stats + # if output.shape[1] >= 5: + # max_conf = output[:, 4].max() + # print(f"[DEBUG] Max confidence in raw output: {max_conf:.4f}", flush=True) + + # Filter by confidence + confidences = output[:, 4] + mask = confidences > self.conf_threshold + filtered = output[mask] + + if len(filtered) == 0: + return [] + + # Get letterbox metadata + if hasattr(self, '_last_letterbox_meta') and self._last_letterbox_meta: + ratio = self._last_letterbox_meta['ratio'] + dw, dh = self._last_letterbox_meta['dwdh'] + + # Extract coordinates + x_center = filtered[:, 0] + y_center = filtered[:, 1] + width = filtered[:, 2] + height = filtered[:, 3] + confidences = filtered[:, 4] + + # Convert center to corner + x1 = x_center - width / 2 + y1 = y_center - height / 2 + x2 = x_center + width / 2 + y2 = y_center + height / 2 + + # Adjust for letterbox padding + x1 -= dw + y1 -= dh + x2 -= dw + y2 -= dh + + # Adjust for resizing + x1 /= ratio[0] + y1 /= ratio[1] + x2 /= ratio[0] + y2 /= ratio[1] + + # Clip to image bounds + x1 = np.clip(x1, 0, original_width) + y1 = np.clip(y1, 0, original_height) + x2 = np.clip(x2, 0, original_width) + y2 = np.clip(y2, 0, original_height) + + # Convert back to x, y, w, h + final_x = x1 + final_y = y1 + final_w = x2 - x1 + final_h = y2 - y1 + + else: + # Fallback for non-letterbox (legacy) + scale_x = original_width / self.input_size[0] + scale_y = original_height / self.input_size[1] + + x_center = filtered[:, 0] * scale_x + y_center = filtered[:, 1] * scale_y + width = filtered[:, 2] * scale_x + height = filtered[:, 3] * scale_y + confidences = filtered[:, 4] + + final_x = x_center - width / 2 + final_y = y_center - height / 2 + final_w = width + final_h = height + + # Apply NMS + boxes = np.stack([final_x, final_y, final_w, final_h], axis=1) + indices = self._nms(boxes, confidences, self.iou_threshold) + + # Format output + detections = [] + for i in indices: + x = int(final_x[i]) + y = int(final_y[i]) + w = int(final_w[i]) + h = int(final_h[i]) + conf = float(confidences[i]) + detections.append((x, y, w, h, conf)) + + return detections + + def _nms( + self, + boxes: np.ndarray, + scores: np.ndarray, + iou_threshold: float, + ) -> List[int]: + """Non-Maximum Suppression.""" + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = x1 + boxes[:, 2] + y2 = y1 + boxes[:, 3] + + areas = boxes[:, 2] * boxes[:, 3] + order = scores.argsort()[::-1] + + keep = [] + while len(order) > 0: + i = order[0] + keep.append(i) + + if len(order) == 1: + break + + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0, xx2 - xx1) + h = np.maximum(0, yy2 - yy1) + inter = w * h + + iou = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(iou <= iou_threshold)[0] + order = order[inds + 1] + + return keep + + def generate_mask( + self, + frame_shape: Tuple[int, int, int], + detections: List[Tuple[int, int, int, int, float]], + mask_scale: float = 1.5, + feather_radius: int = 20, + ) -> np.ndarray: + """ + Generate a mask image from face detections. + + Args: + frame_shape: Shape of the original frame (height, width, channels) + detections: List of face detections (x, y, w, h, conf) + mask_scale: Scale factor for mask region + feather_radius: Radius for edge feathering + + Returns: + Grayscale mask image (white = blur, black = keep) + """ + import cv2 + + height, width = frame_shape[:2] + mask = np.zeros((height, width), dtype=np.uint8) + + for (x, y, w, h, conf) in detections: + # Scale the bounding box + center_x = x + w // 2 + center_y = y + h // 2 + + scaled_w = int(w * mask_scale) + scaled_h = int(h * mask_scale) + + # Draw ellipse for natural face shape + cv2.ellipse( + mask, + (center_x, center_y), + (scaled_w // 2, scaled_h // 2), + 0, # angle + 0, 360, # arc + 255, # color (white) + -1, # filled + ) + + # Apply Gaussian blur for feathering + if feather_radius > 0 and len(detections) > 0: + kernel_size = feather_radius * 2 + 1 + mask = cv2.GaussianBlur(mask, (kernel_size, kernel_size), 0) + + return mask + + +# Singleton instance +_detector: Optional[YOLOFaceDetector] = None + + +def get_detector(**kwargs) -> YOLOFaceDetector: + """Get or create the global YOLO detector instance.""" + global _detector + if _detector is None: + _detector = YOLOFaceDetector(**kwargs) + return _detector diff --git a/server/main.py b/server/main.py new file mode 100644 index 0000000..0f12d8a --- /dev/null +++ b/server/main.py @@ -0,0 +1,180 @@ +""" +Face Detection Inference Server. + +This FastAPI application runs in a separate process to handle +GPU-accelerated face detection using ONNX Runtime. +""" + +import os +import sys +import threading +import uuid +import queue +import traceback +from typing import Dict, Optional, List +from pathlib import Path + +from fastapi import FastAPI, HTTPException, BackgroundTasks +from pydantic import BaseModel +import uvicorn +import cv2 +import numpy as np + +# Add project root to path for imports if needed +sys.path.append(str(Path(__file__).parent.parent)) + +from server.detector import YOLOFaceDetector, get_detector + +app = FastAPI(title="Face Mask Inference Server") + +# Task storage +class TaskStatus: + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + CANCELLED = "cancelled" + +class Task(BaseModel): + id: str + status: str + progress: int = 0 + total: int = 0 + message: Optional[str] = None + result_path: Optional[str] = None + +# In-memory storage +tasks: Dict[str, Task] = {} +cancel_events: Dict[str, threading.Event] = {} + +class GenerateRequest(BaseModel): + video_path: str + output_dir: str + start_frame: int + end_frame: int + conf_threshold: float = 0.5 + iou_threshold: float = 0.45 + mask_scale: float = 1.5 + +def process_video_task(task_id: str, req: GenerateRequest): + """Background task to process video.""" + try: + tasks[task_id].status = TaskStatus.PROCESSING + cancel_event = cancel_events.get(task_id) + + # Verify video exists + if not os.path.exists(req.video_path): + tasks[task_id].status = TaskStatus.FAILED + tasks[task_id].message = f"Video not found: {req.video_path}" + return + + # Initialize detector (will load model on first run) + print(f"Loading detector for task {task_id}...") + detector = get_detector( + conf_threshold=req.conf_threshold, + iou_threshold=req.iou_threshold + ) + # Ensure session is loaded + _ = detector.session + + # Open video + cap = cv2.VideoCapture(req.video_path) + if not cap.isOpened(): + tasks[task_id].status = TaskStatus.FAILED + tasks[task_id].message = "Failed to open video" + return + + # Determine frame range + total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + end_frame = min(req.end_frame, total_video_frames - 1) + frames_to_process = end_frame - req.start_frame + 1 + + tasks[task_id].total = frames_to_process + + # Ensure output directory exists + os.makedirs(req.output_dir, exist_ok=True) + + print(f"Starting processing: {req.video_path} ({frames_to_process} frames)") + + # Process loop + current_count = 0 + for frame_idx in range(req.start_frame, end_frame + 1): + if cancel_event and cancel_event.is_set(): + tasks[task_id].status = TaskStatus.CANCELLED + tasks[task_id].message = "Cancelled by user" + break + + # Read frame + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) + ret, frame = cap.read() + + if ret: + # Detect + detections = detector.detect(frame) + + # Generate mask + mask = detector.generate_mask( + frame.shape, + detections, + mask_scale=req.mask_scale + ) + + # Save + mask_filename = f"mask_{current_count:06d}.png" # Note: using relative index for filename + mask_path = os.path.join(req.output_dir, mask_filename) + cv2.imwrite(mask_path, mask) + + # Update progress + current_count += 1 + tasks[task_id].progress = current_count + + cap.release() + + if tasks[task_id].status == TaskStatus.PROCESSING: + tasks[task_id].status = TaskStatus.COMPLETED + tasks[task_id].result_path = req.output_dir + tasks[task_id].message = "Processing completed successfully" + print(f"Task {task_id} completed.") + + except Exception as e: + tasks[task_id].status = TaskStatus.FAILED + tasks[task_id].message = str(e) + print(f"Error in task {task_id}: {e}") + traceback.print_exc() + finally: + # Cleanup + if task_id in cancel_events: + del cancel_events[task_id] + +@app.get("/status") +def get_status(): + return {"status": "running", "gpu_available": True} # TODO: check GPU + +@app.post("/generate", response_model=Task) +def generate_mask_endpoint(req: GenerateRequest, background_tasks: BackgroundTasks): + task_id = str(uuid.uuid4()) + task = Task(id=task_id, status=TaskStatus.PENDING) + tasks[task_id] = task + cancel_events[task_id] = threading.Event() + + background_tasks.add_task(process_video_task, task_id, req) + return task + +@app.get("/tasks/{task_id}", response_model=Task) +def get_task(task_id: str): + if task_id not in tasks: + raise HTTPException(status_code=404, detail="Task not found") + return tasks[task_id] + +@app.post("/tasks/{task_id}/cancel") +def cancel_task(task_id: str): + if task_id not in tasks: + raise HTTPException(status_code=404, detail="Task not found") + + if task_id in cancel_events: + cancel_events[task_id].set() + + return {"message": "Cancellation requested"} + +if __name__ == "__main__": + uvicorn.run(app, host="127.0.0.1", port=8181)