blender-mask-peoples/server/main.py

"""
Face Detection Inference Server.

This FastAPI application runs in a separate process to handle
GPU-accelerated face detection using ONNX Runtime.
"""

import os
import sys
import platform

# CRITICAL: Fix LD_LIBRARY_PATH before importing cv2 or torch
# cv2 adds its own lib path to the front, which can override ROCm libraries
def fix_library_path():
    """Ensure ROCm libraries are loaded before cv2's bundled libraries."""
    ld_path = os.environ.get('LD_LIBRARY_PATH', '')

    # Split and filter paths
    paths = [p for p in ld_path.split(':') if p]

    # Separate ROCm/GPU paths from other paths
    rocm_paths = [p for p in paths if 'rocm' in p.lower() or 'clr-' in p or 'hip' in p.lower()]
    other_paths = [p for p in paths if p not in rocm_paths]

    # Rebuild with ROCm paths first
    if rocm_paths:
        new_ld_path = ':'.join(rocm_paths + other_paths)
        os.environ['LD_LIBRARY_PATH'] = new_ld_path
        print("[FaceMask] Fixed LD_LIBRARY_PATH to prioritize ROCm libraries")

# Fix library path BEFORE any other imports
fix_library_path()

import queue  # noqa: E402
import threading  # noqa: E402
import uuid  # noqa: E402
import traceback  # noqa: E402
import subprocess  # noqa: E402
from typing import Dict, Optional, List  # noqa: E402
from pathlib import Path  # noqa: E402

from fastapi import FastAPI, HTTPException, BackgroundTasks  # noqa: E402
from pydantic import BaseModel  # noqa: E402
import uvicorn  # noqa: E402
import cv2  # noqa: E402
import numpy as np  # noqa: E402
import msgpack  # noqa: E402

# Add project root to path for imports if needed
sys.path.append(str(Path(__file__).parent.parent))

from server.detector import get_detector, get_pose_detector  # noqa: E402

app = FastAPI(title="Face Mask Inference Server")


def _get_r_frame_rate(video_path: str) -> tuple:
    """ffprobe でコンテナ宣言の r_frame_rate を取得する。

    Returns:
        (fps_float, fps_str): fps_str は "120/1" のような分数文字列。
        取得失敗時は (0.0, "")。
    """
    try:
        result = subprocess.run(
            [
                "ffprobe", "-v", "error",
                "-select_streams", "v:0",
                "-show_entries", "stream=r_frame_rate",
                "-of", "default=noprint_wrappers=1:nokey=1",
                video_path,
            ],
            capture_output=True,
            text=True,
            timeout=10,
        )
        if result.returncode == 0:
            rate_str = result.stdout.strip()
            if "/" in rate_str:
                num, den = rate_str.split("/")
                fps_float = float(num) / float(den)
            else:
                fps_float = float(rate_str)
                rate_str = str(fps_float)
            return fps_float, rate_str
    except Exception:
        pass
    return 0.0, ""

# GPU status cache
_gpu_status_cache = None

# Task storage
class TaskStatus:
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"

class Task(BaseModel):
    id: str
    status: str
    progress: int = 0
    total: int = 0
    message: Optional[str] = None
    result_path: Optional[str] = None

# In-memory storage
tasks: Dict[str, Task] = {}
cancel_events: Dict[str, threading.Event] = {}

class GenerateRequest(BaseModel):
    video_path: str
    output_dir: str
    start_frame: int
    end_frame: int
    conf_threshold: float = 0.5
    iou_threshold: float = 0.45


class VideoInfoRequest(BaseModel):
    video_path: str


class BakeRequest(BaseModel):
    video_path: str
    detections_path: str
    output_path: str
    blur_size: int = 50
    display_scale: float = 1.0
    format: str = "mp4"


class GenerateImagesRequest(BaseModel):
    image_dir: str
    filenames: List[str]
    output_dir: str
    start_index: int = 0
    end_index: int = -1
    conf_threshold: float = 0.5
    iou_threshold: float = 0.45


class AugmentPoseRequest(BaseModel):
    detections_path: str
    conf_threshold: float = 0.5
    iou_threshold: float = 0.45


class BakeImagesRequest(BaseModel):
    image_dir: str
    filenames: List[str]
    output_dir: str
    detections_path: str
    blur_size: int = 50
    display_scale: float = 1.0


class _FFmpegPipeWriter:
    """Write BGR frames to ffmpeg stdin."""

    def __init__(self, cmd: List[str]):
        self._proc = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.PIPE,
        )
        if self._proc.stdin is None:
            self._proc.kill()
            raise RuntimeError("Failed to open ffmpeg stdin")

    def write(self, frame: np.ndarray) -> None:
        if self._proc.stdin is None:
            raise RuntimeError("ffmpeg stdin is not available")
        self._proc.stdin.write(frame.tobytes())

    def release(self) -> None:
        if self._proc.stdin:
            try:
                self._proc.stdin.close()
            except Exception:
                pass

        stderr_output = ""
        if self._proc.stderr:
            try:
                stderr_output = self._proc.stderr.read().decode("utf-8", errors="replace")
            except Exception:
                stderr_output = ""

        rc = self._proc.wait()
        if rc != 0:
            tail = "\n".join(stderr_output.strip().splitlines()[-8:])
            raise RuntimeError(f"ffmpeg writer failed (code={rc}): {tail}")


def _build_ffmpeg_vaapi_writer(
    output_path: str,
    fps: float,
    width: int,
    height: int,
    out_fps_str: str = "",
) -> _FFmpegPipeWriter:
    """Create ffmpeg h264_vaapi writer with QP=24 (balanced quality/speed).

    fps: ソース動画の avg_frame_rate（rawパイプの入力レート）
    out_fps_str: 出力コンテナに宣言する r_frame_rate（"120/1" 等）。
                 ソースと異なる場合は fps フィルタでフレームを補完する。
    """
    # ソースの avg_fps と出力の r_fps が有意に異なる場合のみ fps フィルタを挿入
    needs_fps_filter = bool(out_fps_str)
    if needs_fps_filter:
        try:
            if "/" in out_fps_str:
                num, den = out_fps_str.split("/")
                out_fps_float = float(num) / float(den)
            else:
                out_fps_float = float(out_fps_str)
            needs_fps_filter = abs(out_fps_float - fps) > 0.01
        except ValueError:
            needs_fps_filter = False

    if needs_fps_filter:
        vf = f"format=nv12,fps={out_fps_str},hwupload"
        print(f"[FaceMask] fps filter: {fps:.3f} -> {out_fps_str}")
    else:
        vf = "format=nv12,hwupload"

    cmd = [
        "ffmpeg",
        "-hide_banner",
        "-loglevel",
        "error",
        "-y",
        "-vaapi_device",
        "/dev/dri/renderD128",
        "-f",
        "rawvideo",
        "-pix_fmt",
        "bgr24",
        "-s",
        f"{width}x{height}",
        "-r",
        f"{fps}",
        "-i",
        "-",
        "-an",
        "-vf",
        vf,
        "-c:v",
        "h264_vaapi",
        "-qp",
        "24",
        output_path,
    ]
    return _FFmpegPipeWriter(cmd)


def _build_video_writer(
    output_path: str,
    fmt: str,
    fps: float,
    width: int,
    height: int,
    out_fps_str: str = "",
) -> object:
    """Create writer with VAAPI preference and OpenCV fallback."""
    format_key = fmt.lower()

    if format_key in {"mp4", "mov"}:
        try:
            writer = _build_ffmpeg_vaapi_writer(output_path, fps, width, height, out_fps_str)
            print("[FaceMask] Using output encoder: ffmpeg h264_vaapi (-qp 24)")
            return writer
        except Exception as e:
            print(f"[FaceMask] VAAPI writer unavailable, fallback to OpenCV: {e}")

    codec_candidates = {
        "mp4": ["avc1", "mp4v"],
        "mov": ["avc1", "mp4v"],
        "avi": ["MJPG", "XVID"],
    }.get(format_key, ["mp4v"])

    for codec in codec_candidates:
        writer = cv2.VideoWriter(
            output_path,
            cv2.VideoWriter_fourcc(*codec),
            fps,
            (width, height),
            isColor=True,
        )
        if writer.isOpened():
            print(f"[FaceMask] Using output codec: {codec}")
            return writer
        writer.release()

    raise RuntimeError(f"Failed to create video writer for format='{fmt}'")

def _scale_bbox(
    x: int,
    y: int,
    w: int,
    h: int,
    scale: float,
    frame_width: int,
    frame_height: int,
) -> Optional[List[int]]:
    """Scale bbox around center and clamp to frame boundaries."""
    if w <= 0 or h <= 0:
        return None

    center_x = x + (w * 0.5)
    center_y = y + (h * 0.5)
    scaled_w = max(1, int(w * scale))
    scaled_h = max(1, int(h * scale))

    x1 = max(0, int(center_x - scaled_w * 0.5))
    y1 = max(0, int(center_y - scaled_h * 0.5))
    x2 = min(frame_width, x1 + scaled_w)
    y2 = min(frame_height, y1 + scaled_h)
    out_w = x2 - x1
    out_h = y2 - y1
    if out_w <= 0 or out_h <= 0:
        return None
    return [x1, y1, out_w, out_h]


def _apply_face_blur_inplace(
    frame: np.ndarray,
    frame_boxes: list,
    src_width: int,
    src_height: int,
    blur_size: int,
    display_scale: float,
    blur_margin: int,
) -> None:
    """検出済み顔領域にガウスぼかしを適用する（in-place）。"""
    if not frame_boxes:
        return

    for box in frame_boxes:
        if not isinstance(box, list) or len(box) < 4:
            continue
        x, y, w, h = int(box[0]), int(box[1]), int(box[2]), int(box[3])
        if w <= 0 or h <= 0:
            continue

        cx = x + w / 2
        cy = y + h / 2
        dw = max(1, int(w * display_scale))
        dh = max(1, int(h * display_scale))
        dx = int(cx - dw / 2)
        dy = int(cy - dh / 2)

        roi_x1 = max(0, dx - blur_margin)
        roi_y1 = max(0, dy - blur_margin)
        roi_x2 = min(src_width, dx + dw + blur_margin)
        roi_y2 = min(src_height, dy + dh + blur_margin)
        roi_width = roi_x2 - roi_x1
        roi_height = roi_y2 - roi_y1
        if roi_width <= 0 or roi_height <= 0:
            continue

        roi_src = frame[roi_y1:roi_y2, roi_x1:roi_x2]
        small_w = max(1, roi_width // 2)
        small_h = max(1, roi_height // 2)
        roi_small = cv2.resize(roi_src, (small_w, small_h), interpolation=cv2.INTER_LINEAR)
        small_blur_size = max(3, (blur_size // 2) | 1)
        roi_small_blurred = cv2.GaussianBlur(roi_small, (small_blur_size, small_blur_size), 0)
        roi_blurred = cv2.resize(roi_small_blurred, (roi_width, roi_height), interpolation=cv2.INTER_LINEAR)

        roi_mask = np.zeros((roi_height, roi_width), dtype=np.uint8)
        center = (int(cx) - roi_x1, int(cy) - roi_y1)
        axes = (max(1, dw // 2), max(1, dh // 2))
        cv2.ellipse(roi_mask, center, axes, 0, 0, 360, 255, -1)

        result = roi_src.copy()
        cv2.copyTo(roi_blurred, roi_mask, result)
        frame[roi_y1:roi_y2, roi_x1:roi_x2] = result


def process_images_task(task_id: str, req: GenerateImagesRequest):
    """画像シーケンスから顔を検出して msgpack キャッシュを保存する。"""
    try:
        tasks[task_id].status = TaskStatus.PROCESSING
        cancel_event = cancel_events.get(task_id)

        if not os.path.exists(req.image_dir):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Image directory not found: {req.image_dir}"
            return
        if not req.filenames:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "No filenames provided"
            return

        detector = get_detector(
            conf_threshold=req.conf_threshold,
            iou_threshold=req.iou_threshold,
        )
        _ = detector.model

        total_files = len(req.filenames)
        start_idx = max(0, req.start_index)
        end_idx = req.end_index if req.end_index >= 0 else total_files - 1
        end_idx = min(end_idx, total_files - 1)

        if start_idx > end_idx:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Invalid index range"
            return

        indices = list(range(start_idx, end_idx + 1))
        tasks[task_id].total = len(indices)
        os.makedirs(req.output_dir, exist_ok=True)
        output_msgpack_path = os.path.join(req.output_dir, "detections.msgpack")

        # 画像サイズを最初のファイルから取得
        first_path = os.path.join(req.image_dir, req.filenames[start_idx])
        first_img = cv2.imread(first_path)
        if first_img is None:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Cannot read image: {first_path}"
            return
        height, width = first_img.shape[:2]

        frame_buffer: List[np.ndarray] = []
        frame_detections: List[List[List[float]]] = []
        batch_size = 5
        current_count = 0

        def process_batch():
            nonlocal current_count
            if not frame_buffer:
                return
            batch_det = detector.detect_batch(frame_buffer)
            for detections in batch_det:
                packed: List[List[float]] = []
                for x, y, w, h, conf in detections:
                    bx, by, bw, bh = int(x), int(y), int(w), int(h)
                    bx = max(0, bx)
                    by = max(0, by)
                    bw = min(width - bx, bw)
                    bh = min(height - by, bh)
                    if bw <= 0 or bh <= 0:
                        continue
                    packed.append([bx, by, bw, bh, float(conf)])
                frame_detections.append(packed)
                current_count += 1
                tasks[task_id].progress = current_count
            frame_buffer.clear()

        print(
            f"[FaceMask] Starting image detection: {req.image_dir} "
            f"({len(indices)} images) -> {output_msgpack_path}"
        )

        for file_idx in indices:
            if cancel_event and cancel_event.is_set():
                tasks[task_id].status = TaskStatus.CANCELLED
                tasks[task_id].message = "Cancelled by user"
                break

            img_path = os.path.join(req.image_dir, req.filenames[file_idx])
            frame = cv2.imread(img_path)
            if frame is None:
                frame_detections.append([])
                current_count += 1
                tasks[task_id].progress = current_count
                continue

            frame_buffer.append(frame)
            if len(frame_buffer) >= batch_size:
                process_batch()

        if frame_buffer:
            process_batch()

        if tasks[task_id].status == TaskStatus.PROCESSING:
            payload = {
                "version": 1,
                "image_dir": req.image_dir,
                "filenames": req.filenames,
                "start_frame": start_idx,
                "end_frame": start_idx + len(frame_detections) - 1,
                "width": width,
                "height": height,
                "fps": 0.0,
                "mask_scale": 1.0,
                "frames": frame_detections,
            }
            with open(output_msgpack_path, "wb") as f:
                f.write(msgpack.packb(payload, use_bin_type=True))

            tasks[task_id].status = TaskStatus.COMPLETED
            tasks[task_id].result_path = output_msgpack_path
            tasks[task_id].message = "Image detection cache completed"
            print(f"[FaceMask] Image detection done: {output_msgpack_path}")

    except Exception as e:
        tasks[task_id].status = TaskStatus.FAILED
        tasks[task_id].message = str(e)
        traceback.print_exc()
    finally:
        if task_id in cancel_events:
            del cancel_events[task_id]


def process_bake_images_task(task_id: str, req: BakeImagesRequest):
    """画像シーケンスに顔ぼかしを適用して新ディレクトリへ書き出す。"""
    try:
        tasks[task_id].status = TaskStatus.PROCESSING
        cancel_event = cancel_events.get(task_id)

        if not os.path.exists(req.image_dir):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Image directory not found: {req.image_dir}"
            return
        if not os.path.exists(req.detections_path):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Detections file not found: {req.detections_path}"
            return

        with open(req.detections_path, "rb") as f:
            payload = msgpack.unpackb(f.read(), raw=False)
        frames_detections = payload.get("frames")
        if not isinstance(frames_detections, list):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Invalid detections format: 'frames' is missing"
            return

        det_start_frame = int(payload.get("start_frame", 0))

        blur_size = max(1, int(req.blur_size))
        if blur_size % 2 == 0:
            blur_size += 1
        display_scale = max(0.1, float(req.display_scale))
        blur_margin = blur_size // 2

        os.makedirs(req.output_dir, exist_ok=True)
        total = len(req.filenames)
        tasks[task_id].total = total

        print(
            f"[FaceMask] Starting image bake: {req.image_dir} "
            f"({total} images) -> {req.output_dir}"
        )

        for i, filename in enumerate(req.filenames):
            if cancel_event and cancel_event.is_set():
                tasks[task_id].status = TaskStatus.CANCELLED
                tasks[task_id].message = "Cancelled by user"
                return

            src_path = os.path.join(req.image_dir, filename)
            frame = cv2.imread(src_path)
            if frame is None:
                tasks[task_id].progress = i + 1
                continue

            h, w = frame.shape[:2]
            det_idx = i - det_start_frame
            frame_boxes = (
                frames_detections[det_idx]
                if 0 <= det_idx < len(frames_detections)
                else []
            )
            _apply_face_blur_inplace(frame, frame_boxes, w, h, blur_size, display_scale, blur_margin)

            out_path = os.path.join(req.output_dir, filename)
            cv2.imwrite(out_path, frame)
            tasks[task_id].progress = i + 1

        if tasks[task_id].status == TaskStatus.PROCESSING:
            tasks[task_id].status = TaskStatus.COMPLETED
            tasks[task_id].result_path = req.output_dir
            tasks[task_id].message = "Image blur bake completed"
            print(f"[FaceMask] Image bake completed: {req.output_dir}")

    except Exception as e:
        tasks[task_id].status = TaskStatus.FAILED
        tasks[task_id].message = str(e)
        traceback.print_exc()
    finally:
        if task_id in cancel_events:
            del cancel_events[task_id]


def augment_pose_task(task_id: str, req: AugmentPoseRequest):
    """Background task: run pose estimation and merge results into existing cache."""
    cap = None
    try:
        tasks[task_id].status = TaskStatus.PROCESSING
        cancel_event = cancel_events.get(task_id)

        if not os.path.exists(req.detections_path):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Detections file not found: {req.detections_path}"
            return

        with open(req.detections_path, "rb") as f:
            payload = msgpack.unpackb(f.read(), raw=False)

        existing_frames: List[List[List[float]]] = payload.get("frames", [])
        video_path = payload.get("video_path")
        start_frame = int(payload.get("start_frame", 0))
        total = len(existing_frames)

        if not video_path:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Cache does not contain video_path (image caches not supported)"
            return
        if not os.path.exists(video_path):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Video not found: {video_path}"
            return
        if total == 0:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Cache has no frames"
            return

        tasks[task_id].total = total

        detector = get_pose_detector(
            conf_threshold=req.conf_threshold,
            iou_threshold=req.iou_threshold,
        )
        _ = detector.model

        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Failed to open video"
            return

        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        if start_frame > 0:
            seek_ok = cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
            if not seek_ok:
                for _ in range(start_frame):
                    ret, _ = cap.read()
                    if not ret:
                        tasks[task_id].status = TaskStatus.FAILED
                        tasks[task_id].message = f"Failed to seek to start frame: {start_frame}"
                        return

        frame_buffer: List[np.ndarray] = []
        buffer_indices: List[int] = []  # existing_frames インデックス対応
        current_count = 0
        batch_size = 5

        def process_pose_batch():
            nonlocal current_count
            if not frame_buffer:
                return
            batch_detections = detector.detect_batch(frame_buffer)
            for idx, detections in zip(buffer_indices, batch_detections):
                for x, y, w, h, conf in detections:
                    bx, by, bw, bh = int(x), int(y), int(w), int(h)
                    bx = max(0, bx)
                    by = max(0, by)
                    bw = min(width - bx, bw)
                    bh = min(height - by, bh)
                    if bw > 0 and bh > 0:
                        existing_frames[idx].append([bx, by, bw, bh, float(conf)])
                current_count += 1
                tasks[task_id].progress = current_count
            frame_buffer.clear()
            buffer_indices.clear()

        for i in range(total):
            if cancel_event and cancel_event.is_set():
                tasks[task_id].status = TaskStatus.CANCELLED
                tasks[task_id].message = "Cancelled by user"
                break

            ret, frame = cap.read()
            if not ret:
                break

            frame_buffer.append(frame)
            buffer_indices.append(i)
            if len(frame_buffer) >= batch_size:
                process_pose_batch()

        if frame_buffer:
            process_pose_batch()

        if tasks[task_id].status == TaskStatus.PROCESSING:
            payload["frames"] = existing_frames
            with open(req.detections_path, "wb") as f:
                f.write(msgpack.packb(payload, use_bin_type=True))

            tasks[task_id].status = TaskStatus.COMPLETED
            tasks[task_id].result_path = req.detections_path
            tasks[task_id].message = "Pose augmentation completed"
            print(f"[FaceMask] Pose augmentation completed: {req.detections_path}")

    except Exception as e:
        tasks[task_id].status = TaskStatus.FAILED
        tasks[task_id].message = str(e)
        traceback.print_exc()
    finally:
        if cap:
            cap.release()
        if task_id in cancel_events:
            del cancel_events[task_id]


def process_video_task(task_id: str, req: GenerateRequest):
    """Background task to detect faces and save bbox cache as msgpack."""
    cap = None

    try:
        tasks[task_id].status = TaskStatus.PROCESSING
        cancel_event = cancel_events.get(task_id)

        if not os.path.exists(req.video_path):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Video not found: {req.video_path}"
            return

        print(f"Loading detector for task {task_id}...")
        detector = get_detector(
            conf_threshold=req.conf_threshold,
            iou_threshold=req.iou_threshold,
        )
        _ = detector.model

        cap = cv2.VideoCapture(req.video_path)
        if not cap.isOpened():
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Failed to open video"
            return

        fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        end_frame = min(req.end_frame, total_video_frames - 1)
        frames_to_process = end_frame - req.start_frame + 1
        if frames_to_process <= 0:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Invalid frame range"
            return

        tasks[task_id].total = frames_to_process
        os.makedirs(req.output_dir, exist_ok=True)
        output_msgpack_path = os.path.join(req.output_dir, "detections.msgpack")

        if req.start_frame > 0:
            seek_ok = cap.set(cv2.CAP_PROP_POS_FRAMES, req.start_frame)
            if not seek_ok:
                for _ in range(req.start_frame):
                    ret, _ = cap.read()
                    if not ret:
                        tasks[task_id].status = TaskStatus.FAILED
                        tasks[task_id].message = (
                            f"Failed to seek to start frame: {req.start_frame}"
                        )
                        return

        frame_buffer: List[np.ndarray] = []
        frame_detections: List[List[List[float]]] = []
        batch_size = 5
        current_count = 0

        def process_batch():
            nonlocal current_count
            if not frame_buffer:
                return

            batch_detections = detector.detect_batch(frame_buffer)
            for detections in batch_detections:
                packed_detections: List[List[float]] = []
                for x, y, w, h, conf in detections:
                    # bboxをそのまま保存（表示スケールはBake時に適用）
                    bx, by, bw, bh = int(x), int(y), int(w), int(h)
                    bx = max(0, bx)
                    by = max(0, by)
                    bw = min(width - bx, bw)
                    bh = min(height - by, bh)
                    if bw <= 0 or bh <= 0:
                        continue
                    packed_detections.append([bx, by, bw, bh, float(conf)])
                frame_detections.append(packed_detections)
                current_count += 1
                tasks[task_id].progress = current_count

            frame_buffer.clear()

        print(
            f"Starting detection cache generation: {req.video_path} "
            f"({frames_to_process} frames) -> {output_msgpack_path}"
        )

        for _ in range(req.start_frame, end_frame + 1):
            if cancel_event and cancel_event.is_set():
                tasks[task_id].status = TaskStatus.CANCELLED
                tasks[task_id].message = "Cancelled by user"
                break

            ret, frame = cap.read()
            if not ret:
                break

            frame_buffer.append(frame)
            if len(frame_buffer) >= batch_size:
                process_batch()

        if frame_buffer:
            process_batch()

        if tasks[task_id].status == TaskStatus.PROCESSING:
            payload = {
                "version": 1,
                "video_path": req.video_path,
                "start_frame": req.start_frame,
                "end_frame": req.start_frame + len(frame_detections) - 1,
                "width": width,
                "height": height,
                "fps": fps,
                "mask_scale": 1.0,
                "frames": frame_detections,
            }
            with open(output_msgpack_path, "wb") as f:
                f.write(msgpack.packb(payload, use_bin_type=True))

            tasks[task_id].status = TaskStatus.COMPLETED
            tasks[task_id].result_path = output_msgpack_path
            tasks[task_id].message = "Detection cache completed"
            print(f"Task {task_id} completed: {output_msgpack_path}")

    except Exception as e:
        tasks[task_id].status = TaskStatus.FAILED
        tasks[task_id].message = str(e)
        print(f"Error in task {task_id}: {e}")
        traceback.print_exc()
    finally:
        if cap:
            cap.release()
        if task_id in cancel_events:
            del cancel_events[task_id]


def process_bake_task(task_id: str, req: BakeRequest):
    """Bake blur using async pipeline: read/process/write run in parallel for 1.35x speedup."""
    try:
        tasks[task_id].status = TaskStatus.PROCESSING
        cancel_event = cancel_events.get(task_id)

        if not os.path.exists(req.video_path):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Video not found: {req.video_path}"
            return
        if not os.path.exists(req.detections_path):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = f"Detections file not found: {req.detections_path}"
            return

        with open(req.detections_path, "rb") as f:
            payload = msgpack.unpackb(f.read(), raw=False)
        frames_detections = payload.get("frames")
        if not isinstance(frames_detections, list):
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Invalid detections format: 'frames' is missing"
            return

        # 検出キャッシュの開始フレーム（ソース動画のフレームインデックス）
        det_start_frame = int(payload.get("start_frame", 0))

        # Get video info
        temp_cap = cv2.VideoCapture(req.video_path)
        if not temp_cap.isOpened():
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Failed to open source video"
            return

        src_fps = temp_cap.get(cv2.CAP_PROP_FPS) or 30.0
        src_width = int(temp_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        src_height = int(temp_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        src_frames = int(temp_cap.get(cv2.CAP_PROP_FRAME_COUNT))
        temp_cap.release()

        # ffprobe で r_frame_rate を取得し、出力コンテナの宣言 FPS をソースに合わせる。
        # 例: 120fps タイムベースで記録された 60fps 動画は r_frame_rate=120/1 だが
        # cv2 は avg_frame_rate=60fps を返すため、Bake 後に Blender がFPSを別値で認識してしまう。
        r_fps_float, r_fps_str = _get_r_frame_rate(req.video_path)
        if r_fps_float > 0:
            print(f"[FaceMask] r_frame_rate={r_fps_str}, avg_fps={src_fps:.3f}")
        else:
            r_fps_str = ""

        if src_width <= 0 or src_height <= 0:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Invalid source video dimensions"
            return

        # ソース動画の全フレームを出力（スワップ後もトリム設定が正しく機能するよう）
        total = src_frames if src_frames > 0 else (det_start_frame + len(frames_detections))
        if total <= 0:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = "Source/detections frame count is zero"
            return
        tasks[task_id].total = total

        output_dir = os.path.dirname(req.output_path)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)

        # Pipeline setup
        blur_size = max(1, int(req.blur_size))
        if blur_size % 2 == 0:
            blur_size += 1
        display_scale = max(0.1, float(req.display_scale))
        # blur_margin は境界問題回避のための計算用余白のみ（表示には使わない）
        blur_margin = blur_size // 2

        # Queues
        queue_size = 8
        read_queue: queue.Queue = queue.Queue(maxsize=queue_size)
        process_queue: queue.Queue = queue.Queue(maxsize=queue_size)

        # Shared state
        error_holder = {"error": None}
        progress_lock = threading.Lock()
        current_progress = [0]

        def _reader_worker():
            """Read frames from video."""
            import time as _time
            cap = cv2.VideoCapture(req.video_path)
            if not cap.isOpened():
                error_holder["error"] = "Failed to open video in reader"
                return

            t_read_total = 0.0
            frame_count = 0
            try:
                for idx in range(total):
                    if cancel_event and cancel_event.is_set():
                        break

                    t0 = _time.perf_counter()
                    ok, frame = cap.read()
                    t_read_total += _time.perf_counter() - t0

                    if not ok:
                        break

                    read_queue.put((idx, frame))
                    frame_count += 1
            except Exception as e:
                error_holder["error"] = f"Reader error: {e}"
            finally:
                cap.release()
                read_queue.put(None)  # Sentinel
                if frame_count > 0:
                    print(
                        f"[Perf/Reader] FINAL frame={frame_count}"
                        f"  read_avg={t_read_total/frame_count*1000:.1f}ms"
                        f"  throughput≈{frame_count/max(t_read_total,1e-9):.1f}fps"
                    )

        def _processor_worker():
            """Process frames with ROI blur."""
            import time as _time
            t_wait_total = 0.0
            t_blur_total = 0.0
            t_blend_total = 0.0
            frame_count = 0
            REPORT_INTERVAL = 50

            try:
                while True:
                    if cancel_event and cancel_event.is_set():
                        process_queue.put(None)
                        break

                    t0 = _time.perf_counter()
                    item = read_queue.get()
                    t_wait_total += _time.perf_counter() - t0

                    if item is None:
                        process_queue.put(None)
                        break

                    idx, frame = item
                    det_idx = idx - det_start_frame
                    frame_boxes = frames_detections[det_idx] if 0 <= det_idx < len(frames_detections) else []

                    if not frame_boxes:
                        process_queue.put((idx, frame))
                        frame_count += 1
                        continue

                    # 各人物ごとに個別ROIで処理（全員まとめると離れた人物間が巨大ROIになるため）
                    valid_boxes = []
                    for box in frame_boxes:
                        if not isinstance(box, list) or len(box) < 4:
                            continue
                        x, y, w, h = int(box[0]), int(box[1]), int(box[2]), int(box[3])
                        if w <= 0 or h <= 0:
                            continue
                        valid_boxes.append((x, y, w, h))

                    if not valid_boxes:
                        process_queue.put((idx, frame))
                        frame_count += 1
                        continue

                    for x, y, w, h in valid_boxes:
                        # display_scale で表示サイズを決定
                        cx = x + w / 2
                        cy = y + h / 2
                        dw = max(1, int(w * display_scale))
                        dh = max(1, int(h * display_scale))
                        dx = int(cx - dw / 2)
                        dy = int(cy - dh / 2)

                        # ROIは表示サイズ + blur_margin（計算用余白、境界問題回避のみ）
                        roi_x1 = max(0, dx - blur_margin)
                        roi_y1 = max(0, dy - blur_margin)
                        roi_x2 = min(src_width, dx + dw + blur_margin)
                        roi_y2 = min(src_height, dy + dh + blur_margin)
                        roi_width = roi_x2 - roi_x1
                        roi_height = roi_y2 - roi_y1

                        if roi_width <= 0 or roi_height <= 0:
                            continue

                        # ブラーはROI全体で計算（余白があるので端の精度が保証される）
                        roi_src = frame[roi_y1:roi_y2, roi_x1:roi_x2]

                        # ダウンサンプル→blur→アップサンプル（同等のぼかしを1/4の計算量で実現）
                        t1 = _time.perf_counter()
                        small_w = max(1, roi_width // 2)
                        small_h = max(1, roi_height // 2)
                        roi_small = cv2.resize(roi_src, (small_w, small_h), interpolation=cv2.INTER_LINEAR)
                        small_blur_size = max(3, (blur_size // 2) | 1)
                        roi_small_blurred = cv2.GaussianBlur(roi_small, (small_blur_size, small_blur_size), 0)
                        roi_blurred = cv2.resize(roi_small_blurred, (roi_width, roi_height), interpolation=cv2.INTER_LINEAR)
                        t_blur_total += _time.perf_counter() - t1

                        # 合成マスクはdisplay_scaleサイズの楕円のみ（featheringなし）
                        roi_mask = np.zeros((roi_height, roi_width), dtype=np.uint8)
                        center = (int(cx) - roi_x1, int(cy) - roi_y1)
                        axes = (max(1, dw // 2), max(1, dh // 2))
                        cv2.ellipse(roi_mask, center, axes, 0, 0, 360, 255, -1)

                        # バイナリマスクなのでcopyToで高速合成（float32変換不要）
                        t2 = _time.perf_counter()
                        result = roi_src.copy()
                        cv2.copyTo(roi_blurred, roi_mask, result)
                        frame[roi_y1:roi_y2, roi_x1:roi_x2] = result
                        t_blend_total += _time.perf_counter() - t2

                    process_queue.put((idx, frame))
                    frame_count += 1

                    if frame_count % REPORT_INTERVAL == 0:
                        n = max(frame_count, 1)
                        fps_proc = frame_count / max(t_wait_total + t_blur_total + t_blend_total, 1e-9)
                        print(
                            f"[Perf/Processor] frame={frame_count}"
                            f"  wait={t_wait_total/n*1000:.1f}ms"
                            f"  blur={t_blur_total/n*1000:.1f}ms"
                            f"  blend={t_blend_total/n*1000:.1f}ms"
                            f"  ROI={roi_width}x{roi_height}"
                            f"  throughput≈{fps_proc:.1f}fps"
                        )

            except Exception as e:
                error_holder["error"] = f"Processor error: {e}"
                process_queue.put(None)
            finally:
                if frame_count > 0:
                    n = max(frame_count, 1)
                    print(
                        f"[Perf/Processor] FINAL frame={frame_count}"
                        f"  wait_avg={t_wait_total/n*1000:.1f}ms"
                        f"  blur_avg={t_blur_total/n*1000:.1f}ms"
                        f"  blend_avg={t_blend_total/n*1000:.1f}ms"
                    )

        def _writer_worker():
            """Write frames to output."""
            import time as _time
            t_wait_total = 0.0
            t_write_total = 0.0
            frame_count = 0
            writer = None
            try:
                writer = _build_video_writer(req.output_path, req.format, src_fps, src_width, src_height, r_fps_str)

                while True:
                    if cancel_event and cancel_event.is_set():
                        break

                    t0 = _time.perf_counter()
                    item = process_queue.get()
                    t_wait_total += _time.perf_counter() - t0

                    if item is None:
                        break

                    idx, frame = item
                    t1 = _time.perf_counter()
                    writer.write(frame)
                    t_write_total += _time.perf_counter() - t1
                    frame_count += 1

                    with progress_lock:
                        current_progress[0] = idx + 1
                        tasks[task_id].progress = current_progress[0]

            except Exception as e:
                error_holder["error"] = f"Writer error: {e}"
            finally:
                if writer:
                    try:
                        writer.release()
                    except Exception as e:
                        print(f"[FaceMask] Writer release error: {e}")
                if frame_count > 0:
                    n = max(frame_count, 1)
                    print(
                        f"[Perf/Writer] FINAL frame={frame_count}"
                        f"  wait_avg={t_wait_total/n*1000:.1f}ms"
                        f"  write_avg={t_write_total/n*1000:.1f}ms"
                    )

        print(
            f"[FaceMask] Starting blur bake: {req.video_path} + "
            f"{req.detections_path} -> {req.output_path}"
        )

        # Start threads
        reader_thread = threading.Thread(target=_reader_worker, daemon=True)
        processor_thread = threading.Thread(target=_processor_worker, daemon=True)
        writer_thread = threading.Thread(target=_writer_worker, daemon=True)

        reader_thread.start()
        processor_thread.start()
        writer_thread.start()

        # Wait for completion
        reader_thread.join()
        processor_thread.join()
        writer_thread.join()

        if error_holder["error"]:
            tasks[task_id].status = TaskStatus.FAILED
            tasks[task_id].message = error_holder["error"]
            print(f"[FaceMask] Bake failed: {error_holder['error']}")
        elif cancel_event and cancel_event.is_set():
            tasks[task_id].status = TaskStatus.CANCELLED
            tasks[task_id].message = "Cancelled by user"
        else:
            tasks[task_id].status = TaskStatus.COMPLETED
            tasks[task_id].result_path = req.output_path
            tasks[task_id].message = "Blur bake completed"
            print(f"[FaceMask] Bake completed: {req.output_path}")

    except Exception as e:
        tasks[task_id].status = TaskStatus.FAILED
        tasks[task_id].message = str(e)
        print(f"Error in async bake task {task_id}: {e}")
        traceback.print_exc()
    finally:
        if task_id in cancel_events:
            del cancel_events[task_id]


def check_gpu_available() -> dict:
    """
    Check if GPU is available for inference.

    Returns a dict with GPU information:
    {
        "available": bool,
        "device_name": str or None,
        "device_count": int,
        "rocm_version": str or None
    }
    """
    global _gpu_status_cache

    # Return cached result if available
    if _gpu_status_cache is not None:
        return _gpu_status_cache

    result = {
        "available": False,
        "device_name": None,
        "device_count": 0,
        "rocm_version": None
    }

    try:
        import torch

        result["available"] = torch.cuda.is_available()

        if result["available"]:
            result["device_count"] = torch.cuda.device_count()
            if result["device_count"] > 0:
                result["device_name"] = torch.cuda.get_device_name(0)

            if hasattr(torch.version, 'hip'):
                result["rocm_version"] = torch.version.hip

    except Exception as e:
        print(f"[FaceMask] Warning: GPU detection failed: {e}")
        result["available"] = False

    # Cache the result
    _gpu_status_cache = result

    return result


def log_startup_diagnostics():
    """Log diagnostic information about the environment and GPU."""
    print("=" * 70)
    print("[FaceMask Server] Startup Diagnostics")
    print("=" * 70)

    # Python Environment
    print("\n[Python Environment]")
    print(f"  Python Version: {sys.version.split()[0]}")
    print(f"  Python Executable: {sys.executable}")
    print(f"  Platform: {platform.platform()}")
    print(f"  Working Directory: {os.getcwd()}")

    # Check if in venv
    in_venv = sys.prefix != sys.base_prefix
    print(f"  Virtual Environment: {'Yes' if in_venv else 'No'}")
    if in_venv:
        print(f"    venv path: {sys.prefix}")

    # ROCm Environment Variables
    print("\n[ROCm Environment Variables]")
    rocm_vars = [
        'ROCM_PATH',
        'HSA_OVERRIDE_GFX_VERSION',
        'PYTORCH_ROCM_ARCH',
        'ROCBLAS_TENSILE_LIBPATH',
        'LD_LIBRARY_PATH'
    ]

    for var in rocm_vars:
        value = os.environ.get(var)
        if value:
            # For LD_LIBRARY_PATH, show if ROCm paths are included
            if var == 'LD_LIBRARY_PATH':
                has_rocm = 'rocm' in value.lower() or 'clr-' in value
                has_hip = 'hip' in value.lower()
                print(f"  {var}: {value[:100]}...")
                print(f"    Contains ROCm paths: {has_rocm}")
                print(f"    Contains HIP paths: {has_hip}")
                if not has_rocm:
                    print("    ⚠️  WARNING: ROCm library paths not found!")
            else:
                if len(value) > 200:
                    display_value = value[:200] + "... (truncated)"
                else:
                    display_value = value
                print(f"  {var}: {display_value}")
        else:
            print(f"  {var}: (not set)")

    # GPU Detection
    print("\n[GPU Detection]")
    try:
        import torch

        cuda_available = torch.cuda.is_available()
        print(f"  torch.cuda.is_available(): {cuda_available}")

        if cuda_available:
            device_count = torch.cuda.device_count()
            print(f"  GPU Device Count: {device_count}")

            if device_count > 0:
                device_name = torch.cuda.get_device_name(0)
                print(f"  GPU Device 0: {device_name}")

                # ROCm version
                if hasattr(torch.version, 'hip'):
                    print(f"  ROCm Version (HIP): {torch.version.hip}")

                # CUDA version (might be emulated by ROCm)
                if torch.version.cuda:
                    print(f"  CUDA Version: {torch.version.cuda}")
        else:
            print("  WARNING: GPU not detected!")
            print("  Server will use CPU for inference (slower)")
            print("  Troubleshooting:")
            print("    - Check ROCm environment variables above")
            print("    - Run: python -c 'import torch; print(torch.cuda.is_available())'")

    except ImportError as e:
        print(f"  ERROR: Cannot import torch: {e}")
        print("  PyTorch must be installed for inference")
    except Exception as e:
        print(f"  ERROR during GPU detection: {e}")

    print("=" * 70)
    print()


@app.get("/status")
def get_status():
    gpu_info = check_gpu_available()

    return {
        "status": "running",
        "gpu_available": gpu_info["available"],
        "gpu_device": gpu_info["device_name"],
        "gpu_count": gpu_info["device_count"],
        "rocm_version": gpu_info["rocm_version"]
    }


@app.post("/video_info")
def get_video_info(req: VideoInfoRequest):
    if not os.path.exists(req.video_path):
        raise HTTPException(status_code=404, detail=f"Video not found: {req.video_path}")

    cap = cv2.VideoCapture(req.video_path)
    if not cap.isOpened():
        raise HTTPException(status_code=400, detail="Failed to open video")

    try:
        avg_fps = float(cap.get(cv2.CAP_PROP_FPS) or 0.0)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0)
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    finally:
        cap.release()

    # Blender は r_frame_rate でタイムライン配置を計算するため、
    # cv2 の avg_frame_rate ではなく r_frame_rate を fps として返す。
    # 例: 120fps タイムベース記録の 60fps 動画で r_frame_rate=120 を返すことで
    # compute_strip_frame_range の fps_ratio が Blender の解釈と一致する。
    r_fps_float, _ = _get_r_frame_rate(req.video_path)
    fps = r_fps_float if r_fps_float > 0 else avg_fps

    return {
        "video_path": req.video_path,
        "fps": fps,
        "width": width,
        "height": height,
        "frame_count": frame_count,
    }

@app.post("/generate", response_model=Task)
def generate_mask_endpoint(req: GenerateRequest, background_tasks: BackgroundTasks):
    task_id = str(uuid.uuid4())
    task = Task(id=task_id, status=TaskStatus.PENDING)
    tasks[task_id] = task
    cancel_events[task_id] = threading.Event()

    background_tasks.add_task(process_video_task, task_id, req)
    return task


@app.post("/bake_blur", response_model=Task)
def bake_blur_endpoint(req: BakeRequest, background_tasks: BackgroundTasks):
    task_id = str(uuid.uuid4())
    task = Task(id=task_id, status=TaskStatus.PENDING)
    tasks[task_id] = task
    cancel_events[task_id] = threading.Event()

    background_tasks.add_task(process_bake_task, task_id, req)
    return task

@app.post("/generate_images", response_model=Task)
def generate_images_endpoint(req: GenerateImagesRequest, background_tasks: BackgroundTasks):
    task_id = str(uuid.uuid4())
    task = Task(id=task_id, status=TaskStatus.PENDING)
    tasks[task_id] = task
    cancel_events[task_id] = threading.Event()
    background_tasks.add_task(process_images_task, task_id, req)
    return task


@app.post("/augment_pose", response_model=Task)
def augment_pose_endpoint(req: AugmentPoseRequest, background_tasks: BackgroundTasks):
    task_id = str(uuid.uuid4())
    task = Task(id=task_id, status=TaskStatus.PENDING)
    tasks[task_id] = task
    cancel_events[task_id] = threading.Event()
    background_tasks.add_task(augment_pose_task, task_id, req)
    return task


@app.post("/bake_image_blur", response_model=Task)
def bake_image_blur_endpoint(req: BakeImagesRequest, background_tasks: BackgroundTasks):
    task_id = str(uuid.uuid4())
    task = Task(id=task_id, status=TaskStatus.PENDING)
    tasks[task_id] = task
    cancel_events[task_id] = threading.Event()
    background_tasks.add_task(process_bake_images_task, task_id, req)
    return task


@app.get("/tasks/{task_id}", response_model=Task)
def get_task(task_id: str):
    if task_id not in tasks:
        raise HTTPException(status_code=404, detail="Task not found")
    return tasks[task_id]

@app.post("/tasks/{task_id}/cancel")
def cancel_task(task_id: str):
    if task_id not in tasks:
        raise HTTPException(status_code=404, detail="Task not found")

    if task_id in cancel_events:
        cancel_events[task_id].set()

    return {"message": "Cancellation requested"}

if __name__ == "__main__":
    log_startup_diagnostics()
    uvicorn.run(app, host="127.0.0.1", port=8181)