blender-mask-peoples/debug_detector.py

#!/usr/bin/env python3
"""
顔検出処理の単体デバッグスクリプト

Usage:
    # 画像ファイルで検出をテスト
    python debug_detector.py --image path/to/image.jpg

    # 動画ファイルで検出をテスト（指定フレームのみ）
    python debug_detector.py --video path/to/video.mp4 --frame 100

    # 動画ファイルで複数フレームをテスト
    python debug_detector.py --video path/to/video.mp4 --start 0 --end 10

    # 結果を保存
    python debug_detector.py --image test.jpg --output result.jpg
"""

import argparse
import sys
from pathlib import Path
import cv2
import numpy as np

# プロジェクトルートをパスに追加
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

from server.detector import YOLOFaceDetector


def draw_detections(image: np.ndarray, detections, mask=None):
    """
    検出結果を画像に描画

    Args:
        image: 元画像（BGR）
        detections: 検出結果のリスト [(x, y, w, h, conf), ...]
        mask: マスク画像（オプション）

    Returns:
        描画済み画像
    """
    output = image.copy()

    # マスクをオーバーレイ
    if mask is not None:
        # マスクを3チャンネルに変換
        mask_colored = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
        # 赤色でオーバーレイ（半透明）
        mask_overlay = np.zeros_like(output)
        mask_overlay[:, :, 2] = mask  # 赤チャンネル
        output = cv2.addWeighted(output, 1.0, mask_overlay, 0.3, 0)

    # バウンディングボックスを描画
    for (x, y, w, h, conf) in detections:
        # ボックス
        cv2.rectangle(output, (x, y), (x + w, y + h), (0, 255, 0), 2)

        # 信頼度テキスト
        label = f"{conf:.2f}"
        label_size, baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        y_label = max(y, label_size[1])
        cv2.rectangle(
            output,
            (x, y_label - label_size[1]),
            (x + label_size[0], y_label + baseline),
            (0, 255, 0),
            -1
        )
        cv2.putText(
            output,
            label,
            (x, y_label),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 0),
            1
        )

    return output


def debug_image(args, detector):
    """画像ファイルで検出をデバッグ"""
    print(f"画像を読み込み中: {args.image}")
    image = cv2.imread(args.image)

    if image is None:
        print(f"エラー: 画像を読み込めません: {args.image}")
        return

    print(f"画像サイズ: {image.shape[1]}x{image.shape[0]}")

    # 検出実行
    print("顔検出を実行中...")
    detections = detector.detect(image)

    print(f"\n検出結果: {len(detections)}個の顔を検出")
    for i, (x, y, w, h, conf) in enumerate(detections):
        print(f"  [{i+1}] x={x}, y={y}, w={w}, h={h}, conf={conf:.3f}")

    # マスク生成
    if len(detections) > 0:
        mask = detector.generate_mask(
            image.shape,
            detections,
            mask_scale=args.mask_scale,
            feather_radius=args.feather_radius
        )
    else:
        mask = None

    # 結果を描画
    result = draw_detections(image, detections, mask)

    # 表示または保存
    if args.output:
        cv2.imwrite(args.output, result)
        print(f"\n結果を保存しました: {args.output}")

        if mask is not None and args.save_mask:
            mask_path = args.output.replace('.', '_mask.')
            cv2.imwrite(mask_path, mask)
            print(f"マスクを保存しました: {mask_path}")
    else:
        cv2.imshow("Detection Result", result)
        if mask is not None:
            cv2.imshow("Mask", mask)
        print("\nキーを押して終了してください...")
        cv2.waitKey(0)
        cv2.destroyAllWindows()


def debug_video(args, detector):
    """動画ファイルで検出をデバッグ"""
    print(f"動画を読み込み中: {args.video}")
    cap = cv2.VideoCapture(args.video)

    if not cap.isOpened():
        print(f"エラー: 動画を開けません: {args.video}")
        return

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print(f"動画情報: {width}x{height}, {fps:.2f}fps, {total_frames}フレーム")

    # フレーム範囲の決定
    start_frame = args.start if args.start is not None else args.frame
    end_frame = args.end if args.end is not None else args.frame

    start_frame = max(0, min(start_frame, total_frames - 1))
    end_frame = max(0, min(end_frame, total_frames - 1))

    print(f"処理範囲: フレーム {start_frame} - {end_frame}")

    # 出力動画の準備
    out_writer = None
    if args.output:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out_writer = cv2.VideoWriter(args.output, fourcc, fps, (width, height))

    # フレーム処理
    for frame_idx in range(start_frame, end_frame + 1):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()

        if not ret:
            print(f"警告: フレーム {frame_idx} を読み込めませんでした")
            continue

        # 検出実行
        detections = detector.detect(frame)

        # マスク生成
        if len(detections) > 0:
            mask = detector.generate_mask(
                frame.shape,
                detections,
                mask_scale=args.mask_scale,
                feather_radius=args.feather_radius
            )
        else:
            mask = None

        # 結果を描画
        result = draw_detections(frame, detections, mask)

        print(f"フレーム {frame_idx}: {len(detections)}個の顔を検出")

        # 保存または表示
        if out_writer:
            out_writer.write(result)
        else:
            cv2.imshow(f"Frame {frame_idx}", result)
            if mask is not None:
                cv2.imshow("Mask", mask)

            key = cv2.waitKey(0 if end_frame == start_frame else 30)
            if key == ord('q'):
                break

    cap.release()
    if out_writer:
        out_writer.release()
        print(f"\n結果を保存しました: {args.output}")
    else:
        cv2.destroyAllWindows()


def main():
    parser = argparse.ArgumentParser(
        description="顔検出処理の単体デバッグスクリプト",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__
    )

    # 入力ソース
    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument("--image", type=str, help="テスト用画像ファイル")
    input_group.add_argument("--video", type=str, help="テスト用動画ファイル")

    # 動画用オプション
    parser.add_argument("--frame", type=int, default=0, help="処理する動画フレーム番号（デフォルト: 0）")
    parser.add_argument("--start", type=int, help="処理開始フレーム（動画のみ）")
    parser.add_argument("--end", type=int, help="処理終了フレーム（動画のみ）")

    # 検出パラメータ
    parser.add_argument("--conf", type=float, default=0.5, help="信頼度閾値（デフォルト: 0.5）")
    parser.add_argument("--iou", type=float, default=0.45, help="NMS IoU閾値（デフォルト: 0.45）")
    parser.add_argument("--mask-scale", type=float, default=1.5, help="マスクスケール（デフォルト: 1.5）")
    parser.add_argument("--feather-radius", type=int, default=20, help="マスクぼかし半径（デフォルト: 20）")

    # 出力オプション
    parser.add_argument("--output", "-o", type=str, help="結果画像/動画の保存先")
    parser.add_argument("--save-mask", action="store_true", help="マスク画像も保存する（画像のみ）")

    # モデル
    parser.add_argument("--model", type=str, help="カスタムモデルパス")

    args = parser.parse_args()

    # 検出器を初期化
    print("YOLOFaceDetectorを初期化中...")
    detector = YOLOFaceDetector(
        model_path=args.model,
        conf_threshold=args.conf,
        iou_threshold=args.iou
    )

    # モデルを事前ロード
    print("モデルをロード中...")
    _ = detector.model
    print("準備完了\n")

    # デバッグ実行
    if args.image:
        debug_image(args, detector)
    else:
        debug_video(args, detector)


if __name__ == "__main__":
    main()