WIP

2026-02-07 07:47:23 +09:00 · 2026-02-07 07:47:23 +09:00 · 0f150e8a0a
commit 0f150e8a0a
parent 97c6b288e0
6 changed files with 895 additions and 276 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .venv/
 # Blender
 *.blend1
@ -16,3 +17,4 @@ __pycache__/
 # 環境
 .direnv/
 .envrc.local
 .env
--- a/core/inference_client.py
+++ b/core/inference_client.py
@ -38,15 +38,40 @@ class InferenceClient:
            root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
            server_script = os.path.join(root_dir, "server", "main.py")
-            # Use system python (assumed to have dependencies via Nix/venv)
+            # Prepare environment variables for server process
-            # In user's environment, 'python' should refer to the environment python
+            server_env = os.environ.copy()
            python_cmd = "python"
-            # Start process
+            # Load environment variables from .env file if it exists
            env_file = os.path.join(root_dir, ".env")
            if os.path.exists(env_file):
                with open(env_file, 'r') as f:
                    for line in f:
                        line = line.strip()
                        if line and not line.startswith('#') and '=' in line:
                            key, value = line.split('=', 1)
                            server_env[key] = value
                print(f"[FaceMask] Loaded environment from: {env_file}")
            # Ensure PYTHONPATH includes project root
            pythonpath = server_env.get('PYTHONPATH', '')
            if pythonpath:
                server_env['PYTHONPATH'] = f"{root_dir}:{pythonpath}"
            else:
                server_env['PYTHONPATH'] = root_dir
            # If there's a venv in the project, add it to PATH
            venv_bin = os.path.join(root_dir, ".venv", "bin")
            if os.path.isdir(venv_bin):
                current_path = server_env.get('PATH', '')
                server_env['PATH'] = f"{venv_bin}:{current_path}"
                print(f"[FaceMask] Using venv from: {venv_bin}")
            # Start process with 'python' command (will use venv if PATH is set correctly)
            self.server_process = subprocess.Popen(
-                [python_cmd, server_script],
+                ["python", server_script],
                cwd=root_dir,
                text=True,
                env=server_env,
                preexec_fn=os.setsid,  # Create new process group
            )
--- a/docs/ultralytics-yolo-rocm-guide.md
+++ b/docs/ultralytics-yolo-rocm-guide.md
@ -0,0 +1,696 @@
 # Ultralytics YOLO with AMD GPU (ROCm) - 完全ガイド
 **取得日**: 2026-02-07
 **対象**: Ultralytics YOLO (YOLOv5/YOLOv8/YOLO11)
 **プラットフォーム**: AMD GPU + ROCm
 ---
 ## 目次
 1. [公式サポート状況](#公式サポート状況)
 2. [ROCm環境のセットアップ](#rocm環境のセットアップ)
 3. [デバイス指定の正しい方法](#デバイス指定の正しい方法)
 4. [Ultralytics YOLOでの使用方法](#ultralytics-yoloでの使用方法)
 5. [トラブルシューティング](#トラブルシューティング)
 6. [コミュニティ実装例](#コミュニティ実装例)
 7. [参考資料](#参考資料)
 ---
 ## 公式サポート状況
 ### Ultralyticsの公式見解
 **重要**: Ultralytics YOLOは現在、AMD ROCmのネイティブサポートを提供していません。
 - **Issue #10323** (2024年4月25日開設、2024年6月8日に"not planned"としてクローズ)
  - Glenn Jocher氏（Ultralyticsチームメンバー）の回答:
    > "YOLOv8 primarily supports Nvidia CUDA for GPU acceleration"
  - ネイティブROCmサポートの即時計画はない
  - コミュニティによるPull Requestは歓迎
 - **推奨される回避策**: PyTorchのROCm互換性レイヤーを使用
 ### 使用可能な選択肢
 1. **PyTorch ROCmバックエンド経由** (非公式だが実用可能)
 2. **Dockerコンテナを使用した環境分離**
 3. **クラウドGPUレンタル** (NVIDIA GPU使用)
 4. **ONNXエクスポート + AMD推論エンジン**
 ---
 ## ROCm環境のセットアップ
 ### 1. ROCmサポート確認
 #### 対応GPU
 - **Instinctシリーズ**: MI100 (gfx908), MI210/MI250/MI250x (gfx90a), MI300A/MI300X/MI325 (gfx942), MI350/MI355 (gfx950)
 - **Radeon RXシリーズ**: RX 7900 GRE以上、一部のRX 6000シリーズ (gfx1030, gfx1100/1101, gfx1200/1201)
 - **統合GPU**: 一部のRyzen APU (環境変数による回避策が必要)
 #### GPU情報の確認
 ```bash
 rocminfo | grep gfx
 ```
 ### 2. PyTorch ROCmのインストール
 #### 方法A: Dockerイメージ（推奨）
 AMDが検証済みのDockerイメージを使用:
 ```bash
 # 最新イメージの取得
 docker pull rocm/pytorch:latest
 # コンテナの起動
 docker run -it \
    --cap-add=SYS_PTRACE \
    --security-opt seccomp=unconfined \
    --device=/dev/kfd \
    --device=/dev/dri \
    --group-add video \
    --ipc=host \
    --shm-size 8G \
    rocm/pytorch:latest
 ```
 **利用可能なタグ（ROCm 7.2.0時点）**:
 - PyTorch 2.9.1 + Python 3.12/3.10
 - PyTorch 2.8.0 + Python 3.12/3.10
 - PyTorch 2.7.1 + Python 3.12/3.10
 #### 方法B: Wheelパッケージ（ベアメタル）
 ```bash
 # 依存関係のインストール
 sudo apt install libjpeg-dev python3-dev python3-pip
 # PyTorch ROCmのインストール
 pip3 install --pre torch torchvision torchaudio \
    --index-url https://download.pytorch.org/whl/nightly/rocm7.0
 ```
 **注意**: AMDは`repo.radeon.com`で提供されるWHLの使用を推奨（PyTorch.orgのnightly buildsは頻繁に変更されテストが不十分）
 #### 方法C: ソースからビルド
 ```bash
 git clone https://github.com/pytorch/pytorch.git
 cd pytorch
 git submodule update --init --recursive
 # GPUアーキテクチャを指定
 export PYTORCH_ROCM_ARCH=gfx90a  # 例: MI210/MI250の場合
 # ビルド手順に従う
 # (pytorch/pytorch リポジトリのREADME参照)
 ```
 ### 3. インストールの確認
 ```bash
 python3 -c 'import torch; print(torch.cuda.is_available())'
 # 出力: True (GPUが利用可能な場合)
 python3 -c 'import torch; print(torch.version.hip)'
 # 出力: ROCmバージョン（例: '5.7.31921'）
 ```
 ---
 ## デバイス指定の正しい方法
 ### 重要な概念
 **PyTorch ROCmは意図的にCUDAインターフェースを再利用**します。これにより、コード変更を最小限に抑えています。
 ### 基本的なデバイス指定
 ```python
 import torch
 # デフォルトのGPUデバイス
 cuda = torch.device('cuda')
 # 特定のGPUデバイス
 cuda0 = torch.device('cuda:0')  # GPU 0
 cuda1 = torch.device('cuda:1')  # GPU 1
 cuda2 = torch.device('cuda:2')  # GPU 2
 ```
 **注意**: `"rocm"`や`"hip"`は無効なデバイス文字列です。必ず`"cuda"`を使用してください。
 ### HIP vs CUDAの検出
 ```python
 import torch
 if torch.cuda.is_available():
    if torch.version.hip:
        print("Running on AMD GPU with ROCm/HIP")
        print(f"ROCm version: {torch.version.hip}")
    elif torch.version.cuda:
        print("Running on NVIDIA GPU with CUDA")
        print(f"CUDA version: {torch.version.cuda}")
 else:
    print("No GPU available")
 ```
 ### Ultralytics YOLOでのデバイス指定
 Ultralytics APIを使用する場合:
 ```python
 from ultralytics import YOLO
 # モデルの読み込み
 model = YOLO('yolov8n.pt')
 # 推論時のデバイス指定
 # 方法1: 文字列で指定
 results = model.predict('image.jpg', device='cuda:0')
 # 方法2: 整数で指定（GPU番号）
 results = model.predict('image.jpg', device=0)
 # 方法3: CPUを使用
 results = model.predict('image.jpg', device='cpu')
 # トレーニング時のデバイス指定
 model.train(data='coco.yaml', epochs=100, device=0)
 ```
 ---
 ## Ultralytics YOLOでの使用方法
 ### 基本的な推論
 ```python
 from ultralytics import YOLO
 import torch
 # GPU確認
 print(f"CUDA available: {torch.cuda.is_available()}")
 print(f"ROCm version: {torch.version.hip if torch.version.hip else 'N/A'}")
 # モデルのロード
 model = YOLO('yolov8n.pt')
 # 推論の実行
 results = model('path/to/image.jpg', device=0)
 # 結果の処理
 for result in results:
    boxes = result.boxes  # Boxes object
    masks = result.masks  # Masks object (セグメンテーションの場合)
    probs = result.probs  # 分類の場合
 ```
 ### バッチ処理
 ```python
 from ultralytics import YOLO
 from pathlib import Path
 model = YOLO('yolov8n.pt')
 # 画像リストで推論
 image_folder = Path('path/to/images')
 image_paths = list(image_folder.glob('*.jpg'))
 # バッチサイズを指定して推論
 results = model.predict(image_paths, device=0, batch=16)
 for i, result in enumerate(results):
    result.save(filename=f'result_{i}.jpg')
 ```
 ### トレーニング
 ```python
 from ultralytics import YOLO
 # モデルの作成
 model = YOLO('yolov8n.yaml')
 # トレーニングの実行
 results = model.train(
    data='coco.yaml',
    epochs=100,
    imgsz=640,
    device=0,  # AMD GPUを使用
    batch=16,
    workers=8
 )
 ```
 ### モデルのエクスポート（ONNX）
 AMD環境での推論最適化のため、ONNXにエクスポート:
 ```python
 from ultralytics import YOLO
 model = YOLO('yolov8n.pt')
 # ONNXフォーマットにエクスポート
 model.export(format='onnx', dynamic=True)
 ```
 エクスポート後、AMD MIGraphXで最適化:
 ```bash
 migraphx-driver compile ./yolov8n.onnx \
    --optimize \
    --gpu \
    --enable-offload-copy \
    --binary \
    -o yolov8n.mxr
 ```
 ---
 ## トラブルシューティング
 ### 1. GPU認識されない
 #### 症状
 ```python
 torch.cuda.is_available()  # False
 ```
 #### 解決策
 **ユーザー権限の確認**:
 ```bash
 # ユーザーを適切なグループに追加
 sudo usermod -a -G video $USER
 sudo usermod -a -G render $USER
 # 再ログインして反映
 ```
 **デバイスアクセスの確認**:
 ```bash
 ls -la /dev/kfd /dev/dri/
 ```
 **ROCmインストールの確認**:
 ```bash
 rocm-smi
 # GPUリストが表示されるはず
 ```
 ### 2. hipErrorNoBinaryForGpu
 #### 症状
 ```
 RuntimeError: HIP error: hipErrorNoBinaryForGpu
 ```
 #### 原因
 PyTorchが対象GPUアーキテクチャ用にコンパイルされていない
 #### 解決策
 **GPUアーキテクチャの確認**:
 ```bash
 rocminfo | grep gfx
 ```
 **環境変数による回避** (統合GPUやサポート外GPU):
 ```bash
 # gfx90cの場合、gfx900として認識させる
 export HSA_OVERRIDE_GFX_VERSION=9.0.0
 # 再度Pythonを実行
 python3 your_script.py
 ```
 **ソースからビルドする場合**:
 ```bash
 export PYTORCH_ROCM_ARCH=gfx1030  # 自分のGPUアーキテクチャを指定
 # PyTorchをビルド
 ```
 ### 3. GPU転送のハング/失敗
 #### 症状
 - スクリプトが無言でハング
 - GPU-CPU間のデータ転送が失敗
 #### 解決策
 **PCIe Atomicsサポートの問題** (コンシューマー向けマザーボード):
 ```bash
 # SDMA（ダイレクトメモリアクセス）を無効化
 export HSA_ENABLE_SDMA=0
 python3 your_script.py
 ```
 この設定なしでは「GPU memory transfers will silently hang/fail」する可能性があります。
 ### 4. ROCmバージョンの非互換性
 #### 症状
 ```
 ImportError: libMIOpen.so.1: cannot open shared object file
 ```
 #### 解決策
 **特定バージョンのROCmを使用**:
 - ROCm 5.7が推奨される場合が多い
 - ROCm 6.0.0は一部環境で動作しない報告あり
 **Dockerイメージを使用**してバージョンを固定:
 ```bash
 docker pull rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
 ```
 ### 5. メモリ不足エラー
 #### 症状
 ```
 RuntimeError: HIP out of memory
 ```
 #### 解決策
 **メモリ使用状況の確認**:
 ```python
 import torch
 print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
 print(f"Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
 ```
 **キャッシュのクリア**:
 ```python
 torch.cuda.empty_cache()
 ```
 **バッチサイズの削減**:
 ```python
 model.train(data='coco.yaml', batch=8)  # デフォルトより小さく
 ```
 **キャッシュアロケータの無効化** (デバッグ用):
 ```bash
 export PYTORCH_NO_HIP_MEMORY_CACHING=1
 ```
 ### 6. パフォーマンスが遅い
 #### 最適化のヒント
 **hipBLASワークスペースの調整**:
 ```bash
 # デフォルトは32 MiB、MI300+は128 MiB
 export HIPBLAS_WORKSPACE_CONFIG=128M
 ```
 **Composable Kernelの有効化** (対応GPU):
 ```python
 import torch
 torch.backends.cuda.matmul.allow_tf32 = True  # TF32はROCmで非対応
 ```
 **混合精度トレーニング**:
 ```python
 from ultralytics import YOLO
 model = YOLO('yolov8n.pt')
 model.train(
    data='coco.yaml',
    epochs=100,
    amp=True,  # Automatic Mixed Precision
    device=0
 )
 ```
 ---
 ## コミュニティ実装例
 ### AMD統合GPU向けYOLOv8実装
 GitHub: [harakas/amd_igpu_yolo_v8](https://github.com/harakas/amd_igpu_yolo_v8)
 #### 主な特徴
 - DockerベースのROCm + PyTorch環境
 - 統合GPUに特化した環境変数設定
 - YOLOv5/YOLOv8の推論例
 - MIGraphXを使った本番デプロイ
 #### セットアップ例
 ```bash
 # Dockerイメージのビルド
 docker build -t rocm-pytorch .
 # ラッパースクリプトでPythonを実行
 ./rocm_python yolo8.py
 ```
 #### 必須環境変数
 ```bash
 # PCIe Atomics非サポート対策
 export HSA_ENABLE_SDMA=0
 # GPUアーキテクチャのオーバーライド
 export HSA_OVERRIDE_GFX_VERSION=9.0.0  # gfx90cの場合
 ```
 #### パフォーマンス実測値
 **テスト環境**: AMD Ryzen 3 5400U (統合GPU)
 - YOLOv8n (640x640): 約50 FPS (0.02秒/画像)
 - YOLOv8n (320x320): 約140 FPS
 #### MIGraphXデプロイ
 ```bash
 # 1. ONNXエクスポート
 python3 -c "from ultralytics import YOLO; YOLO('yolov8n.pt').export(format='onnx')"
 # 2. MIGraphXバイナリにコンパイル
 migraphx-driver compile ./yolov8n.onnx \
    --optimize \
    --gpu \
    --enable-offload-copy \
    --binary \
    -o yolov8n.mxr
 # 3. MIGraphX Python APIで推論
 # (PyTorch依存なし、高速化)
 ```
 ---
 ## 制限事項と注意点
 ### PyTorch ROCmの制限
 1. **TensorFloat-32 (TF32)**: ROCmでは非対応
 2. **分散トレーニング**: NCCLとGlooバックエンドのみサポート
 3. **hipFFT/rocFFT**: プランキャッシュサイズの設定は非サポート
 ### Ultralytics YOLO固有の問題
 1. **公式サポートなし**: AMDからの技術サポートは期待できない
 2. **パフォーマンス**: NVIDIA CUDAと比較して遅い場合がある
 3. **互換性**: 全機能が動作する保証はない（特に新機能）
 ### 推奨される使用ケース
 **ROCmが適している場合**:
 - 既にAMD GPUを所有している
 - 開発/テスト環境
 - 小～中規模のデータセット
 **クラウドGPUを検討すべき場合**:
 - 本番環境での大規模トレーニング
 - 最高のパフォーマンスが必要
 - 公式サポートが必要
 ---
 ## 完全な動作例
 ### 統合GPU環境での推論スクリプト
 ```python
 #!/usr/bin/env python3
 """
 AMD GPU (ROCm) 向け Ultralytics YOLO 推論スクリプト
 """
 import os
 import torch
 from ultralytics import YOLO
 # 必須環境変数の設定（統合GPUの場合）
 os.environ['HSA_ENABLE_SDMA'] = '0'
 os.environ['HSA_OVERRIDE_GFX_VERSION'] = '9.0.0'  # 自分のGPUに合わせて調整
 def check_gpu():
    """GPU利用可能性の確認"""
    print("=" * 50)
    print("GPU情報")
    print("=" * 50)
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU count: {torch.cuda.device_count()}")
        print(f"Current device: {torch.cuda.current_device()}")
        print(f"Device name: {torch.cuda.get_device_name(0)}")
        print(f"ROCm version: {torch.version.hip}")
        print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
    print("=" * 50)
 def main():
    # GPU確認
    check_gpu()
    if not torch.cuda.is_available():
        print("警告: GPUが利用できません。CPUで実行します。")
        device = 'cpu'
    else:
        device = 0  # GPU 0を使用
    # モデルのロード
    print(f"\nモデルをロード中... (device={device})")
    model = YOLO('yolov8n.pt')
    # 推論の実行
    print("\n推論を実行中...")
    results = model.predict(
        source='path/to/image.jpg',
        device=device,
        conf=0.25,
        iou=0.7,
        imgsz=640,
        save=True,
        save_txt=True
    )
    # 結果の表示
    for i, result in enumerate(results):
        print(f"\n結果 {i+1}:")
        print(f"  検出数: {len(result.boxes)}")
        print(f"  処理時間: {result.speed['inference']:.2f}ms")
        # ボックス情報
        for box in result.boxes:
            cls = int(box.cls[0])
            conf = float(box.conf[0])
            print(f"    クラス: {model.names[cls]}, 信頼度: {conf:.2f}")
    # メモリ解放
    torch.cuda.empty_cache()
    print("\n完了!")
 if __name__ == '__main__':
    main()
 ```
 ### Dockerを使った実行例
 ```dockerfile
 # Dockerfile
 FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
 # Ultralyticsのインストール
 RUN pip install ultralytics opencv-python
 # 作業ディレクトリ
 WORKDIR /workspace
 # 環境変数の設定
 ENV HSA_ENABLE_SDMA=0
 ENV HSA_OVERRIDE_GFX_VERSION=9.0.0
 CMD ["/bin/bash"]
 ```
 ```bash
 # ビルドと実行
 docker build -t yolo-rocm .
 docker run -it --rm \
    --device=/dev/kfd \
    --device=/dev/dri \
    --group-add video \
    --ipc=host \
    --shm-size 8G \
    -v $(pwd):/workspace \
    yolo-rocm \
    python3 inference.py
 ```
 ---
 ## 参考資料
 ### 公式ドキュメント
 - [PyTorch HIP (ROCm) Semantics](https://docs.pytorch.org/docs/stable/notes/hip.html)
 - [PyTorch on ROCm Installation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html)
 - [ROCm PyTorch Compatibility](https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html)
 - [Ultralytics YOLO Documentation](https://docs.ultralytics.com/)
 ### GitHub Issues & Discussions
 - [AMD GPU support and optimisation - YOLOv5 #2995](https://github.com/ultralytics/yolov5/issues/2995)
 - [Direct support for AMD GPUs/ROCm - Ultralytics #10323](https://github.com/ultralytics/ultralytics/issues/10323)
 - [Running YOLOV8 on non CUDA GPU - Discussion #10066](https://github.com/orgs/ultralytics/discussions/10066)
 ### コミュニティリソース
 - [harakas/amd_igpu_yolo_v8](https://github.com/harakas/amd_igpu_yolo_v8) - AMD統合GPU向け実装例
 - [ROCm Docker Hub](https://hub.docker.com/r/rocm/pytorch) - 公式Dockerイメージ
 - [PyTorch for AMD ROCm Platform Blog](https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/)
 ### トラブルシューティングリソース
 - [How to run torch with AMD gpu? - PyTorch Forums](https://discuss.pytorch.org/t/how-to-run-torch-with-amd-gpu/157069)
 - [Install AMD GPU ROCm and PyTorch on Ubuntu - GitHub Gist](https://gist.github.com/jurgonaut/462a6bd9b87ed085fa0fe6c893536993)
 ---
 ## まとめ
 ### 重要なポイント
 1. **公式サポートなし**: Ultralytics YOLOはROCmをネイティブサポートしていないが、PyTorchのROCmバックエンド経由で動作可能
 2. **デバイス指定**: `device='cuda'` または `device=0` を使用（`'rocm'`や`'hip'`は無効）
 3. **環境変数が重要**: 統合GPUや一部のコンシューマーGPUでは `HSA_ENABLE_SDMA=0` と `HSA_OVERRIDE_GFX_VERSION` が必須
 4. **Dockerが推奨**: 環境の一貫性とトラブル回避のため、公式ROCm PyTorchイメージの使用を推奨
 5. **パフォーマンス**: NVIDIA CUDAより遅い場合があるが、小～中規模の用途では実用的
 ### 次のステップ
 - 自分のGPUアーキテクチャを確認 (`rocminfo | grep gfx`)
 - Docker環境でテスト推論を実行
 - パフォーマンスが不十分な場合はONNX + MIGraphXを検討
 - 本番環境ではクラウドGPUの使用を検討
 ---
 **ドキュメント作成**: Claude Code (Sonnet 4.5)
 **最終更新**: 2026-02-07
--- a/flake.nix
+++ b/flake.nix
@ -15,32 +15,89 @@
    flake-utils.lib.eachDefaultSystem (
      system:
      let
-        pkgs = nixpkgs.legacyPackages.${system};
+        pkgs = import nixpkgs {
          inherit system;
          config.allowUnfree = true;
        };
      in
      {
        devShells.default = pkgs.mkShell {
          buildInputs = with pkgs; [
            # Python環境
            python311
            python311Packages.pip
-            python311Packages.requests
+            python311Packages.virtualenv
            python311Packages.fastapi
            python311Packages.uvicorn
            python311Packages.numpy
            python311Packages.opencv4
            python311Packages.onnxruntime
            git
            # C/C++標準ライブラリ（PyTorchなどに必要）
            stdenv.cc.cc.lib
            zlib
            zstd
            # ROCm関連（AMD GPU推論に必要）
            rocmPackages.clr
            rocmPackages.rocm-smi
            rocmPackages.rocm-runtime
          ];
          shellHook = ''
            python --version
            blender --version | head -n 1
            # ROCm環境変数
            export ROCM_PATH="${pkgs.rocmPackages.clr}"
            export HSA_OVERRIDE_GFX_VERSION="11.0.0"  # RX 7900 (RDNA 3 / gfx1100)
            # LD_LIBRARY_PATH: ROCm、C++標準ライブラリ、その他必要なライブラリ
            export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib:${pkgs.zlib}/lib:${pkgs.zstd.out}/lib:${pkgs.rocmPackages.clr}/lib:${pkgs.rocmPackages.rocm-runtime}/lib:$LD_LIBRARY_PATH"
            # venvのセットアップ
            VENV_DIR="$PWD/.venv"
            if [ ! -d "$VENV_DIR" ]; then
              echo "[Setup] Creating Python virtual environment..."
              python -m venv "$VENV_DIR"
            fi
            # venvをアクティベート
            source "$VENV_DIR/bin/activate"
            # 必要なパッケージのインストール確認とインストール
            if ! python -c "import torch; print(torch.cuda.is_available())" 2>/dev/null | grep -q "True"; then
              echo "[Setup] Installing Python dependencies..."
              # まずPyTorch ROCm版をインストール（ROCm 6.2用）
              pip install --quiet torch torchvision --index-url https://download.pytorch.org/whl/rocm6.2
              # 次に通常のPyPIから他のパッケージをインストール
              pip install --quiet \
                ultralytics \
                opencv-python-headless \
                numpy \
                fastapi \
                uvicorn \
                pydantic
              # opencv-pythonがインストールされていたら削除（headless版のみ使用）
              pip uninstall -y opencv-python opencv 2>/dev/null || true
              # opencv-python-headlessを再インストールして確実にする
              pip install --quiet --force-reinstall opencv-python-headless
              echo "[Setup] Dependencies installed successfully"
            fi
            # Pythonパスにカレントディレクトリを追加
            export PYTHONPATH="$PWD:$PYTHONPATH"
            # アドオンのインストールパスを環境変数として設定
            export BLENDER_USER_SCRIPTS="$HOME/.config/blender/5.0/scripts"
            export BLENDER_USER_ADDONS="$BLENDER_USER_SCRIPTS/addons"
            # 環境変数をファイルに保存（サーバープロセス用）
            cat > "$PWD/.env" << EOF
 LD_LIBRARY_PATH=${pkgs.stdenv.cc.cc.lib}/lib:${pkgs.zlib}/lib:${pkgs.zstd.out}/lib:${pkgs.rocmPackages.clr}/lib:${pkgs.rocmPackages.rocm-runtime}/lib
 ROCM_PATH=${pkgs.rocmPackages.clr}
 HSA_OVERRIDE_GFX_VERSION=11.0.0
 PYTORCH_ROCM_ARCH=gfx1100
 ROCBLAS_TENSILE_LIBPATH=${pkgs.rocmPackages.clr}/lib/rocblas/library
 EOF
            echo "[Setup] Environment ready with GPU support"
          '';
        };
      }
--- a/server/detector.py
+++ b/server/detector.py
@ -1,8 +1,8 @@
 """
-YOLOv11 Face Detector using ONNX Runtime with GPU support.
+YOLOv8 Face Detector using PyTorch with ROCm support.
 This module provides high-performance face detection using
-YOLOv11-face model with CUDA acceleration.
+YOLOv8-face model with AMD GPU (ROCm) acceleration.
 """
 import os
@ -13,16 +13,16 @@ import numpy as np
 class YOLOFaceDetector:
    """
-    YOLOv11 face detector with ONNX Runtime GPU support.
+    YOLOv8 face detector with PyTorch ROCm support.
    Features:
-    - CUDA GPU acceleration
+    - ROCm GPU acceleration for AMD GPUs
    - High accuracy face detection
-    - NMS for overlapping detections
+    - Automatic NMS for overlapping detections
    """
    # Default model path relative to this file
-    DEFAULT_MODEL = "yolov11n-face.onnx"
+    DEFAULT_MODEL = "yolov8n-face-lindevs.pt"
    def __init__(
        self,
@ -35,7 +35,7 @@ class YOLOFaceDetector:
        Initialize the YOLO face detector.
        Args:
-            model_path: Path to ONNX model file. If None, uses default model.
+            model_path: Path to PyTorch model file. If None, uses default model.
            conf_threshold: Confidence threshold for detections
            iou_threshold: IoU threshold for NMS
            input_size: Model input size (width, height)
@ -43,14 +43,16 @@ class YOLOFaceDetector:
        self.conf_threshold = conf_threshold
        self.iou_threshold = iou_threshold
        self.input_size = input_size
-        self._session = None
+        self._model = None
        self._model_path = model_path
        self._device = None
    @property
-    def session(self):
+    def model(self):
-        """Lazy-load ONNX Runtime session."""
+        """Lazy-load YOLO model."""
-        if self._session is None:
+        if self._model is None:
-            import onnxruntime as ort
+            from ultralytics import YOLO
            import torch
            # Determine model path
            if self._model_path is None:
@ -63,27 +65,30 @@ class YOLOFaceDetector:
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model not found: {model_path}")
-            # Configure providers (prefer CUDA)
+            # Detect device (ROCm GPU or CPU)
-            providers = []
+            if torch.cuda.is_available():
-            if 'CUDAExecutionProvider' in ort.get_available_providers():
+                self._device = 'cuda'
-                providers.append('CUDAExecutionProvider')
+                device_name = torch.cuda.get_device_name(0)
-                print("[FaceMask] Using CUDA GPU for inference")
+                print(f"[FaceMask] Using ROCm GPU for inference: {device_name}")
-            providers.append('CPUExecutionProvider')
+            else:
                self._device = 'cpu'
                print("[FaceMask] Using CPU for inference (ROCm GPU not available)")
-            # Create session
+            # Load model (let Ultralytics handle device management)
-            sess_options = ort.SessionOptions()
+            try:
-            sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+                self._model = YOLO(model_path)
-            
+                # Don't call .to() - let predict() handle device assignment
-            self._session = ort.InferenceSession(
+                print(f"[FaceMask] Model loaded, will use device: {self._device}")
-                model_path,
+            except Exception as e:
-                sess_options=sess_options,
+                print(f"[FaceMask] Error loading model: {e}")
-                providers=providers,
+                import traceback
-            )
+                traceback.print_exc()
                raise
            print(f"[FaceMask] YOLO model loaded: {model_path}")
-            print(f"[FaceMask] Providers: {self._session.get_providers()}")
+            print(f"[FaceMask] Device: {self._device}")
-        return self._session
+        return self._model
    def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]:
        """
@ -95,219 +100,53 @@ class YOLOFaceDetector:
        Returns:
            List of detections as (x, y, width, height, confidence)
        """
        import cv2
        original_height, original_width = frame.shape[:2]
        input_tensor = self._preprocess(frame)
        # print(f"[DEBUG] Input tensor shape: {input_tensor.shape}, Range: [{input_tensor.min():.3f}, {input_tensor.max():.3f}]", flush=True)
        # Run inference
-        input_name = self.session.get_inputs()[0].name
+        import torch
-        outputs = self.session.run(None, {input_name: input_tensor})
+        print(f"[FaceMask] Inference device: {self._device}, CUDA available: {torch.cuda.is_available()}")
        try:
            results = self.model.predict(
                frame,
                conf=self.conf_threshold,
                iou=self.iou_threshold,
                imgsz=self.input_size[0],
                verbose=False,
                device=self._device,
            )
        except Exception as e:
            print(f"[FaceMask] ERROR during inference: {e}")
            import traceback
            traceback.print_exc()
            # Fallback to CPU
            print("[FaceMask] Falling back to CPU inference...")
            self._device = 'cpu'
            results = self.model.predict(
                frame,
                conf=self.conf_threshold,
                iou=self.iou_threshold,
                imgsz=self.input_size[0],
                verbose=False,
                device='cpu',
            )
-        raw_output = outputs[0]
+        # Extract detections
        # print(f"[DEBUG] Raw output shape: {raw_output.shape}, Range: [{raw_output.min():.3f}, {raw_output.max():.3f}]", flush=True)
        # Postprocess
        detections = self._postprocess(
            raw_output,
            original_width,
            original_height,
        )
        # print(f"[DEBUG] Detections found: {len(detections)}", flush=True)
        return detections
    def _preprocess(self, frame: np.ndarray) -> np.ndarray:
        """Preprocess frame for YOLO input with letterbox resizing."""
        import cv2
        # Letterbox resize
        shape = frame.shape[:2]  # current shape [height, width]
        new_shape = self.input_size
        # Scale ratio (new / old)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        # Compute padding
        ratio = r, r  # width, height ratios
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
        dw /= 2  # divide padding into 2 sides
        dh /= 2
        if shape[::-1] != new_unpad:  # resize
            frame = cv2.resize(frame, new_unpad, interpolation=cv2.INTER_LINEAR)
        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
        # Add border
        frame = cv2.copyMakeBorder(frame, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
        # Store metadata for postprocessing
        self._last_letterbox_meta = {'ratio': ratio, 'dwdh': (dw, dh)}
        # Convert BGR to RGB
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Normalize to [0, 1]
        normalized = rgb.astype(np.float32) / 255.0
        # Transpose to CHW format
        transposed = np.transpose(normalized, (2, 0, 1))
        # Add batch dimension
        batched = np.expand_dims(transposed, axis=0)
        return batched
    def _postprocess(
        self,
        output: np.ndarray,
        original_width: int,
        original_height: int,
    ) -> List[Tuple[int, int, int, int, float]]:
        """
        Postprocess YOLO output to get detections.
        """
        # Output shape: [1, num_detections, 5+] where 5 = x_center, y_center, w, h, conf
        # Handle different output formats
        if output.shape[1] < output.shape[2]:
            # Format: [1, 5+, num_detections] - transpose
            output = np.transpose(output[0], (1, 0))
        else:
            output = output[0]
        # Debug confidence stats
        # if output.shape[1] >= 5:
        #     max_conf = output[:, 4].max()
        #     print(f"[DEBUG] Max confidence in raw output: {max_conf:.4f}", flush=True)
        # Filter by confidence
        confidences = output[:, 4]
        mask = confidences > self.conf_threshold
        filtered = output[mask]
        if len(filtered) == 0:
            return []
        # Get letterbox metadata
        if hasattr(self, '_last_letterbox_meta') and self._last_letterbox_meta:
            ratio = self._last_letterbox_meta['ratio']
            dw, dh = self._last_letterbox_meta['dwdh']
            # Extract coordinates
            x_center = filtered[:, 0]
            y_center = filtered[:, 1]
            width = filtered[:, 2]
            height = filtered[:, 3]
            confidences = filtered[:, 4]
            # Convert center to corner
            x1 = x_center - width / 2
            y1 = y_center - height / 2
            x2 = x_center + width / 2
            y2 = y_center + height / 2
            # Adjust for letterbox padding
            x1 -= dw
            y1 -= dh
            x2 -= dw
            y2 -= dh
            # Adjust for resizing
            x1 /= ratio[0]
            y1 /= ratio[1]
            x2 /= ratio[0]
            y2 /= ratio[1]
            # Clip to image bounds
            x1 = np.clip(x1, 0, original_width)
            y1 = np.clip(y1, 0, original_height)
            x2 = np.clip(x2, 0, original_width)
            y2 = np.clip(y2, 0, original_height)
            # Convert back to x, y, w, h
            final_x = x1
            final_y = y1
            final_w = x2 - x1
            final_h = y2 - y1
        else:
            # Fallback for non-letterbox (legacy)
            scale_x = original_width / self.input_size[0]
            scale_y = original_height / self.input_size[1]
            x_center = filtered[:, 0] * scale_x
            y_center = filtered[:, 1] * scale_y
            width = filtered[:, 2] * scale_x
            height = filtered[:, 3] * scale_y
            confidences = filtered[:, 4]
            final_x = x_center - width / 2
            final_y = y_center - height / 2
            final_w = width
            final_h = height
        # Apply NMS
        boxes = np.stack([final_x, final_y, final_w, final_h], axis=1)
        indices = self._nms(boxes, confidences, self.iou_threshold)
        # Format output
        detections = []
-        for i in indices:
+        if len(results) > 0 and results[0].boxes is not None:
-            x = int(final_x[i])
+            boxes = results[0].boxes
-            y = int(final_y[i])
+            for box in boxes:
-            w = int(final_w[i])
+                # Get coordinates in xyxy format
-            h = int(final_h[i])
+                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
-            conf = float(confidences[i])
+                conf = float(box.conf[0].cpu().numpy())
-            detections.append((x, y, w, h, conf))
+
                # Convert to x, y, width, height
                x = int(x1)
                y = int(y1)
                w = int(x2 - x1)
                h = int(y2 - y1)
                detections.append((x, y, w, h, conf))
        return detections
    def _nms(
        self,
        boxes: np.ndarray,
        scores: np.ndarray,
        iou_threshold: float,
    ) -> List[int]:
        """Non-Maximum Suppression."""
        x1 = boxes[:, 0]
        y1 = boxes[:, 1]
        x2 = x1 + boxes[:, 2]
        y2 = y1 + boxes[:, 3]
        areas = boxes[:, 2] * boxes[:, 3]
        order = scores.argsort()[::-1]
        keep = []
        while len(order) > 0:
            i = order[0]
            keep.append(i)
            if len(order) == 1:
                break
            xx1 = np.maximum(x1[i], x1[order[1:]])
            yy1 = np.maximum(y1[i], y1[order[1:]])
            xx2 = np.minimum(x2[i], x2[order[1:]])
            yy2 = np.minimum(y2[i], y2[order[1:]])
            w = np.maximum(0, xx2 - xx1)
            h = np.maximum(0, yy2 - yy1)
            inter = w * h
            iou = inter / (areas[i] + areas[order[1:]] - inter)
            inds = np.where(iou <= iou_threshold)[0]
            order = order[inds + 1]
        return keep
    def generate_mask(
        self,
        frame_shape: Tuple[int, int, int],
--- a/server/main.py
+++ b/server/main.py
@ -74,8 +74,8 @@ def process_video_task(task_id: str, req: GenerateRequest):
            conf_threshold=req.conf_threshold,
            iou_threshold=req.iou_threshold
        )
-        # Ensure session is loaded
+        # Ensure model is loaded
-        _ = detector.session
+        _ = detector.model
        # Open video
        cap = cv2.VideoCapture(req.video_path)