From 0f150e8a0adf6f7b2db00e8e97d142327f812e45 Mon Sep 17 00:00:00 2001
From: Hare <kei.hiracchi.0928@gmail.com>
Date: Sat, 7 Feb 2026 07:47:23 +0900
Subject: [PATCH] WIP

---
 .gitignore                          |   2 +
 core/inference_client.py            |  39 +-
 docs/ultralytics-yolo-rocm-guide.md | 696 ++++++++++++++++++++++++++++
 flake.nix                           |  71 ++-
 server/detector.py                  | 359 ++++----------
 server/main.py                      |   4 +-
 6 files changed, 895 insertions(+), 276 deletions(-)
 create mode 100644 docs/ultralytics-yolo-rocm-guide.md

diff --git a/.gitignore b/.gitignore
index 8508b72..a4b2dba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 *.so
+.venv/
 
 # Blender
 *.blend1
@@ -16,3 +17,4 @@ __pycache__/
 # 環境
 .direnv/
 .envrc.local
+.env
diff --git a/core/inference_client.py b/core/inference_client.py
index da43b65..0a17143 100644
--- a/core/inference_client.py
+++ b/core/inference_client.py
@@ -37,16 +37,41 @@ class InferenceClient:
             # Assuming this file is in core/inference_client.py
             root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
             server_script = os.path.join(root_dir, "server", "main.py")
-            
-            # Use system python (assumed to have dependencies via Nix/venv)
-            # In user's environment, 'python' should refer to the environment python
-            python_cmd = "python"
-            
-            # Start process
+
+            # Prepare environment variables for server process
+            server_env = os.environ.copy()
+
+            # Load environment variables from .env file if it exists
+            env_file = os.path.join(root_dir, ".env")
+            if os.path.exists(env_file):
+                with open(env_file, 'r') as f:
+                    for line in f:
+                        line = line.strip()
+                        if line and not line.startswith('#') and '=' in line:
+                            key, value = line.split('=', 1)
+                            server_env[key] = value
+                print(f"[FaceMask] Loaded environment from: {env_file}")
+
+            # Ensure PYTHONPATH includes project root
+            pythonpath = server_env.get('PYTHONPATH', '')
+            if pythonpath:
+                server_env['PYTHONPATH'] = f"{root_dir}:{pythonpath}"
+            else:
+                server_env['PYTHONPATH'] = root_dir
+
+            # If there's a venv in the project, add it to PATH
+            venv_bin = os.path.join(root_dir, ".venv", "bin")
+            if os.path.isdir(venv_bin):
+                current_path = server_env.get('PATH', '')
+                server_env['PATH'] = f"{venv_bin}:{current_path}"
+                print(f"[FaceMask] Using venv from: {venv_bin}")
+
+            # Start process with 'python' command (will use venv if PATH is set correctly)
             self.server_process = subprocess.Popen(
-                [python_cmd, server_script],
+                ["python", server_script],
                 cwd=root_dir,
                 text=True,
+                env=server_env,
                 preexec_fn=os.setsid,  # Create new process group
             )
             
diff --git a/docs/ultralytics-yolo-rocm-guide.md b/docs/ultralytics-yolo-rocm-guide.md
new file mode 100644
index 0000000..460ab18
--- /dev/null
+++ b/docs/ultralytics-yolo-rocm-guide.md
@@ -0,0 +1,696 @@
+# Ultralytics YOLO with AMD GPU (ROCm) - 完全ガイド
+
+**取得日**: 2026-02-07
+**対象**: Ultralytics YOLO (YOLOv5/YOLOv8/YOLO11)
+**プラットフォーム**: AMD GPU + ROCm
+
+---
+
+## 目次
+
+1. [公式サポート状況](#公式サポート状況)
+2. [ROCm環境のセットアップ](#rocm環境のセットアップ)
+3. [デバイス指定の正しい方法](#デバイス指定の正しい方法)
+4. [Ultralytics YOLOでの使用方法](#ultralytics-yoloでの使用方法)
+5. [トラブルシューティング](#トラブルシューティング)
+6. [コミュニティ実装例](#コミュニティ実装例)
+7. [参考資料](#参考資料)
+
+---
+
+## 公式サポート状況
+
+### Ultralyticsの公式見解
+
+**重要**: Ultralytics YOLOは現在、AMD ROCmのネイティブサポートを提供していません。
+
+- **Issue #10323** (2024年4月25日開設、2024年6月8日に"not planned"としてクローズ)
+  - Glenn Jocher氏（Ultralyticsチームメンバー）の回答:
+    > "YOLOv8 primarily supports Nvidia CUDA for GPU acceleration"
+  - ネイティブROCmサポートの即時計画はない
+  - コミュニティによるPull Requestは歓迎
+
+- **推奨される回避策**: PyTorchのROCm互換性レイヤーを使用
+
+### 使用可能な選択肢
+
+1. **PyTorch ROCmバックエンド経由** (非公式だが実用可能)
+2. **Dockerコンテナを使用した環境分離**
+3. **クラウドGPUレンタル** (NVIDIA GPU使用)
+4. **ONNXエクスポート + AMD推論エンジン**
+
+---
+
+## ROCm環境のセットアップ
+
+### 1. ROCmサポート確認
+
+#### 対応GPU
+
+- **Instinctシリーズ**: MI100 (gfx908), MI210/MI250/MI250x (gfx90a), MI300A/MI300X/MI325 (gfx942), MI350/MI355 (gfx950)
+- **Radeon RXシリーズ**: RX 7900 GRE以上、一部のRX 6000シリーズ (gfx1030, gfx1100/1101, gfx1200/1201)
+- **統合GPU**: 一部のRyzen APU (環境変数による回避策が必要)
+
+#### GPU情報の確認
+
+```bash
+rocminfo | grep gfx
+```
+
+### 2. PyTorch ROCmのインストール
+
+#### 方法A: Dockerイメージ（推奨）
+
+AMDが検証済みのDockerイメージを使用:
+
+```bash
+# 最新イメージの取得
+docker pull rocm/pytorch:latest
+
+# コンテナの起動
+docker run -it \
+    --cap-add=SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    --device=/dev/kfd \
+    --device=/dev/dri \
+    --group-add video \
+    --ipc=host \
+    --shm-size 8G \
+    rocm/pytorch:latest
+```
+
+**利用可能なタグ（ROCm 7.2.0時点）**:
+- PyTorch 2.9.1 + Python 3.12/3.10
+- PyTorch 2.8.0 + Python 3.12/3.10
+- PyTorch 2.7.1 + Python 3.12/3.10
+
+#### 方法B: Wheelパッケージ（ベアメタル）
+
+```bash
+# 依存関係のインストール
+sudo apt install libjpeg-dev python3-dev python3-pip
+
+# PyTorch ROCmのインストール
+pip3 install --pre torch torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/nightly/rocm7.0
+```
+
+**注意**: AMDは`repo.radeon.com`で提供されるWHLの使用を推奨（PyTorch.orgのnightly buildsは頻繁に変更されテストが不十分）
+
+#### 方法C: ソースからビルド
+
+```bash
+git clone https://github.com/pytorch/pytorch.git
+cd pytorch
+git submodule update --init --recursive
+
+# GPUアーキテクチャを指定
+export PYTORCH_ROCM_ARCH=gfx90a  # 例: MI210/MI250の場合
+
+# ビルド手順に従う
+# (pytorch/pytorch リポジトリのREADME参照)
+```
+
+### 3. インストールの確認
+
+```bash
+python3 -c 'import torch; print(torch.cuda.is_available())'
+# 出力: True (GPUが利用可能な場合)
+
+python3 -c 'import torch; print(torch.version.hip)'
+# 出力: ROCmバージョン（例: '5.7.31921'）
+```
+
+---
+
+## デバイス指定の正しい方法
+
+### 重要な概念
+
+**PyTorch ROCmは意図的にCUDAインターフェースを再利用**します。これにより、コード変更を最小限に抑えています。
+
+### 基本的なデバイス指定
+
+```python
+import torch
+
+# デフォルトのGPUデバイス
+cuda = torch.device('cuda')
+
+# 特定のGPUデバイス
+cuda0 = torch.device('cuda:0')  # GPU 0
+cuda1 = torch.device('cuda:1')  # GPU 1
+cuda2 = torch.device('cuda:2')  # GPU 2
+```
+
+**注意**: `"rocm"`や`"hip"`は無効なデバイス文字列です。必ず`"cuda"`を使用してください。
+
+### HIP vs CUDAの検出
+
+```python
+import torch
+
+if torch.cuda.is_available():
+    if torch.version.hip:
+        print("Running on AMD GPU with ROCm/HIP")
+        print(f"ROCm version: {torch.version.hip}")
+    elif torch.version.cuda:
+        print("Running on NVIDIA GPU with CUDA")
+        print(f"CUDA version: {torch.version.cuda}")
+else:
+    print("No GPU available")
+```
+
+### Ultralytics YOLOでのデバイス指定
+
+Ultralytics APIを使用する場合:
+
+```python
+from ultralytics import YOLO
+
+# モデルの読み込み
+model = YOLO('yolov8n.pt')
+
+# 推論時のデバイス指定
+# 方法1: 文字列で指定
+results = model.predict('image.jpg', device='cuda:0')
+
+# 方法2: 整数で指定（GPU番号）
+results = model.predict('image.jpg', device=0)
+
+# 方法3: CPUを使用
+results = model.predict('image.jpg', device='cpu')
+
+# トレーニング時のデバイス指定
+model.train(data='coco.yaml', epochs=100, device=0)
+```
+
+---
+
+## Ultralytics YOLOでの使用方法
+
+### 基本的な推論
+
+```python
+from ultralytics import YOLO
+import torch
+
+# GPU確認
+print(f"CUDA available: {torch.cuda.is_available()}")
+print(f"ROCm version: {torch.version.hip if torch.version.hip else 'N/A'}")
+
+# モデルのロード
+model = YOLO('yolov8n.pt')
+
+# 推論の実行
+results = model('path/to/image.jpg', device=0)
+
+# 結果の処理
+for result in results:
+    boxes = result.boxes  # Boxes object
+    masks = result.masks  # Masks object (セグメンテーションの場合)
+    probs = result.probs  # 分類の場合
+```
+
+### バッチ処理
+
+```python
+from ultralytics import YOLO
+from pathlib import Path
+
+model = YOLO('yolov8n.pt')
+
+# 画像リストで推論
+image_folder = Path('path/to/images')
+image_paths = list(image_folder.glob('*.jpg'))
+
+# バッチサイズを指定して推論
+results = model.predict(image_paths, device=0, batch=16)
+
+for i, result in enumerate(results):
+    result.save(filename=f'result_{i}.jpg')
+```
+
+### トレーニング
+
+```python
+from ultralytics import YOLO
+
+# モデルの作成
+model = YOLO('yolov8n.yaml')
+
+# トレーニングの実行
+results = model.train(
+    data='coco.yaml',
+    epochs=100,
+    imgsz=640,
+    device=0,  # AMD GPUを使用
+    batch=16,
+    workers=8
+)
+```
+
+### モデルのエクスポート（ONNX）
+
+AMD環境での推論最適化のため、ONNXにエクスポート:
+
+```python
+from ultralytics import YOLO
+
+model = YOLO('yolov8n.pt')
+
+# ONNXフォーマットにエクスポート
+model.export(format='onnx', dynamic=True)
+```
+
+エクスポート後、AMD MIGraphXで最適化:
+
+```bash
+migraphx-driver compile ./yolov8n.onnx \
+    --optimize \
+    --gpu \
+    --enable-offload-copy \
+    --binary \
+    -o yolov8n.mxr
+```
+
+---
+
+## トラブルシューティング
+
+### 1. GPU認識されない
+
+#### 症状
+```python
+torch.cuda.is_available()  # False
+```
+
+#### 解決策
+
+**ユーザー権限の確認**:
+```bash
+# ユーザーを適切なグループに追加
+sudo usermod -a -G video $USER
+sudo usermod -a -G render $USER
+
+# 再ログインして反映
+```
+
+**デバイスアクセスの確認**:
+```bash
+ls -la /dev/kfd /dev/dri/
+```
+
+**ROCmインストールの確認**:
+```bash
+rocm-smi
+# GPUリストが表示されるはず
+```
+
+### 2. hipErrorNoBinaryForGpu
+
+#### 症状
+```
+RuntimeError: HIP error: hipErrorNoBinaryForGpu
+```
+
+#### 原因
+PyTorchが対象GPUアーキテクチャ用にコンパイルされていない
+
+#### 解決策
+
+**GPUアーキテクチャの確認**:
+```bash
+rocminfo | grep gfx
+```
+
+**環境変数による回避** (統合GPUやサポート外GPU):
+```bash
+# gfx90cの場合、gfx900として認識させる
+export HSA_OVERRIDE_GFX_VERSION=9.0.0
+
+# 再度Pythonを実行
+python3 your_script.py
+```
+
+**ソースからビルドする場合**:
+```bash
+export PYTORCH_ROCM_ARCH=gfx1030  # 自分のGPUアーキテクチャを指定
+# PyTorchをビルド
+```
+
+### 3. GPU転送のハング/失敗
+
+#### 症状
+- スクリプトが無言でハング
+- GPU-CPU間のデータ転送が失敗
+
+#### 解決策
+
+**PCIe Atomicsサポートの問題** (コンシューマー向けマザーボード):
+
+```bash
+# SDMA（ダイレクトメモリアクセス）を無効化
+export HSA_ENABLE_SDMA=0
+
+python3 your_script.py
+```
+
+この設定なしでは「GPU memory transfers will silently hang/fail」する可能性があります。
+
+### 4. ROCmバージョンの非互換性
+
+#### 症状
+```
+ImportError: libMIOpen.so.1: cannot open shared object file
+```
+
+#### 解決策
+
+**特定バージョンのROCmを使用**:
+- ROCm 5.7が推奨される場合が多い
+- ROCm 6.0.0は一部環境で動作しない報告あり
+
+**Dockerイメージを使用**してバージョンを固定:
+```bash
+docker pull rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+```
+
+### 5. メモリ不足エラー
+
+#### 症状
+```
+RuntimeError: HIP out of memory
+```
+
+#### 解決策
+
+**メモリ使用状況の確認**:
+```python
+import torch
+
+print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+print(f"Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+```
+
+**キャッシュのクリア**:
+```python
+torch.cuda.empty_cache()
+```
+
+**バッチサイズの削減**:
+```python
+model.train(data='coco.yaml', batch=8)  # デフォルトより小さく
+```
+
+**キャッシュアロケータの無効化** (デバッグ用):
+```bash
+export PYTORCH_NO_HIP_MEMORY_CACHING=1
+```
+
+### 6. パフォーマンスが遅い
+
+#### 最適化のヒント
+
+**hipBLASワークスペースの調整**:
+```bash
+# デフォルトは32 MiB、MI300+は128 MiB
+export HIPBLAS_WORKSPACE_CONFIG=128M
+```
+
+**Composable Kernelの有効化** (対応GPU):
+```python
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True  # TF32はROCmで非対応
+```
+
+**混合精度トレーニング**:
+```python
+from ultralytics import YOLO
+
+model = YOLO('yolov8n.pt')
+model.train(
+    data='coco.yaml',
+    epochs=100,
+    amp=True,  # Automatic Mixed Precision
+    device=0
+)
+```
+
+---
+
+## コミュニティ実装例
+
+### AMD統合GPU向けYOLOv8実装
+
+GitHub: [harakas/amd_igpu_yolo_v8](https://github.com/harakas/amd_igpu_yolo_v8)
+
+#### 主な特徴
+
+- DockerベースのROCm + PyTorch環境
+- 統合GPUに特化した環境変数設定
+- YOLOv5/YOLOv8の推論例
+- MIGraphXを使った本番デプロイ
+
+#### セットアップ例
+
+```bash
+# Dockerイメージのビルド
+docker build -t rocm-pytorch .
+
+# ラッパースクリプトでPythonを実行
+./rocm_python yolo8.py
+```
+
+#### 必須環境変数
+
+```bash
+# PCIe Atomics非サポート対策
+export HSA_ENABLE_SDMA=0
+
+# GPUアーキテクチャのオーバーライド
+export HSA_OVERRIDE_GFX_VERSION=9.0.0  # gfx90cの場合
+```
+
+#### パフォーマンス実測値
+
+**テスト環境**: AMD Ryzen 3 5400U (統合GPU)
+
+- YOLOv8n (640x640): 約50 FPS (0.02秒/画像)
+- YOLOv8n (320x320): 約140 FPS
+
+#### MIGraphXデプロイ
+
+```bash
+# 1. ONNXエクスポート
+python3 -c "from ultralytics import YOLO; YOLO('yolov8n.pt').export(format='onnx')"
+
+# 2. MIGraphXバイナリにコンパイル
+migraphx-driver compile ./yolov8n.onnx \
+    --optimize \
+    --gpu \
+    --enable-offload-copy \
+    --binary \
+    -o yolov8n.mxr
+
+# 3. MIGraphX Python APIで推論
+# (PyTorch依存なし、高速化)
+```
+
+---
+
+## 制限事項と注意点
+
+### PyTorch ROCmの制限
+
+1. **TensorFloat-32 (TF32)**: ROCmでは非対応
+2. **分散トレーニング**: NCCLとGlooバックエンドのみサポート
+3. **hipFFT/rocFFT**: プランキャッシュサイズの設定は非サポート
+
+### Ultralytics YOLO固有の問題
+
+1. **公式サポートなし**: AMDからの技術サポートは期待できない
+2. **パフォーマンス**: NVIDIA CUDAと比較して遅い場合がある
+3. **互換性**: 全機能が動作する保証はない（特に新機能）
+
+### 推奨される使用ケース
+
+**ROCmが適している場合**:
+- 既にAMD GPUを所有している
+- 開発/テスト環境
+- 小～中規模のデータセット
+
+**クラウドGPUを検討すべき場合**:
+- 本番環境での大規模トレーニング
+- 最高のパフォーマンスが必要
+- 公式サポートが必要
+
+---
+
+## 完全な動作例
+
+### 統合GPU環境での推論スクリプト
+
+```python
+#!/usr/bin/env python3
+"""
+AMD GPU (ROCm) 向け Ultralytics YOLO 推論スクリプト
+"""
+import os
+import torch
+from ultralytics import YOLO
+
+# 必須環境変数の設定（統合GPUの場合）
+os.environ['HSA_ENABLE_SDMA'] = '0'
+os.environ['HSA_OVERRIDE_GFX_VERSION'] = '9.0.0'  # 自分のGPUに合わせて調整
+
+def check_gpu():
+    """GPU利用可能性の確認"""
+    print("=" * 50)
+    print("GPU情報")
+    print("=" * 50)
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"GPU count: {torch.cuda.device_count()}")
+        print(f"Current device: {torch.cuda.current_device()}")
+        print(f"Device name: {torch.cuda.get_device_name(0)}")
+        print(f"ROCm version: {torch.version.hip}")
+        print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+        print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+    print("=" * 50)
+
+def main():
+    # GPU確認
+    check_gpu()
+
+    if not torch.cuda.is_available():
+        print("警告: GPUが利用できません。CPUで実行します。")
+        device = 'cpu'
+    else:
+        device = 0  # GPU 0を使用
+
+    # モデルのロード
+    print(f"\nモデルをロード中... (device={device})")
+    model = YOLO('yolov8n.pt')
+
+    # 推論の実行
+    print("\n推論を実行中...")
+    results = model.predict(
+        source='path/to/image.jpg',
+        device=device,
+        conf=0.25,
+        iou=0.7,
+        imgsz=640,
+        save=True,
+        save_txt=True
+    )
+
+    # 結果の表示
+    for i, result in enumerate(results):
+        print(f"\n結果 {i+1}:")
+        print(f"  検出数: {len(result.boxes)}")
+        print(f"  処理時間: {result.speed['inference']:.2f}ms")
+
+        # ボックス情報
+        for box in result.boxes:
+            cls = int(box.cls[0])
+            conf = float(box.conf[0])
+            print(f"    クラス: {model.names[cls]}, 信頼度: {conf:.2f}")
+
+    # メモリ解放
+    torch.cuda.empty_cache()
+    print("\n完了!")
+
+if __name__ == '__main__':
+    main()
+```
+
+### Dockerを使った実行例
+
+```dockerfile
+# Dockerfile
+FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+
+# Ultralyticsのインストール
+RUN pip install ultralytics opencv-python
+
+# 作業ディレクトリ
+WORKDIR /workspace
+
+# 環境変数の設定
+ENV HSA_ENABLE_SDMA=0
+ENV HSA_OVERRIDE_GFX_VERSION=9.0.0
+
+CMD ["/bin/bash"]
+```
+
+```bash
+# ビルドと実行
+docker build -t yolo-rocm .
+
+docker run -it --rm \
+    --device=/dev/kfd \
+    --device=/dev/dri \
+    --group-add video \
+    --ipc=host \
+    --shm-size 8G \
+    -v $(pwd):/workspace \
+    yolo-rocm \
+    python3 inference.py
+```
+
+---
+
+## 参考資料
+
+### 公式ドキュメント
+
+- [PyTorch HIP (ROCm) Semantics](https://docs.pytorch.org/docs/stable/notes/hip.html)
+- [PyTorch on ROCm Installation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html)
+- [ROCm PyTorch Compatibility](https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html)
+- [Ultralytics YOLO Documentation](https://docs.ultralytics.com/)
+
+### GitHub Issues & Discussions
+
+- [AMD GPU support and optimisation - YOLOv5 #2995](https://github.com/ultralytics/yolov5/issues/2995)
+- [Direct support for AMD GPUs/ROCm - Ultralytics #10323](https://github.com/ultralytics/ultralytics/issues/10323)
+- [Running YOLOV8 on non CUDA GPU - Discussion #10066](https://github.com/orgs/ultralytics/discussions/10066)
+
+### コミュニティリソース
+
+- [harakas/amd_igpu_yolo_v8](https://github.com/harakas/amd_igpu_yolo_v8) - AMD統合GPU向け実装例
+- [ROCm Docker Hub](https://hub.docker.com/r/rocm/pytorch) - 公式Dockerイメージ
+- [PyTorch for AMD ROCm Platform Blog](https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/)
+
+### トラブルシューティングリソース
+
+- [How to run torch with AMD gpu? - PyTorch Forums](https://discuss.pytorch.org/t/how-to-run-torch-with-amd-gpu/157069)
+- [Install AMD GPU ROCm and PyTorch on Ubuntu - GitHub Gist](https://gist.github.com/jurgonaut/462a6bd9b87ed085fa0fe6c893536993)
+
+---
+
+## まとめ
+
+### 重要なポイント
+
+1. **公式サポートなし**: Ultralytics YOLOはROCmをネイティブサポートしていないが、PyTorchのROCmバックエンド経由で動作可能
+
+2. **デバイス指定**: `device='cuda'` または `device=0` を使用（`'rocm'`や`'hip'`は無効）
+
+3. **環境変数が重要**: 統合GPUや一部のコンシューマーGPUでは `HSA_ENABLE_SDMA=0` と `HSA_OVERRIDE_GFX_VERSION` が必須
+
+4. **Dockerが推奨**: 環境の一貫性とトラブル回避のため、公式ROCm PyTorchイメージの使用を推奨
+
+5. **パフォーマンス**: NVIDIA CUDAより遅い場合があるが、小～中規模の用途では実用的
+
+### 次のステップ
+
+- 自分のGPUアーキテクチャを確認 (`rocminfo | grep gfx`)
+- Docker環境でテスト推論を実行
+- パフォーマンスが不十分な場合はONNX + MIGraphXを検討
+- 本番環境ではクラウドGPUの使用を検討
+
+---
+
+**ドキュメント作成**: Claude Code (Sonnet 4.5)
+**最終更新**: 2026-02-07
diff --git a/flake.nix b/flake.nix
index dbebab7..b6d05ce 100644
--- a/flake.nix
+++ b/flake.nix
@@ -15,32 +15,89 @@
     flake-utils.lib.eachDefaultSystem (
       system:
       let
-        pkgs = nixpkgs.legacyPackages.${system};
+        pkgs = import nixpkgs {
+          inherit system;
+          config.allowUnfree = true;
+        };
       in
       {
         devShells.default = pkgs.mkShell {
           buildInputs = with pkgs; [
+            # Python環境
             python311
             python311Packages.pip
-            python311Packages.requests
-            python311Packages.fastapi
-            python311Packages.uvicorn
-            python311Packages.numpy
-            python311Packages.opencv4
-            python311Packages.onnxruntime
+            python311Packages.virtualenv
             git
+
+            # C/C++標準ライブラリ（PyTorchなどに必要）
+            stdenv.cc.cc.lib
+            zlib
+            zstd
+
+            # ROCm関連（AMD GPU推論に必要）
+            rocmPackages.clr
+            rocmPackages.rocm-smi
+            rocmPackages.rocm-runtime
           ];
 
           shellHook = ''
             python --version
             blender --version | head -n 1
 
+            # ROCm環境変数
+            export ROCM_PATH="${pkgs.rocmPackages.clr}"
+            export HSA_OVERRIDE_GFX_VERSION="11.0.0"  # RX 7900 (RDNA 3 / gfx1100)
+
+            # LD_LIBRARY_PATH: ROCm、C++標準ライブラリ、その他必要なライブラリ
+            export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib:${pkgs.zlib}/lib:${pkgs.zstd.out}/lib:${pkgs.rocmPackages.clr}/lib:${pkgs.rocmPackages.rocm-runtime}/lib:$LD_LIBRARY_PATH"
+
+            # venvのセットアップ
+            VENV_DIR="$PWD/.venv"
+            if [ ! -d "$VENV_DIR" ]; then
+              echo "[Setup] Creating Python virtual environment..."
+              python -m venv "$VENV_DIR"
+            fi
+
+            # venvをアクティベート
+            source "$VENV_DIR/bin/activate"
+
+            # 必要なパッケージのインストール確認とインストール
+            if ! python -c "import torch; print(torch.cuda.is_available())" 2>/dev/null | grep -q "True"; then
+              echo "[Setup] Installing Python dependencies..."
+              # まずPyTorch ROCm版をインストール（ROCm 6.2用）
+              pip install --quiet torch torchvision --index-url https://download.pytorch.org/whl/rocm6.2
+              # 次に通常のPyPIから他のパッケージをインストール
+              pip install --quiet \
+                ultralytics \
+                opencv-python-headless \
+                numpy \
+                fastapi \
+                uvicorn \
+                pydantic
+              # opencv-pythonがインストールされていたら削除（headless版のみ使用）
+              pip uninstall -y opencv-python opencv 2>/dev/null || true
+              # opencv-python-headlessを再インストールして確実にする
+              pip install --quiet --force-reinstall opencv-python-headless
+              echo "[Setup] Dependencies installed successfully"
+            fi
+
             # Pythonパスにカレントディレクトリを追加
             export PYTHONPATH="$PWD:$PYTHONPATH"
 
             # アドオンのインストールパスを環境変数として設定
             export BLENDER_USER_SCRIPTS="$HOME/.config/blender/5.0/scripts"
             export BLENDER_USER_ADDONS="$BLENDER_USER_SCRIPTS/addons"
+
+            # 環境変数をファイルに保存（サーバープロセス用）
+            cat > "$PWD/.env" << EOF
+LD_LIBRARY_PATH=${pkgs.stdenv.cc.cc.lib}/lib:${pkgs.zlib}/lib:${pkgs.zstd.out}/lib:${pkgs.rocmPackages.clr}/lib:${pkgs.rocmPackages.rocm-runtime}/lib
+ROCM_PATH=${pkgs.rocmPackages.clr}
+HSA_OVERRIDE_GFX_VERSION=11.0.0
+PYTORCH_ROCM_ARCH=gfx1100
+ROCBLAS_TENSILE_LIBPATH=${pkgs.rocmPackages.clr}/lib/rocblas/library
+EOF
+
+            echo "[Setup] Environment ready with GPU support"
           '';
         };
       }
diff --git a/server/detector.py b/server/detector.py
index 3c03589..ede604a 100644
--- a/server/detector.py
+++ b/server/detector.py
@@ -1,8 +1,8 @@
 """
-YOLOv11 Face Detector using ONNX Runtime with GPU support.
+YOLOv8 Face Detector using PyTorch with ROCm support.
 
 This module provides high-performance face detection using
-YOLOv11-face model with CUDA acceleration.
+YOLOv8-face model with AMD GPU (ROCm) acceleration.
 """
 
 import os
@@ -13,17 +13,17 @@ import numpy as np
 
 class YOLOFaceDetector:
     """
-    YOLOv11 face detector with ONNX Runtime GPU support.
-    
+    YOLOv8 face detector with PyTorch ROCm support.
+
     Features:
-    - CUDA GPU acceleration
+    - ROCm GPU acceleration for AMD GPUs
     - High accuracy face detection
-    - NMS for overlapping detections
+    - Automatic NMS for overlapping detections
     """
-    
+
     # Default model path relative to this file
-    DEFAULT_MODEL = "yolov11n-face.onnx"
-    
+    DEFAULT_MODEL = "yolov8n-face-lindevs.pt"
+
     def __init__(
         self,
         model_path: Optional[str] = None,
@@ -33,9 +33,9 @@ class YOLOFaceDetector:
     ):
         """
         Initialize the YOLO face detector.
-        
+
         Args:
-            model_path: Path to ONNX model file. If None, uses default model.
+            model_path: Path to PyTorch model file. If None, uses default model.
             conf_threshold: Confidence threshold for detections
             iou_threshold: IoU threshold for NMS
             input_size: Model input size (width, height)
@@ -43,15 +43,17 @@ class YOLOFaceDetector:
         self.conf_threshold = conf_threshold
         self.iou_threshold = iou_threshold
         self.input_size = input_size
-        self._session = None
+        self._model = None
         self._model_path = model_path
-    
+        self._device = None
+
     @property
-    def session(self):
-        """Lazy-load ONNX Runtime session."""
-        if self._session is None:
-            import onnxruntime as ort
-            
+    def model(self):
+        """Lazy-load YOLO model."""
+        if self._model is None:
+            from ultralytics import YOLO
+            import torch
+
             # Determine model path
             if self._model_path is None:
                 # Assuming models are in ../models relative to server/detector.py
@@ -59,255 +61,92 @@ class YOLOFaceDetector:
                 model_path = str(models_dir / self.DEFAULT_MODEL)
             else:
                 model_path = self._model_path
-            
+
             if not os.path.exists(model_path):
                 raise FileNotFoundError(f"Model not found: {model_path}")
-            
-            # Configure providers (prefer CUDA)
-            providers = []
-            if 'CUDAExecutionProvider' in ort.get_available_providers():
-                providers.append('CUDAExecutionProvider')
-                print("[FaceMask] Using CUDA GPU for inference")
-            providers.append('CPUExecutionProvider')
-            
-            # Create session
-            sess_options = ort.SessionOptions()
-            sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-            
-            self._session = ort.InferenceSession(
-                model_path,
-                sess_options=sess_options,
-                providers=providers,
-            )
-            
+
+            # Detect device (ROCm GPU or CPU)
+            if torch.cuda.is_available():
+                self._device = 'cuda'
+                device_name = torch.cuda.get_device_name(0)
+                print(f"[FaceMask] Using ROCm GPU for inference: {device_name}")
+            else:
+                self._device = 'cpu'
+                print("[FaceMask] Using CPU for inference (ROCm GPU not available)")
+
+            # Load model (let Ultralytics handle device management)
+            try:
+                self._model = YOLO(model_path)
+                # Don't call .to() - let predict() handle device assignment
+                print(f"[FaceMask] Model loaded, will use device: {self._device}")
+            except Exception as e:
+                print(f"[FaceMask] Error loading model: {e}")
+                import traceback
+                traceback.print_exc()
+                raise
+
             print(f"[FaceMask] YOLO model loaded: {model_path}")
-            print(f"[FaceMask] Providers: {self._session.get_providers()}")
-        
-        return self._session
-    
+            print(f"[FaceMask] Device: {self._device}")
+
+        return self._model
+
     def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]:
         """
         Detect faces in a frame.
-        
+
         Args:
             frame: BGR image as numpy array (H, W, C)
-            
+
         Returns:
             List of detections as (x, y, width, height, confidence)
         """
-        import cv2
-        
-        original_height, original_width = frame.shape[:2]
-        
-        input_tensor = self._preprocess(frame)
-        # print(f"[DEBUG] Input tensor shape: {input_tensor.shape}, Range: [{input_tensor.min():.3f}, {input_tensor.max():.3f}]", flush=True)
-        
         # Run inference
-        input_name = self.session.get_inputs()[0].name
-        outputs = self.session.run(None, {input_name: input_tensor})
-        
-        raw_output = outputs[0]
-        # print(f"[DEBUG] Raw output shape: {raw_output.shape}, Range: [{raw_output.min():.3f}, {raw_output.max():.3f}]", flush=True)
+        import torch
+        print(f"[FaceMask] Inference device: {self._device}, CUDA available: {torch.cuda.is_available()}")
+        try:
+            results = self.model.predict(
+                frame,
+                conf=self.conf_threshold,
+                iou=self.iou_threshold,
+                imgsz=self.input_size[0],
+                verbose=False,
+                device=self._device,
+            )
+        except Exception as e:
+            print(f"[FaceMask] ERROR during inference: {e}")
+            import traceback
+            traceback.print_exc()
+            # Fallback to CPU
+            print("[FaceMask] Falling back to CPU inference...")
+            self._device = 'cpu'
+            results = self.model.predict(
+                frame,
+                conf=self.conf_threshold,
+                iou=self.iou_threshold,
+                imgsz=self.input_size[0],
+                verbose=False,
+                device='cpu',
+            )
 
-        # Postprocess
-        detections = self._postprocess(
-            raw_output,
-            original_width,
-            original_height,
-        )
-        # print(f"[DEBUG] Detections found: {len(detections)}", flush=True)
-        
-        return detections
-    
-    def _preprocess(self, frame: np.ndarray) -> np.ndarray:
-        """Preprocess frame for YOLO input with letterbox resizing."""
-        import cv2
-        
-        # Letterbox resize
-        shape = frame.shape[:2]  # current shape [height, width]
-        new_shape = self.input_size
-        
-        # Scale ratio (new / old)
-        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-        
-        # Compute padding
-        ratio = r, r  # width, height ratios
-        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-        
-        dw /= 2  # divide padding into 2 sides
-        dh /= 2
-        
-        if shape[::-1] != new_unpad:  # resize
-            frame = cv2.resize(frame, new_unpad, interpolation=cv2.INTER_LINEAR)
-            
-        top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-        left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-        
-        # Add border
-        frame = cv2.copyMakeBorder(frame, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
-        
-        # Store metadata for postprocessing
-        self._last_letterbox_meta = {'ratio': ratio, 'dwdh': (dw, dh)}
-        
-        # Convert BGR to RGB
-        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        
-        # Normalize to [0, 1]
-        normalized = rgb.astype(np.float32) / 255.0
-        
-        # Transpose to CHW format
-        transposed = np.transpose(normalized, (2, 0, 1))
-        
-        # Add batch dimension
-        batched = np.expand_dims(transposed, axis=0)
-        
-        return batched
-    
-    def _postprocess(
-        self,
-        output: np.ndarray,
-        original_width: int,
-        original_height: int,
-    ) -> List[Tuple[int, int, int, int, float]]:
-        """
-        Postprocess YOLO output to get detections.
-        """
-        # Output shape: [1, num_detections, 5+] where 5 = x_center, y_center, w, h, conf
-        
-        # Handle different output formats
-        if output.shape[1] < output.shape[2]:
-            # Format: [1, 5+, num_detections] - transpose
-            output = np.transpose(output[0], (1, 0))
-        else:
-            output = output[0]
-            
-        # Debug confidence stats
-        # if output.shape[1] >= 5:
-        #     max_conf = output[:, 4].max()
-        #     print(f"[DEBUG] Max confidence in raw output: {max_conf:.4f}", flush=True)
-        
-        # Filter by confidence
-        confidences = output[:, 4]
-        mask = confidences > self.conf_threshold
-        filtered = output[mask]
-        
-        if len(filtered) == 0:
-            return []
-            
-        # Get letterbox metadata
-        if hasattr(self, '_last_letterbox_meta') and self._last_letterbox_meta:
-            ratio = self._last_letterbox_meta['ratio']
-            dw, dh = self._last_letterbox_meta['dwdh']
-            
-            # Extract coordinates
-            x_center = filtered[:, 0]
-            y_center = filtered[:, 1]
-            width = filtered[:, 2]
-            height = filtered[:, 3]
-            confidences = filtered[:, 4]
-            
-            # Convert center to corner
-            x1 = x_center - width / 2
-            y1 = y_center - height / 2
-            x2 = x_center + width / 2
-            y2 = y_center + height / 2
-            
-            # Adjust for letterbox padding
-            x1 -= dw
-            y1 -= dh
-            x2 -= dw
-            y2 -= dh
-            
-            # Adjust for resizing
-            x1 /= ratio[0]
-            y1 /= ratio[1]
-            x2 /= ratio[0]
-            y2 /= ratio[1]
-            
-            # Clip to image bounds
-            x1 = np.clip(x1, 0, original_width)
-            y1 = np.clip(y1, 0, original_height)
-            x2 = np.clip(x2, 0, original_width)
-            y2 = np.clip(y2, 0, original_height)
-            
-            # Convert back to x, y, w, h
-            final_x = x1
-            final_y = y1
-            final_w = x2 - x1
-            final_h = y2 - y1
-            
-        else:
-            # Fallback for non-letterbox (legacy)
-            scale_x = original_width / self.input_size[0]
-            scale_y = original_height / self.input_size[1]
-            
-            x_center = filtered[:, 0] * scale_x
-            y_center = filtered[:, 1] * scale_y
-            width = filtered[:, 2] * scale_x
-            height = filtered[:, 3] * scale_y
-            confidences = filtered[:, 4]
-            
-            final_x = x_center - width / 2
-            final_y = y_center - height / 2
-            final_w = width
-            final_h = height
-        
-        # Apply NMS
-        boxes = np.stack([final_x, final_y, final_w, final_h], axis=1)
-        indices = self._nms(boxes, confidences, self.iou_threshold)
-        
-        # Format output
+        # Extract detections
         detections = []
-        for i in indices:
-            x = int(final_x[i])
-            y = int(final_y[i])
-            w = int(final_w[i])
-            h = int(final_h[i])
-            conf = float(confidences[i])
-            detections.append((x, y, w, h, conf))
-        
+        if len(results) > 0 and results[0].boxes is not None:
+            boxes = results[0].boxes
+            for box in boxes:
+                # Get coordinates in xyxy format
+                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                conf = float(box.conf[0].cpu().numpy())
+
+                # Convert to x, y, width, height
+                x = int(x1)
+                y = int(y1)
+                w = int(x2 - x1)
+                h = int(y2 - y1)
+
+                detections.append((x, y, w, h, conf))
+
         return detections
-    
-    def _nms(
-        self,
-        boxes: np.ndarray,
-        scores: np.ndarray,
-        iou_threshold: float,
-    ) -> List[int]:
-        """Non-Maximum Suppression."""
-        x1 = boxes[:, 0]
-        y1 = boxes[:, 1]
-        x2 = x1 + boxes[:, 2]
-        y2 = y1 + boxes[:, 3]
-        
-        areas = boxes[:, 2] * boxes[:, 3]
-        order = scores.argsort()[::-1]
-        
-        keep = []
-        while len(order) > 0:
-            i = order[0]
-            keep.append(i)
-            
-            if len(order) == 1:
-                break
-            
-            xx1 = np.maximum(x1[i], x1[order[1:]])
-            yy1 = np.maximum(y1[i], y1[order[1:]])
-            xx2 = np.minimum(x2[i], x2[order[1:]])
-            yy2 = np.minimum(y2[i], y2[order[1:]])
-            
-            w = np.maximum(0, xx2 - xx1)
-            h = np.maximum(0, yy2 - yy1)
-            inter = w * h
-            
-            iou = inter / (areas[i] + areas[order[1:]] - inter)
-            
-            inds = np.where(iou <= iou_threshold)[0]
-            order = order[inds + 1]
-        
-        return keep
-    
+
     def generate_mask(
         self,
         frame_shape: Tuple[int, int, int],
@@ -317,29 +156,29 @@ class YOLOFaceDetector:
     ) -> np.ndarray:
         """
         Generate a mask image from face detections.
-        
+
         Args:
             frame_shape: Shape of the original frame (height, width, channels)
             detections: List of face detections (x, y, w, h, conf)
             mask_scale: Scale factor for mask region
             feather_radius: Radius for edge feathering
-            
+
         Returns:
             Grayscale mask image (white = blur, black = keep)
         """
         import cv2
-        
+
         height, width = frame_shape[:2]
         mask = np.zeros((height, width), dtype=np.uint8)
-        
+
         for (x, y, w, h, conf) in detections:
             # Scale the bounding box
             center_x = x + w // 2
             center_y = y + h // 2
-            
+
             scaled_w = int(w * mask_scale)
             scaled_h = int(h * mask_scale)
-            
+
             # Draw ellipse for natural face shape
             cv2.ellipse(
                 mask,
@@ -350,12 +189,12 @@ class YOLOFaceDetector:
                 255,  # color (white)
                 -1,  # filled
             )
-        
+
         # Apply Gaussian blur for feathering
         if feather_radius > 0 and len(detections) > 0:
             kernel_size = feather_radius * 2 + 1
             mask = cv2.GaussianBlur(mask, (kernel_size, kernel_size), 0)
-        
+
         return mask
 
 
diff --git a/server/main.py b/server/main.py
index 0f12d8a..043245b 100644
--- a/server/main.py
+++ b/server/main.py
@@ -74,8 +74,8 @@ def process_video_task(task_id: str, req: GenerateRequest):
             conf_threshold=req.conf_threshold,
             iou_threshold=req.iou_threshold
         )
-        # Ensure session is loaded
-        _ = detector.session
+        # Ensure model is loaded
+        _ = detector.model
         
         # Open video
         cap = cv2.VideoCapture(req.video_path)