WIP
This commit is contained in:
parent
97c6b288e0
commit
0f150e8a0a
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -8,6 +8,7 @@ __pycache__/
|
|||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.venv/
|
||||
|
||||
# Blender
|
||||
*.blend1
|
||||
|
|
@ -16,3 +17,4 @@ __pycache__/
|
|||
# 環境
|
||||
.direnv/
|
||||
.envrc.local
|
||||
.env
|
||||
|
|
|
|||
|
|
@ -37,16 +37,41 @@ class InferenceClient:
|
|||
# Assuming this file is in core/inference_client.py
|
||||
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
server_script = os.path.join(root_dir, "server", "main.py")
|
||||
|
||||
# Use system python (assumed to have dependencies via Nix/venv)
|
||||
# In user's environment, 'python' should refer to the environment python
|
||||
python_cmd = "python"
|
||||
|
||||
# Start process
|
||||
|
||||
# Prepare environment variables for server process
|
||||
server_env = os.environ.copy()
|
||||
|
||||
# Load environment variables from .env file if it exists
|
||||
env_file = os.path.join(root_dir, ".env")
|
||||
if os.path.exists(env_file):
|
||||
with open(env_file, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#') and '=' in line:
|
||||
key, value = line.split('=', 1)
|
||||
server_env[key] = value
|
||||
print(f"[FaceMask] Loaded environment from: {env_file}")
|
||||
|
||||
# Ensure PYTHONPATH includes project root
|
||||
pythonpath = server_env.get('PYTHONPATH', '')
|
||||
if pythonpath:
|
||||
server_env['PYTHONPATH'] = f"{root_dir}:{pythonpath}"
|
||||
else:
|
||||
server_env['PYTHONPATH'] = root_dir
|
||||
|
||||
# If there's a venv in the project, add it to PATH
|
||||
venv_bin = os.path.join(root_dir, ".venv", "bin")
|
||||
if os.path.isdir(venv_bin):
|
||||
current_path = server_env.get('PATH', '')
|
||||
server_env['PATH'] = f"{venv_bin}:{current_path}"
|
||||
print(f"[FaceMask] Using venv from: {venv_bin}")
|
||||
|
||||
# Start process with 'python' command (will use venv if PATH is set correctly)
|
||||
self.server_process = subprocess.Popen(
|
||||
[python_cmd, server_script],
|
||||
["python", server_script],
|
||||
cwd=root_dir,
|
||||
text=True,
|
||||
env=server_env,
|
||||
preexec_fn=os.setsid, # Create new process group
|
||||
)
|
||||
|
||||
|
|
|
|||
696
docs/ultralytics-yolo-rocm-guide.md
Normal file
696
docs/ultralytics-yolo-rocm-guide.md
Normal file
|
|
@ -0,0 +1,696 @@
|
|||
# Ultralytics YOLO with AMD GPU (ROCm) - 完全ガイド
|
||||
|
||||
**取得日**: 2026-02-07
|
||||
**対象**: Ultralytics YOLO (YOLOv5/YOLOv8/YOLO11)
|
||||
**プラットフォーム**: AMD GPU + ROCm
|
||||
|
||||
---
|
||||
|
||||
## 目次
|
||||
|
||||
1. [公式サポート状況](#公式サポート状況)
|
||||
2. [ROCm環境のセットアップ](#rocm環境のセットアップ)
|
||||
3. [デバイス指定の正しい方法](#デバイス指定の正しい方法)
|
||||
4. [Ultralytics YOLOでの使用方法](#ultralytics-yoloでの使用方法)
|
||||
5. [トラブルシューティング](#トラブルシューティング)
|
||||
6. [コミュニティ実装例](#コミュニティ実装例)
|
||||
7. [参考資料](#参考資料)
|
||||
|
||||
---
|
||||
|
||||
## 公式サポート状況
|
||||
|
||||
### Ultralyticsの公式見解
|
||||
|
||||
**重要**: Ultralytics YOLOは現在、AMD ROCmのネイティブサポートを提供していません。
|
||||
|
||||
- **Issue #10323** (2024年4月25日開設、2024年6月8日に"not planned"としてクローズ)
|
||||
- Glenn Jocher氏(Ultralyticsチームメンバー)の回答:
|
||||
> "YOLOv8 primarily supports Nvidia CUDA for GPU acceleration"
|
||||
- ネイティブROCmサポートの即時計画はない
|
||||
- コミュニティによるPull Requestは歓迎
|
||||
|
||||
- **推奨される回避策**: PyTorchのROCm互換性レイヤーを使用
|
||||
|
||||
### 使用可能な選択肢
|
||||
|
||||
1. **PyTorch ROCmバックエンド経由** (非公式だが実用可能)
|
||||
2. **Dockerコンテナを使用した環境分離**
|
||||
3. **クラウドGPUレンタル** (NVIDIA GPU使用)
|
||||
4. **ONNXエクスポート + AMD推論エンジン**
|
||||
|
||||
---
|
||||
|
||||
## ROCm環境のセットアップ
|
||||
|
||||
### 1. ROCmサポート確認
|
||||
|
||||
#### 対応GPU
|
||||
|
||||
- **Instinctシリーズ**: MI100 (gfx908), MI210/MI250/MI250x (gfx90a), MI300A/MI300X/MI325 (gfx942), MI350/MI355 (gfx950)
|
||||
- **Radeon RXシリーズ**: RX 7900 GRE以上、一部のRX 6000シリーズ (gfx1030, gfx1100/1101, gfx1200/1201)
|
||||
- **統合GPU**: 一部のRyzen APU (環境変数による回避策が必要)
|
||||
|
||||
#### GPU情報の確認
|
||||
|
||||
```bash
|
||||
rocminfo | grep gfx
|
||||
```
|
||||
|
||||
### 2. PyTorch ROCmのインストール
|
||||
|
||||
#### 方法A: Dockerイメージ(推奨)
|
||||
|
||||
AMDが検証済みのDockerイメージを使用:
|
||||
|
||||
```bash
|
||||
# 最新イメージの取得
|
||||
docker pull rocm/pytorch:latest
|
||||
|
||||
# コンテナの起動
|
||||
docker run -it \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--shm-size 8G \
|
||||
rocm/pytorch:latest
|
||||
```
|
||||
|
||||
**利用可能なタグ(ROCm 7.2.0時点)**:
|
||||
- PyTorch 2.9.1 + Python 3.12/3.10
|
||||
- PyTorch 2.8.0 + Python 3.12/3.10
|
||||
- PyTorch 2.7.1 + Python 3.12/3.10
|
||||
|
||||
#### 方法B: Wheelパッケージ(ベアメタル)
|
||||
|
||||
```bash
|
||||
# 依存関係のインストール
|
||||
sudo apt install libjpeg-dev python3-dev python3-pip
|
||||
|
||||
# PyTorch ROCmのインストール
|
||||
pip3 install --pre torch torchvision torchaudio \
|
||||
--index-url https://download.pytorch.org/whl/nightly/rocm7.0
|
||||
```
|
||||
|
||||
**注意**: AMDは`repo.radeon.com`で提供されるWHLの使用を推奨(PyTorch.orgのnightly buildsは頻繁に変更されテストが不十分)
|
||||
|
||||
#### 方法C: ソースからビルド
|
||||
|
||||
```bash
|
||||
git clone https://github.com/pytorch/pytorch.git
|
||||
cd pytorch
|
||||
git submodule update --init --recursive
|
||||
|
||||
# GPUアーキテクチャを指定
|
||||
export PYTORCH_ROCM_ARCH=gfx90a # 例: MI210/MI250の場合
|
||||
|
||||
# ビルド手順に従う
|
||||
# (pytorch/pytorch リポジトリのREADME参照)
|
||||
```
|
||||
|
||||
### 3. インストールの確認
|
||||
|
||||
```bash
|
||||
python3 -c 'import torch; print(torch.cuda.is_available())'
|
||||
# 出力: True (GPUが利用可能な場合)
|
||||
|
||||
python3 -c 'import torch; print(torch.version.hip)'
|
||||
# 出力: ROCmバージョン(例: '5.7.31921')
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## デバイス指定の正しい方法
|
||||
|
||||
### 重要な概念
|
||||
|
||||
**PyTorch ROCmは意図的にCUDAインターフェースを再利用**します。これにより、コード変更を最小限に抑えています。
|
||||
|
||||
### 基本的なデバイス指定
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
# デフォルトのGPUデバイス
|
||||
cuda = torch.device('cuda')
|
||||
|
||||
# 特定のGPUデバイス
|
||||
cuda0 = torch.device('cuda:0') # GPU 0
|
||||
cuda1 = torch.device('cuda:1') # GPU 1
|
||||
cuda2 = torch.device('cuda:2') # GPU 2
|
||||
```
|
||||
|
||||
**注意**: `"rocm"`や`"hip"`は無効なデバイス文字列です。必ず`"cuda"`を使用してください。
|
||||
|
||||
### HIP vs CUDAの検出
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
if torch.cuda.is_available():
|
||||
if torch.version.hip:
|
||||
print("Running on AMD GPU with ROCm/HIP")
|
||||
print(f"ROCm version: {torch.version.hip}")
|
||||
elif torch.version.cuda:
|
||||
print("Running on NVIDIA GPU with CUDA")
|
||||
print(f"CUDA version: {torch.version.cuda}")
|
||||
else:
|
||||
print("No GPU available")
|
||||
```
|
||||
|
||||
### Ultralytics YOLOでのデバイス指定
|
||||
|
||||
Ultralytics APIを使用する場合:
|
||||
|
||||
```python
|
||||
from ultralytics import YOLO
|
||||
|
||||
# モデルの読み込み
|
||||
model = YOLO('yolov8n.pt')
|
||||
|
||||
# 推論時のデバイス指定
|
||||
# 方法1: 文字列で指定
|
||||
results = model.predict('image.jpg', device='cuda:0')
|
||||
|
||||
# 方法2: 整数で指定(GPU番号)
|
||||
results = model.predict('image.jpg', device=0)
|
||||
|
||||
# 方法3: CPUを使用
|
||||
results = model.predict('image.jpg', device='cpu')
|
||||
|
||||
# トレーニング時のデバイス指定
|
||||
model.train(data='coco.yaml', epochs=100, device=0)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Ultralytics YOLOでの使用方法
|
||||
|
||||
### 基本的な推論
|
||||
|
||||
```python
|
||||
from ultralytics import YOLO
|
||||
import torch
|
||||
|
||||
# GPU確認
|
||||
print(f"CUDA available: {torch.cuda.is_available()}")
|
||||
print(f"ROCm version: {torch.version.hip if torch.version.hip else 'N/A'}")
|
||||
|
||||
# モデルのロード
|
||||
model = YOLO('yolov8n.pt')
|
||||
|
||||
# 推論の実行
|
||||
results = model('path/to/image.jpg', device=0)
|
||||
|
||||
# 結果の処理
|
||||
for result in results:
|
||||
boxes = result.boxes # Boxes object
|
||||
masks = result.masks # Masks object (セグメンテーションの場合)
|
||||
probs = result.probs # 分類の場合
|
||||
```
|
||||
|
||||
### バッチ処理
|
||||
|
||||
```python
|
||||
from ultralytics import YOLO
|
||||
from pathlib import Path
|
||||
|
||||
model = YOLO('yolov8n.pt')
|
||||
|
||||
# 画像リストで推論
|
||||
image_folder = Path('path/to/images')
|
||||
image_paths = list(image_folder.glob('*.jpg'))
|
||||
|
||||
# バッチサイズを指定して推論
|
||||
results = model.predict(image_paths, device=0, batch=16)
|
||||
|
||||
for i, result in enumerate(results):
|
||||
result.save(filename=f'result_{i}.jpg')
|
||||
```
|
||||
|
||||
### トレーニング
|
||||
|
||||
```python
|
||||
from ultralytics import YOLO
|
||||
|
||||
# モデルの作成
|
||||
model = YOLO('yolov8n.yaml')
|
||||
|
||||
# トレーニングの実行
|
||||
results = model.train(
|
||||
data='coco.yaml',
|
||||
epochs=100,
|
||||
imgsz=640,
|
||||
device=0, # AMD GPUを使用
|
||||
batch=16,
|
||||
workers=8
|
||||
)
|
||||
```
|
||||
|
||||
### モデルのエクスポート(ONNX)
|
||||
|
||||
AMD環境での推論最適化のため、ONNXにエクスポート:
|
||||
|
||||
```python
|
||||
from ultralytics import YOLO
|
||||
|
||||
model = YOLO('yolov8n.pt')
|
||||
|
||||
# ONNXフォーマットにエクスポート
|
||||
model.export(format='onnx', dynamic=True)
|
||||
```
|
||||
|
||||
エクスポート後、AMD MIGraphXで最適化:
|
||||
|
||||
```bash
|
||||
migraphx-driver compile ./yolov8n.onnx \
|
||||
--optimize \
|
||||
--gpu \
|
||||
--enable-offload-copy \
|
||||
--binary \
|
||||
-o yolov8n.mxr
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## トラブルシューティング
|
||||
|
||||
### 1. GPU認識されない
|
||||
|
||||
#### 症状
|
||||
```python
|
||||
torch.cuda.is_available() # False
|
||||
```
|
||||
|
||||
#### 解決策
|
||||
|
||||
**ユーザー権限の確認**:
|
||||
```bash
|
||||
# ユーザーを適切なグループに追加
|
||||
sudo usermod -a -G video $USER
|
||||
sudo usermod -a -G render $USER
|
||||
|
||||
# 再ログインして反映
|
||||
```
|
||||
|
||||
**デバイスアクセスの確認**:
|
||||
```bash
|
||||
ls -la /dev/kfd /dev/dri/
|
||||
```
|
||||
|
||||
**ROCmインストールの確認**:
|
||||
```bash
|
||||
rocm-smi
|
||||
# GPUリストが表示されるはず
|
||||
```
|
||||
|
||||
### 2. hipErrorNoBinaryForGpu
|
||||
|
||||
#### 症状
|
||||
```
|
||||
RuntimeError: HIP error: hipErrorNoBinaryForGpu
|
||||
```
|
||||
|
||||
#### 原因
|
||||
PyTorchが対象GPUアーキテクチャ用にコンパイルされていない
|
||||
|
||||
#### 解決策
|
||||
|
||||
**GPUアーキテクチャの確認**:
|
||||
```bash
|
||||
rocminfo | grep gfx
|
||||
```
|
||||
|
||||
**環境変数による回避** (統合GPUやサポート外GPU):
|
||||
```bash
|
||||
# gfx90cの場合、gfx900として認識させる
|
||||
export HSA_OVERRIDE_GFX_VERSION=9.0.0
|
||||
|
||||
# 再度Pythonを実行
|
||||
python3 your_script.py
|
||||
```
|
||||
|
||||
**ソースからビルドする場合**:
|
||||
```bash
|
||||
export PYTORCH_ROCM_ARCH=gfx1030 # 自分のGPUアーキテクチャを指定
|
||||
# PyTorchをビルド
|
||||
```
|
||||
|
||||
### 3. GPU転送のハング/失敗
|
||||
|
||||
#### 症状
|
||||
- スクリプトが無言でハング
|
||||
- GPU-CPU間のデータ転送が失敗
|
||||
|
||||
#### 解決策
|
||||
|
||||
**PCIe Atomicsサポートの問題** (コンシューマー向けマザーボード):
|
||||
|
||||
```bash
|
||||
# SDMA(ダイレクトメモリアクセス)を無効化
|
||||
export HSA_ENABLE_SDMA=0
|
||||
|
||||
python3 your_script.py
|
||||
```
|
||||
|
||||
この設定なしでは「GPU memory transfers will silently hang/fail」する可能性があります。
|
||||
|
||||
### 4. ROCmバージョンの非互換性
|
||||
|
||||
#### 症状
|
||||
```
|
||||
ImportError: libMIOpen.so.1: cannot open shared object file
|
||||
```
|
||||
|
||||
#### 解決策
|
||||
|
||||
**特定バージョンのROCmを使用**:
|
||||
- ROCm 5.7が推奨される場合が多い
|
||||
- ROCm 6.0.0は一部環境で動作しない報告あり
|
||||
|
||||
**Dockerイメージを使用**してバージョンを固定:
|
||||
```bash
|
||||
docker pull rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
|
||||
```
|
||||
|
||||
### 5. メモリ不足エラー
|
||||
|
||||
#### 症状
|
||||
```
|
||||
RuntimeError: HIP out of memory
|
||||
```
|
||||
|
||||
#### 解決策
|
||||
|
||||
**メモリ使用状況の確認**:
|
||||
```python
|
||||
import torch
|
||||
|
||||
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
||||
print(f"Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
|
||||
```
|
||||
|
||||
**キャッシュのクリア**:
|
||||
```python
|
||||
torch.cuda.empty_cache()
|
||||
```
|
||||
|
||||
**バッチサイズの削減**:
|
||||
```python
|
||||
model.train(data='coco.yaml', batch=8) # デフォルトより小さく
|
||||
```
|
||||
|
||||
**キャッシュアロケータの無効化** (デバッグ用):
|
||||
```bash
|
||||
export PYTORCH_NO_HIP_MEMORY_CACHING=1
|
||||
```
|
||||
|
||||
### 6. パフォーマンスが遅い
|
||||
|
||||
#### 最適化のヒント
|
||||
|
||||
**hipBLASワークスペースの調整**:
|
||||
```bash
|
||||
# デフォルトは32 MiB、MI300+は128 MiB
|
||||
export HIPBLAS_WORKSPACE_CONFIG=128M
|
||||
```
|
||||
|
||||
**Composable Kernelの有効化** (対応GPU):
|
||||
```python
|
||||
import torch
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # TF32はROCmで非対応
|
||||
```
|
||||
|
||||
**混合精度トレーニング**:
|
||||
```python
|
||||
from ultralytics import YOLO
|
||||
|
||||
model = YOLO('yolov8n.pt')
|
||||
model.train(
|
||||
data='coco.yaml',
|
||||
epochs=100,
|
||||
amp=True, # Automatic Mixed Precision
|
||||
device=0
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## コミュニティ実装例
|
||||
|
||||
### AMD統合GPU向けYOLOv8実装
|
||||
|
||||
GitHub: [harakas/amd_igpu_yolo_v8](https://github.com/harakas/amd_igpu_yolo_v8)
|
||||
|
||||
#### 主な特徴
|
||||
|
||||
- DockerベースのROCm + PyTorch環境
|
||||
- 統合GPUに特化した環境変数設定
|
||||
- YOLOv5/YOLOv8の推論例
|
||||
- MIGraphXを使った本番デプロイ
|
||||
|
||||
#### セットアップ例
|
||||
|
||||
```bash
|
||||
# Dockerイメージのビルド
|
||||
docker build -t rocm-pytorch .
|
||||
|
||||
# ラッパースクリプトでPythonを実行
|
||||
./rocm_python yolo8.py
|
||||
```
|
||||
|
||||
#### 必須環境変数
|
||||
|
||||
```bash
|
||||
# PCIe Atomics非サポート対策
|
||||
export HSA_ENABLE_SDMA=0
|
||||
|
||||
# GPUアーキテクチャのオーバーライド
|
||||
export HSA_OVERRIDE_GFX_VERSION=9.0.0 # gfx90cの場合
|
||||
```
|
||||
|
||||
#### パフォーマンス実測値
|
||||
|
||||
**テスト環境**: AMD Ryzen 3 5400U (統合GPU)
|
||||
|
||||
- YOLOv8n (640x640): 約50 FPS (0.02秒/画像)
|
||||
- YOLOv8n (320x320): 約140 FPS
|
||||
|
||||
#### MIGraphXデプロイ
|
||||
|
||||
```bash
|
||||
# 1. ONNXエクスポート
|
||||
python3 -c "from ultralytics import YOLO; YOLO('yolov8n.pt').export(format='onnx')"
|
||||
|
||||
# 2. MIGraphXバイナリにコンパイル
|
||||
migraphx-driver compile ./yolov8n.onnx \
|
||||
--optimize \
|
||||
--gpu \
|
||||
--enable-offload-copy \
|
||||
--binary \
|
||||
-o yolov8n.mxr
|
||||
|
||||
# 3. MIGraphX Python APIで推論
|
||||
# (PyTorch依存なし、高速化)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 制限事項と注意点
|
||||
|
||||
### PyTorch ROCmの制限
|
||||
|
||||
1. **TensorFloat-32 (TF32)**: ROCmでは非対応
|
||||
2. **分散トレーニング**: NCCLとGlooバックエンドのみサポート
|
||||
3. **hipFFT/rocFFT**: プランキャッシュサイズの設定は非サポート
|
||||
|
||||
### Ultralytics YOLO固有の問題
|
||||
|
||||
1. **公式サポートなし**: AMDからの技術サポートは期待できない
|
||||
2. **パフォーマンス**: NVIDIA CUDAと比較して遅い場合がある
|
||||
3. **互換性**: 全機能が動作する保証はない(特に新機能)
|
||||
|
||||
### 推奨される使用ケース
|
||||
|
||||
**ROCmが適している場合**:
|
||||
- 既にAMD GPUを所有している
|
||||
- 開発/テスト環境
|
||||
- 小~中規模のデータセット
|
||||
|
||||
**クラウドGPUを検討すべき場合**:
|
||||
- 本番環境での大規模トレーニング
|
||||
- 最高のパフォーマンスが必要
|
||||
- 公式サポートが必要
|
||||
|
||||
---
|
||||
|
||||
## 完全な動作例
|
||||
|
||||
### 統合GPU環境での推論スクリプト
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AMD GPU (ROCm) 向け Ultralytics YOLO 推論スクリプト
|
||||
"""
|
||||
import os
|
||||
import torch
|
||||
from ultralytics import YOLO
|
||||
|
||||
# 必須環境変数の設定(統合GPUの場合)
|
||||
os.environ['HSA_ENABLE_SDMA'] = '0'
|
||||
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '9.0.0' # 自分のGPUに合わせて調整
|
||||
|
||||
def check_gpu():
|
||||
"""GPU利用可能性の確認"""
|
||||
print("=" * 50)
|
||||
print("GPU情報")
|
||||
print("=" * 50)
|
||||
print(f"CUDA available: {torch.cuda.is_available()}")
|
||||
if torch.cuda.is_available():
|
||||
print(f"GPU count: {torch.cuda.device_count()}")
|
||||
print(f"Current device: {torch.cuda.current_device()}")
|
||||
print(f"Device name: {torch.cuda.get_device_name(0)}")
|
||||
print(f"ROCm version: {torch.version.hip}")
|
||||
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
||||
print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
|
||||
print("=" * 50)
|
||||
|
||||
def main():
|
||||
# GPU確認
|
||||
check_gpu()
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
print("警告: GPUが利用できません。CPUで実行します。")
|
||||
device = 'cpu'
|
||||
else:
|
||||
device = 0 # GPU 0を使用
|
||||
|
||||
# モデルのロード
|
||||
print(f"\nモデルをロード中... (device={device})")
|
||||
model = YOLO('yolov8n.pt')
|
||||
|
||||
# 推論の実行
|
||||
print("\n推論を実行中...")
|
||||
results = model.predict(
|
||||
source='path/to/image.jpg',
|
||||
device=device,
|
||||
conf=0.25,
|
||||
iou=0.7,
|
||||
imgsz=640,
|
||||
save=True,
|
||||
save_txt=True
|
||||
)
|
||||
|
||||
# 結果の表示
|
||||
for i, result in enumerate(results):
|
||||
print(f"\n結果 {i+1}:")
|
||||
print(f" 検出数: {len(result.boxes)}")
|
||||
print(f" 処理時間: {result.speed['inference']:.2f}ms")
|
||||
|
||||
# ボックス情報
|
||||
for box in result.boxes:
|
||||
cls = int(box.cls[0])
|
||||
conf = float(box.conf[0])
|
||||
print(f" クラス: {model.names[cls]}, 信頼度: {conf:.2f}")
|
||||
|
||||
# メモリ解放
|
||||
torch.cuda.empty_cache()
|
||||
print("\n完了!")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
```
|
||||
|
||||
### Dockerを使った実行例
|
||||
|
||||
```dockerfile
|
||||
# Dockerfile
|
||||
FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
|
||||
|
||||
# Ultralyticsのインストール
|
||||
RUN pip install ultralytics opencv-python
|
||||
|
||||
# 作業ディレクトリ
|
||||
WORKDIR /workspace
|
||||
|
||||
# 環境変数の設定
|
||||
ENV HSA_ENABLE_SDMA=0
|
||||
ENV HSA_OVERRIDE_GFX_VERSION=9.0.0
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
```
|
||||
|
||||
```bash
|
||||
# ビルドと実行
|
||||
docker build -t yolo-rocm .
|
||||
|
||||
docker run -it --rm \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--shm-size 8G \
|
||||
-v $(pwd):/workspace \
|
||||
yolo-rocm \
|
||||
python3 inference.py
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 参考資料
|
||||
|
||||
### 公式ドキュメント
|
||||
|
||||
- [PyTorch HIP (ROCm) Semantics](https://docs.pytorch.org/docs/stable/notes/hip.html)
|
||||
- [PyTorch on ROCm Installation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html)
|
||||
- [ROCm PyTorch Compatibility](https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html)
|
||||
- [Ultralytics YOLO Documentation](https://docs.ultralytics.com/)
|
||||
|
||||
### GitHub Issues & Discussions
|
||||
|
||||
- [AMD GPU support and optimisation - YOLOv5 #2995](https://github.com/ultralytics/yolov5/issues/2995)
|
||||
- [Direct support for AMD GPUs/ROCm - Ultralytics #10323](https://github.com/ultralytics/ultralytics/issues/10323)
|
||||
- [Running YOLOV8 on non CUDA GPU - Discussion #10066](https://github.com/orgs/ultralytics/discussions/10066)
|
||||
|
||||
### コミュニティリソース
|
||||
|
||||
- [harakas/amd_igpu_yolo_v8](https://github.com/harakas/amd_igpu_yolo_v8) - AMD統合GPU向け実装例
|
||||
- [ROCm Docker Hub](https://hub.docker.com/r/rocm/pytorch) - 公式Dockerイメージ
|
||||
- [PyTorch for AMD ROCm Platform Blog](https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/)
|
||||
|
||||
### トラブルシューティングリソース
|
||||
|
||||
- [How to run torch with AMD gpu? - PyTorch Forums](https://discuss.pytorch.org/t/how-to-run-torch-with-amd-gpu/157069)
|
||||
- [Install AMD GPU ROCm and PyTorch on Ubuntu - GitHub Gist](https://gist.github.com/jurgonaut/462a6bd9b87ed085fa0fe6c893536993)
|
||||
|
||||
---
|
||||
|
||||
## まとめ
|
||||
|
||||
### 重要なポイント
|
||||
|
||||
1. **公式サポートなし**: Ultralytics YOLOはROCmをネイティブサポートしていないが、PyTorchのROCmバックエンド経由で動作可能
|
||||
|
||||
2. **デバイス指定**: `device='cuda'` または `device=0` を使用(`'rocm'`や`'hip'`は無効)
|
||||
|
||||
3. **環境変数が重要**: 統合GPUや一部のコンシューマーGPUでは `HSA_ENABLE_SDMA=0` と `HSA_OVERRIDE_GFX_VERSION` が必須
|
||||
|
||||
4. **Dockerが推奨**: 環境の一貫性とトラブル回避のため、公式ROCm PyTorchイメージの使用を推奨
|
||||
|
||||
5. **パフォーマンス**: NVIDIA CUDAより遅い場合があるが、小~中規模の用途では実用的
|
||||
|
||||
### 次のステップ
|
||||
|
||||
- 自分のGPUアーキテクチャを確認 (`rocminfo | grep gfx`)
|
||||
- Docker環境でテスト推論を実行
|
||||
- パフォーマンスが不十分な場合はONNX + MIGraphXを検討
|
||||
- 本番環境ではクラウドGPUの使用を検討
|
||||
|
||||
---
|
||||
|
||||
**ドキュメント作成**: Claude Code (Sonnet 4.5)
|
||||
**最終更新**: 2026-02-07
|
||||
71
flake.nix
71
flake.nix
|
|
@ -15,32 +15,89 @@
|
|||
flake-utils.lib.eachDefaultSystem (
|
||||
system:
|
||||
let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
config.allowUnfree = true;
|
||||
};
|
||||
in
|
||||
{
|
||||
devShells.default = pkgs.mkShell {
|
||||
buildInputs = with pkgs; [
|
||||
# Python環境
|
||||
python311
|
||||
python311Packages.pip
|
||||
python311Packages.requests
|
||||
python311Packages.fastapi
|
||||
python311Packages.uvicorn
|
||||
python311Packages.numpy
|
||||
python311Packages.opencv4
|
||||
python311Packages.onnxruntime
|
||||
python311Packages.virtualenv
|
||||
git
|
||||
|
||||
# C/C++標準ライブラリ(PyTorchなどに必要)
|
||||
stdenv.cc.cc.lib
|
||||
zlib
|
||||
zstd
|
||||
|
||||
# ROCm関連(AMD GPU推論に必要)
|
||||
rocmPackages.clr
|
||||
rocmPackages.rocm-smi
|
||||
rocmPackages.rocm-runtime
|
||||
];
|
||||
|
||||
shellHook = ''
|
||||
python --version
|
||||
blender --version | head -n 1
|
||||
|
||||
# ROCm環境変数
|
||||
export ROCM_PATH="${pkgs.rocmPackages.clr}"
|
||||
export HSA_OVERRIDE_GFX_VERSION="11.0.0" # RX 7900 (RDNA 3 / gfx1100)
|
||||
|
||||
# LD_LIBRARY_PATH: ROCm、C++標準ライブラリ、その他必要なライブラリ
|
||||
export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib:${pkgs.zlib}/lib:${pkgs.zstd.out}/lib:${pkgs.rocmPackages.clr}/lib:${pkgs.rocmPackages.rocm-runtime}/lib:$LD_LIBRARY_PATH"
|
||||
|
||||
# venvのセットアップ
|
||||
VENV_DIR="$PWD/.venv"
|
||||
if [ ! -d "$VENV_DIR" ]; then
|
||||
echo "[Setup] Creating Python virtual environment..."
|
||||
python -m venv "$VENV_DIR"
|
||||
fi
|
||||
|
||||
# venvをアクティベート
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
# 必要なパッケージのインストール確認とインストール
|
||||
if ! python -c "import torch; print(torch.cuda.is_available())" 2>/dev/null | grep -q "True"; then
|
||||
echo "[Setup] Installing Python dependencies..."
|
||||
# まずPyTorch ROCm版をインストール(ROCm 6.2用)
|
||||
pip install --quiet torch torchvision --index-url https://download.pytorch.org/whl/rocm6.2
|
||||
# 次に通常のPyPIから他のパッケージをインストール
|
||||
pip install --quiet \
|
||||
ultralytics \
|
||||
opencv-python-headless \
|
||||
numpy \
|
||||
fastapi \
|
||||
uvicorn \
|
||||
pydantic
|
||||
# opencv-pythonがインストールされていたら削除(headless版のみ使用)
|
||||
pip uninstall -y opencv-python opencv 2>/dev/null || true
|
||||
# opencv-python-headlessを再インストールして確実にする
|
||||
pip install --quiet --force-reinstall opencv-python-headless
|
||||
echo "[Setup] Dependencies installed successfully"
|
||||
fi
|
||||
|
||||
# Pythonパスにカレントディレクトリを追加
|
||||
export PYTHONPATH="$PWD:$PYTHONPATH"
|
||||
|
||||
# アドオンのインストールパスを環境変数として設定
|
||||
export BLENDER_USER_SCRIPTS="$HOME/.config/blender/5.0/scripts"
|
||||
export BLENDER_USER_ADDONS="$BLENDER_USER_SCRIPTS/addons"
|
||||
|
||||
# 環境変数をファイルに保存(サーバープロセス用)
|
||||
cat > "$PWD/.env" << EOF
|
||||
LD_LIBRARY_PATH=${pkgs.stdenv.cc.cc.lib}/lib:${pkgs.zlib}/lib:${pkgs.zstd.out}/lib:${pkgs.rocmPackages.clr}/lib:${pkgs.rocmPackages.rocm-runtime}/lib
|
||||
ROCM_PATH=${pkgs.rocmPackages.clr}
|
||||
HSA_OVERRIDE_GFX_VERSION=11.0.0
|
||||
PYTORCH_ROCM_ARCH=gfx1100
|
||||
ROCBLAS_TENSILE_LIBPATH=${pkgs.rocmPackages.clr}/lib/rocblas/library
|
||||
EOF
|
||||
|
||||
echo "[Setup] Environment ready with GPU support"
|
||||
'';
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
"""
|
||||
YOLOv11 Face Detector using ONNX Runtime with GPU support.
|
||||
YOLOv8 Face Detector using PyTorch with ROCm support.
|
||||
|
||||
This module provides high-performance face detection using
|
||||
YOLOv11-face model with CUDA acceleration.
|
||||
YOLOv8-face model with AMD GPU (ROCm) acceleration.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
|
@ -13,17 +13,17 @@ import numpy as np
|
|||
|
||||
class YOLOFaceDetector:
|
||||
"""
|
||||
YOLOv11 face detector with ONNX Runtime GPU support.
|
||||
|
||||
YOLOv8 face detector with PyTorch ROCm support.
|
||||
|
||||
Features:
|
||||
- CUDA GPU acceleration
|
||||
- ROCm GPU acceleration for AMD GPUs
|
||||
- High accuracy face detection
|
||||
- NMS for overlapping detections
|
||||
- Automatic NMS for overlapping detections
|
||||
"""
|
||||
|
||||
|
||||
# Default model path relative to this file
|
||||
DEFAULT_MODEL = "yolov11n-face.onnx"
|
||||
|
||||
DEFAULT_MODEL = "yolov8n-face-lindevs.pt"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_path: Optional[str] = None,
|
||||
|
|
@ -33,9 +33,9 @@ class YOLOFaceDetector:
|
|||
):
|
||||
"""
|
||||
Initialize the YOLO face detector.
|
||||
|
||||
|
||||
Args:
|
||||
model_path: Path to ONNX model file. If None, uses default model.
|
||||
model_path: Path to PyTorch model file. If None, uses default model.
|
||||
conf_threshold: Confidence threshold for detections
|
||||
iou_threshold: IoU threshold for NMS
|
||||
input_size: Model input size (width, height)
|
||||
|
|
@ -43,15 +43,17 @@ class YOLOFaceDetector:
|
|||
self.conf_threshold = conf_threshold
|
||||
self.iou_threshold = iou_threshold
|
||||
self.input_size = input_size
|
||||
self._session = None
|
||||
self._model = None
|
||||
self._model_path = model_path
|
||||
|
||||
self._device = None
|
||||
|
||||
@property
|
||||
def session(self):
|
||||
"""Lazy-load ONNX Runtime session."""
|
||||
if self._session is None:
|
||||
import onnxruntime as ort
|
||||
|
||||
def model(self):
|
||||
"""Lazy-load YOLO model."""
|
||||
if self._model is None:
|
||||
from ultralytics import YOLO
|
||||
import torch
|
||||
|
||||
# Determine model path
|
||||
if self._model_path is None:
|
||||
# Assuming models are in ../models relative to server/detector.py
|
||||
|
|
@ -59,255 +61,92 @@ class YOLOFaceDetector:
|
|||
model_path = str(models_dir / self.DEFAULT_MODEL)
|
||||
else:
|
||||
model_path = self._model_path
|
||||
|
||||
|
||||
if not os.path.exists(model_path):
|
||||
raise FileNotFoundError(f"Model not found: {model_path}")
|
||||
|
||||
# Configure providers (prefer CUDA)
|
||||
providers = []
|
||||
if 'CUDAExecutionProvider' in ort.get_available_providers():
|
||||
providers.append('CUDAExecutionProvider')
|
||||
print("[FaceMask] Using CUDA GPU for inference")
|
||||
providers.append('CPUExecutionProvider')
|
||||
|
||||
# Create session
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
|
||||
self._session = ort.InferenceSession(
|
||||
model_path,
|
||||
sess_options=sess_options,
|
||||
providers=providers,
|
||||
)
|
||||
|
||||
|
||||
# Detect device (ROCm GPU or CPU)
|
||||
if torch.cuda.is_available():
|
||||
self._device = 'cuda'
|
||||
device_name = torch.cuda.get_device_name(0)
|
||||
print(f"[FaceMask] Using ROCm GPU for inference: {device_name}")
|
||||
else:
|
||||
self._device = 'cpu'
|
||||
print("[FaceMask] Using CPU for inference (ROCm GPU not available)")
|
||||
|
||||
# Load model (let Ultralytics handle device management)
|
||||
try:
|
||||
self._model = YOLO(model_path)
|
||||
# Don't call .to() - let predict() handle device assignment
|
||||
print(f"[FaceMask] Model loaded, will use device: {self._device}")
|
||||
except Exception as e:
|
||||
print(f"[FaceMask] Error loading model: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
print(f"[FaceMask] YOLO model loaded: {model_path}")
|
||||
print(f"[FaceMask] Providers: {self._session.get_providers()}")
|
||||
|
||||
return self._session
|
||||
|
||||
print(f"[FaceMask] Device: {self._device}")
|
||||
|
||||
return self._model
|
||||
|
||||
def detect(self, frame: np.ndarray) -> List[Tuple[int, int, int, int, float]]:
|
||||
"""
|
||||
Detect faces in a frame.
|
||||
|
||||
|
||||
Args:
|
||||
frame: BGR image as numpy array (H, W, C)
|
||||
|
||||
|
||||
Returns:
|
||||
List of detections as (x, y, width, height, confidence)
|
||||
"""
|
||||
import cv2
|
||||
|
||||
original_height, original_width = frame.shape[:2]
|
||||
|
||||
input_tensor = self._preprocess(frame)
|
||||
# print(f"[DEBUG] Input tensor shape: {input_tensor.shape}, Range: [{input_tensor.min():.3f}, {input_tensor.max():.3f}]", flush=True)
|
||||
|
||||
# Run inference
|
||||
input_name = self.session.get_inputs()[0].name
|
||||
outputs = self.session.run(None, {input_name: input_tensor})
|
||||
|
||||
raw_output = outputs[0]
|
||||
# print(f"[DEBUG] Raw output shape: {raw_output.shape}, Range: [{raw_output.min():.3f}, {raw_output.max():.3f}]", flush=True)
|
||||
import torch
|
||||
print(f"[FaceMask] Inference device: {self._device}, CUDA available: {torch.cuda.is_available()}")
|
||||
try:
|
||||
results = self.model.predict(
|
||||
frame,
|
||||
conf=self.conf_threshold,
|
||||
iou=self.iou_threshold,
|
||||
imgsz=self.input_size[0],
|
||||
verbose=False,
|
||||
device=self._device,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[FaceMask] ERROR during inference: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Fallback to CPU
|
||||
print("[FaceMask] Falling back to CPU inference...")
|
||||
self._device = 'cpu'
|
||||
results = self.model.predict(
|
||||
frame,
|
||||
conf=self.conf_threshold,
|
||||
iou=self.iou_threshold,
|
||||
imgsz=self.input_size[0],
|
||||
verbose=False,
|
||||
device='cpu',
|
||||
)
|
||||
|
||||
# Postprocess
|
||||
detections = self._postprocess(
|
||||
raw_output,
|
||||
original_width,
|
||||
original_height,
|
||||
)
|
||||
# print(f"[DEBUG] Detections found: {len(detections)}", flush=True)
|
||||
|
||||
return detections
|
||||
|
||||
def _preprocess(self, frame: np.ndarray) -> np.ndarray:
|
||||
"""Preprocess frame for YOLO input with letterbox resizing."""
|
||||
import cv2
|
||||
|
||||
# Letterbox resize
|
||||
shape = frame.shape[:2] # current shape [height, width]
|
||||
new_shape = self.input_size
|
||||
|
||||
# Scale ratio (new / old)
|
||||
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
||||
|
||||
# Compute padding
|
||||
ratio = r, r # width, height ratios
|
||||
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
|
||||
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
|
||||
|
||||
dw /= 2 # divide padding into 2 sides
|
||||
dh /= 2
|
||||
|
||||
if shape[::-1] != new_unpad: # resize
|
||||
frame = cv2.resize(frame, new_unpad, interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
|
||||
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
|
||||
|
||||
# Add border
|
||||
frame = cv2.copyMakeBorder(frame, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
|
||||
|
||||
# Store metadata for postprocessing
|
||||
self._last_letterbox_meta = {'ratio': ratio, 'dwdh': (dw, dh)}
|
||||
|
||||
# Convert BGR to RGB
|
||||
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Normalize to [0, 1]
|
||||
normalized = rgb.astype(np.float32) / 255.0
|
||||
|
||||
# Transpose to CHW format
|
||||
transposed = np.transpose(normalized, (2, 0, 1))
|
||||
|
||||
# Add batch dimension
|
||||
batched = np.expand_dims(transposed, axis=0)
|
||||
|
||||
return batched
|
||||
|
||||
def _postprocess(
|
||||
self,
|
||||
output: np.ndarray,
|
||||
original_width: int,
|
||||
original_height: int,
|
||||
) -> List[Tuple[int, int, int, int, float]]:
|
||||
"""
|
||||
Postprocess YOLO output to get detections.
|
||||
"""
|
||||
# Output shape: [1, num_detections, 5+] where 5 = x_center, y_center, w, h, conf
|
||||
|
||||
# Handle different output formats
|
||||
if output.shape[1] < output.shape[2]:
|
||||
# Format: [1, 5+, num_detections] - transpose
|
||||
output = np.transpose(output[0], (1, 0))
|
||||
else:
|
||||
output = output[0]
|
||||
|
||||
# Debug confidence stats
|
||||
# if output.shape[1] >= 5:
|
||||
# max_conf = output[:, 4].max()
|
||||
# print(f"[DEBUG] Max confidence in raw output: {max_conf:.4f}", flush=True)
|
||||
|
||||
# Filter by confidence
|
||||
confidences = output[:, 4]
|
||||
mask = confidences > self.conf_threshold
|
||||
filtered = output[mask]
|
||||
|
||||
if len(filtered) == 0:
|
||||
return []
|
||||
|
||||
# Get letterbox metadata
|
||||
if hasattr(self, '_last_letterbox_meta') and self._last_letterbox_meta:
|
||||
ratio = self._last_letterbox_meta['ratio']
|
||||
dw, dh = self._last_letterbox_meta['dwdh']
|
||||
|
||||
# Extract coordinates
|
||||
x_center = filtered[:, 0]
|
||||
y_center = filtered[:, 1]
|
||||
width = filtered[:, 2]
|
||||
height = filtered[:, 3]
|
||||
confidences = filtered[:, 4]
|
||||
|
||||
# Convert center to corner
|
||||
x1 = x_center - width / 2
|
||||
y1 = y_center - height / 2
|
||||
x2 = x_center + width / 2
|
||||
y2 = y_center + height / 2
|
||||
|
||||
# Adjust for letterbox padding
|
||||
x1 -= dw
|
||||
y1 -= dh
|
||||
x2 -= dw
|
||||
y2 -= dh
|
||||
|
||||
# Adjust for resizing
|
||||
x1 /= ratio[0]
|
||||
y1 /= ratio[1]
|
||||
x2 /= ratio[0]
|
||||
y2 /= ratio[1]
|
||||
|
||||
# Clip to image bounds
|
||||
x1 = np.clip(x1, 0, original_width)
|
||||
y1 = np.clip(y1, 0, original_height)
|
||||
x2 = np.clip(x2, 0, original_width)
|
||||
y2 = np.clip(y2, 0, original_height)
|
||||
|
||||
# Convert back to x, y, w, h
|
||||
final_x = x1
|
||||
final_y = y1
|
||||
final_w = x2 - x1
|
||||
final_h = y2 - y1
|
||||
|
||||
else:
|
||||
# Fallback for non-letterbox (legacy)
|
||||
scale_x = original_width / self.input_size[0]
|
||||
scale_y = original_height / self.input_size[1]
|
||||
|
||||
x_center = filtered[:, 0] * scale_x
|
||||
y_center = filtered[:, 1] * scale_y
|
||||
width = filtered[:, 2] * scale_x
|
||||
height = filtered[:, 3] * scale_y
|
||||
confidences = filtered[:, 4]
|
||||
|
||||
final_x = x_center - width / 2
|
||||
final_y = y_center - height / 2
|
||||
final_w = width
|
||||
final_h = height
|
||||
|
||||
# Apply NMS
|
||||
boxes = np.stack([final_x, final_y, final_w, final_h], axis=1)
|
||||
indices = self._nms(boxes, confidences, self.iou_threshold)
|
||||
|
||||
# Format output
|
||||
# Extract detections
|
||||
detections = []
|
||||
for i in indices:
|
||||
x = int(final_x[i])
|
||||
y = int(final_y[i])
|
||||
w = int(final_w[i])
|
||||
h = int(final_h[i])
|
||||
conf = float(confidences[i])
|
||||
detections.append((x, y, w, h, conf))
|
||||
|
||||
if len(results) > 0 and results[0].boxes is not None:
|
||||
boxes = results[0].boxes
|
||||
for box in boxes:
|
||||
# Get coordinates in xyxy format
|
||||
x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
|
||||
conf = float(box.conf[0].cpu().numpy())
|
||||
|
||||
# Convert to x, y, width, height
|
||||
x = int(x1)
|
||||
y = int(y1)
|
||||
w = int(x2 - x1)
|
||||
h = int(y2 - y1)
|
||||
|
||||
detections.append((x, y, w, h, conf))
|
||||
|
||||
return detections
|
||||
|
||||
def _nms(
|
||||
self,
|
||||
boxes: np.ndarray,
|
||||
scores: np.ndarray,
|
||||
iou_threshold: float,
|
||||
) -> List[int]:
|
||||
"""Non-Maximum Suppression."""
|
||||
x1 = boxes[:, 0]
|
||||
y1 = boxes[:, 1]
|
||||
x2 = x1 + boxes[:, 2]
|
||||
y2 = y1 + boxes[:, 3]
|
||||
|
||||
areas = boxes[:, 2] * boxes[:, 3]
|
||||
order = scores.argsort()[::-1]
|
||||
|
||||
keep = []
|
||||
while len(order) > 0:
|
||||
i = order[0]
|
||||
keep.append(i)
|
||||
|
||||
if len(order) == 1:
|
||||
break
|
||||
|
||||
xx1 = np.maximum(x1[i], x1[order[1:]])
|
||||
yy1 = np.maximum(y1[i], y1[order[1:]])
|
||||
xx2 = np.minimum(x2[i], x2[order[1:]])
|
||||
yy2 = np.minimum(y2[i], y2[order[1:]])
|
||||
|
||||
w = np.maximum(0, xx2 - xx1)
|
||||
h = np.maximum(0, yy2 - yy1)
|
||||
inter = w * h
|
||||
|
||||
iou = inter / (areas[i] + areas[order[1:]] - inter)
|
||||
|
||||
inds = np.where(iou <= iou_threshold)[0]
|
||||
order = order[inds + 1]
|
||||
|
||||
return keep
|
||||
|
||||
|
||||
def generate_mask(
|
||||
self,
|
||||
frame_shape: Tuple[int, int, int],
|
||||
|
|
@ -317,29 +156,29 @@ class YOLOFaceDetector:
|
|||
) -> np.ndarray:
|
||||
"""
|
||||
Generate a mask image from face detections.
|
||||
|
||||
|
||||
Args:
|
||||
frame_shape: Shape of the original frame (height, width, channels)
|
||||
detections: List of face detections (x, y, w, h, conf)
|
||||
mask_scale: Scale factor for mask region
|
||||
feather_radius: Radius for edge feathering
|
||||
|
||||
|
||||
Returns:
|
||||
Grayscale mask image (white = blur, black = keep)
|
||||
"""
|
||||
import cv2
|
||||
|
||||
|
||||
height, width = frame_shape[:2]
|
||||
mask = np.zeros((height, width), dtype=np.uint8)
|
||||
|
||||
|
||||
for (x, y, w, h, conf) in detections:
|
||||
# Scale the bounding box
|
||||
center_x = x + w // 2
|
||||
center_y = y + h // 2
|
||||
|
||||
|
||||
scaled_w = int(w * mask_scale)
|
||||
scaled_h = int(h * mask_scale)
|
||||
|
||||
|
||||
# Draw ellipse for natural face shape
|
||||
cv2.ellipse(
|
||||
mask,
|
||||
|
|
@ -350,12 +189,12 @@ class YOLOFaceDetector:
|
|||
255, # color (white)
|
||||
-1, # filled
|
||||
)
|
||||
|
||||
|
||||
# Apply Gaussian blur for feathering
|
||||
if feather_radius > 0 and len(detections) > 0:
|
||||
kernel_size = feather_radius * 2 + 1
|
||||
mask = cv2.GaussianBlur(mask, (kernel_size, kernel_size), 0)
|
||||
|
||||
|
||||
return mask
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -74,8 +74,8 @@ def process_video_task(task_id: str, req: GenerateRequest):
|
|||
conf_threshold=req.conf_threshold,
|
||||
iou_threshold=req.iou_threshold
|
||||
)
|
||||
# Ensure session is loaded
|
||||
_ = detector.session
|
||||
# Ensure model is loaded
|
||||
_ = detector.model
|
||||
|
||||
# Open video
|
||||
cap = cv2.VideoCapture(req.video_path)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user