Addonに適用

This commit is contained in:
Keisuke Hirata 2026-02-12 18:52:55 +09:00
parent 0b6c31501e
commit a2131a962b
7 changed files with 225 additions and 7 deletions

View File

@ -6,6 +6,41 @@
## 開発者向け情報
### GPU環境の確認
推論サーバーは起動時に環境診断情報を出力します:
- Python環境バージョン、仮想環境の検出
- ROCm環境変数ROCM_PATH、HSA_OVERRIDE_GFX_VERSION等
- GPU検出状況デバイス名、ROCmバージョン
```bash
# サーバーを起動して診断ログを確認
python server/main.py
# サーバーのGPU状態を確認
python test_server_api.py --status
```
出力例:
```
[FaceMask Server] Startup Diagnostics
======================================================================
[Python Environment]
Python Version: 3.12.12
Virtual Environment: Yes
[ROCm Environment Variables]
ROCM_PATH: /nix/store/.../clr-7.1.1
HSA_OVERRIDE_GFX_VERSION: 11.0.0
[GPU Detection]
torch.cuda.is_available(): True
GPU Device 0: AMD Radeon Graphics
ROCm Version (HIP): 7.0.51831
======================================================================
```
### 処理プロセスの単体デバッグ
顔検出処理をBlenderから独立してテストできます。

View File

@ -72,21 +72,56 @@ class InferenceClient:
cwd=root_dir,
text=True,
env=server_env,
stdout=subprocess.PIPE, # Capture stdout
stderr=subprocess.PIPE, # Capture stderr
preexec_fn=os.setsid, # Create new process group
)
# Wait for startup
for _ in range(20): # Wait up to 10 seconds
if self.is_server_running():
print("[FaceMask] Server started successfully")
return
# Check if process died
if self.server_process.poll() is not None:
raise RuntimeError(f"Server failed to start (rc={self.server_process.returncode})")
# Capture and display error output
try:
stdout, stderr = self.server_process.communicate(timeout=1)
except subprocess.TimeoutExpired:
stdout, stderr = "", ""
error_msg = f"Server failed to start (exit code: {self.server_process.returncode})"
print(f"[FaceMask] ERROR: {error_msg}")
if stdout and stdout.strip():
print("[FaceMask] Server stdout:")
print(stdout.strip())
if stderr and stderr.strip():
print("[FaceMask] Server stderr:")
print(stderr.strip())
self.server_process = None
raise RuntimeError(error_msg)
time.sleep(0.5)
# If we get here, startup timed out
# Try to capture any partial output
if self.server_process:
try:
# Non-blocking read with short timeout
stdout, stderr = self.server_process.communicate(timeout=0.1)
if stdout and stdout.strip():
print("[FaceMask] Server stdout (partial):")
print(stdout.strip())
if stderr and stderr.strip():
print("[FaceMask] Server stderr (partial):")
print(stderr.strip())
except subprocess.TimeoutExpired:
pass # Process still running but not responding
raise RuntimeError("Server startup timed out")
def stop_server(self):

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
# 推論サーバーの単体起動スクリプト
set -e

View File

@ -7,6 +7,7 @@ GPU-accelerated face detection using ONNX Runtime.
import os
import sys
import platform
import threading
import uuid
import queue
@ -27,6 +28,9 @@ from server.detector import YOLOFaceDetector, get_detector
app = FastAPI(title="Face Mask Inference Server")
# GPU status cache
_gpu_status_cache = None
# Task storage
class TaskStatus:
PENDING = "pending"
@ -146,9 +150,146 @@ def process_video_task(task_id: str, req: GenerateRequest):
if task_id in cancel_events:
del cancel_events[task_id]
def check_gpu_available() -> dict:
"""
Check if GPU is available for inference.
Returns a dict with GPU information:
{
"available": bool,
"device_name": str or None,
"device_count": int,
"rocm_version": str or None
}
"""
global _gpu_status_cache
# Return cached result if available
if _gpu_status_cache is not None:
return _gpu_status_cache
result = {
"available": False,
"device_name": None,
"device_count": 0,
"rocm_version": None
}
try:
import torch
result["available"] = torch.cuda.is_available()
if result["available"]:
result["device_count"] = torch.cuda.device_count()
if result["device_count"] > 0:
result["device_name"] = torch.cuda.get_device_name(0)
if hasattr(torch.version, 'hip'):
result["rocm_version"] = torch.version.hip
except Exception as e:
print(f"[FaceMask] Warning: GPU detection failed: {e}")
result["available"] = False
# Cache the result
_gpu_status_cache = result
return result
def log_startup_diagnostics():
"""Log diagnostic information about the environment and GPU."""
print("=" * 70)
print("[FaceMask Server] Startup Diagnostics")
print("=" * 70)
# Python Environment
print("\n[Python Environment]")
print(f" Python Version: {sys.version.split()[0]}")
print(f" Python Executable: {sys.executable}")
print(f" Platform: {platform.platform()}")
print(f" Working Directory: {os.getcwd()}")
# Check if in venv
in_venv = sys.prefix != sys.base_prefix
print(f" Virtual Environment: {'Yes' if in_venv else 'No'}")
if in_venv:
print(f" venv path: {sys.prefix}")
# ROCm Environment Variables
print("\n[ROCm Environment Variables]")
rocm_vars = [
'ROCM_PATH',
'HSA_OVERRIDE_GFX_VERSION',
'PYTORCH_ROCM_ARCH',
'ROCBLAS_TENSILE_LIBPATH',
'LD_LIBRARY_PATH'
]
for var in rocm_vars:
value = os.environ.get(var)
if value:
# Truncate very long values
if len(value) > 200:
display_value = value[:200] + "... (truncated)"
else:
display_value = value
print(f" {var}: {display_value}")
else:
print(f" {var}: (not set)")
# GPU Detection
print("\n[GPU Detection]")
try:
import torch
cuda_available = torch.cuda.is_available()
print(f" torch.cuda.is_available(): {cuda_available}")
if cuda_available:
device_count = torch.cuda.device_count()
print(f" GPU Device Count: {device_count}")
if device_count > 0:
device_name = torch.cuda.get_device_name(0)
print(f" GPU Device 0: {device_name}")
# ROCm version
if hasattr(torch.version, 'hip'):
print(f" ROCm Version (HIP): {torch.version.hip}")
# CUDA version (might be emulated by ROCm)
if torch.version.cuda:
print(f" CUDA Version: {torch.version.cuda}")
else:
print(" WARNING: GPU not detected!")
print(" Server will use CPU for inference (slower)")
print(" Troubleshooting:")
print(" - Check ROCm environment variables above")
print(" - Run: python -c 'import torch; print(torch.cuda.is_available())'")
except ImportError as e:
print(f" ERROR: Cannot import torch: {e}")
print(" PyTorch must be installed for inference")
except Exception as e:
print(f" ERROR during GPU detection: {e}")
print("=" * 70)
print()
@app.get("/status")
def get_status():
return {"status": "running", "gpu_available": True} # TODO: check GPU
gpu_info = check_gpu_available()
return {
"status": "running",
"gpu_available": gpu_info["available"],
"gpu_device": gpu_info["device_name"],
"gpu_count": gpu_info["device_count"],
"rocm_version": gpu_info["rocm_version"]
}
@app.post("/generate", response_model=Task)
def generate_mask_endpoint(req: GenerateRequest, background_tasks: BackgroundTasks):
@ -177,4 +318,5 @@ def cancel_task(task_id: str):
return {"message": "Cancellation requested"}
if __name__ == "__main__":
log_startup_diagnostics()
uvicorn.run(app, host="127.0.0.1", port=8181)

Binary file not shown.

Binary file not shown.

View File

@ -30,6 +30,12 @@ def check_status():
print("✓ サーバーは稼働中です")
print(f" Status: {data.get('status')}")
print(f" GPU Available: {data.get('gpu_available')}")
if data.get('gpu_device'):
print(f" GPU Device: {data.get('gpu_device')}")
if data.get('gpu_count'):
print(f" GPU Count: {data.get('gpu_count')}")
if data.get('rocm_version'):
print(f" ROCm Version: {data.get('rocm_version')}")
return True
except (urllib.error.URLError, ConnectionRefusedError, TimeoutError) as e:
print("✗ サーバーに接続できません")