Addonに適用
This commit is contained in:
parent
0b6c31501e
commit
a2131a962b
35
README.md
35
README.md
|
|
@ -6,6 +6,41 @@
|
||||||
|
|
||||||
## 開発者向け情報
|
## 開発者向け情報
|
||||||
|
|
||||||
|
### GPU環境の確認
|
||||||
|
|
||||||
|
推論サーバーは起動時に環境診断情報を出力します:
|
||||||
|
|
||||||
|
- Python環境(バージョン、仮想環境の検出)
|
||||||
|
- ROCm環境変数(ROCM_PATH、HSA_OVERRIDE_GFX_VERSION等)
|
||||||
|
- GPU検出状況(デバイス名、ROCmバージョン)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# サーバーを起動して診断ログを確認
|
||||||
|
python server/main.py
|
||||||
|
|
||||||
|
# サーバーのGPU状態を確認
|
||||||
|
python test_server_api.py --status
|
||||||
|
```
|
||||||
|
|
||||||
|
出力例:
|
||||||
|
```
|
||||||
|
[FaceMask Server] Startup Diagnostics
|
||||||
|
======================================================================
|
||||||
|
[Python Environment]
|
||||||
|
Python Version: 3.12.12
|
||||||
|
Virtual Environment: Yes
|
||||||
|
|
||||||
|
[ROCm Environment Variables]
|
||||||
|
ROCM_PATH: /nix/store/.../clr-7.1.1
|
||||||
|
HSA_OVERRIDE_GFX_VERSION: 11.0.0
|
||||||
|
|
||||||
|
[GPU Detection]
|
||||||
|
torch.cuda.is_available(): True
|
||||||
|
GPU Device 0: AMD Radeon Graphics
|
||||||
|
ROCm Version (HIP): 7.0.51831
|
||||||
|
======================================================================
|
||||||
|
```
|
||||||
|
|
||||||
### 処理プロセスの単体デバッグ
|
### 処理プロセスの単体デバッグ
|
||||||
|
|
||||||
顔検出処理をBlenderから独立してテストできます。
|
顔検出処理をBlenderから独立してテストできます。
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,8 @@ class InferenceClient:
|
||||||
cwd=root_dir,
|
cwd=root_dir,
|
||||||
text=True,
|
text=True,
|
||||||
env=server_env,
|
env=server_env,
|
||||||
|
stdout=subprocess.PIPE, # Capture stdout
|
||||||
|
stderr=subprocess.PIPE, # Capture stderr
|
||||||
preexec_fn=os.setsid, # Create new process group
|
preexec_fn=os.setsid, # Create new process group
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -83,10 +85,43 @@ class InferenceClient:
|
||||||
|
|
||||||
# Check if process died
|
# Check if process died
|
||||||
if self.server_process.poll() is not None:
|
if self.server_process.poll() is not None:
|
||||||
raise RuntimeError(f"Server failed to start (rc={self.server_process.returncode})")
|
# Capture and display error output
|
||||||
|
try:
|
||||||
|
stdout, stderr = self.server_process.communicate(timeout=1)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
stdout, stderr = "", ""
|
||||||
|
|
||||||
|
error_msg = f"Server failed to start (exit code: {self.server_process.returncode})"
|
||||||
|
print(f"[FaceMask] ERROR: {error_msg}")
|
||||||
|
|
||||||
|
if stdout and stdout.strip():
|
||||||
|
print("[FaceMask] Server stdout:")
|
||||||
|
print(stdout.strip())
|
||||||
|
|
||||||
|
if stderr and stderr.strip():
|
||||||
|
print("[FaceMask] Server stderr:")
|
||||||
|
print(stderr.strip())
|
||||||
|
|
||||||
|
self.server_process = None
|
||||||
|
raise RuntimeError(error_msg)
|
||||||
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# If we get here, startup timed out
|
||||||
|
# Try to capture any partial output
|
||||||
|
if self.server_process:
|
||||||
|
try:
|
||||||
|
# Non-blocking read with short timeout
|
||||||
|
stdout, stderr = self.server_process.communicate(timeout=0.1)
|
||||||
|
if stdout and stdout.strip():
|
||||||
|
print("[FaceMask] Server stdout (partial):")
|
||||||
|
print(stdout.strip())
|
||||||
|
if stderr and stderr.strip():
|
||||||
|
print("[FaceMask] Server stderr (partial):")
|
||||||
|
print(stderr.strip())
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
pass # Process still running but not responding
|
||||||
|
|
||||||
raise RuntimeError("Server startup timed out")
|
raise RuntimeError("Server startup timed out")
|
||||||
|
|
||||||
def stop_server(self):
|
def stop_server(self):
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
#!/bin/bash
|
#!/usr/bin/env bash
|
||||||
# 推論サーバーの単体起動スクリプト
|
# 推論サーバーの単体起動スクリプト
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
|
||||||
144
server/main.py
144
server/main.py
|
|
@ -7,6 +7,7 @@ GPU-accelerated face detection using ONNX Runtime.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import platform
|
||||||
import threading
|
import threading
|
||||||
import uuid
|
import uuid
|
||||||
import queue
|
import queue
|
||||||
|
|
@ -27,6 +28,9 @@ from server.detector import YOLOFaceDetector, get_detector
|
||||||
|
|
||||||
app = FastAPI(title="Face Mask Inference Server")
|
app = FastAPI(title="Face Mask Inference Server")
|
||||||
|
|
||||||
|
# GPU status cache
|
||||||
|
_gpu_status_cache = None
|
||||||
|
|
||||||
# Task storage
|
# Task storage
|
||||||
class TaskStatus:
|
class TaskStatus:
|
||||||
PENDING = "pending"
|
PENDING = "pending"
|
||||||
|
|
@ -146,9 +150,146 @@ def process_video_task(task_id: str, req: GenerateRequest):
|
||||||
if task_id in cancel_events:
|
if task_id in cancel_events:
|
||||||
del cancel_events[task_id]
|
del cancel_events[task_id]
|
||||||
|
|
||||||
|
def check_gpu_available() -> dict:
|
||||||
|
"""
|
||||||
|
Check if GPU is available for inference.
|
||||||
|
|
||||||
|
Returns a dict with GPU information:
|
||||||
|
{
|
||||||
|
"available": bool,
|
||||||
|
"device_name": str or None,
|
||||||
|
"device_count": int,
|
||||||
|
"rocm_version": str or None
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
global _gpu_status_cache
|
||||||
|
|
||||||
|
# Return cached result if available
|
||||||
|
if _gpu_status_cache is not None:
|
||||||
|
return _gpu_status_cache
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"available": False,
|
||||||
|
"device_name": None,
|
||||||
|
"device_count": 0,
|
||||||
|
"rocm_version": None
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
result["available"] = torch.cuda.is_available()
|
||||||
|
|
||||||
|
if result["available"]:
|
||||||
|
result["device_count"] = torch.cuda.device_count()
|
||||||
|
if result["device_count"] > 0:
|
||||||
|
result["device_name"] = torch.cuda.get_device_name(0)
|
||||||
|
|
||||||
|
if hasattr(torch.version, 'hip'):
|
||||||
|
result["rocm_version"] = torch.version.hip
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[FaceMask] Warning: GPU detection failed: {e}")
|
||||||
|
result["available"] = False
|
||||||
|
|
||||||
|
# Cache the result
|
||||||
|
_gpu_status_cache = result
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def log_startup_diagnostics():
|
||||||
|
"""Log diagnostic information about the environment and GPU."""
|
||||||
|
print("=" * 70)
|
||||||
|
print("[FaceMask Server] Startup Diagnostics")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Python Environment
|
||||||
|
print("\n[Python Environment]")
|
||||||
|
print(f" Python Version: {sys.version.split()[0]}")
|
||||||
|
print(f" Python Executable: {sys.executable}")
|
||||||
|
print(f" Platform: {platform.platform()}")
|
||||||
|
print(f" Working Directory: {os.getcwd()}")
|
||||||
|
|
||||||
|
# Check if in venv
|
||||||
|
in_venv = sys.prefix != sys.base_prefix
|
||||||
|
print(f" Virtual Environment: {'Yes' if in_venv else 'No'}")
|
||||||
|
if in_venv:
|
||||||
|
print(f" venv path: {sys.prefix}")
|
||||||
|
|
||||||
|
# ROCm Environment Variables
|
||||||
|
print("\n[ROCm Environment Variables]")
|
||||||
|
rocm_vars = [
|
||||||
|
'ROCM_PATH',
|
||||||
|
'HSA_OVERRIDE_GFX_VERSION',
|
||||||
|
'PYTORCH_ROCM_ARCH',
|
||||||
|
'ROCBLAS_TENSILE_LIBPATH',
|
||||||
|
'LD_LIBRARY_PATH'
|
||||||
|
]
|
||||||
|
|
||||||
|
for var in rocm_vars:
|
||||||
|
value = os.environ.get(var)
|
||||||
|
if value:
|
||||||
|
# Truncate very long values
|
||||||
|
if len(value) > 200:
|
||||||
|
display_value = value[:200] + "... (truncated)"
|
||||||
|
else:
|
||||||
|
display_value = value
|
||||||
|
print(f" {var}: {display_value}")
|
||||||
|
else:
|
||||||
|
print(f" {var}: (not set)")
|
||||||
|
|
||||||
|
# GPU Detection
|
||||||
|
print("\n[GPU Detection]")
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
cuda_available = torch.cuda.is_available()
|
||||||
|
print(f" torch.cuda.is_available(): {cuda_available}")
|
||||||
|
|
||||||
|
if cuda_available:
|
||||||
|
device_count = torch.cuda.device_count()
|
||||||
|
print(f" GPU Device Count: {device_count}")
|
||||||
|
|
||||||
|
if device_count > 0:
|
||||||
|
device_name = torch.cuda.get_device_name(0)
|
||||||
|
print(f" GPU Device 0: {device_name}")
|
||||||
|
|
||||||
|
# ROCm version
|
||||||
|
if hasattr(torch.version, 'hip'):
|
||||||
|
print(f" ROCm Version (HIP): {torch.version.hip}")
|
||||||
|
|
||||||
|
# CUDA version (might be emulated by ROCm)
|
||||||
|
if torch.version.cuda:
|
||||||
|
print(f" CUDA Version: {torch.version.cuda}")
|
||||||
|
else:
|
||||||
|
print(" WARNING: GPU not detected!")
|
||||||
|
print(" Server will use CPU for inference (slower)")
|
||||||
|
print(" Troubleshooting:")
|
||||||
|
print(" - Check ROCm environment variables above")
|
||||||
|
print(" - Run: python -c 'import torch; print(torch.cuda.is_available())'")
|
||||||
|
|
||||||
|
except ImportError as e:
|
||||||
|
print(f" ERROR: Cannot import torch: {e}")
|
||||||
|
print(" PyTorch must be installed for inference")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR during GPU detection: {e}")
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
@app.get("/status")
|
@app.get("/status")
|
||||||
def get_status():
|
def get_status():
|
||||||
return {"status": "running", "gpu_available": True} # TODO: check GPU
|
gpu_info = check_gpu_available()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "running",
|
||||||
|
"gpu_available": gpu_info["available"],
|
||||||
|
"gpu_device": gpu_info["device_name"],
|
||||||
|
"gpu_count": gpu_info["device_count"],
|
||||||
|
"rocm_version": gpu_info["rocm_version"]
|
||||||
|
}
|
||||||
|
|
||||||
@app.post("/generate", response_model=Task)
|
@app.post("/generate", response_model=Task)
|
||||||
def generate_mask_endpoint(req: GenerateRequest, background_tasks: BackgroundTasks):
|
def generate_mask_endpoint(req: GenerateRequest, background_tasks: BackgroundTasks):
|
||||||
|
|
@ -177,4 +318,5 @@ def cancel_task(task_id: str):
|
||||||
return {"message": "Cancellation requested"}
|
return {"message": "Cancellation requested"}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
log_startup_diagnostics()
|
||||||
uvicorn.run(app, host="127.0.0.1", port=8181)
|
uvicorn.run(app, host="127.0.0.1", port=8181)
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -30,6 +30,12 @@ def check_status():
|
||||||
print("✓ サーバーは稼働中です")
|
print("✓ サーバーは稼働中です")
|
||||||
print(f" Status: {data.get('status')}")
|
print(f" Status: {data.get('status')}")
|
||||||
print(f" GPU Available: {data.get('gpu_available')}")
|
print(f" GPU Available: {data.get('gpu_available')}")
|
||||||
|
if data.get('gpu_device'):
|
||||||
|
print(f" GPU Device: {data.get('gpu_device')}")
|
||||||
|
if data.get('gpu_count'):
|
||||||
|
print(f" GPU Count: {data.get('gpu_count')}")
|
||||||
|
if data.get('rocm_version'):
|
||||||
|
print(f" ROCm Version: {data.get('rocm_version')}")
|
||||||
return True
|
return True
|
||||||
except (urllib.error.URLError, ConnectionRefusedError, TimeoutError) as e:
|
except (urllib.error.URLError, ConnectionRefusedError, TimeoutError) as e:
|
||||||
print("✗ サーバーに接続できません")
|
print("✗ サーバーに接続できません")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user