Addonに適用
This commit is contained in:
parent
0b6c31501e
commit
a2131a962b
35
README.md
35
README.md
|
|
@ -6,6 +6,41 @@
|
|||
|
||||
## 開発者向け情報
|
||||
|
||||
### GPU環境の確認
|
||||
|
||||
推論サーバーは起動時に環境診断情報を出力します:
|
||||
|
||||
- Python環境(バージョン、仮想環境の検出)
|
||||
- ROCm環境変数(ROCM_PATH、HSA_OVERRIDE_GFX_VERSION等)
|
||||
- GPU検出状況(デバイス名、ROCmバージョン)
|
||||
|
||||
```bash
|
||||
# サーバーを起動して診断ログを確認
|
||||
python server/main.py
|
||||
|
||||
# サーバーのGPU状態を確認
|
||||
python test_server_api.py --status
|
||||
```
|
||||
|
||||
出力例:
|
||||
```
|
||||
[FaceMask Server] Startup Diagnostics
|
||||
======================================================================
|
||||
[Python Environment]
|
||||
Python Version: 3.12.12
|
||||
Virtual Environment: Yes
|
||||
|
||||
[ROCm Environment Variables]
|
||||
ROCM_PATH: /nix/store/.../clr-7.1.1
|
||||
HSA_OVERRIDE_GFX_VERSION: 11.0.0
|
||||
|
||||
[GPU Detection]
|
||||
torch.cuda.is_available(): True
|
||||
GPU Device 0: AMD Radeon Graphics
|
||||
ROCm Version (HIP): 7.0.51831
|
||||
======================================================================
|
||||
```
|
||||
|
||||
### 処理プロセスの単体デバッグ
|
||||
|
||||
顔検出処理をBlenderから独立してテストできます。
|
||||
|
|
|
|||
|
|
@ -72,21 +72,56 @@ class InferenceClient:
|
|||
cwd=root_dir,
|
||||
text=True,
|
||||
env=server_env,
|
||||
stdout=subprocess.PIPE, # Capture stdout
|
||||
stderr=subprocess.PIPE, # Capture stderr
|
||||
preexec_fn=os.setsid, # Create new process group
|
||||
)
|
||||
|
||||
|
||||
# Wait for startup
|
||||
for _ in range(20): # Wait up to 10 seconds
|
||||
if self.is_server_running():
|
||||
print("[FaceMask] Server started successfully")
|
||||
return
|
||||
|
||||
|
||||
# Check if process died
|
||||
if self.server_process.poll() is not None:
|
||||
raise RuntimeError(f"Server failed to start (rc={self.server_process.returncode})")
|
||||
|
||||
# Capture and display error output
|
||||
try:
|
||||
stdout, stderr = self.server_process.communicate(timeout=1)
|
||||
except subprocess.TimeoutExpired:
|
||||
stdout, stderr = "", ""
|
||||
|
||||
error_msg = f"Server failed to start (exit code: {self.server_process.returncode})"
|
||||
print(f"[FaceMask] ERROR: {error_msg}")
|
||||
|
||||
if stdout and stdout.strip():
|
||||
print("[FaceMask] Server stdout:")
|
||||
print(stdout.strip())
|
||||
|
||||
if stderr and stderr.strip():
|
||||
print("[FaceMask] Server stderr:")
|
||||
print(stderr.strip())
|
||||
|
||||
self.server_process = None
|
||||
raise RuntimeError(error_msg)
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
|
||||
# If we get here, startup timed out
|
||||
# Try to capture any partial output
|
||||
if self.server_process:
|
||||
try:
|
||||
# Non-blocking read with short timeout
|
||||
stdout, stderr = self.server_process.communicate(timeout=0.1)
|
||||
if stdout and stdout.strip():
|
||||
print("[FaceMask] Server stdout (partial):")
|
||||
print(stdout.strip())
|
||||
if stderr and stderr.strip():
|
||||
print("[FaceMask] Server stderr (partial):")
|
||||
print(stderr.strip())
|
||||
except subprocess.TimeoutExpired:
|
||||
pass # Process still running but not responding
|
||||
|
||||
raise RuntimeError("Server startup timed out")
|
||||
|
||||
def stop_server(self):
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
# 推論サーバーの単体起動スクリプト
|
||||
|
||||
set -e
|
||||
|
|
|
|||
144
server/main.py
144
server/main.py
|
|
@ -7,6 +7,7 @@ GPU-accelerated face detection using ONNX Runtime.
|
|||
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
import threading
|
||||
import uuid
|
||||
import queue
|
||||
|
|
@ -27,6 +28,9 @@ from server.detector import YOLOFaceDetector, get_detector
|
|||
|
||||
app = FastAPI(title="Face Mask Inference Server")
|
||||
|
||||
# GPU status cache
|
||||
_gpu_status_cache = None
|
||||
|
||||
# Task storage
|
||||
class TaskStatus:
|
||||
PENDING = "pending"
|
||||
|
|
@ -146,9 +150,146 @@ def process_video_task(task_id: str, req: GenerateRequest):
|
|||
if task_id in cancel_events:
|
||||
del cancel_events[task_id]
|
||||
|
||||
def check_gpu_available() -> dict:
|
||||
"""
|
||||
Check if GPU is available for inference.
|
||||
|
||||
Returns a dict with GPU information:
|
||||
{
|
||||
"available": bool,
|
||||
"device_name": str or None,
|
||||
"device_count": int,
|
||||
"rocm_version": str or None
|
||||
}
|
||||
"""
|
||||
global _gpu_status_cache
|
||||
|
||||
# Return cached result if available
|
||||
if _gpu_status_cache is not None:
|
||||
return _gpu_status_cache
|
||||
|
||||
result = {
|
||||
"available": False,
|
||||
"device_name": None,
|
||||
"device_count": 0,
|
||||
"rocm_version": None
|
||||
}
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
result["available"] = torch.cuda.is_available()
|
||||
|
||||
if result["available"]:
|
||||
result["device_count"] = torch.cuda.device_count()
|
||||
if result["device_count"] > 0:
|
||||
result["device_name"] = torch.cuda.get_device_name(0)
|
||||
|
||||
if hasattr(torch.version, 'hip'):
|
||||
result["rocm_version"] = torch.version.hip
|
||||
|
||||
except Exception as e:
|
||||
print(f"[FaceMask] Warning: GPU detection failed: {e}")
|
||||
result["available"] = False
|
||||
|
||||
# Cache the result
|
||||
_gpu_status_cache = result
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def log_startup_diagnostics():
|
||||
"""Log diagnostic information about the environment and GPU."""
|
||||
print("=" * 70)
|
||||
print("[FaceMask Server] Startup Diagnostics")
|
||||
print("=" * 70)
|
||||
|
||||
# Python Environment
|
||||
print("\n[Python Environment]")
|
||||
print(f" Python Version: {sys.version.split()[0]}")
|
||||
print(f" Python Executable: {sys.executable}")
|
||||
print(f" Platform: {platform.platform()}")
|
||||
print(f" Working Directory: {os.getcwd()}")
|
||||
|
||||
# Check if in venv
|
||||
in_venv = sys.prefix != sys.base_prefix
|
||||
print(f" Virtual Environment: {'Yes' if in_venv else 'No'}")
|
||||
if in_venv:
|
||||
print(f" venv path: {sys.prefix}")
|
||||
|
||||
# ROCm Environment Variables
|
||||
print("\n[ROCm Environment Variables]")
|
||||
rocm_vars = [
|
||||
'ROCM_PATH',
|
||||
'HSA_OVERRIDE_GFX_VERSION',
|
||||
'PYTORCH_ROCM_ARCH',
|
||||
'ROCBLAS_TENSILE_LIBPATH',
|
||||
'LD_LIBRARY_PATH'
|
||||
]
|
||||
|
||||
for var in rocm_vars:
|
||||
value = os.environ.get(var)
|
||||
if value:
|
||||
# Truncate very long values
|
||||
if len(value) > 200:
|
||||
display_value = value[:200] + "... (truncated)"
|
||||
else:
|
||||
display_value = value
|
||||
print(f" {var}: {display_value}")
|
||||
else:
|
||||
print(f" {var}: (not set)")
|
||||
|
||||
# GPU Detection
|
||||
print("\n[GPU Detection]")
|
||||
try:
|
||||
import torch
|
||||
|
||||
cuda_available = torch.cuda.is_available()
|
||||
print(f" torch.cuda.is_available(): {cuda_available}")
|
||||
|
||||
if cuda_available:
|
||||
device_count = torch.cuda.device_count()
|
||||
print(f" GPU Device Count: {device_count}")
|
||||
|
||||
if device_count > 0:
|
||||
device_name = torch.cuda.get_device_name(0)
|
||||
print(f" GPU Device 0: {device_name}")
|
||||
|
||||
# ROCm version
|
||||
if hasattr(torch.version, 'hip'):
|
||||
print(f" ROCm Version (HIP): {torch.version.hip}")
|
||||
|
||||
# CUDA version (might be emulated by ROCm)
|
||||
if torch.version.cuda:
|
||||
print(f" CUDA Version: {torch.version.cuda}")
|
||||
else:
|
||||
print(" WARNING: GPU not detected!")
|
||||
print(" Server will use CPU for inference (slower)")
|
||||
print(" Troubleshooting:")
|
||||
print(" - Check ROCm environment variables above")
|
||||
print(" - Run: python -c 'import torch; print(torch.cuda.is_available())'")
|
||||
|
||||
except ImportError as e:
|
||||
print(f" ERROR: Cannot import torch: {e}")
|
||||
print(" PyTorch must be installed for inference")
|
||||
except Exception as e:
|
||||
print(f" ERROR during GPU detection: {e}")
|
||||
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
|
||||
@app.get("/status")
|
||||
def get_status():
|
||||
return {"status": "running", "gpu_available": True} # TODO: check GPU
|
||||
gpu_info = check_gpu_available()
|
||||
|
||||
return {
|
||||
"status": "running",
|
||||
"gpu_available": gpu_info["available"],
|
||||
"gpu_device": gpu_info["device_name"],
|
||||
"gpu_count": gpu_info["device_count"],
|
||||
"rocm_version": gpu_info["rocm_version"]
|
||||
}
|
||||
|
||||
@app.post("/generate", response_model=Task)
|
||||
def generate_mask_endpoint(req: GenerateRequest, background_tasks: BackgroundTasks):
|
||||
|
|
@ -177,4 +318,5 @@ def cancel_task(task_id: str):
|
|||
return {"message": "Cancellation requested"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
log_startup_diagnostics()
|
||||
uvicorn.run(app, host="127.0.0.1", port=8181)
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
|
|
@ -30,6 +30,12 @@ def check_status():
|
|||
print("✓ サーバーは稼働中です")
|
||||
print(f" Status: {data.get('status')}")
|
||||
print(f" GPU Available: {data.get('gpu_available')}")
|
||||
if data.get('gpu_device'):
|
||||
print(f" GPU Device: {data.get('gpu_device')}")
|
||||
if data.get('gpu_count'):
|
||||
print(f" GPU Count: {data.get('gpu_count')}")
|
||||
if data.get('rocm_version'):
|
||||
print(f" ROCm Version: {data.get('rocm_version')}")
|
||||
return True
|
||||
except (urllib.error.URLError, ConnectionRefusedError, TimeoutError) as e:
|
||||
print("✗ サーバーに接続できません")
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user