diff --git a/README.md b/README.md index ac26cb8..0f64a10 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,41 @@ ## 開発者向け情報 +### GPU環境の確認 + +推論サーバーは起動時に環境診断情報を出力します: + +- Python環境(バージョン、仮想環境の検出) +- ROCm環境変数(ROCM_PATH、HSA_OVERRIDE_GFX_VERSION等) +- GPU検出状況(デバイス名、ROCmバージョン) + +```bash +# サーバーを起動して診断ログを確認 +python server/main.py + +# サーバーのGPU状態を確認 +python test_server_api.py --status +``` + +出力例: +``` +[FaceMask Server] Startup Diagnostics +====================================================================== +[Python Environment] + Python Version: 3.12.12 + Virtual Environment: Yes + +[ROCm Environment Variables] + ROCM_PATH: /nix/store/.../clr-7.1.1 + HSA_OVERRIDE_GFX_VERSION: 11.0.0 + +[GPU Detection] + torch.cuda.is_available(): True + GPU Device 0: AMD Radeon Graphics + ROCm Version (HIP): 7.0.51831 +====================================================================== +``` + ### 処理プロセスの単体デバッグ 顔検出処理をBlenderから独立してテストできます。 diff --git a/core/inference_client.py b/core/inference_client.py index 0a17143..318c9a7 100644 --- a/core/inference_client.py +++ b/core/inference_client.py @@ -72,21 +72,56 @@ class InferenceClient: cwd=root_dir, text=True, env=server_env, + stdout=subprocess.PIPE, # Capture stdout + stderr=subprocess.PIPE, # Capture stderr preexec_fn=os.setsid, # Create new process group ) - + # Wait for startup for _ in range(20): # Wait up to 10 seconds if self.is_server_running(): print("[FaceMask] Server started successfully") return - + # Check if process died if self.server_process.poll() is not None: - raise RuntimeError(f"Server failed to start (rc={self.server_process.returncode})") - + # Capture and display error output + try: + stdout, stderr = self.server_process.communicate(timeout=1) + except subprocess.TimeoutExpired: + stdout, stderr = "", "" + + error_msg = f"Server failed to start (exit code: {self.server_process.returncode})" + print(f"[FaceMask] ERROR: {error_msg}") + + if stdout and stdout.strip(): + print("[FaceMask] Server stdout:") + print(stdout.strip()) + + if stderr and stderr.strip(): + print("[FaceMask] Server stderr:") + print(stderr.strip()) + + self.server_process = None + raise RuntimeError(error_msg) + time.sleep(0.5) - + + # If we get here, startup timed out + # Try to capture any partial output + if self.server_process: + try: + # Non-blocking read with short timeout + stdout, stderr = self.server_process.communicate(timeout=0.1) + if stdout and stdout.strip(): + print("[FaceMask] Server stdout (partial):") + print(stdout.strip()) + if stderr and stderr.strip(): + print("[FaceMask] Server stderr (partial):") + print(stderr.strip()) + except subprocess.TimeoutExpired: + pass # Process still running but not responding + raise RuntimeError("Server startup timed out") def stop_server(self): diff --git a/run_server.sh b/run_server.sh index ad7d66f..4dc7683 100755 --- a/run_server.sh +++ b/run_server.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # 推論サーバーの単体起動スクリプト set -e diff --git a/server/main.py b/server/main.py index 043245b..78f6a2f 100644 --- a/server/main.py +++ b/server/main.py @@ -7,6 +7,7 @@ GPU-accelerated face detection using ONNX Runtime. import os import sys +import platform import threading import uuid import queue @@ -27,6 +28,9 @@ from server.detector import YOLOFaceDetector, get_detector app = FastAPI(title="Face Mask Inference Server") +# GPU status cache +_gpu_status_cache = None + # Task storage class TaskStatus: PENDING = "pending" @@ -146,9 +150,146 @@ def process_video_task(task_id: str, req: GenerateRequest): if task_id in cancel_events: del cancel_events[task_id] +def check_gpu_available() -> dict: + """ + Check if GPU is available for inference. + + Returns a dict with GPU information: + { + "available": bool, + "device_name": str or None, + "device_count": int, + "rocm_version": str or None + } + """ + global _gpu_status_cache + + # Return cached result if available + if _gpu_status_cache is not None: + return _gpu_status_cache + + result = { + "available": False, + "device_name": None, + "device_count": 0, + "rocm_version": None + } + + try: + import torch + + result["available"] = torch.cuda.is_available() + + if result["available"]: + result["device_count"] = torch.cuda.device_count() + if result["device_count"] > 0: + result["device_name"] = torch.cuda.get_device_name(0) + + if hasattr(torch.version, 'hip'): + result["rocm_version"] = torch.version.hip + + except Exception as e: + print(f"[FaceMask] Warning: GPU detection failed: {e}") + result["available"] = False + + # Cache the result + _gpu_status_cache = result + + return result + + +def log_startup_diagnostics(): + """Log diagnostic information about the environment and GPU.""" + print("=" * 70) + print("[FaceMask Server] Startup Diagnostics") + print("=" * 70) + + # Python Environment + print("\n[Python Environment]") + print(f" Python Version: {sys.version.split()[0]}") + print(f" Python Executable: {sys.executable}") + print(f" Platform: {platform.platform()}") + print(f" Working Directory: {os.getcwd()}") + + # Check if in venv + in_venv = sys.prefix != sys.base_prefix + print(f" Virtual Environment: {'Yes' if in_venv else 'No'}") + if in_venv: + print(f" venv path: {sys.prefix}") + + # ROCm Environment Variables + print("\n[ROCm Environment Variables]") + rocm_vars = [ + 'ROCM_PATH', + 'HSA_OVERRIDE_GFX_VERSION', + 'PYTORCH_ROCM_ARCH', + 'ROCBLAS_TENSILE_LIBPATH', + 'LD_LIBRARY_PATH' + ] + + for var in rocm_vars: + value = os.environ.get(var) + if value: + # Truncate very long values + if len(value) > 200: + display_value = value[:200] + "... (truncated)" + else: + display_value = value + print(f" {var}: {display_value}") + else: + print(f" {var}: (not set)") + + # GPU Detection + print("\n[GPU Detection]") + try: + import torch + + cuda_available = torch.cuda.is_available() + print(f" torch.cuda.is_available(): {cuda_available}") + + if cuda_available: + device_count = torch.cuda.device_count() + print(f" GPU Device Count: {device_count}") + + if device_count > 0: + device_name = torch.cuda.get_device_name(0) + print(f" GPU Device 0: {device_name}") + + # ROCm version + if hasattr(torch.version, 'hip'): + print(f" ROCm Version (HIP): {torch.version.hip}") + + # CUDA version (might be emulated by ROCm) + if torch.version.cuda: + print(f" CUDA Version: {torch.version.cuda}") + else: + print(" WARNING: GPU not detected!") + print(" Server will use CPU for inference (slower)") + print(" Troubleshooting:") + print(" - Check ROCm environment variables above") + print(" - Run: python -c 'import torch; print(torch.cuda.is_available())'") + + except ImportError as e: + print(f" ERROR: Cannot import torch: {e}") + print(" PyTorch must be installed for inference") + except Exception as e: + print(f" ERROR during GPU detection: {e}") + + print("=" * 70) + print() + + @app.get("/status") def get_status(): - return {"status": "running", "gpu_available": True} # TODO: check GPU + gpu_info = check_gpu_available() + + return { + "status": "running", + "gpu_available": gpu_info["available"], + "gpu_device": gpu_info["device_name"], + "gpu_count": gpu_info["device_count"], + "rocm_version": gpu_info["rocm_version"] + } @app.post("/generate", response_model=Task) def generate_mask_endpoint(req: GenerateRequest, background_tasks: BackgroundTasks): @@ -177,4 +318,5 @@ def cancel_task(task_id: str): return {"message": "Cancellation requested"} if __name__ == "__main__": + log_startup_diagnostics() uvicorn.run(app, host="127.0.0.1", port=8181) diff --git a/test_result_frame0.jpg b/test_result_frame0.jpg deleted file mode 100644 index 3831c87..0000000 Binary files a/test_result_frame0.jpg and /dev/null differ diff --git a/test_result_gpu_frame0.jpg b/test_result_gpu_frame0.jpg deleted file mode 100644 index 3831c87..0000000 Binary files a/test_result_gpu_frame0.jpg and /dev/null differ diff --git a/test_server_api.py b/test_server_api.py index b77c52a..c086053 100755 --- a/test_server_api.py +++ b/test_server_api.py @@ -30,6 +30,12 @@ def check_status(): print("✓ サーバーは稼働中です") print(f" Status: {data.get('status')}") print(f" GPU Available: {data.get('gpu_available')}") + if data.get('gpu_device'): + print(f" GPU Device: {data.get('gpu_device')}") + if data.get('gpu_count'): + print(f" GPU Count: {data.get('gpu_count')}") + if data.get('rocm_version'): + print(f" ROCm Version: {data.get('rocm_version')}") return True except (urllib.error.URLError, ConnectionRefusedError, TimeoutError) as e: print("✗ サーバーに接続できません")