Addonに適用

2026-02-12 18:52:55 +09:00 · 2026-02-12 18:52:55 +09:00 · a2131a962b
commit a2131a962b
parent 0b6c31501e
7 changed files with 225 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -6,6 +6,41 @@

 ## 開発者向け情報

+### GPU環境の確認
+
+推論サーバーは起動時に環境診断情報を出力します：
+
+- Python環境（バージョン、仮想環境の検出）
+- ROCm環境変数（ROCM_PATH、HSA_OVERRIDE_GFX_VERSION等）
+- GPU検出状況（デバイス名、ROCmバージョン）
+
+```bash
+# サーバーを起動して診断ログを確認
+python server/main.py
+
+# サーバーのGPU状態を確認
+python test_server_api.py --status
+```
+
+出力例：
+```
+[FaceMask Server] Startup Diagnostics
+======================================================================
+[Python Environment]
+  Python Version: 3.12.12
+  Virtual Environment: Yes
+
+[ROCm Environment Variables]
+  ROCM_PATH: /nix/store/.../clr-7.1.1
+  HSA_OVERRIDE_GFX_VERSION: 11.0.0
+
+[GPU Detection]
+  torch.cuda.is_available(): True
+  GPU Device 0: AMD Radeon Graphics
+  ROCm Version (HIP): 7.0.51831
+======================================================================
+```
+
 ### 処理プロセスの単体デバッグ

 顔検出処理をBlenderから独立してテストできます。
--- a/core/inference_client.py
+++ b/core/inference_client.py
@ -72,21 +72,56 @@ class InferenceClient:
                cwd=root_dir,
                text=True,
                env=server_env,
+                stdout=subprocess.PIPE,  # Capture stdout
+                stderr=subprocess.PIPE,  # Capture stderr
                preexec_fn=os.setsid,  # Create new process group
            )
-            
+
            # Wait for startup
            for _ in range(20):  # Wait up to 10 seconds
                if self.is_server_running():
                    print("[FaceMask] Server started successfully")
                    return
-                
+
                # Check if process died
                if self.server_process.poll() is not None:
-                    raise RuntimeError(f"Server failed to start (rc={self.server_process.returncode})")
-                
+                    # Capture and display error output
+                    try:
+                        stdout, stderr = self.server_process.communicate(timeout=1)
+                    except subprocess.TimeoutExpired:
+                        stdout, stderr = "", ""
+
+                    error_msg = f"Server failed to start (exit code: {self.server_process.returncode})"
+                    print(f"[FaceMask] ERROR: {error_msg}")
+
+                    if stdout and stdout.strip():
+                        print("[FaceMask] Server stdout:")
+                        print(stdout.strip())
+
+                    if stderr and stderr.strip():
+                        print("[FaceMask] Server stderr:")
+                        print(stderr.strip())
+
+                    self.server_process = None
+                    raise RuntimeError(error_msg)
+
                time.sleep(0.5)
-            
+
+            # If we get here, startup timed out
+            # Try to capture any partial output
+            if self.server_process:
+                try:
+                    # Non-blocking read with short timeout
+                    stdout, stderr = self.server_process.communicate(timeout=0.1)
+                    if stdout and stdout.strip():
+                        print("[FaceMask] Server stdout (partial):")
+                        print(stdout.strip())
+                    if stderr and stderr.strip():
+                        print("[FaceMask] Server stderr (partial):")
+                        print(stderr.strip())
+                except subprocess.TimeoutExpired:
+                    pass  # Process still running but not responding
+
            raise RuntimeError("Server startup timed out")
    
    def stop_server(self):
--- a/run_server.sh
+++ b/run_server.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # 推論サーバーの単体起動スクリプト

 set -e
--- a/server/main.py
+++ b/server/main.py
@ -7,6 +7,7 @@ GPU-accelerated face detection using ONNX Runtime.

 import os
 import sys
+import platform
 import threading
 import uuid
 import queue
@ -27,6 +28,9 @@ from server.detector import YOLOFaceDetector, get_detector

 app = FastAPI(title="Face Mask Inference Server")

+# GPU status cache
+_gpu_status_cache = None
+
 # Task storage
 class TaskStatus:
    PENDING = "pending"
@ -146,9 +150,146 @@ def process_video_task(task_id: str, req: GenerateRequest):
        if task_id in cancel_events:
            del cancel_events[task_id]

+def check_gpu_available() -> dict:
+    """
+    Check if GPU is available for inference.
+
+    Returns a dict with GPU information:
+    {
+        "available": bool,
+        "device_name": str or None,
+        "device_count": int,
+        "rocm_version": str or None
+    }
+    """
+    global _gpu_status_cache
+
+    # Return cached result if available
+    if _gpu_status_cache is not None:
+        return _gpu_status_cache
+
+    result = {
+        "available": False,
+        "device_name": None,
+        "device_count": 0,
+        "rocm_version": None
+    }
+
+    try:
+        import torch
+
+        result["available"] = torch.cuda.is_available()
+
+        if result["available"]:
+            result["device_count"] = torch.cuda.device_count()
+            if result["device_count"] > 0:
+                result["device_name"] = torch.cuda.get_device_name(0)
+
+            if hasattr(torch.version, 'hip'):
+                result["rocm_version"] = torch.version.hip
+
+    except Exception as e:
+        print(f"[FaceMask] Warning: GPU detection failed: {e}")
+        result["available"] = False
+
+    # Cache the result
+    _gpu_status_cache = result
+
+    return result
+
+
+def log_startup_diagnostics():
+    """Log diagnostic information about the environment and GPU."""
+    print("=" * 70)
+    print("[FaceMask Server] Startup Diagnostics")
+    print("=" * 70)
+
+    # Python Environment
+    print("\n[Python Environment]")
+    print(f"  Python Version: {sys.version.split()[0]}")
+    print(f"  Python Executable: {sys.executable}")
+    print(f"  Platform: {platform.platform()}")
+    print(f"  Working Directory: {os.getcwd()}")
+
+    # Check if in venv
+    in_venv = sys.prefix != sys.base_prefix
+    print(f"  Virtual Environment: {'Yes' if in_venv else 'No'}")
+    if in_venv:
+        print(f"    venv path: {sys.prefix}")
+
+    # ROCm Environment Variables
+    print("\n[ROCm Environment Variables]")
+    rocm_vars = [
+        'ROCM_PATH',
+        'HSA_OVERRIDE_GFX_VERSION',
+        'PYTORCH_ROCM_ARCH',
+        'ROCBLAS_TENSILE_LIBPATH',
+        'LD_LIBRARY_PATH'
+    ]
+
+    for var in rocm_vars:
+        value = os.environ.get(var)
+        if value:
+            # Truncate very long values
+            if len(value) > 200:
+                display_value = value[:200] + "... (truncated)"
+            else:
+                display_value = value
+            print(f"  {var}: {display_value}")
+        else:
+            print(f"  {var}: (not set)")
+
+    # GPU Detection
+    print("\n[GPU Detection]")
+    try:
+        import torch
+
+        cuda_available = torch.cuda.is_available()
+        print(f"  torch.cuda.is_available(): {cuda_available}")
+
+        if cuda_available:
+            device_count = torch.cuda.device_count()
+            print(f"  GPU Device Count: {device_count}")
+
+            if device_count > 0:
+                device_name = torch.cuda.get_device_name(0)
+                print(f"  GPU Device 0: {device_name}")
+
+                # ROCm version
+                if hasattr(torch.version, 'hip'):
+                    print(f"  ROCm Version (HIP): {torch.version.hip}")
+
+                # CUDA version (might be emulated by ROCm)
+                if torch.version.cuda:
+                    print(f"  CUDA Version: {torch.version.cuda}")
+        else:
+            print("  WARNING: GPU not detected!")
+            print("  Server will use CPU for inference (slower)")
+            print("  Troubleshooting:")
+            print("    - Check ROCm environment variables above")
+            print("    - Run: python -c 'import torch; print(torch.cuda.is_available())'")
+
+    except ImportError as e:
+        print(f"  ERROR: Cannot import torch: {e}")
+        print("  PyTorch must be installed for inference")
+    except Exception as e:
+        print(f"  ERROR during GPU detection: {e}")
+
+    print("=" * 70)
+    print()
+
+
@app.get("/status")
 def get_status():
-    return {"status": "running", "gpu_available": True} # TODO: check GPU
+    gpu_info = check_gpu_available()
+
+    return {
+        "status": "running",
+        "gpu_available": gpu_info["available"],
+        "gpu_device": gpu_info["device_name"],
+        "gpu_count": gpu_info["device_count"],
+        "rocm_version": gpu_info["rocm_version"]
+    }

@app.post("/generate", response_model=Task)
 def generate_mask_endpoint(req: GenerateRequest, background_tasks: BackgroundTasks):
@ -177,4 +318,5 @@ def cancel_task(task_id: str):
    return {"message": "Cancellation requested"}

 if __name__ == "__main__":
+    log_startup_diagnostics()
    uvicorn.run(app, host="127.0.0.1", port=8181)
--- a/test_result_frame0.jpg
+++ b/test_result_frame0.jpg
--- a/test_result_gpu_frame0.jpg
+++ b/test_result_gpu_frame0.jpg
--- a/test_server_api.py
+++ b/test_server_api.py
@ -30,6 +30,12 @@ def check_status():
            print("✓ サーバーは稼働中です")
            print(f"  Status: {data.get('status')}")
            print(f"  GPU Available: {data.get('gpu_available')}")
+            if data.get('gpu_device'):
+                print(f"  GPU Device: {data.get('gpu_device')}")
+            if data.get('gpu_count'):
+                print(f"  GPU Count: {data.get('gpu_count')}")
+            if data.get('rocm_version'):
+                print(f"  ROCm Version: {data.get('rocm_version')}")
            return True
    except (urllib.error.URLError, ConnectionRefusedError, TimeoutError) as e:
        print("✗ サーバーに接続できません")