diff --git a/server/detector.py b/server/detector.py
index 68a6317..abb03db 100644
--- a/server/detector.py
+++ b/server/detector.py
@@ -49,13 +49,12 @@ def _head_bbox_from_pose(
         cx = (kp_x1 + kp_x2) / 2.0
         cy = (kp_y1 + kp_y2) / 2.0
 
-        # Head radius: inter-landmark span ≈ 80% of head width, so expand by ~1.25
-        # Shift center upward slightly to include scalp
-        r = max(span * 1.25, person_w * 0.20)
+        # span はキーポイントの外接幅（≒顔幅）なので、半径 = span/2 で顔と等倍になる
+        r = max(span * 0.5, person_w * 0.10)
         x1 = int(cx - r)
-        y1 = int(cy - r * 1.15)   # extra margin above (scalp)
+        y1 = int(cy - r)
         x2 = int(cx + r)
-        y2 = int(cy + r * 0.85)   # less margin below (chin)
+        y2 = int(cy + r)
         return x1, y1, x2 - x1, y2 - y1
 
     # --- Step 2: shoulder keypoints ---
@@ -71,8 +70,9 @@ def _head_bbox_from_pose(
             sh_width = abs(visible_shoulder[1][0] - visible_shoulder[0][0])
         else:
             sh_width = person_w * 0.5
-        r = max(sh_width * 0.5, person_w * 0.20)
-        cy = cy_sh - r * 1.3   # head center is above shoulders
+        # 肩幅は顔幅の約1.5〜2倍なので、0.3倍で顔サイズに近い半径になる
+        r = max(sh_width * 0.3, person_w * 0.12)
+        cy = cy_sh - r * 1.3   # 頭の中心は肩より上
         x1 = int(cx - r)
         y1 = int(cy - r)
         x2 = int(cx + r)
@@ -80,7 +80,8 @@ def _head_bbox_from_pose(
         return x1, y1, x2 - x1, y2 - y1
 
     # --- Step 3: person bbox top ---
-    r = max(person_w * 0.35, 20.0)
+    # 顔幅は人物幅の約20〜30%なので、半径 = person_w * 0.15 で顔サイズに近い
+    r = max(person_w * 0.15, 20.0)
     cx = (person_x1 + person_x2) / 2.0
     x1 = int(cx - r)
     y1 = int(person_y1)
diff --git a/server/main.py b/server/main.py
index 150143e..b1d97b4 100644
--- a/server/main.py
+++ b/server/main.py
@@ -446,36 +446,59 @@ def process_bake_task(task_id: str, req: BakeRequest):
 
         def _reader_worker():
             """Read frames from video."""
+            import time as _time
             cap = cv2.VideoCapture(req.video_path)
             if not cap.isOpened():
                 error_holder["error"] = "Failed to open video in reader"
                 return
 
+            t_read_total = 0.0
+            frame_count = 0
             try:
                 for idx in range(total):
                     if cancel_event and cancel_event.is_set():
                         break
 
+                    t0 = _time.perf_counter()
                     ok, frame = cap.read()
+                    t_read_total += _time.perf_counter() - t0
+
                     if not ok:
                         break
 
                     read_queue.put((idx, frame))
+                    frame_count += 1
             except Exception as e:
                 error_holder["error"] = f"Reader error: {e}"
             finally:
                 cap.release()
                 read_queue.put(None)  # Sentinel
+                if frame_count > 0:
+                    print(
+                        f"[Perf/Reader] FINAL frame={frame_count}"
+                        f"  read_avg={t_read_total/frame_count*1000:.1f}ms"
+                        f"  throughput≈{frame_count/max(t_read_total,1e-9):.1f}fps"
+                    )
 
         def _processor_worker():
             """Process frames with ROI blur."""
+            import time as _time
+            t_wait_total = 0.0
+            t_blur_total = 0.0
+            t_blend_total = 0.0
+            frame_count = 0
+            REPORT_INTERVAL = 50
+
             try:
                 while True:
                     if cancel_event and cancel_event.is_set():
                         process_queue.put(None)
                         break
 
+                    t0 = _time.perf_counter()
                     item = read_queue.get()
+                    t_wait_total += _time.perf_counter() - t0
+
                     if item is None:
                         process_queue.put(None)
                         break
@@ -485,6 +508,7 @@ def process_bake_task(task_id: str, req: BakeRequest):
 
                     if not frame_boxes:
                         process_queue.put((idx, frame))
+                        frame_count += 1
                         continue
 
                     # 各人物ごとに個別ROIで処理（全員まとめると離れた人物間が巨大ROIになるため）
@@ -499,6 +523,7 @@ def process_bake_task(task_id: str, req: BakeRequest):
 
                     if not valid_boxes:
                         process_queue.put((idx, frame))
+                        frame_count += 1
                         continue
 
                     for x, y, w, h in valid_boxes:
@@ -523,7 +548,16 @@ def process_bake_task(task_id: str, req: BakeRequest):
 
                         # ブラーはROI全体で計算（余白があるので端の精度が保証される）
                         roi_src = frame[roi_y1:roi_y2, roi_x1:roi_x2]
-                        roi_blurred = cv2.GaussianBlur(roi_src, (blur_size, blur_size), 0)
+
+                        # ダウンサンプル→blur→アップサンプル（同等のぼかしを1/4の計算量で実現）
+                        t1 = _time.perf_counter()
+                        small_w = max(1, roi_width // 2)
+                        small_h = max(1, roi_height // 2)
+                        roi_small = cv2.resize(roi_src, (small_w, small_h), interpolation=cv2.INTER_LINEAR)
+                        small_blur_size = max(3, (blur_size // 2) | 1)
+                        roi_small_blurred = cv2.GaussianBlur(roi_small, (small_blur_size, small_blur_size), 0)
+                        roi_blurred = cv2.resize(roi_small_blurred, (roi_width, roi_height), interpolation=cv2.INTER_LINEAR)
+                        t_blur_total += _time.perf_counter() - t1
 
                         # 合成マスクはdisplay_scaleサイズの楕円のみ（featheringなし）
                         roi_mask = np.zeros((roi_height, roi_width), dtype=np.uint8)
@@ -531,18 +565,47 @@ def process_bake_task(task_id: str, req: BakeRequest):
                         axes = (max(1, dw // 2), max(1, dh // 2))
                         cv2.ellipse(roi_mask, center, axes, 0, 0, 360, 255, -1)
 
-                        roi_alpha = (roi_mask.astype(np.float32) / 255.0)[..., np.newaxis]
-                        roi_composed = roi_src.astype(np.float32) * (1.0 - roi_alpha) + roi_blurred.astype(np.float32) * roi_alpha
-                        frame[roi_y1:roi_y2, roi_x1:roi_x2] = np.clip(roi_composed, 0, 255).astype(np.uint8)
+                        # バイナリマスクなのでcopyToで高速合成（float32変換不要）
+                        t2 = _time.perf_counter()
+                        result = roi_src.copy()
+                        cv2.copyTo(roi_blurred, roi_mask, result)
+                        frame[roi_y1:roi_y2, roi_x1:roi_x2] = result
+                        t_blend_total += _time.perf_counter() - t2
 
                     process_queue.put((idx, frame))
+                    frame_count += 1
+
+                    if frame_count % REPORT_INTERVAL == 0:
+                        n = max(frame_count, 1)
+                        fps_proc = frame_count / max(t_wait_total + t_blur_total + t_blend_total, 1e-9)
+                        print(
+                            f"[Perf/Processor] frame={frame_count}"
+                            f"  wait={t_wait_total/n*1000:.1f}ms"
+                            f"  blur={t_blur_total/n*1000:.1f}ms"
+                            f"  blend={t_blend_total/n*1000:.1f}ms"
+                            f"  ROI={roi_width}x{roi_height}"
+                            f"  throughput≈{fps_proc:.1f}fps"
+                        )
 
             except Exception as e:
                 error_holder["error"] = f"Processor error: {e}"
                 process_queue.put(None)
+            finally:
+                if frame_count > 0:
+                    n = max(frame_count, 1)
+                    print(
+                        f"[Perf/Processor] FINAL frame={frame_count}"
+                        f"  wait_avg={t_wait_total/n*1000:.1f}ms"
+                        f"  blur_avg={t_blur_total/n*1000:.1f}ms"
+                        f"  blend_avg={t_blend_total/n*1000:.1f}ms"
+                    )
 
         def _writer_worker():
             """Write frames to output."""
+            import time as _time
+            t_wait_total = 0.0
+            t_write_total = 0.0
+            frame_count = 0
             writer = None
             try:
                 writer = _build_video_writer(req.output_path, req.format, src_fps, src_width, src_height)
@@ -551,12 +614,18 @@ def process_bake_task(task_id: str, req: BakeRequest):
                     if cancel_event and cancel_event.is_set():
                         break
 
+                    t0 = _time.perf_counter()
                     item = process_queue.get()
+                    t_wait_total += _time.perf_counter() - t0
+
                     if item is None:
                         break
 
                     idx, frame = item
+                    t1 = _time.perf_counter()
                     writer.write(frame)
+                    t_write_total += _time.perf_counter() - t1
+                    frame_count += 1
 
                     with progress_lock:
                         current_progress[0] = idx + 1
@@ -570,6 +639,13 @@ def process_bake_task(task_id: str, req: BakeRequest):
                         writer.release()
                     except Exception as e:
                         print(f"[FaceMask] Writer release error: {e}")
+                if frame_count > 0:
+                    n = max(frame_count, 1)
+                    print(
+                        f"[Perf/Writer] FINAL frame={frame_count}"
+                        f"  wait_avg={t_wait_total/n*1000:.1f}ms"
+                        f"  write_avg={t_write_total/n*1000:.1f}ms"
+                    )
 
         print(
             f"[FaceMask] Starting blur bake: {req.video_path} + "