update: Consolidationの不要なToken上限の削除

2026-05-02 23:48:33 +09:00 · 2026-05-02 23:48:33 +09:00 · 0e7be01807
commit 0e7be01807
parent 35c8ee3a73
6 changed files with 66 additions and 54 deletions
--- a/TODO.md
+++ b/TODO.md
@ -19,6 +19,7 @@
 - [ ] Manifest: Tool Output / File Upload 上限の分離とデフォルト緩和 → [tickets/manifest-output-upload-limits.md](tickets/manifest-output-upload-limits.md)
 - [ ] メモリ機構
  - [ ] 使用頻度メトリクス + Knowledge 化候補レポート → [tickets/memory-usage-metrics.md](tickets/memory-usage-metrics.md)
+  - [ ] Phase 2 累積入力トークン上限の撤去 → [tickets/memory-consolidation-drop-input-cap.md](tickets/memory-consolidation-drop-input-cap.md)
 - [ ] セッション内 TODO ツール（注意機構付き） → [tickets/session-todo.md](tickets/session-todo.md)
 - ワークスペースのメモリーをLintするヘッドレスCLI
 - system-reminder 注入機構の汎用化（2件目の利用者が出た時に検討。タグ形式と「履歴を汚さない」原則は session-todo で先行確立）
--- a/crates/manifest/src/config.rs
+++ b/crates/manifest/src/config.rs
@ -218,9 +218,6 @@ impl MemoryConfig {
                .extract_worker_max_input_tokens
                .or(self.extract_worker_max_input_tokens),
            consolidation_model: upper.consolidation_model.or(self.consolidation_model),
-            consolidation_worker_max_input_tokens: upper
-                .consolidation_worker_max_input_tokens
-                .or(self.consolidation_worker_max_input_tokens),
            consolidation_threshold_files: upper
                .consolidation_threshold_files
                .or(self.consolidation_threshold_files),
--- a/crates/manifest/src/defaults.rs
+++ b/crates/manifest/src/defaults.rs
@ -50,8 +50,3 @@ pub const COMPACT_DEFAULT_REFERENCE_COUNT: usize = 5;
 /// own LLM calls. Exceeding this aborts the extract run.
 /// See [`crate::MemoryConfig::extract_worker_max_input_tokens`].
 pub const MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS: u64 = 30_000;
-
-/// Cumulative input-token cap for the memory Phase 2 (consolidation)
-/// worker's own LLM calls. Exceeding this aborts the consolidation run.
-/// See [`crate::MemoryConfig::consolidation_worker_max_input_tokens`].
-pub const MEMORY_CONSOLIDATION_WORKER_MAX_INPUT_TOKENS: u64 = 80_000;
--- a/crates/manifest/src/lib.rs
+++ b/crates/manifest/src/lib.rs
@ -90,11 +90,6 @@ pub struct MemoryConfig {
    /// Reasoning-class models are recommended.
    #[serde(default)]
    pub consolidation_model: Option<ModelManifest>,
-    /// Cumulative input-token cap for the consolidation worker's own
-    /// LLM calls. Exceeding this aborts the consolidation run. `None` ⇒
-    /// [`defaults::MEMORY_CONSOLIDATION_WORKER_MAX_INPUT_TOKENS`].
-    #[serde(default)]
-    pub consolidation_worker_max_input_tokens: Option<u64>,
    /// Phase 2 trigger: file-count threshold of `_staging/`. Phase 2
    /// fires when the staging directory has at least this many entries.
    /// Either threshold reaching its limit fires Phase 2 (logical OR).
--- a/crates/pod/src/pod.rs
+++ b/crates/pod/src/pod.rs
@ -1735,9 +1735,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
            Err(e) => return Err(PodError::ConsolidationLock(e)),
        };

-        let cap = memory_cfg
-            .consolidation_worker_max_input_tokens
-            .unwrap_or(manifest::defaults::MEMORY_CONSOLIDATION_WORKER_MAX_INPUT_TOKENS);
        let client = match self.build_consolidator_client(memory_cfg) {
            Ok(c) => c,
            Err(e) => {
@ -1749,20 +1746,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
            Worker::new(client).system_prompt(consolidate::CONSOLIDATION_SYSTEM_PROMPT);
        worker.set_cache_key(Some(self.session_id.to_string()));

-        let input_so_far = Arc::new(std::sync::atomic::AtomicU64::new(0));
-        {
-            let acc = input_so_far.clone();
-            worker.on_usage(move |event| {
-                if let Some(tokens) = event.input_tokens {
-                    acc.fetch_add(tokens, Ordering::Relaxed);
-                }
-            });
-        }
-        worker.set_interceptor(MemoryConsolidationWorkerInterceptor {
-            input_so_far: input_so_far.clone(),
-            max_input_tokens: cap,
-        });
-
        // Memory tools are self-contained — they bypass ScopedFs and write
        // directly under the workspace via WorkspaceLayout. Resident
        // knowledge injection (`Pod::set_resident_knowledge_injection`) is
@ -1843,30 +1826,6 @@ enum ConsolidateDecision {
    Completed,
 }

-/// Pre-request interceptor for the Phase 2 consolidation worker. Same
-/// shape as the extract interceptor; kept separate so the abort message
-/// names the right subsystem.
-struct MemoryConsolidationWorkerInterceptor {
-    input_so_far: Arc<std::sync::atomic::AtomicU64>,
-    max_input_tokens: u64,
-}
-
-#[async_trait]
-impl llm_worker::interceptor::Interceptor for MemoryConsolidationWorkerInterceptor {
-    async fn pre_llm_request(
-        &self,
-        _context: &mut Vec<Item>,
-    ) -> llm_worker::interceptor::PreRequestAction {
-        if self.input_so_far.load(Ordering::Relaxed) > self.max_input_tokens {
-            return llm_worker::interceptor::PreRequestAction::Cancel(format!(
-                "Phase 2 consolidation worker input exceeded {} tokens",
-                self.max_input_tokens
-            ));
-        }
-        llm_worker::interceptor::PreRequestAction::Continue
-    }
-}
-
 impl<St: Store> Pod<Box<dyn LlmClient>, St> {
    /// Create a Pod entirely from a validated manifest.
    ///
--- a/tickets/memory-consolidation-drop-input-cap.md
+++ b/tickets/memory-consolidation-drop-input-cap.md
@ -0,0 +1,65 @@
+# Memory Phase 2: 累積入力トークン上限の撤去
+
+## 背景
+
+Phase 2 (memory.consolidation) の sub-Worker には、累積入力トークンが
+`MEMORY_CONSOLIDATION_WORKER_MAX_INPUT_TOKENS = 80_000` を超えると
+`PreRequestAction::Cancel` でアボートする circuit-breaker
+(`MemoryConsolidationWorkerInterceptor`) が組み込まれている。
+manifest からも `[memory] consolidation_worker_max_input_tokens` で
+上書きできる。
+
+実運用で、このアボートが notice として頻繁に出ることが分かった:
+
+```
+[notice] pod: memory Phase 2 consolidation failed:
+Aborted: Phase 2 consolidation worker input exceeded 80000 tokens
+```
+
+原因は `build_consolidate_input` が staging 全文 + 既存 memory 全文を
+毎ターン頭で詰めるため、ツール呼び出しを伴う複数ターンで累積入力が
+80K に達するため。`worker.on_usage` の `input_tokens` を素直に加算
+しているのでキャッシュヒット込みの値で上限を圧迫する。
+
+そもそも Phase 2 は staging + 既存 memory を **全部読まないと判断
+できない** 統合タスクであり、入力サイズで途中アボートしても staging が
+未消費のまま残るので、次回 Phase 2 トリガで同じ理由で再度落ちる。
+ユーザーが手で staging を間引かない限り永遠に notice が出続ける。
+
+入力が context window に乗らない極端なケースは LLM 側で context
+overflow になるため、自前で先回りして止める意味は薄い。runaway 防止は
+別途 `Worker::max_turns` が存在するのでそちらに任せる方が筋がよい。
+
+## ゴール
+
+Phase 2 consolidation worker の累積入力トークン上限を撤去する。
+runaway 対策は `Worker::max_turns` に寄せる。
+
+## 要件
+
+- `MemoryConsolidationWorkerInterceptor` を削除
+- Phase 2 構築箇所 (`Pod::run_consolidate_once`) の interceptor 注入と
+  `on_usage` 累積カウンタを撤去
+- `MemoryConfig::consolidation_worker_max_input_tokens` を削除
+- `manifest::defaults::MEMORY_CONSOLIDATION_WORKER_MAX_INPUT_TOKENS` を削除
+- manifest cascade (`config.rs`) からも当該フィールドを削除
+- 既存 manifest がこのキーを含んでいたとしても、`#[serde(deny_unknown_fields)]`
+  系で読み込み時にエラーにならないよう確認 (現状 `deny_unknown_fields` を
+  使っていなければ無視されるはずだが、要確認)
+- 関連テスト・docs から `consolidation_worker_max_input_tokens` への
+  参照を一掃する
+
+## 範囲外
+
+- Phase 1 (extract) 側の `extract_worker_max_input_tokens` の扱い
+  (extract は session history の切り出しサイズに別の意味があるため別件)
+- compact 側の `compact_worker_max_input_tokens` の扱い
+- `Worker::max_turns` のデフォルト値見直し (現状の挙動で問題が出てから検討)
+- 累積入力トークンを cache hit/miss で区別するなどのカウンタ精緻化
+
+## 完了条件
+
+- `cargo check` / `cargo test` が `manifest`, `memory`, `pod` で通る
+- Phase 2 trigger が走った際、入力サイズに起因する Aborted notice が
+  発生しないこと (整合性レビューで `MemoryConsolidationWorkerInterceptor`
+  および対応する manifest field が完全に消えていることを確認)