From 902083bd3877b463dd51450e18ddf82ea2ad83fa Mon Sep 17 00:00:00 2001 From: Hare Date: Sat, 23 May 2026 09:14:07 +0900 Subject: [PATCH] fix: remove memory extract input cap --- crates/manifest/src/config.rs | 38 +++++++- crates/manifest/src/defaults.rs | 5 - crates/manifest/src/lib.rs | 5 - crates/memory/src/extract/input.rs | 15 +++ crates/pod/src/pod.rs | 117 ------------------------ crates/pod/tests/compact_events_test.rs | 21 +++++ docs/manifest.toml | 4 - docs/plan/memory.md | 3 +- 8 files changed, 73 insertions(+), 135 deletions(-) diff --git a/crates/manifest/src/config.rs b/crates/manifest/src/config.rs index 059ba3f9..8200fb60 100644 --- a/crates/manifest/src/config.rs +++ b/crates/manifest/src/config.rs @@ -156,6 +156,15 @@ pub(crate) fn reject_removed_manifest_fields(s: &str) -> Result<(), toml::de::Er (removed; use compaction.prune_protected_tokens)", )); } + if value + .get("memory") + .and_then(toml::Value::as_table) + .is_some_and(|table| table.contains_key("extract_worker_max_input_tokens")) + { + return Err(toml::de::Error::custom( + "unknown field in manifest: memory.extract_worker_max_input_tokens (removed)", + )); + } Ok(()) } @@ -283,9 +292,6 @@ impl MemoryConfig { language: upper.language.or(self.language), extract_model: upper.extract_model.or(self.extract_model), extract_threshold: upper.extract_threshold.or(self.extract_threshold), - extract_worker_max_input_tokens: upper - .extract_worker_max_input_tokens - .or(self.extract_worker_max_input_tokens), extract_worker_max_turns: upper .extract_worker_max_turns .or(self.extract_worker_max_turns), @@ -1004,6 +1010,32 @@ prune_protected_turns = 3 ); } + #[test] + fn from_toml_rejects_removed_extract_worker_max_input_tokens_field() { + let bad = r#" +[memory] +extract_worker_max_input_tokens = 30000 +"#; + let err = PodManifestConfig::from_toml(bad).unwrap_err(); + assert!( + err.to_string() + .contains("memory.extract_worker_max_input_tokens"), + "unexpected error: {err}" + ); + } + + #[test] + fn from_toml_accepts_extract_worker_max_turns() { + let cfg = PodManifestConfig::from_toml( + r#" +[memory] +extract_worker_max_turns = 2 +"#, + ) + .unwrap(); + assert_eq!(cfg.memory.unwrap().extract_worker_max_turns, Some(2)); + } + #[test] fn from_toml_accepts_worker_reasoning_string_or_integer() { let effort = PodManifestConfig::from_toml( diff --git a/crates/manifest/src/defaults.rs b/crates/manifest/src/defaults.rs index 81f38ec4..4855a4df 100644 --- a/crates/manifest/src/defaults.rs +++ b/crates/manifest/src/defaults.rs @@ -59,11 +59,6 @@ pub const COMPACT_WORKER_MAX_TURNS: Option = Some(20); /// default references. pub const COMPACT_DEFAULT_REFERENCE_COUNT: usize = 5; -/// Current prompt-occupancy cap for the memory extract worker's own -/// LLM requests. Exceeding this aborts the extract run (circuit-breaker -/// path). See [`crate::MemoryConfig::extract_worker_max_input_tokens`]. -pub const MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS: u64 = 30_000; - /// Optional maximum extract-worker tool-loop depth. `None` means unlimited. /// See [`crate::MemoryConfig::extract_worker_max_turns`]. pub const MEMORY_EXTRACT_WORKER_MAX_TURNS: Option = Some(8); diff --git a/crates/manifest/src/lib.rs b/crates/manifest/src/lib.rs index 3a868860..36435cb6 100644 --- a/crates/manifest/src/lib.rs +++ b/crates/manifest/src/lib.rs @@ -114,11 +114,6 @@ pub struct MemoryConfig { /// the auto-extract trigger is dormant. #[serde(default)] pub extract_threshold: Option, - /// Current prompt-occupancy cap for the extract worker's own LLM - /// requests. Exceeding this aborts the extract run. `None` ⇒ - /// [`defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS`]. - #[serde(default)] - pub extract_worker_max_input_tokens: Option, /// Optional maximum extract-worker tool-loop depth. `None` leaves /// the worker unlimited; the default bounds runaway short-context /// loops. Falls through to diff --git a/crates/memory/src/extract/input.rs b/crates/memory/src/extract/input.rs index 7bba7674..34820694 100644 --- a/crates/memory/src/extract/input.rs +++ b/crates/memory/src/extract/input.rs @@ -69,4 +69,19 @@ mod tests { assert!(s.contains("[ToolResult] ok")); assert!(!s.contains("scratch")); } + + #[test] + fn tool_result_renders_summary_but_not_content() { + let huge_content = "raw-content-should-never-enter-extract-input".repeat(10_000); + let items = vec![Item::tool_result_with_content( + "c1", + "short summary kept for extraction", + huge_content.clone(), + )]; + + let s = build_extract_input(&items); + assert!(s.contains("[ToolResult] short summary kept for extraction")); + assert!(!s.contains("raw-content-should-never-enter-extract-input")); + assert!(!s.contains(&huge_content)); + } } diff --git a/crates/pod/src/pod.rs b/crates/pod/src/pod.rs index acf20833..4c7b61ff 100644 --- a/crates/pod/src/pod.rs +++ b/crates/pod/src/pod.rs @@ -2497,9 +2497,6 @@ impl Pod { .to_vec(); let layout = memory::WorkspaceLayout::resolve(memory_cfg, &self.pwd); - let cap = memory_cfg - .extract_worker_max_input_tokens - .unwrap_or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS); let extract_worker_max_turns = memory_cfg .extract_worker_max_turns .or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS); @@ -2513,21 +2510,6 @@ impl Pod { let mut extract_worker = Worker::new(client).system_prompt(extract_system_prompt); extract_worker.set_cache_key(Some(self.segment_id().to_string())); - // Occupancy-based input-token meter + interceptor. The tracker pairs - // each pre-request history length with the following UsageEvent, then - // the interceptor projects current prompt occupancy with the same - // UsageRecord counter used by the main Pod thresholds. - let extract_usage_tracker = Arc::new(UsageTracker::new()); - { - let tracker = extract_usage_tracker.clone(); - extract_worker.on_usage(move |event| { - tracker.record_usage(event); - }); - } - extract_worker.set_interceptor(MemoryExtractWorkerInterceptor { - usage_tracker: extract_usage_tracker, - max_input_tokens: cap, - }); extract_worker.set_max_turns(extract_worker_max_turns); let ctx = Arc::new(extract::ExtractWorkerContext::new()); @@ -2783,38 +2765,6 @@ enum ExtractDecision { Completed, } -/// Pre-request interceptor for the extract worker. Aborts when current -/// prompt occupancy crosses `max_input_tokens`. Uses the same -/// `UsageRecord` + `llm_worker::token_counter::total_tokens` projection -/// as the main Pod compaction thresholds, so prompt-cache hits are not -/// counted cumulatively across turns. Kept separate from -/// `compact::worker::CompactWorkerInterceptor` so each subsystem can -/// tune its own cancel message and budget. -struct MemoryExtractWorkerInterceptor { - usage_tracker: Arc, - max_input_tokens: u64, -} - -#[async_trait] -impl llm_worker::interceptor::Interceptor for MemoryExtractWorkerInterceptor { - async fn pre_llm_request( - &self, - context: &mut Vec, - ) -> llm_worker::interceptor::PreRequestAction { - let records = self.usage_tracker.records(); - let estimate = llm_worker::token_counter::total_tokens(context, &records); - if estimate.tokens > self.max_input_tokens { - return llm_worker::interceptor::PreRequestAction::Cancel(format!( - "extract worker input occupancy exceeded {} tokens", - self.max_input_tokens - )); - } - - self.usage_tracker.note_request(context.len()); - llm_worker::interceptor::PreRequestAction::Continue - } -} - /// Outcome of a single consolidation iteration. Internal to /// `try_post_run_consolidate` / `run_consolidate_once`. enum ConsolidateDecision { @@ -3770,70 +3720,3 @@ permission = "write" assert!(shadows.iter().all(|s| s.slug.as_str() != "alpha")); } } - -#[cfg(test)] -mod memory_extract_interceptor_tests { - use super::*; - use llm_worker::interceptor::{Interceptor, PreRequestAction}; - use llm_worker::timeline::event::UsageEvent; - - fn make_usage(input: u64) -> UsageEvent { - UsageEvent { - input_tokens: Some(input), - output_tokens: Some(0), - total_tokens: Some(input), - cache_read_input_tokens: None, - cache_creation_input_tokens: None, - } - } - - #[tokio::test] - async fn extract_interceptor_uses_occupancy_not_cumulative_usage() { - let tracker = Arc::new(UsageTracker::new()); - let interceptor = MemoryExtractWorkerInterceptor { - usage_tracker: tracker.clone(), - max_input_tokens: 150, - }; - let mut context = vec![Item::user_message("hello")]; - - assert!(matches!( - interceptor.pre_llm_request(&mut context).await, - PreRequestAction::Continue - )); - tracker.record_usage(&make_usage(100)); - - assert!(matches!( - interceptor.pre_llm_request(&mut context).await, - PreRequestAction::Continue - )); - tracker.record_usage(&make_usage(100)); - - // Two 100-token requests would exceed a cumulative 150-token cap, but - // current occupancy is still the latest 100-token measurement. - assert!(matches!( - interceptor.pre_llm_request(&mut context).await, - PreRequestAction::Continue - )); - } - - #[tokio::test] - async fn extract_interceptor_cancels_when_occupancy_exceeds_cap() { - let tracker = Arc::new(UsageTracker::new()); - let interceptor = MemoryExtractWorkerInterceptor { - usage_tracker: tracker.clone(), - max_input_tokens: 99, - }; - let mut context = vec![Item::user_message("hello")]; - - assert!(matches!( - interceptor.pre_llm_request(&mut context).await, - PreRequestAction::Continue - )); - tracker.record_usage(&make_usage(100)); - - assert!(matches!( - interceptor.pre_llm_request(&mut context).await, - PreRequestAction::Cancel(message) if message.contains("occupancy") - )); - } -} diff --git a/crates/pod/tests/compact_events_test.rs b/crates/pod/tests/compact_events_test.rs index 7ff77687..3a917412 100644 --- a/crates/pod/tests/compact_events_test.rs +++ b/crates/pod/tests/compact_events_test.rs @@ -679,6 +679,27 @@ target = "./" permission = "write" "#; +#[tokio::test] +async fn extract_large_unprocessed_range_does_not_abort_on_input_occupancy() { + let client = MockClient::new(vec![ + text_events_with_usage("recorded", 1000), + write_extracted_tool_use_events("ec-large"), + single_text_events("done"), + ]); + let mut pod = make_pod_with_manifest(EXTRACT_NO_COMPACT_MANIFEST, client).await; + + let large_request = format!("remember this large slice: {}", "x ".repeat(200_000)); + pod.run_text(&large_request).await.unwrap(); + + pod.try_post_run_extract().await.expect( + "large unprocessed extract ranges must reach the extract worker, not abort locally", + ); + assert!( + pod.extract_pointer().is_some(), + "successful extract should advance the pointer even when the input range is large" + ); +} + #[tokio::test] async fn spawn_and_wait_drives_extract_to_completion() { let client = MockClient::new(vec![ diff --git a/docs/manifest.toml b/docs/manifest.toml index ff739af5..30e0a01e 100644 --- a/docs/manifest.toml +++ b/docs/manifest.toml @@ -263,10 +263,6 @@ permission = "write" # # ※ memory tools と resident injection は extract_threshold が None でも動く。 # extract_threshold = 30000 # -# # 任意。デフォルト: 30000 (`defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS`)。 -# # extract worker 自身の現在占有 token cap (超過で abort)。 -# extract_worker_max_input_tokens = 30000 -# # # 任意。デフォルト: 8 (`defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS`)。 # # extract worker 自身の tool loop 上限。Rust config で None の場合のみ無制限。 # extract_worker_max_turns = 8 diff --git a/docs/plan/memory.md b/docs/plan/memory.md index 08a44a97..cab75e53 100644 --- a/docs/plan/memory.md +++ b/docs/plan/memory.md @@ -122,7 +122,7 @@ Workflow 保護は専用 tool schema のトリックではなく Linter ルー - **Trigger**: activity tokens の累積閾値(cumulative input tokens since last pointer)。tool call カウントは不採用(ツールカスタマイズ非依存・大小重みづけのため) - **実行主体**: 既存 compact と同じ Worker spawn 機構を再利用。Pod は立てない -- **入力**: 前回 extract 以降の session log 範囲。処理済み境界の pointer は session log 側に保持し、寿命を session と揃える。session-store のドメイン純度を保つため、汎用拡張点 `LogEntry::Extension { domain, payload }`(domain = `"memory.extract"`)に寄せ、session-store は memory ドメインを知らない +- **入力**: 前回 extract 以降の session log 範囲。処理済み境界の pointer は session log 側に保持し、寿命を session と揃える。session-store のドメイン純度を保つため、汎用拡張点 `LogEntry::Extension { domain, payload }`(domain = `"memory.extract"`)に寄せ、session-store は memory ドメインを知らない。Tool result は raw `content` ではなく表示用 `summary` だけを render し、巨大な tool output を extract input に載せない - **出力**: JSON schema で**活動ログ**の候補配列を返す。Knowledge 等の派生物は consolidation が活動ログから導出するので、extract では純粋な「起きたこと」に絞る - `decisions`: 判断したこと(選択肢 + 選んだ + 根拠) - `discussions`: 議論したこと(トピック + 論点) @@ -131,6 +131,7 @@ Workflow 保護は専用 tool schema のトリックではなく Linter ルー - **抽出対象がなければ空配列を返してよい**(Hermes の "Nothing to save." と同系。頻繁発火を許容する前提) - **書き込み先**: `memory/_staging/.json` - LLM 出力(活動ログ JSON)は pod 側ラッパーが `source: { session_id, range: [start_entry, end_entry] }` を**機械付与**して wrap。LLM には source を推論させない +- **実行保証**: extract worker 自身の input occupancy cap は設けない。未処理 range が大きい場合でも pointer 以降の最大範囲を渡し、LLM/API/tool failure のときだけ pointer を進めない - **モデル**: `memory.extract_model`。軽量だが文脈理解できる中堅クラス(Haiku / 4o-mini / Flash 相当)を想定 - **Compact との順序**: 同一 turn 完了後の post-run チェックで extract を **compact より前** に走らせる。compact は history を組み替えるので、extract の入力範囲(session log 上の entry index)は compact 前のほうが安定する - **並走防止 (extract 同士)**: Pod 上の `extract_in_flight` フラグで in-flight 中の新規 trigger を skip。完了時点で閾値超過していれば直ちに次回を発火し、新 pointer 以降の最大範囲を回収する(pending 状態は保持しない=完了時の閾値再評価で coalesce 相当の挙動を成立させる)