fix: remove memory extract input cap

This commit is contained in:
Keisuke Hirata 2026-05-23 09:14:07 +09:00
parent cfb5fa89f1
commit d5da95499d
8 changed files with 73 additions and 135 deletions

View File

@ -156,6 +156,15 @@ pub(crate) fn reject_removed_manifest_fields(s: &str) -> Result<(), toml::de::Er
(removed; use compaction.prune_protected_tokens)",
));
}
if value
.get("memory")
.and_then(toml::Value::as_table)
.is_some_and(|table| table.contains_key("extract_worker_max_input_tokens"))
{
return Err(toml::de::Error::custom(
"unknown field in manifest: memory.extract_worker_max_input_tokens (removed)",
));
}
Ok(())
}
@ -283,9 +292,6 @@ impl MemoryConfig {
language: upper.language.or(self.language),
extract_model: upper.extract_model.or(self.extract_model),
extract_threshold: upper.extract_threshold.or(self.extract_threshold),
extract_worker_max_input_tokens: upper
.extract_worker_max_input_tokens
.or(self.extract_worker_max_input_tokens),
extract_worker_max_turns: upper
.extract_worker_max_turns
.or(self.extract_worker_max_turns),
@ -1004,6 +1010,32 @@ prune_protected_turns = 3
);
}
#[test]
fn from_toml_rejects_removed_extract_worker_max_input_tokens_field() {
let bad = r#"
[memory]
extract_worker_max_input_tokens = 30000
"#;
let err = PodManifestConfig::from_toml(bad).unwrap_err();
assert!(
err.to_string()
.contains("memory.extract_worker_max_input_tokens"),
"unexpected error: {err}"
);
}
#[test]
fn from_toml_accepts_extract_worker_max_turns() {
let cfg = PodManifestConfig::from_toml(
r#"
[memory]
extract_worker_max_turns = 2
"#,
)
.unwrap();
assert_eq!(cfg.memory.unwrap().extract_worker_max_turns, Some(2));
}
#[test]
fn from_toml_accepts_worker_reasoning_string_or_integer() {
let effort = PodManifestConfig::from_toml(

View File

@ -59,11 +59,6 @@ pub const COMPACT_WORKER_MAX_TURNS: Option<u32> = Some(20);
/// default references.
pub const COMPACT_DEFAULT_REFERENCE_COUNT: usize = 5;
/// Current prompt-occupancy cap for the memory extract worker's own
/// LLM requests. Exceeding this aborts the extract run (circuit-breaker
/// path). See [`crate::MemoryConfig::extract_worker_max_input_tokens`].
pub const MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS: u64 = 30_000;
/// Optional maximum extract-worker tool-loop depth. `None` means unlimited.
/// See [`crate::MemoryConfig::extract_worker_max_turns`].
pub const MEMORY_EXTRACT_WORKER_MAX_TURNS: Option<u32> = Some(8);

View File

@ -114,11 +114,6 @@ pub struct MemoryConfig {
/// the auto-extract trigger is dormant.
#[serde(default)]
pub extract_threshold: Option<u64>,
/// Current prompt-occupancy cap for the extract worker's own LLM
/// requests. Exceeding this aborts the extract run. `None` ⇒
/// [`defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS`].
#[serde(default)]
pub extract_worker_max_input_tokens: Option<u64>,
/// Optional maximum extract-worker tool-loop depth. `None` leaves
/// the worker unlimited; the default bounds runaway short-context
/// loops. Falls through to

View File

@ -69,4 +69,19 @@ mod tests {
assert!(s.contains("[ToolResult] ok"));
assert!(!s.contains("scratch"));
}
#[test]
fn tool_result_renders_summary_but_not_content() {
let huge_content = "raw-content-should-never-enter-extract-input".repeat(10_000);
let items = vec![Item::tool_result_with_content(
"c1",
"short summary kept for extraction",
huge_content.clone(),
)];
let s = build_extract_input(&items);
assert!(s.contains("[ToolResult] short summary kept for extraction"));
assert!(!s.contains("raw-content-should-never-enter-extract-input"));
assert!(!s.contains(&huge_content));
}
}

View File

@ -2497,9 +2497,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
.to_vec();
let layout = memory::WorkspaceLayout::resolve(memory_cfg, &self.pwd);
let cap = memory_cfg
.extract_worker_max_input_tokens
.unwrap_or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS);
let extract_worker_max_turns = memory_cfg
.extract_worker_max_turns
.or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS);
@ -2513,21 +2510,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
let mut extract_worker = Worker::new(client).system_prompt(extract_system_prompt);
extract_worker.set_cache_key(Some(self.segment_id().to_string()));
// Occupancy-based input-token meter + interceptor. The tracker pairs
// each pre-request history length with the following UsageEvent, then
// the interceptor projects current prompt occupancy with the same
// UsageRecord counter used by the main Pod thresholds.
let extract_usage_tracker = Arc::new(UsageTracker::new());
{
let tracker = extract_usage_tracker.clone();
extract_worker.on_usage(move |event| {
tracker.record_usage(event);
});
}
extract_worker.set_interceptor(MemoryExtractWorkerInterceptor {
usage_tracker: extract_usage_tracker,
max_input_tokens: cap,
});
extract_worker.set_max_turns(extract_worker_max_turns);
let ctx = Arc::new(extract::ExtractWorkerContext::new());
@ -2783,38 +2765,6 @@ enum ExtractDecision {
Completed,
}
/// Pre-request interceptor for the extract worker. Aborts when current
/// prompt occupancy crosses `max_input_tokens`. Uses the same
/// `UsageRecord` + `llm_worker::token_counter::total_tokens` projection
/// as the main Pod compaction thresholds, so prompt-cache hits are not
/// counted cumulatively across turns. Kept separate from
/// `compact::worker::CompactWorkerInterceptor` so each subsystem can
/// tune its own cancel message and budget.
struct MemoryExtractWorkerInterceptor {
usage_tracker: Arc<UsageTracker>,
max_input_tokens: u64,
}
#[async_trait]
impl llm_worker::interceptor::Interceptor for MemoryExtractWorkerInterceptor {
async fn pre_llm_request(
&self,
context: &mut Vec<Item>,
) -> llm_worker::interceptor::PreRequestAction {
let records = self.usage_tracker.records();
let estimate = llm_worker::token_counter::total_tokens(context, &records);
if estimate.tokens > self.max_input_tokens {
return llm_worker::interceptor::PreRequestAction::Cancel(format!(
"extract worker input occupancy exceeded {} tokens",
self.max_input_tokens
));
}
self.usage_tracker.note_request(context.len());
llm_worker::interceptor::PreRequestAction::Continue
}
}
/// Outcome of a single consolidation iteration. Internal to
/// `try_post_run_consolidate` / `run_consolidate_once`.
enum ConsolidateDecision {
@ -3770,70 +3720,3 @@ permission = "write"
assert!(shadows.iter().all(|s| s.slug.as_str() != "alpha"));
}
}
#[cfg(test)]
mod memory_extract_interceptor_tests {
use super::*;
use llm_worker::interceptor::{Interceptor, PreRequestAction};
use llm_worker::timeline::event::UsageEvent;
fn make_usage(input: u64) -> UsageEvent {
UsageEvent {
input_tokens: Some(input),
output_tokens: Some(0),
total_tokens: Some(input),
cache_read_input_tokens: None,
cache_creation_input_tokens: None,
}
}
#[tokio::test]
async fn extract_interceptor_uses_occupancy_not_cumulative_usage() {
let tracker = Arc::new(UsageTracker::new());
let interceptor = MemoryExtractWorkerInterceptor {
usage_tracker: tracker.clone(),
max_input_tokens: 150,
};
let mut context = vec![Item::user_message("hello")];
assert!(matches!(
interceptor.pre_llm_request(&mut context).await,
PreRequestAction::Continue
));
tracker.record_usage(&make_usage(100));
assert!(matches!(
interceptor.pre_llm_request(&mut context).await,
PreRequestAction::Continue
));
tracker.record_usage(&make_usage(100));
// Two 100-token requests would exceed a cumulative 150-token cap, but
// current occupancy is still the latest 100-token measurement.
assert!(matches!(
interceptor.pre_llm_request(&mut context).await,
PreRequestAction::Continue
));
}
#[tokio::test]
async fn extract_interceptor_cancels_when_occupancy_exceeds_cap() {
let tracker = Arc::new(UsageTracker::new());
let interceptor = MemoryExtractWorkerInterceptor {
usage_tracker: tracker.clone(),
max_input_tokens: 99,
};
let mut context = vec![Item::user_message("hello")];
assert!(matches!(
interceptor.pre_llm_request(&mut context).await,
PreRequestAction::Continue
));
tracker.record_usage(&make_usage(100));
assert!(matches!(
interceptor.pre_llm_request(&mut context).await,
PreRequestAction::Cancel(message) if message.contains("occupancy")
));
}
}

View File

@ -679,6 +679,27 @@ target = "./"
permission = "write"
"#;
#[tokio::test]
async fn extract_large_unprocessed_range_does_not_abort_on_input_occupancy() {
let client = MockClient::new(vec![
text_events_with_usage("recorded", 1000),
write_extracted_tool_use_events("ec-large"),
single_text_events("done"),
]);
let mut pod = make_pod_with_manifest(EXTRACT_NO_COMPACT_MANIFEST, client).await;
let large_request = format!("remember this large slice: {}", "x ".repeat(200_000));
pod.run_text(&large_request).await.unwrap();
pod.try_post_run_extract().await.expect(
"large unprocessed extract ranges must reach the extract worker, not abort locally",
);
assert!(
pod.extract_pointer().is_some(),
"successful extract should advance the pointer even when the input range is large"
);
}
#[tokio::test]
async fn spawn_and_wait_drives_extract_to_completion() {
let client = MockClient::new(vec![

View File

@ -263,10 +263,6 @@ permission = "write"
# # ※ memory tools と resident injection は extract_threshold が None でも動く。
# extract_threshold = 30000
#
# # 任意。デフォルト: 30000 (`defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS`)。
# # extract worker 自身の現在占有 token cap (超過で abort)。
# extract_worker_max_input_tokens = 30000
#
# # 任意。デフォルト: 8 (`defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS`)。
# # extract worker 自身の tool loop 上限。Rust config で None の場合のみ無制限。
# extract_worker_max_turns = 8

View File

@ -122,7 +122,7 @@ Workflow 保護は専用 tool schema のトリックではなく Linter ルー
- **Trigger**: activity tokens の累積閾値cumulative input tokens since last pointer。tool call カウントは不採用(ツールカスタマイズ非依存・大小重みづけのため)
- **実行主体**: 既存 compact と同じ Worker spawn 機構を再利用。Pod は立てない
- **入力**: 前回 extract 以降の session log 範囲。処理済み境界の pointer は session log 側に保持し、寿命を session と揃える。session-store のドメイン純度を保つため、汎用拡張点 `LogEntry::Extension { domain, payload }`domain = `"memory.extract"`に寄せ、session-store は memory ドメインを知らない
- **入力**: 前回 extract 以降の session log 範囲。処理済み境界の pointer は session log 側に保持し、寿命を session と揃える。session-store のドメイン純度を保つため、汎用拡張点 `LogEntry::Extension { domain, payload }`domain = `"memory.extract"`に寄せ、session-store は memory ドメインを知らない。Tool result は raw `content` ではなく表示用 `summary` だけを render し、巨大な tool output を extract input に載せない
- **出力**: JSON schema で**活動ログ**の候補配列を返す。Knowledge 等の派生物は consolidation が活動ログから導出するので、extract では純粋な「起きたこと」に絞る
- `decisions`: 判断したこと(選択肢 + 選んだ + 根拠)
- `discussions`: 議論したこと(トピック + 論点)
@ -131,6 +131,7 @@ Workflow 保護は専用 tool schema のトリックではなく Linter ルー
- **抽出対象がなければ空配列を返してよい**Hermes の "Nothing to save." と同系。頻繁発火を許容する前提)
- **書き込み先**: `memory/_staging/<id>.json`
- LLM 出力(活動ログ JSONは pod 側ラッパーが `source: { session_id, range: [start_entry, end_entry] }` を**機械付与**して wrap。LLM には source を推論させない
- **実行保証**: extract worker 自身の input occupancy cap は設けない。未処理 range が大きい場合でも pointer 以降の最大範囲を渡し、LLM/API/tool failure のときだけ pointer を進めない
- **モデル**: `memory.extract_model`。軽量だが文脈理解できる中堅クラスHaiku / 4o-mini / Flash 相当)を想定
- **Compact との順序**: 同一 turn 完了後の post-run チェックで extract を **compact より前** に走らせる。compact は history を組み替えるので、extract の入力範囲session log 上の entry indexは compact 前のほうが安定する
- **並走防止 (extract 同士)**: Pod 上の `extract_in_flight` フラグで in-flight 中の新規 trigger を skip。完了時点で閾値超過していれば直ちに次回を発火し、新 pointer 以降の最大範囲を回収するpending 状態は保持しない=完了時の閾値再評価で coalesce 相当の挙動を成立させる)