merge: memory-extract-remove-input-cap
This commit is contained in:
commit
1feb560ff9
|
|
@ -156,6 +156,15 @@ pub(crate) fn reject_removed_manifest_fields(s: &str) -> Result<(), toml::de::Er
|
||||||
(removed; use compaction.prune_protected_tokens)",
|
(removed; use compaction.prune_protected_tokens)",
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
if value
|
||||||
|
.get("memory")
|
||||||
|
.and_then(toml::Value::as_table)
|
||||||
|
.is_some_and(|table| table.contains_key("extract_worker_max_input_tokens"))
|
||||||
|
{
|
||||||
|
return Err(toml::de::Error::custom(
|
||||||
|
"unknown field in manifest: memory.extract_worker_max_input_tokens (removed)",
|
||||||
|
));
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -283,9 +292,6 @@ impl MemoryConfig {
|
||||||
language: upper.language.or(self.language),
|
language: upper.language.or(self.language),
|
||||||
extract_model: upper.extract_model.or(self.extract_model),
|
extract_model: upper.extract_model.or(self.extract_model),
|
||||||
extract_threshold: upper.extract_threshold.or(self.extract_threshold),
|
extract_threshold: upper.extract_threshold.or(self.extract_threshold),
|
||||||
extract_worker_max_input_tokens: upper
|
|
||||||
.extract_worker_max_input_tokens
|
|
||||||
.or(self.extract_worker_max_input_tokens),
|
|
||||||
extract_worker_max_turns: upper
|
extract_worker_max_turns: upper
|
||||||
.extract_worker_max_turns
|
.extract_worker_max_turns
|
||||||
.or(self.extract_worker_max_turns),
|
.or(self.extract_worker_max_turns),
|
||||||
|
|
@ -1004,6 +1010,32 @@ prune_protected_turns = 3
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn from_toml_rejects_removed_extract_worker_max_input_tokens_field() {
|
||||||
|
let bad = r#"
|
||||||
|
[memory]
|
||||||
|
extract_worker_max_input_tokens = 30000
|
||||||
|
"#;
|
||||||
|
let err = PodManifestConfig::from_toml(bad).unwrap_err();
|
||||||
|
assert!(
|
||||||
|
err.to_string()
|
||||||
|
.contains("memory.extract_worker_max_input_tokens"),
|
||||||
|
"unexpected error: {err}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn from_toml_accepts_extract_worker_max_turns() {
|
||||||
|
let cfg = PodManifestConfig::from_toml(
|
||||||
|
r#"
|
||||||
|
[memory]
|
||||||
|
extract_worker_max_turns = 2
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(cfg.memory.unwrap().extract_worker_max_turns, Some(2));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn from_toml_accepts_worker_reasoning_string_or_integer() {
|
fn from_toml_accepts_worker_reasoning_string_or_integer() {
|
||||||
let effort = PodManifestConfig::from_toml(
|
let effort = PodManifestConfig::from_toml(
|
||||||
|
|
|
||||||
|
|
@ -59,11 +59,6 @@ pub const COMPACT_WORKER_MAX_TURNS: Option<u32> = Some(20);
|
||||||
/// default references.
|
/// default references.
|
||||||
pub const COMPACT_DEFAULT_REFERENCE_COUNT: usize = 5;
|
pub const COMPACT_DEFAULT_REFERENCE_COUNT: usize = 5;
|
||||||
|
|
||||||
/// Current prompt-occupancy cap for the memory extract worker's own
|
|
||||||
/// LLM requests. Exceeding this aborts the extract run (circuit-breaker
|
|
||||||
/// path). See [`crate::MemoryConfig::extract_worker_max_input_tokens`].
|
|
||||||
pub const MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS: u64 = 30_000;
|
|
||||||
|
|
||||||
/// Optional maximum extract-worker tool-loop depth. `None` means unlimited.
|
/// Optional maximum extract-worker tool-loop depth. `None` means unlimited.
|
||||||
/// See [`crate::MemoryConfig::extract_worker_max_turns`].
|
/// See [`crate::MemoryConfig::extract_worker_max_turns`].
|
||||||
pub const MEMORY_EXTRACT_WORKER_MAX_TURNS: Option<u32> = Some(8);
|
pub const MEMORY_EXTRACT_WORKER_MAX_TURNS: Option<u32> = Some(8);
|
||||||
|
|
|
||||||
|
|
@ -114,11 +114,6 @@ pub struct MemoryConfig {
|
||||||
/// the auto-extract trigger is dormant.
|
/// the auto-extract trigger is dormant.
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub extract_threshold: Option<u64>,
|
pub extract_threshold: Option<u64>,
|
||||||
/// Current prompt-occupancy cap for the extract worker's own LLM
|
|
||||||
/// requests. Exceeding this aborts the extract run. `None` ⇒
|
|
||||||
/// [`defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS`].
|
|
||||||
#[serde(default)]
|
|
||||||
pub extract_worker_max_input_tokens: Option<u64>,
|
|
||||||
/// Optional maximum extract-worker tool-loop depth. `None` leaves
|
/// Optional maximum extract-worker tool-loop depth. `None` leaves
|
||||||
/// the worker unlimited; the default bounds runaway short-context
|
/// the worker unlimited; the default bounds runaway short-context
|
||||||
/// loops. Falls through to
|
/// loops. Falls through to
|
||||||
|
|
|
||||||
|
|
@ -69,4 +69,19 @@ mod tests {
|
||||||
assert!(s.contains("[ToolResult] ok"));
|
assert!(s.contains("[ToolResult] ok"));
|
||||||
assert!(!s.contains("scratch"));
|
assert!(!s.contains("scratch"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tool_result_renders_summary_but_not_content() {
|
||||||
|
let huge_content = "raw-content-should-never-enter-extract-input".repeat(10_000);
|
||||||
|
let items = vec![Item::tool_result_with_content(
|
||||||
|
"c1",
|
||||||
|
"short summary kept for extraction",
|
||||||
|
huge_content.clone(),
|
||||||
|
)];
|
||||||
|
|
||||||
|
let s = build_extract_input(&items);
|
||||||
|
assert!(s.contains("[ToolResult] short summary kept for extraction"));
|
||||||
|
assert!(!s.contains("raw-content-should-never-enter-extract-input"));
|
||||||
|
assert!(!s.contains(&huge_content));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2497,9 +2497,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
|
||||||
.to_vec();
|
.to_vec();
|
||||||
|
|
||||||
let layout = memory::WorkspaceLayout::resolve(memory_cfg, &self.pwd);
|
let layout = memory::WorkspaceLayout::resolve(memory_cfg, &self.pwd);
|
||||||
let cap = memory_cfg
|
|
||||||
.extract_worker_max_input_tokens
|
|
||||||
.unwrap_or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS);
|
|
||||||
let extract_worker_max_turns = memory_cfg
|
let extract_worker_max_turns = memory_cfg
|
||||||
.extract_worker_max_turns
|
.extract_worker_max_turns
|
||||||
.or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS);
|
.or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS);
|
||||||
|
|
@ -2513,21 +2510,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
|
||||||
let mut extract_worker = Worker::new(client).system_prompt(extract_system_prompt);
|
let mut extract_worker = Worker::new(client).system_prompt(extract_system_prompt);
|
||||||
extract_worker.set_cache_key(Some(self.segment_id().to_string()));
|
extract_worker.set_cache_key(Some(self.segment_id().to_string()));
|
||||||
|
|
||||||
// Occupancy-based input-token meter + interceptor. The tracker pairs
|
|
||||||
// each pre-request history length with the following UsageEvent, then
|
|
||||||
// the interceptor projects current prompt occupancy with the same
|
|
||||||
// UsageRecord counter used by the main Pod thresholds.
|
|
||||||
let extract_usage_tracker = Arc::new(UsageTracker::new());
|
|
||||||
{
|
|
||||||
let tracker = extract_usage_tracker.clone();
|
|
||||||
extract_worker.on_usage(move |event| {
|
|
||||||
tracker.record_usage(event);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
extract_worker.set_interceptor(MemoryExtractWorkerInterceptor {
|
|
||||||
usage_tracker: extract_usage_tracker,
|
|
||||||
max_input_tokens: cap,
|
|
||||||
});
|
|
||||||
extract_worker.set_max_turns(extract_worker_max_turns);
|
extract_worker.set_max_turns(extract_worker_max_turns);
|
||||||
|
|
||||||
let ctx = Arc::new(extract::ExtractWorkerContext::new());
|
let ctx = Arc::new(extract::ExtractWorkerContext::new());
|
||||||
|
|
@ -2783,38 +2765,6 @@ enum ExtractDecision {
|
||||||
Completed,
|
Completed,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Pre-request interceptor for the extract worker. Aborts when current
|
|
||||||
/// prompt occupancy crosses `max_input_tokens`. Uses the same
|
|
||||||
/// `UsageRecord` + `llm_worker::token_counter::total_tokens` projection
|
|
||||||
/// as the main Pod compaction thresholds, so prompt-cache hits are not
|
|
||||||
/// counted cumulatively across turns. Kept separate from
|
|
||||||
/// `compact::worker::CompactWorkerInterceptor` so each subsystem can
|
|
||||||
/// tune its own cancel message and budget.
|
|
||||||
struct MemoryExtractWorkerInterceptor {
|
|
||||||
usage_tracker: Arc<UsageTracker>,
|
|
||||||
max_input_tokens: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl llm_worker::interceptor::Interceptor for MemoryExtractWorkerInterceptor {
|
|
||||||
async fn pre_llm_request(
|
|
||||||
&self,
|
|
||||||
context: &mut Vec<Item>,
|
|
||||||
) -> llm_worker::interceptor::PreRequestAction {
|
|
||||||
let records = self.usage_tracker.records();
|
|
||||||
let estimate = llm_worker::token_counter::total_tokens(context, &records);
|
|
||||||
if estimate.tokens > self.max_input_tokens {
|
|
||||||
return llm_worker::interceptor::PreRequestAction::Cancel(format!(
|
|
||||||
"extract worker input occupancy exceeded {} tokens",
|
|
||||||
self.max_input_tokens
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
self.usage_tracker.note_request(context.len());
|
|
||||||
llm_worker::interceptor::PreRequestAction::Continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Outcome of a single consolidation iteration. Internal to
|
/// Outcome of a single consolidation iteration. Internal to
|
||||||
/// `try_post_run_consolidate` / `run_consolidate_once`.
|
/// `try_post_run_consolidate` / `run_consolidate_once`.
|
||||||
enum ConsolidateDecision {
|
enum ConsolidateDecision {
|
||||||
|
|
@ -3770,70 +3720,3 @@ permission = "write"
|
||||||
assert!(shadows.iter().all(|s| s.slug.as_str() != "alpha"));
|
assert!(shadows.iter().all(|s| s.slug.as_str() != "alpha"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod memory_extract_interceptor_tests {
|
|
||||||
use super::*;
|
|
||||||
use llm_worker::interceptor::{Interceptor, PreRequestAction};
|
|
||||||
use llm_worker::timeline::event::UsageEvent;
|
|
||||||
|
|
||||||
fn make_usage(input: u64) -> UsageEvent {
|
|
||||||
UsageEvent {
|
|
||||||
input_tokens: Some(input),
|
|
||||||
output_tokens: Some(0),
|
|
||||||
total_tokens: Some(input),
|
|
||||||
cache_read_input_tokens: None,
|
|
||||||
cache_creation_input_tokens: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn extract_interceptor_uses_occupancy_not_cumulative_usage() {
|
|
||||||
let tracker = Arc::new(UsageTracker::new());
|
|
||||||
let interceptor = MemoryExtractWorkerInterceptor {
|
|
||||||
usage_tracker: tracker.clone(),
|
|
||||||
max_input_tokens: 150,
|
|
||||||
};
|
|
||||||
let mut context = vec![Item::user_message("hello")];
|
|
||||||
|
|
||||||
assert!(matches!(
|
|
||||||
interceptor.pre_llm_request(&mut context).await,
|
|
||||||
PreRequestAction::Continue
|
|
||||||
));
|
|
||||||
tracker.record_usage(&make_usage(100));
|
|
||||||
|
|
||||||
assert!(matches!(
|
|
||||||
interceptor.pre_llm_request(&mut context).await,
|
|
||||||
PreRequestAction::Continue
|
|
||||||
));
|
|
||||||
tracker.record_usage(&make_usage(100));
|
|
||||||
|
|
||||||
// Two 100-token requests would exceed a cumulative 150-token cap, but
|
|
||||||
// current occupancy is still the latest 100-token measurement.
|
|
||||||
assert!(matches!(
|
|
||||||
interceptor.pre_llm_request(&mut context).await,
|
|
||||||
PreRequestAction::Continue
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn extract_interceptor_cancels_when_occupancy_exceeds_cap() {
|
|
||||||
let tracker = Arc::new(UsageTracker::new());
|
|
||||||
let interceptor = MemoryExtractWorkerInterceptor {
|
|
||||||
usage_tracker: tracker.clone(),
|
|
||||||
max_input_tokens: 99,
|
|
||||||
};
|
|
||||||
let mut context = vec![Item::user_message("hello")];
|
|
||||||
|
|
||||||
assert!(matches!(
|
|
||||||
interceptor.pre_llm_request(&mut context).await,
|
|
||||||
PreRequestAction::Continue
|
|
||||||
));
|
|
||||||
tracker.record_usage(&make_usage(100));
|
|
||||||
|
|
||||||
assert!(matches!(
|
|
||||||
interceptor.pre_llm_request(&mut context).await,
|
|
||||||
PreRequestAction::Cancel(message) if message.contains("occupancy")
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -679,6 +679,27 @@ target = "./"
|
||||||
permission = "write"
|
permission = "write"
|
||||||
"#;
|
"#;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn extract_large_unprocessed_range_does_not_abort_on_input_occupancy() {
|
||||||
|
let client = MockClient::new(vec![
|
||||||
|
text_events_with_usage("recorded", 1000),
|
||||||
|
write_extracted_tool_use_events("ec-large"),
|
||||||
|
single_text_events("done"),
|
||||||
|
]);
|
||||||
|
let mut pod = make_pod_with_manifest(EXTRACT_NO_COMPACT_MANIFEST, client).await;
|
||||||
|
|
||||||
|
let large_request = format!("remember this large slice: {}", "x ".repeat(200_000));
|
||||||
|
pod.run_text(&large_request).await.unwrap();
|
||||||
|
|
||||||
|
pod.try_post_run_extract().await.expect(
|
||||||
|
"large unprocessed extract ranges must reach the extract worker, not abort locally",
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
pod.extract_pointer().is_some(),
|
||||||
|
"successful extract should advance the pointer even when the input range is large"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn spawn_and_wait_drives_extract_to_completion() {
|
async fn spawn_and_wait_drives_extract_to_completion() {
|
||||||
let client = MockClient::new(vec![
|
let client = MockClient::new(vec![
|
||||||
|
|
|
||||||
|
|
@ -263,10 +263,6 @@ permission = "write"
|
||||||
# # ※ memory tools と resident injection は extract_threshold が None でも動く。
|
# # ※ memory tools と resident injection は extract_threshold が None でも動く。
|
||||||
# extract_threshold = 30000
|
# extract_threshold = 30000
|
||||||
#
|
#
|
||||||
# # 任意。デフォルト: 30000 (`defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS`)。
|
|
||||||
# # extract worker 自身の現在占有 token cap (超過で abort)。
|
|
||||||
# extract_worker_max_input_tokens = 30000
|
|
||||||
#
|
|
||||||
# # 任意。デフォルト: 8 (`defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS`)。
|
# # 任意。デフォルト: 8 (`defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS`)。
|
||||||
# # extract worker 自身の tool loop 上限。Rust config で None の場合のみ無制限。
|
# # extract worker 自身の tool loop 上限。Rust config で None の場合のみ無制限。
|
||||||
# extract_worker_max_turns = 8
|
# extract_worker_max_turns = 8
|
||||||
|
|
|
||||||
|
|
@ -122,7 +122,7 @@ Workflow 保護は専用 tool schema のトリックではなく Linter ルー
|
||||||
|
|
||||||
- **Trigger**: activity tokens の累積閾値(cumulative input tokens since last pointer)。tool call カウントは不採用(ツールカスタマイズ非依存・大小重みづけのため)
|
- **Trigger**: activity tokens の累積閾値(cumulative input tokens since last pointer)。tool call カウントは不採用(ツールカスタマイズ非依存・大小重みづけのため)
|
||||||
- **実行主体**: 既存 compact と同じ Worker spawn 機構を再利用。Pod は立てない
|
- **実行主体**: 既存 compact と同じ Worker spawn 機構を再利用。Pod は立てない
|
||||||
- **入力**: 前回 extract 以降の session log 範囲。処理済み境界の pointer は session log 側に保持し、寿命を session と揃える。session-store のドメイン純度を保つため、汎用拡張点 `LogEntry::Extension { domain, payload }`(domain = `"memory.extract"`)に寄せ、session-store は memory ドメインを知らない
|
- **入力**: 前回 extract 以降の session log 範囲。処理済み境界の pointer は session log 側に保持し、寿命を session と揃える。session-store のドメイン純度を保つため、汎用拡張点 `LogEntry::Extension { domain, payload }`(domain = `"memory.extract"`)に寄せ、session-store は memory ドメインを知らない。Tool result は raw `content` ではなく表示用 `summary` だけを render し、巨大な tool output を extract input に載せない
|
||||||
- **出力**: JSON schema で**活動ログ**の候補配列を返す。Knowledge 等の派生物は consolidation が活動ログから導出するので、extract では純粋な「起きたこと」に絞る
|
- **出力**: JSON schema で**活動ログ**の候補配列を返す。Knowledge 等の派生物は consolidation が活動ログから導出するので、extract では純粋な「起きたこと」に絞る
|
||||||
- `decisions`: 判断したこと(選択肢 + 選んだ + 根拠)
|
- `decisions`: 判断したこと(選択肢 + 選んだ + 根拠)
|
||||||
- `discussions`: 議論したこと(トピック + 論点)
|
- `discussions`: 議論したこと(トピック + 論点)
|
||||||
|
|
@ -131,6 +131,7 @@ Workflow 保護は専用 tool schema のトリックではなく Linter ルー
|
||||||
- **抽出対象がなければ空配列を返してよい**(Hermes の "Nothing to save." と同系。頻繁発火を許容する前提)
|
- **抽出対象がなければ空配列を返してよい**(Hermes の "Nothing to save." と同系。頻繁発火を許容する前提)
|
||||||
- **書き込み先**: `memory/_staging/<id>.json`
|
- **書き込み先**: `memory/_staging/<id>.json`
|
||||||
- LLM 出力(活動ログ JSON)は pod 側ラッパーが `source: { session_id, range: [start_entry, end_entry] }` を**機械付与**して wrap。LLM には source を推論させない
|
- LLM 出力(活動ログ JSON)は pod 側ラッパーが `source: { session_id, range: [start_entry, end_entry] }` を**機械付与**して wrap。LLM には source を推論させない
|
||||||
|
- **実行保証**: extract worker 自身の input occupancy cap は設けない。未処理 range が大きい場合でも pointer 以降の最大範囲を渡し、LLM/API/tool failure のときだけ pointer を進めない
|
||||||
- **モデル**: `memory.extract_model`。軽量だが文脈理解できる中堅クラス(Haiku / 4o-mini / Flash 相当)を想定
|
- **モデル**: `memory.extract_model`。軽量だが文脈理解できる中堅クラス(Haiku / 4o-mini / Flash 相当)を想定
|
||||||
- **Compact との順序**: 同一 turn 完了後の post-run チェックで extract を **compact より前** に走らせる。compact は history を組み替えるので、extract の入力範囲(session log 上の entry index)は compact 前のほうが安定する
|
- **Compact との順序**: 同一 turn 完了後の post-run チェックで extract を **compact より前** に走らせる。compact は history を組み替えるので、extract の入力範囲(session log 上の entry index)は compact 前のほうが安定する
|
||||||
- **並走防止 (extract 同士)**: Pod 上の `extract_in_flight` フラグで in-flight 中の新規 trigger を skip。完了時点で閾値超過していれば直ちに次回を発火し、新 pointer 以降の最大範囲を回収する(pending 状態は保持しない=完了時の閾値再評価で coalesce 相当の挙動を成立させる)
|
- **並走防止 (extract 同士)**: Pod 上の `extract_in_flight` フラグで in-flight 中の新規 trigger を skip。完了時点で閾値超過していれば直ちに次回を発火し、新 pointer 以降の最大範囲を回収する(pending 状態は保持しない=完了時の閾値再評価で coalesce 相当の挙動を成立させる)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user