merge: memory-extract-remove-input-cap
This commit is contained in:
commit
cf4ecf8d70
|
|
@ -156,6 +156,15 @@ pub(crate) fn reject_removed_manifest_fields(s: &str) -> Result<(), toml::de::Er
|
|||
(removed; use compaction.prune_protected_tokens)",
|
||||
));
|
||||
}
|
||||
if value
|
||||
.get("memory")
|
||||
.and_then(toml::Value::as_table)
|
||||
.is_some_and(|table| table.contains_key("extract_worker_max_input_tokens"))
|
||||
{
|
||||
return Err(toml::de::Error::custom(
|
||||
"unknown field in manifest: memory.extract_worker_max_input_tokens (removed)",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -283,9 +292,6 @@ impl MemoryConfig {
|
|||
language: upper.language.or(self.language),
|
||||
extract_model: upper.extract_model.or(self.extract_model),
|
||||
extract_threshold: upper.extract_threshold.or(self.extract_threshold),
|
||||
extract_worker_max_input_tokens: upper
|
||||
.extract_worker_max_input_tokens
|
||||
.or(self.extract_worker_max_input_tokens),
|
||||
extract_worker_max_turns: upper
|
||||
.extract_worker_max_turns
|
||||
.or(self.extract_worker_max_turns),
|
||||
|
|
@ -1004,6 +1010,32 @@ prune_protected_turns = 3
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_toml_rejects_removed_extract_worker_max_input_tokens_field() {
|
||||
let bad = r#"
|
||||
[memory]
|
||||
extract_worker_max_input_tokens = 30000
|
||||
"#;
|
||||
let err = PodManifestConfig::from_toml(bad).unwrap_err();
|
||||
assert!(
|
||||
err.to_string()
|
||||
.contains("memory.extract_worker_max_input_tokens"),
|
||||
"unexpected error: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_toml_accepts_extract_worker_max_turns() {
|
||||
let cfg = PodManifestConfig::from_toml(
|
||||
r#"
|
||||
[memory]
|
||||
extract_worker_max_turns = 2
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(cfg.memory.unwrap().extract_worker_max_turns, Some(2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_toml_accepts_worker_reasoning_string_or_integer() {
|
||||
let effort = PodManifestConfig::from_toml(
|
||||
|
|
|
|||
|
|
@ -59,11 +59,6 @@ pub const COMPACT_WORKER_MAX_TURNS: Option<u32> = Some(20);
|
|||
/// default references.
|
||||
pub const COMPACT_DEFAULT_REFERENCE_COUNT: usize = 5;
|
||||
|
||||
/// Current prompt-occupancy cap for the memory extract worker's own
|
||||
/// LLM requests. Exceeding this aborts the extract run (circuit-breaker
|
||||
/// path). See [`crate::MemoryConfig::extract_worker_max_input_tokens`].
|
||||
pub const MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS: u64 = 30_000;
|
||||
|
||||
/// Optional maximum extract-worker tool-loop depth. `None` means unlimited.
|
||||
/// See [`crate::MemoryConfig::extract_worker_max_turns`].
|
||||
pub const MEMORY_EXTRACT_WORKER_MAX_TURNS: Option<u32> = Some(8);
|
||||
|
|
|
|||
|
|
@ -114,11 +114,6 @@ pub struct MemoryConfig {
|
|||
/// the auto-extract trigger is dormant.
|
||||
#[serde(default)]
|
||||
pub extract_threshold: Option<u64>,
|
||||
/// Current prompt-occupancy cap for the extract worker's own LLM
|
||||
/// requests. Exceeding this aborts the extract run. `None` ⇒
|
||||
/// [`defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS`].
|
||||
#[serde(default)]
|
||||
pub extract_worker_max_input_tokens: Option<u64>,
|
||||
/// Optional maximum extract-worker tool-loop depth. `None` leaves
|
||||
/// the worker unlimited; the default bounds runaway short-context
|
||||
/// loops. Falls through to
|
||||
|
|
|
|||
|
|
@ -69,4 +69,19 @@ mod tests {
|
|||
assert!(s.contains("[ToolResult] ok"));
|
||||
assert!(!s.contains("scratch"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tool_result_renders_summary_but_not_content() {
|
||||
let huge_content = "raw-content-should-never-enter-extract-input".repeat(10_000);
|
||||
let items = vec![Item::tool_result_with_content(
|
||||
"c1",
|
||||
"short summary kept for extraction",
|
||||
huge_content.clone(),
|
||||
)];
|
||||
|
||||
let s = build_extract_input(&items);
|
||||
assert!(s.contains("[ToolResult] short summary kept for extraction"));
|
||||
assert!(!s.contains("raw-content-should-never-enter-extract-input"));
|
||||
assert!(!s.contains(&huge_content));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2497,9 +2497,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
|
|||
.to_vec();
|
||||
|
||||
let layout = memory::WorkspaceLayout::resolve(memory_cfg, &self.pwd);
|
||||
let cap = memory_cfg
|
||||
.extract_worker_max_input_tokens
|
||||
.unwrap_or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS);
|
||||
let extract_worker_max_turns = memory_cfg
|
||||
.extract_worker_max_turns
|
||||
.or(manifest::defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS);
|
||||
|
|
@ -2513,21 +2510,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
|
|||
let mut extract_worker = Worker::new(client).system_prompt(extract_system_prompt);
|
||||
extract_worker.set_cache_key(Some(self.segment_id().to_string()));
|
||||
|
||||
// Occupancy-based input-token meter + interceptor. The tracker pairs
|
||||
// each pre-request history length with the following UsageEvent, then
|
||||
// the interceptor projects current prompt occupancy with the same
|
||||
// UsageRecord counter used by the main Pod thresholds.
|
||||
let extract_usage_tracker = Arc::new(UsageTracker::new());
|
||||
{
|
||||
let tracker = extract_usage_tracker.clone();
|
||||
extract_worker.on_usage(move |event| {
|
||||
tracker.record_usage(event);
|
||||
});
|
||||
}
|
||||
extract_worker.set_interceptor(MemoryExtractWorkerInterceptor {
|
||||
usage_tracker: extract_usage_tracker,
|
||||
max_input_tokens: cap,
|
||||
});
|
||||
extract_worker.set_max_turns(extract_worker_max_turns);
|
||||
|
||||
let ctx = Arc::new(extract::ExtractWorkerContext::new());
|
||||
|
|
@ -2783,38 +2765,6 @@ enum ExtractDecision {
|
|||
Completed,
|
||||
}
|
||||
|
||||
/// Pre-request interceptor for the extract worker. Aborts when current
|
||||
/// prompt occupancy crosses `max_input_tokens`. Uses the same
|
||||
/// `UsageRecord` + `llm_worker::token_counter::total_tokens` projection
|
||||
/// as the main Pod compaction thresholds, so prompt-cache hits are not
|
||||
/// counted cumulatively across turns. Kept separate from
|
||||
/// `compact::worker::CompactWorkerInterceptor` so each subsystem can
|
||||
/// tune its own cancel message and budget.
|
||||
struct MemoryExtractWorkerInterceptor {
|
||||
usage_tracker: Arc<UsageTracker>,
|
||||
max_input_tokens: u64,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl llm_worker::interceptor::Interceptor for MemoryExtractWorkerInterceptor {
|
||||
async fn pre_llm_request(
|
||||
&self,
|
||||
context: &mut Vec<Item>,
|
||||
) -> llm_worker::interceptor::PreRequestAction {
|
||||
let records = self.usage_tracker.records();
|
||||
let estimate = llm_worker::token_counter::total_tokens(context, &records);
|
||||
if estimate.tokens > self.max_input_tokens {
|
||||
return llm_worker::interceptor::PreRequestAction::Cancel(format!(
|
||||
"extract worker input occupancy exceeded {} tokens",
|
||||
self.max_input_tokens
|
||||
));
|
||||
}
|
||||
|
||||
self.usage_tracker.note_request(context.len());
|
||||
llm_worker::interceptor::PreRequestAction::Continue
|
||||
}
|
||||
}
|
||||
|
||||
/// Outcome of a single consolidation iteration. Internal to
|
||||
/// `try_post_run_consolidate` / `run_consolidate_once`.
|
||||
enum ConsolidateDecision {
|
||||
|
|
@ -3770,70 +3720,3 @@ permission = "write"
|
|||
assert!(shadows.iter().all(|s| s.slug.as_str() != "alpha"));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod memory_extract_interceptor_tests {
|
||||
use super::*;
|
||||
use llm_worker::interceptor::{Interceptor, PreRequestAction};
|
||||
use llm_worker::timeline::event::UsageEvent;
|
||||
|
||||
fn make_usage(input: u64) -> UsageEvent {
|
||||
UsageEvent {
|
||||
input_tokens: Some(input),
|
||||
output_tokens: Some(0),
|
||||
total_tokens: Some(input),
|
||||
cache_read_input_tokens: None,
|
||||
cache_creation_input_tokens: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn extract_interceptor_uses_occupancy_not_cumulative_usage() {
|
||||
let tracker = Arc::new(UsageTracker::new());
|
||||
let interceptor = MemoryExtractWorkerInterceptor {
|
||||
usage_tracker: tracker.clone(),
|
||||
max_input_tokens: 150,
|
||||
};
|
||||
let mut context = vec![Item::user_message("hello")];
|
||||
|
||||
assert!(matches!(
|
||||
interceptor.pre_llm_request(&mut context).await,
|
||||
PreRequestAction::Continue
|
||||
));
|
||||
tracker.record_usage(&make_usage(100));
|
||||
|
||||
assert!(matches!(
|
||||
interceptor.pre_llm_request(&mut context).await,
|
||||
PreRequestAction::Continue
|
||||
));
|
||||
tracker.record_usage(&make_usage(100));
|
||||
|
||||
// Two 100-token requests would exceed a cumulative 150-token cap, but
|
||||
// current occupancy is still the latest 100-token measurement.
|
||||
assert!(matches!(
|
||||
interceptor.pre_llm_request(&mut context).await,
|
||||
PreRequestAction::Continue
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn extract_interceptor_cancels_when_occupancy_exceeds_cap() {
|
||||
let tracker = Arc::new(UsageTracker::new());
|
||||
let interceptor = MemoryExtractWorkerInterceptor {
|
||||
usage_tracker: tracker.clone(),
|
||||
max_input_tokens: 99,
|
||||
};
|
||||
let mut context = vec![Item::user_message("hello")];
|
||||
|
||||
assert!(matches!(
|
||||
interceptor.pre_llm_request(&mut context).await,
|
||||
PreRequestAction::Continue
|
||||
));
|
||||
tracker.record_usage(&make_usage(100));
|
||||
|
||||
assert!(matches!(
|
||||
interceptor.pre_llm_request(&mut context).await,
|
||||
PreRequestAction::Cancel(message) if message.contains("occupancy")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -679,6 +679,27 @@ target = "./"
|
|||
permission = "write"
|
||||
"#;
|
||||
|
||||
#[tokio::test]
|
||||
async fn extract_large_unprocessed_range_does_not_abort_on_input_occupancy() {
|
||||
let client = MockClient::new(vec![
|
||||
text_events_with_usage("recorded", 1000),
|
||||
write_extracted_tool_use_events("ec-large"),
|
||||
single_text_events("done"),
|
||||
]);
|
||||
let mut pod = make_pod_with_manifest(EXTRACT_NO_COMPACT_MANIFEST, client).await;
|
||||
|
||||
let large_request = format!("remember this large slice: {}", "x ".repeat(200_000));
|
||||
pod.run_text(&large_request).await.unwrap();
|
||||
|
||||
pod.try_post_run_extract().await.expect(
|
||||
"large unprocessed extract ranges must reach the extract worker, not abort locally",
|
||||
);
|
||||
assert!(
|
||||
pod.extract_pointer().is_some(),
|
||||
"successful extract should advance the pointer even when the input range is large"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn spawn_and_wait_drives_extract_to_completion() {
|
||||
let client = MockClient::new(vec![
|
||||
|
|
|
|||
|
|
@ -263,10 +263,6 @@ permission = "write"
|
|||
# # ※ memory tools と resident injection は extract_threshold が None でも動く。
|
||||
# extract_threshold = 30000
|
||||
#
|
||||
# # 任意。デフォルト: 30000 (`defaults::MEMORY_EXTRACT_WORKER_MAX_INPUT_TOKENS`)。
|
||||
# # extract worker 自身の現在占有 token cap (超過で abort)。
|
||||
# extract_worker_max_input_tokens = 30000
|
||||
#
|
||||
# # 任意。デフォルト: 8 (`defaults::MEMORY_EXTRACT_WORKER_MAX_TURNS`)。
|
||||
# # extract worker 自身の tool loop 上限。Rust config で None の場合のみ無制限。
|
||||
# extract_worker_max_turns = 8
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ Workflow 保護は専用 tool schema のトリックではなく Linter ルー
|
|||
|
||||
- **Trigger**: activity tokens の累積閾値(cumulative input tokens since last pointer)。tool call カウントは不採用(ツールカスタマイズ非依存・大小重みづけのため)
|
||||
- **実行主体**: 既存 compact と同じ Worker spawn 機構を再利用。Pod は立てない
|
||||
- **入力**: 前回 extract 以降の session log 範囲。処理済み境界の pointer は session log 側に保持し、寿命を session と揃える。session-store のドメイン純度を保つため、汎用拡張点 `LogEntry::Extension { domain, payload }`(domain = `"memory.extract"`)に寄せ、session-store は memory ドメインを知らない
|
||||
- **入力**: 前回 extract 以降の session log 範囲。処理済み境界の pointer は session log 側に保持し、寿命を session と揃える。session-store のドメイン純度を保つため、汎用拡張点 `LogEntry::Extension { domain, payload }`(domain = `"memory.extract"`)に寄せ、session-store は memory ドメインを知らない。Tool result は raw `content` ではなく表示用 `summary` だけを render し、巨大な tool output を extract input に載せない
|
||||
- **出力**: JSON schema で**活動ログ**の候補配列を返す。Knowledge 等の派生物は consolidation が活動ログから導出するので、extract では純粋な「起きたこと」に絞る
|
||||
- `decisions`: 判断したこと(選択肢 + 選んだ + 根拠)
|
||||
- `discussions`: 議論したこと(トピック + 論点)
|
||||
|
|
@ -131,6 +131,7 @@ Workflow 保護は専用 tool schema のトリックではなく Linter ルー
|
|||
- **抽出対象がなければ空配列を返してよい**(Hermes の "Nothing to save." と同系。頻繁発火を許容する前提)
|
||||
- **書き込み先**: `memory/_staging/<id>.json`
|
||||
- LLM 出力(活動ログ JSON)は pod 側ラッパーが `source: { session_id, range: [start_entry, end_entry] }` を**機械付与**して wrap。LLM には source を推論させない
|
||||
- **実行保証**: extract worker 自身の input occupancy cap は設けない。未処理 range が大きい場合でも pointer 以降の最大範囲を渡し、LLM/API/tool failure のときだけ pointer を進めない
|
||||
- **モデル**: `memory.extract_model`。軽量だが文脈理解できる中堅クラス(Haiku / 4o-mini / Flash 相当)を想定
|
||||
- **Compact との順序**: 同一 turn 完了後の post-run チェックで extract を **compact より前** に走らせる。compact は history を組み替えるので、extract の入力範囲(session log 上の entry index)は compact 前のほうが安定する
|
||||
- **並走防止 (extract 同士)**: Pod 上の `extract_in_flight` フラグで in-flight 中の新規 trigger を skip。完了時点で閾値超過していれば直ちに次回を発火し、新 pointer 以降の最大範囲を回収する(pending 状態は保持しない=完了時の閾値再評価で coalesce 相当の挙動を成立させる)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user