diff --git a/crates/llm-worker/src/interceptor.rs b/crates/llm-worker/src/interceptor.rs index 05afe4ea..1c058fb4 100644 --- a/crates/llm-worker/src/interceptor.rs +++ b/crates/llm-worker/src/interceptor.rs @@ -32,10 +32,16 @@ pub enum PromptAction { } /// Action before an LLM request. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq)] pub enum PreRequestAction { /// Proceed normally. Continue, + /// Proceed after appending these items to durable worker history. + /// + /// This is for upper-layer budget/status nudges that the model may react + /// to: the items are committed before the request so later turns can see + /// why the worker changed course. + ContinueWith(Vec), /// Cancel with a reason (treated as an error). Cancel(String), /// Yield control to the caller for external processing. @@ -149,11 +155,12 @@ pub trait Interceptor: Send + Sync { /// Called before each LLM request. The context starts as a clone /// of `worker.history` (after `pending_history_appends` and the - /// Worker's own prune projection have been applied) and can be - /// further modified for that single request only — mutations here - /// are **not** persisted back to history. Use - /// [`Self::pending_history_appends`] for inputs that need to land - /// in history. + /// Worker's own prune projection have been applied). + /// + /// Direct mutations to `context` remain request-local and are not persisted. + /// If an interceptor derives a human/model-visible nudge from the current + /// request context, return [`PreRequestAction::ContinueWith`] so the Worker + /// commits it to history before the request is sent. async fn pre_llm_request(&self, _context: &mut Vec) -> PreRequestAction { PreRequestAction::Continue } diff --git a/crates/llm-worker/src/worker.rs b/crates/llm-worker/src/worker.rs index c4313d99..9feb7bea 100644 --- a/crates/llm-worker/src/worker.rs +++ b/crates/llm-worker/src/worker.rs @@ -1169,6 +1169,10 @@ impl Worker { self.last_run_interrupted = true; return Ok(WorkerResult::Yielded); } + PreRequestAction::ContinueWith(items) => { + self.append_history_items(items.clone()); + request_context.extend(items); + } PreRequestAction::Continue => {} } diff --git a/crates/manifest/src/config.rs b/crates/manifest/src/config.rs index cd362713..cf782e91 100644 --- a/crates/manifest/src/config.rs +++ b/crates/manifest/src/config.rs @@ -125,18 +125,34 @@ pub struct CompactionConfigPartial { pub prune_protected_tokens: Option, #[serde(default)] pub prune_min_savings: Option, + #[serde(default, alias = "compact_threshold")] + pub threshold: Option, + #[serde(default, alias = "compact_request_threshold")] + pub request_threshold: Option, + #[serde(default, alias = "compact_retained_tokens")] + pub retained_tokens: Option, #[serde(default)] - pub compact_threshold: Option, + pub overview_target_tokens: Option, #[serde(default)] - pub compact_request_threshold: Option, + pub overview_warning_tokens: Option, #[serde(default)] - pub compact_retained_tokens: Option, + pub overview_deadline_tokens: Option, + #[serde(default, alias = "compact_worker_max_input_tokens")] + pub worker_context_max_tokens: Option, #[serde(default)] - pub compact_auto_read_budget: Option, + pub finish_warning_remaining_tokens: Option, #[serde(default)] - pub compact_worker_max_input_tokens: Option, + pub final_reserve_tokens: Option, + #[serde(default, alias = "compact_worker_max_turns")] + pub worker_max_turns: Option, #[serde(default)] - pub compact_worker_max_turns: Option, + pub summary_target_tokens: Option, + #[serde(default)] + pub summary_max_tokens: Option, + #[serde(default, alias = "compact_auto_read_budget")] + pub auto_read_budget_tokens: Option, + #[serde(default)] + pub result_context_max_tokens: Option, #[serde(default)] pub model: Option, } @@ -386,22 +402,32 @@ impl CompactionConfigPartial { Self { prune_protected_tokens: upper.prune_protected_tokens.or(self.prune_protected_tokens), prune_min_savings: upper.prune_min_savings.or(self.prune_min_savings), - compact_threshold: upper.compact_threshold.or(self.compact_threshold), - compact_request_threshold: upper - .compact_request_threshold - .or(self.compact_request_threshold), - compact_retained_tokens: upper - .compact_retained_tokens - .or(self.compact_retained_tokens), - compact_auto_read_budget: upper - .compact_auto_read_budget - .or(self.compact_auto_read_budget), - compact_worker_max_input_tokens: upper - .compact_worker_max_input_tokens - .or(self.compact_worker_max_input_tokens), - compact_worker_max_turns: upper - .compact_worker_max_turns - .or(self.compact_worker_max_turns), + threshold: upper.threshold.or(self.threshold), + request_threshold: upper.request_threshold.or(self.request_threshold), + retained_tokens: upper.retained_tokens.or(self.retained_tokens), + overview_target_tokens: upper.overview_target_tokens.or(self.overview_target_tokens), + overview_warning_tokens: upper + .overview_warning_tokens + .or(self.overview_warning_tokens), + overview_deadline_tokens: upper + .overview_deadline_tokens + .or(self.overview_deadline_tokens), + worker_context_max_tokens: upper + .worker_context_max_tokens + .or(self.worker_context_max_tokens), + finish_warning_remaining_tokens: upper + .finish_warning_remaining_tokens + .or(self.finish_warning_remaining_tokens), + final_reserve_tokens: upper.final_reserve_tokens.or(self.final_reserve_tokens), + worker_max_turns: upper.worker_max_turns.or(self.worker_max_turns), + summary_target_tokens: upper.summary_target_tokens.or(self.summary_target_tokens), + summary_max_tokens: upper.summary_max_tokens.or(self.summary_max_tokens), + auto_read_budget_tokens: upper + .auto_read_budget_tokens + .or(self.auto_read_budget_tokens), + result_context_max_tokens: upper + .result_context_max_tokens + .or(self.result_context_max_tokens), model: merge_option(self.model, upper.model, ModelManifest::merge), } } @@ -544,20 +570,42 @@ impl TryFrom for PodManifest { .prune_protected_tokens .unwrap_or(defaults::PRUNE_PROTECTED_TOKENS), prune_min_savings: c.prune_min_savings.unwrap_or(defaults::PRUNE_MIN_SAVINGS), - compact_threshold: c.compact_threshold, - compact_request_threshold: c.compact_request_threshold, - compact_retained_tokens: c - .compact_retained_tokens + threshold: c.threshold, + request_threshold: c.request_threshold, + retained_tokens: c + .retained_tokens .unwrap_or(defaults::COMPACT_RETAINED_TOKENS), - compact_auto_read_budget: c - .compact_auto_read_budget - .unwrap_or(defaults::COMPACT_AUTO_READ_BUDGET), - compact_worker_max_input_tokens: c - .compact_worker_max_input_tokens + overview_target_tokens: c + .overview_target_tokens + .unwrap_or(defaults::COMPACT_OVERVIEW_TARGET_TOKENS), + overview_warning_tokens: c + .overview_warning_tokens + .unwrap_or(defaults::COMPACT_OVERVIEW_WARNING_TOKENS), + overview_deadline_tokens: c + .overview_deadline_tokens + .unwrap_or(defaults::COMPACT_OVERVIEW_DEADLINE_TOKENS), + worker_context_max_tokens: c + .worker_context_max_tokens .unwrap_or(defaults::COMPACT_WORKER_MAX_INPUT_TOKENS), - compact_worker_max_turns: c - .compact_worker_max_turns - .or(defaults::COMPACT_WORKER_MAX_TURNS), + finish_warning_remaining_tokens: c + .finish_warning_remaining_tokens + .unwrap_or(defaults::COMPACT_FINISH_WARNING_REMAINING_TOKENS), + final_reserve_tokens: c + .final_reserve_tokens + .unwrap_or(defaults::COMPACT_FINAL_RESERVE_TOKENS), + worker_max_turns: c.worker_max_turns.or(defaults::COMPACT_WORKER_MAX_TURNS), + summary_target_tokens: c + .summary_target_tokens + .unwrap_or(defaults::COMPACT_SUMMARY_TARGET_TOKENS), + summary_max_tokens: c + .summary_max_tokens + .unwrap_or(defaults::COMPACT_SUMMARY_MAX_TOKENS), + auto_read_budget_tokens: c + .auto_read_budget_tokens + .unwrap_or(defaults::COMPACT_AUTO_READ_BUDGET), + result_context_max_tokens: c + .result_context_max_tokens + .unwrap_or(defaults::COMPACT_RESULT_CONTEXT_MAX_TOKENS), model: c.model, }) }) @@ -984,7 +1032,7 @@ mod tests { fn merge_option_struct_field_wise() { let lower = PodManifestConfig { compaction: Some(CompactionConfigPartial { - compact_threshold: Some(50_000), + threshold: Some(50_000), prune_protected_tokens: Some(5_000), ..Default::default() }), @@ -992,14 +1040,14 @@ mod tests { }; let upper = PodManifestConfig { compaction: Some(CompactionConfigPartial { - compact_threshold: Some(80_000), + threshold: Some(80_000), ..Default::default() }), ..Default::default() }; let merged = lower.merge(upper); let c = merged.compaction.unwrap(); - assert_eq!(c.compact_threshold, Some(80_000)); + assert_eq!(c.threshold, Some(80_000)); // field from lower retained when upper has None assert_eq!(c.prune_protected_tokens, Some(5_000)); } @@ -1122,27 +1170,27 @@ stop_sequences = ["\n\n", ""] } #[test] - fn from_toml_accepts_compact_worker_max_turns() { + fn from_toml_accepts_worker_max_turns() { let cfg = PodManifestConfig::from_toml( r#" [compaction] -compact_worker_max_turns = 7 +worker_max_turns = 7 "#, ) .unwrap(); - assert_eq!(cfg.compaction.unwrap().compact_worker_max_turns, Some(7)); + assert_eq!(cfg.compaction.unwrap().worker_max_turns, Some(7)); } #[test] - fn try_from_compaction_defaults_compact_worker_max_turns() { + fn try_from_compaction_defaults_worker_max_turns() { let mut cfg = minimal_valid(); cfg.compaction = Some(CompactionConfigPartial::default()); let manifest = PodManifest::try_from(cfg).unwrap(); assert_eq!( - manifest.compaction.unwrap().compact_worker_max_turns, + manifest.compaction.unwrap().worker_max_turns, defaults::COMPACT_WORKER_MAX_TURNS ); } diff --git a/crates/manifest/src/defaults.rs b/crates/manifest/src/defaults.rs index 4855a4df..ce0c41ff 100644 --- a/crates/manifest/src/defaults.rs +++ b/crates/manifest/src/defaults.rs @@ -25,9 +25,23 @@ pub const PRUNE_MIN_SAVINGS: u64 = 4096; /// Token budget retained (unchanged) at the tail of the history across /// a compact. Items whose cumulative token count fits within this budget /// starting from the end are kept verbatim; the rest are summarised. -/// See [`crate::CompactionConfig::compact_retained_tokens`]. +/// See [`crate::CompactionConfig::retained_tokens`]. pub const COMPACT_RETAINED_TOKENS: u64 = 8000; +/// Target size for the deterministic compact overview/index fed to the +/// compact worker. Exceeding this target is tolerated. +/// See [`crate::CompactionConfig::overview_target_tokens`]. +pub const COMPACT_OVERVIEW_TARGET_TOKENS: u64 = 8_000; + +/// Warning threshold for compact overview/index size. Compaction continues. +/// See [`crate::CompactionConfig::overview_warning_tokens`]. +pub const COMPACT_OVERVIEW_WARNING_TOKENS: u64 = 16_000; + +/// Hard deterministic-overview deadline. When exceeded, overview generation +/// falls back to a coarser index before the compact worker is started. +/// See [`crate::CompactionConfig::overview_deadline_tokens`]. +pub const COMPACT_OVERVIEW_DEADLINE_TOKENS: u64 = 40_000; + /// Default instruction asset reference used when `worker.instruction` /// is omitted. See the `PromptLoader` prefix addressing scheme for the /// `$insomnia/` / `$user/` / `$workspace/` namespaces. @@ -42,19 +56,39 @@ pub const WORKER_LANGUAGE: &str = /// session after compaction. Limits how much raw file text the /// compact worker can pull into the compacted context via /// `mark_read_required`. See -/// [`crate::CompactionConfig::compact_auto_read_budget`]. +/// [`crate::CompactionConfig::auto_read_budget_tokens`]. pub const COMPACT_AUTO_READ_BUDGET: u64 = 8000; /// Current prompt-occupancy cap for the compact worker's own LLM /// calls. Exceeding this aborts the compact run (circuit-breaker -/// path). See -/// [`crate::CompactionConfig::compact_worker_max_input_tokens`]. +/// path). See [`crate::CompactionConfig::worker_context_max_tokens`]. pub const COMPACT_WORKER_MAX_INPUT_TOKENS: u64 = 50_000; +/// Remaining compact-worker context threshold that triggers an instruction +/// to stop exploring and call `write_summary`. +/// See [`crate::CompactionConfig::finish_warning_remaining_tokens`]. +pub const COMPACT_FINISH_WARNING_REMAINING_TOKENS: u64 = 8_000; + +/// Context reserve preserved for final summary/tool closing turns. +/// See [`crate::CompactionConfig::final_reserve_tokens`]. +pub const COMPACT_FINAL_RESERVE_TOKENS: u64 = 4_000; + /// Optional maximum compact-worker tool-loop depth. `None` means unlimited. -/// See [`crate::CompactionConfig::compact_worker_max_turns`]. +/// See [`crate::CompactionConfig::worker_max_turns`]. pub const COMPACT_WORKER_MAX_TURNS: Option = Some(20); +/// Target size for the `write_summary` text. Used in prompt/nudge text. +/// See [`crate::CompactionConfig::summary_target_tokens`]. +pub const COMPACT_SUMMARY_TARGET_TOKENS: u64 = 2_000; + +/// Hard validation cap for the final `write_summary` text. +/// See [`crate::CompactionConfig::summary_max_tokens`]. +pub const COMPACT_SUMMARY_MAX_TOKENS: u64 = 4_000; + +/// Dry-run cap for the compacted session's initial request context. +/// See [`crate::CompactionConfig::result_context_max_tokens`]. +pub const COMPACT_RESULT_CONTEXT_MAX_TOKENS: u64 = 60_000; + /// Number of recently-touched files fed to the compact worker as /// default references. pub const COMPACT_DEFAULT_REFERENCE_COUNT: usize = 5; diff --git a/crates/manifest/src/lib.rs b/crates/manifest/src/lib.rs index 2ad132cf..abe9d58e 100644 --- a/crates/manifest/src/lib.rs +++ b/crates/manifest/src/lib.rs @@ -363,8 +363,8 @@ pub struct CompactionConfig { /// Checked by the Controller after each run. When current occupancy /// exceeds this value, compact runs before the next turn. `None` /// disables the between-turns check. - #[serde(default)] - pub compact_threshold: Option, + #[serde(default, alias = "compact_threshold")] + pub threshold: Option, /// Safety-net (between-requests) compaction threshold. /// @@ -373,32 +373,76 @@ pub struct CompactionConfig { /// Controller can compact before the next LLM request. `None` /// disables the between-requests check. /// - /// Expected relation: `compact_threshold < compact_request_threshold` - /// (proactive triggers before safety net). A reversed configuration - /// is accepted but logged as a warning. - #[serde(default)] - pub compact_request_threshold: Option, + /// Expected relation: `threshold < request_threshold` (proactive triggers + /// before safety net). A reversed configuration is accepted but logged as + /// a warning. + #[serde(default, alias = "compact_request_threshold")] + pub request_threshold: Option, /// Token budget retained verbatim at the tail of the history after /// compaction. Measured against the occupancy estimate from /// `UsageRecord` history; turn boundaries are ignored. - #[serde(default = "default_compact_retained_tokens")] - pub compact_retained_tokens: u64, + #[serde(default = "default_retained_tokens", alias = "compact_retained_tokens")] + pub retained_tokens: u64, - /// Aggregate token budget for auto-read file contents injected into - /// the compacted session by the compact worker. - #[serde(default = "default_compact_auto_read_budget")] - pub compact_auto_read_budget: u64, + /// Target size for the deterministic overview/index fed to the compact + /// worker. Overshooting this target is not an error. + #[serde(default = "default_overview_target_tokens")] + pub overview_target_tokens: u64, + + /// Warning threshold for deterministic overview/index size. + #[serde(default = "default_overview_warning_tokens")] + pub overview_warning_tokens: u64, + + /// Deadline threshold for deterministic overview/index generation. + /// Oversized overviews fall back to a coarser deterministic index. + #[serde(default = "default_overview_deadline_tokens")] + pub overview_deadline_tokens: u64, /// Current prompt-occupancy cap for the compact worker's own LLM /// requests. Exceeding this aborts the compact run. - #[serde(default = "default_compact_worker_max_input_tokens")] - pub compact_worker_max_input_tokens: u64, + #[serde( + default = "default_worker_context_max_tokens", + alias = "compact_worker_max_input_tokens" + )] + pub worker_context_max_tokens: u64, + + /// Remaining compact-worker context threshold that triggers a warning and + /// an instruction to stop exploring and call `write_summary`. + #[serde(default = "default_finish_warning_remaining_tokens")] + pub finish_warning_remaining_tokens: u64, + + /// Context reserve preserved for final summary/tool closing turns. + #[serde(default = "default_final_reserve_tokens")] + pub final_reserve_tokens: u64, /// Optional maximum compact-worker tool-loop depth. `None` leaves the /// worker unlimited; the default bounds runaway short-context loops. - #[serde(default = "default_compact_worker_max_turns")] - pub compact_worker_max_turns: Option, + #[serde( + default = "default_worker_max_turns", + alias = "compact_worker_max_turns" + )] + pub worker_max_turns: Option, + + /// Target size for the `write_summary` text. Used in prompt/nudge text. + #[serde(default = "default_summary_target_tokens")] + pub summary_target_tokens: u64, + + /// Hard validation cap for the final `write_summary` text. + #[serde(default = "default_summary_max_tokens")] + pub summary_max_tokens: u64, + + /// Aggregate token budget for auto-read file contents injected into + /// the compacted session by the compact worker. + #[serde( + default = "default_auto_read_budget_tokens", + alias = "compact_auto_read_budget" + )] + pub auto_read_budget_tokens: u64, + + /// Dry-run cap for the compacted session's initial request context. + #[serde(default = "default_result_context_max_tokens")] + pub result_context_max_tokens: u64, /// Optional model for the compactor (summary) LLM. /// If omitted, the main model is cloned via `clone_boxed()`. @@ -412,30 +456,62 @@ fn default_prune_protected_tokens() -> u64 { fn default_prune_min_savings() -> u64 { defaults::PRUNE_MIN_SAVINGS } -fn default_compact_retained_tokens() -> u64 { +fn default_retained_tokens() -> u64 { defaults::COMPACT_RETAINED_TOKENS } -fn default_compact_auto_read_budget() -> u64 { - defaults::COMPACT_AUTO_READ_BUDGET +fn default_overview_target_tokens() -> u64 { + defaults::COMPACT_OVERVIEW_TARGET_TOKENS } -fn default_compact_worker_max_input_tokens() -> u64 { +fn default_overview_warning_tokens() -> u64 { + defaults::COMPACT_OVERVIEW_WARNING_TOKENS +} +fn default_overview_deadline_tokens() -> u64 { + defaults::COMPACT_OVERVIEW_DEADLINE_TOKENS +} +fn default_worker_context_max_tokens() -> u64 { defaults::COMPACT_WORKER_MAX_INPUT_TOKENS } -fn default_compact_worker_max_turns() -> Option { +fn default_finish_warning_remaining_tokens() -> u64 { + defaults::COMPACT_FINISH_WARNING_REMAINING_TOKENS +} +fn default_final_reserve_tokens() -> u64 { + defaults::COMPACT_FINAL_RESERVE_TOKENS +} +fn default_worker_max_turns() -> Option { defaults::COMPACT_WORKER_MAX_TURNS } +fn default_summary_target_tokens() -> u64 { + defaults::COMPACT_SUMMARY_TARGET_TOKENS +} +fn default_summary_max_tokens() -> u64 { + defaults::COMPACT_SUMMARY_MAX_TOKENS +} +fn default_auto_read_budget_tokens() -> u64 { + defaults::COMPACT_AUTO_READ_BUDGET +} +fn default_result_context_max_tokens() -> u64 { + defaults::COMPACT_RESULT_CONTEXT_MAX_TOKENS +} impl Default for CompactionConfig { fn default() -> Self { Self { prune_protected_tokens: default_prune_protected_tokens(), prune_min_savings: default_prune_min_savings(), - compact_threshold: None, - compact_request_threshold: None, - compact_retained_tokens: default_compact_retained_tokens(), - compact_auto_read_budget: default_compact_auto_read_budget(), - compact_worker_max_input_tokens: default_compact_worker_max_input_tokens(), - compact_worker_max_turns: default_compact_worker_max_turns(), + threshold: None, + request_threshold: None, + retained_tokens: default_retained_tokens(), + overview_target_tokens: default_overview_target_tokens(), + overview_warning_tokens: default_overview_warning_tokens(), + overview_deadline_tokens: default_overview_deadline_tokens(), + worker_context_max_tokens: default_worker_context_max_tokens(), + finish_warning_remaining_tokens: default_finish_warning_remaining_tokens(), + final_reserve_tokens: default_final_reserve_tokens(), + worker_max_turns: default_worker_max_turns(), + summary_target_tokens: default_summary_target_tokens(), + summary_max_tokens: default_summary_max_tokens(), + auto_read_budget_tokens: default_auto_read_budget_tokens(), + result_context_max_tokens: default_result_context_max_tokens(), model: None, } } @@ -592,15 +668,15 @@ model_id = "claude-sonnet-4-20250514" #[test] fn parse_compaction_config() { - let toml = format!("{MINIMAL_REQUIRED}\n[compaction]\ncompact_threshold = 80000\n"); + let toml = format!("{MINIMAL_REQUIRED}\n[compaction]\nthreshold = 80000\n"); let manifest = PodManifest::from_toml(&toml).unwrap(); let c = manifest.compaction.unwrap(); assert_eq!(c.prune_protected_tokens, 8000); assert_eq!(c.prune_min_savings, 4096); - assert_eq!(c.compact_threshold, Some(80000)); - assert_eq!(c.compact_request_threshold, None); - assert_eq!(c.compact_retained_tokens, 8000); - assert_eq!(c.compact_worker_max_turns, Some(20)); + assert_eq!(c.threshold, Some(80000)); + assert_eq!(c.request_threshold, None); + assert_eq!(c.retained_tokens, 8000); + assert_eq!(c.worker_max_turns, Some(20)); } #[test] @@ -618,11 +694,11 @@ model_id = "claude-sonnet-4-20250514" let toml = format!( "{MINIMAL_REQUIRED}\n\ [compaction]\n\ - compact_worker_max_turns = 7\n" + worker_max_turns = 7\n" ); let manifest = PodManifest::from_toml(&toml).unwrap(); let c = manifest.compaction.unwrap(); - assert_eq!(c.compact_worker_max_turns, Some(7)); + assert_eq!(c.worker_max_turns, Some(7)); } #[test] @@ -630,13 +706,13 @@ model_id = "claude-sonnet-4-20250514" let toml = format!( "{MINIMAL_REQUIRED}\n\ [compaction]\n\ - compact_threshold = 80000\n\ - compact_request_threshold = 90000\n" + threshold = 80000\n\ + request_threshold = 90000\n" ); let manifest = PodManifest::from_toml(&toml).unwrap(); let c = manifest.compaction.unwrap(); - assert_eq!(c.compact_threshold, Some(80000)); - assert_eq!(c.compact_request_threshold, Some(90000)); + assert_eq!(c.threshold, Some(80000)); + assert_eq!(c.request_threshold, Some(90000)); } #[test] @@ -644,12 +720,12 @@ model_id = "claude-sonnet-4-20250514" let toml = format!( "{MINIMAL_REQUIRED}\n\ [compaction]\n\ - compact_request_threshold = 90000\n" + request_threshold = 90000\n" ); let manifest = PodManifest::from_toml(&toml).unwrap(); let c = manifest.compaction.unwrap(); - assert_eq!(c.compact_threshold, None); - assert_eq!(c.compact_request_threshold, Some(90000)); + assert_eq!(c.threshold, None); + assert_eq!(c.request_threshold, Some(90000)); } #[test] @@ -657,7 +733,7 @@ model_id = "claude-sonnet-4-20250514" let toml = format!( "{MINIMAL_REQUIRED}\n\ [compaction]\n\ - compact_threshold = 80000\n\n\ + threshold = 80000\n\n\ [compaction.model]\n\ scheme = \"gemini\"\n\ model_id = \"gemini-2.0-flash\"\n" diff --git a/crates/manifest/src/paths.rs b/crates/manifest/src/paths.rs index f8cccc75..f17ae616 100644 --- a/crates/manifest/src/paths.rs +++ b/crates/manifest/src/paths.rs @@ -281,10 +281,7 @@ mod tests { ("HOME", Some("/h")), ("XDG_RUNTIME_DIR", Some("/run/user/1000")), ]); - assert_eq!( - runtime_dir().unwrap(), - PathBuf::from("") - ); + assert_eq!(runtime_dir().unwrap(), PathBuf::from("")); } #[test] diff --git a/crates/pod/src/compact/worker.rs b/crates/pod/src/compact/worker.rs index 7660a170..7b134832 100644 --- a/crates/pod/src/compact/worker.rs +++ b/crates/pod/src/compact/worker.rs @@ -18,12 +18,13 @@ //! compacted session's opening system messages. use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use async_trait::async_trait; use llm_worker::Item; -use llm_worker::interceptor::{Interceptor, PreRequestAction}; -use llm_worker::tool::{Tool, ToolDefinition, ToolError, ToolMeta, ToolOutput}; +use llm_worker::interceptor::{Interceptor, PreRequestAction, PreToolAction, ToolCallInfo}; +use llm_worker::tool::{Tool, ToolDefinition, ToolError, ToolMeta, ToolOutput, ToolResult}; use serde::Deserialize; use tools::ScopedFs; @@ -246,14 +247,63 @@ pub(crate) fn write_summary_tool(ctx: Arc>) -> ToolD }) } -/// Interceptor that aborts the compact worker when its current prompt -/// occupancy estimate crosses `max_input_tokens`. The estimate uses the same -/// `UsageRecord` + `llm_worker::token_counter::total_tokens` path as the main -/// Pod compaction thresholds, so prompt-cache hits are not counted cumulatively -/// across turns. +/// Interceptor that monitors compact-worker context occupancy. +/// +/// `max_input_tokens` remains the hard circuit breaker. Before that point, +/// the interceptor can persist a system warning into worker history telling +/// the model to stop broad exploration and call `write_summary`, and can block +/// additional exploratory tool calls once the final reserve is reached. pub(crate) struct CompactWorkerInterceptor { pub usage_tracker: Arc, pub max_input_tokens: u64, + pub finish_warning_remaining_tokens: u64, + pub final_reserve_tokens: u64, + pub on_warning: Option>, + warning_sent: AtomicBool, + last_remaining_tokens: AtomicU64, +} + +impl CompactWorkerInterceptor { + pub(crate) fn new( + usage_tracker: Arc, + max_input_tokens: u64, + finish_warning_remaining_tokens: u64, + final_reserve_tokens: u64, + on_warning: Option>, + ) -> Self { + Self { + usage_tracker, + max_input_tokens, + finish_warning_remaining_tokens, + final_reserve_tokens, + on_warning, + warning_sent: AtomicBool::new(false), + last_remaining_tokens: AtomicU64::new(max_input_tokens), + } + } + + fn maybe_emit_warning(&self, remaining: u64) -> Option { + let warning_threshold = self.finish_warning_remaining_tokens; + let reserve_threshold = self.final_reserve_tokens; + let should_warn = (warning_threshold > 0 && remaining <= warning_threshold) + || (reserve_threshold > 0 && remaining <= reserve_threshold); + if !should_warn || self.warning_sent.swap(true, Ordering::AcqRel) { + return None; + } + + let message = format!( + "compact worker context budget is low ({remaining}/{} tokens remaining). \ + Stop broad exploration now, read only if absolutely necessary, then call \ + `write_summary` with the final structured summary.", + self.max_input_tokens + ); + if let Some(cb) = self.on_warning.as_ref() { + cb(message.clone()); + } + Some(Item::system_message(format!( + "[Compact worker budget warning]\n\n{message}" + ))) + } } #[async_trait] @@ -268,9 +318,31 @@ impl Interceptor for CompactWorkerInterceptor { )); } + let remaining = self.max_input_tokens.saturating_sub(estimate.tokens); + self.last_remaining_tokens + .store(remaining, Ordering::Release); + if let Some(item) = self.maybe_emit_warning(remaining) { + self.usage_tracker.note_request(context.len() + 1); + return PreRequestAction::ContinueWith(vec![item]); + } + self.usage_tracker.note_request(context.len()); PreRequestAction::Continue } + + async fn pre_tool_call(&self, info: &mut ToolCallInfo) -> PreToolAction { + if self.final_reserve_tokens == 0 || info.call.name == "write_summary" { + return PreToolAction::Continue; + } + let remaining = self.last_remaining_tokens.load(Ordering::Acquire); + if remaining > self.final_reserve_tokens { + return PreToolAction::Continue; + } + PreToolAction::SyntheticResult(ToolResult::error( + info.call.id.clone(), + "compact worker final reserve reached; do not perform more exploratory tool reads. Call `write_summary` now.", + )) + } } /// Crude bytes→tokens estimate; good enough for budget accounting. @@ -301,10 +373,7 @@ mod tests { #[tokio::test] async fn compact_worker_interceptor_uses_occupancy_not_cumulative_usage() { let tracker = Arc::new(UsageTracker::new()); - let interceptor = CompactWorkerInterceptor { - usage_tracker: tracker.clone(), - max_input_tokens: 150, - }; + let interceptor = CompactWorkerInterceptor::new(tracker.clone(), 150, 0, 0, None); let mut context = vec![Item::user_message("hello")]; assert!(matches!( @@ -327,13 +396,40 @@ mod tests { )); } + #[tokio::test] + async fn compact_worker_interceptor_warns_before_hard_cap() { + let tracker = Arc::new(UsageTracker::new()); + let warnings = Arc::new(Mutex::new(Vec::new())); + let captured = warnings.clone(); + let interceptor = CompactWorkerInterceptor::new( + tracker.clone(), + 150, + 60, + 20, + Some(Arc::new(move |message| { + captured.lock().unwrap().push(message); + })), + ); + let mut context = vec![Item::user_message("hello")]; + + assert!(matches!( + interceptor.pre_llm_request(&mut context).await, + PreRequestAction::Continue + )); + tracker.record_usage(&make_usage(100)); + + assert!(matches!( + interceptor.pre_llm_request(&mut context).await, + PreRequestAction::ContinueWith(items) + if items.len() == 1 && items[0].as_text().unwrap_or_default().contains("write_summary") + )); + assert_eq!(warnings.lock().unwrap().len(), 1); + } + #[tokio::test] async fn compact_worker_interceptor_cancels_when_occupancy_exceeds_cap() { let tracker = Arc::new(UsageTracker::new()); - let interceptor = CompactWorkerInterceptor { - usage_tracker: tracker.clone(), - max_input_tokens: 99, - }; + let interceptor = CompactWorkerInterceptor::new(tracker.clone(), 99, 0, 0, None); let mut context = vec![Item::user_message("hello")]; assert!(matches!( diff --git a/crates/pod/src/pod.rs b/crates/pod/src/pod.rs index 44b97ff7..046932ce 100644 --- a/crates/pod/src/pod.rs +++ b/crates/pod/src/pod.rs @@ -241,7 +241,7 @@ pub struct Pod { scope: SharedScope, hook_builder: HookRegistryBuilder, interceptor_installed: bool, - /// Shared compaction state (present when compact_threshold is configured). + /// Shared compaction state (present when threshold is configured). compact_state: Option>, /// Per-LLM-request Usage tracker. Always present after construction. /// Captures `(history_len, UsageEvent)` pairs during a run; drained @@ -1121,8 +1121,8 @@ impl Pod { /// Install the hook-based interceptor on the Worker if not already done. /// - /// When either compaction threshold (`compact_threshold` or - /// `compact_request_threshold`) is configured in the manifest, allocates + /// When either compaction threshold (`threshold` or + /// `request_threshold`) is configured in the manifest, allocates /// a shared [`CompactState`] and wires the interceptor to read current /// occupancy through the `UsageRecord` timeline. fn ensure_interceptor_installed(&mut self) { @@ -1141,13 +1141,7 @@ impl Pod { .manifest .compaction .as_ref() - .map(|c| { - ( - c.compact_threshold, - c.compact_request_threshold, - c.compact_retained_tokens, - ) - }) + .map(|c| (c.threshold, c.request_threshold, c.retained_tokens)) .unwrap_or((None, None, manifest::defaults::COMPACT_RETAINED_TOKENS)); let tracker_for_usage = self.usage_tracker.clone(); @@ -1161,7 +1155,7 @@ impl Pod { warn!( post_run_threshold = post, request_threshold = req, - "compact_threshold > compact_request_threshold; \ + "threshold > request_threshold; \ proactive check will never fire before the safety net" ); } @@ -2124,12 +2118,7 @@ impl Pod { let retained = state .as_ref() .map(|s| s.retained_tokens()) - .or_else(|| { - self.manifest - .compaction - .as_ref() - .map(|c| c.compact_retained_tokens) - }) + .or_else(|| self.manifest.compaction.as_ref().map(|c| c.retained_tokens)) .unwrap_or(manifest::defaults::COMPACT_RETAINED_TOKENS); let current_tokens = self.total_tokens().tokens; let cut = self.split_for_retained(retained); @@ -2324,21 +2313,49 @@ impl Pod { // Compaction-related knobs. Fall through to manifest defaults when // `[compaction]` is omitted entirely. - let (auto_read_budget, compact_worker_max_input_tokens, compact_worker_max_turns) = self + let ( + auto_read_budget, + worker_context_max_tokens, + finish_warning_remaining_tokens, + final_reserve_tokens, + worker_max_turns, + overview_target_tokens, + overview_warning_tokens, + overview_deadline_tokens, + summary_target_tokens, + summary_max_tokens, + result_context_max_tokens, + ) = self .manifest .compaction .as_ref() .map(|c| { ( - c.compact_auto_read_budget, - c.compact_worker_max_input_tokens, - c.compact_worker_max_turns, + c.auto_read_budget_tokens, + c.worker_context_max_tokens, + c.finish_warning_remaining_tokens, + c.final_reserve_tokens, + c.worker_max_turns, + c.overview_target_tokens, + c.overview_warning_tokens, + c.overview_deadline_tokens, + c.summary_target_tokens, + c.summary_max_tokens, + c.result_context_max_tokens, ) }) .unwrap_or(( manifest::defaults::COMPACT_AUTO_READ_BUDGET, manifest::defaults::COMPACT_WORKER_MAX_INPUT_TOKENS, + manifest::defaults::COMPACT_FINISH_WARNING_REMAINING_TOKENS, + manifest::defaults::COMPACT_FINAL_RESERVE_TOKENS, manifest::defaults::COMPACT_WORKER_MAX_TURNS, + manifest::defaults::COMPACT_OVERVIEW_TARGET_TOKENS, + manifest::defaults::COMPACT_OVERVIEW_WARNING_TOKENS, + manifest::defaults::COMPACT_OVERVIEW_DEADLINE_TOKENS, + manifest::defaults::COMPACT_SUMMARY_TARGET_TOKENS, + manifest::defaults::COMPACT_SUMMARY_MAX_TOKENS, + manifest::defaults::COMPACT_RESULT_CONTEXT_MAX_TOKENS, )); // Default references: the N most-recently-touched files in the @@ -2358,7 +2375,33 @@ impl Pod { &items_to_summarise, &default_refs, Some(task_snapshot_text.as_str()), + SummaryInputOptions { + overview_target_tokens, + overview_warning_tokens, + overview_deadline_tokens, + summary_target_tokens, + }, ); + if summary_input.warning_exceeded { + self.alert( + AlertLevel::Warn, + AlertSource::Compactor, + format!( + "compact overview is larger than expected (≈{} tokens; warning threshold {})", + summary_input.overview_tokens, overview_warning_tokens + ), + ); + } + if summary_input.deadline_fallback_used { + self.alert( + AlertLevel::Warn, + AlertSource::Compactor, + format!( + "compact overview exceeded deadline ({} tokens); using coarse fallback", + overview_deadline_tokens + ), + ); + } // Worker-side state collected by the compact worker's tool calls. let ctx = Arc::new(std::sync::Mutex::new(CompactWorkerContext::with_budget( @@ -2390,11 +2433,19 @@ impl Pod { tracker.record_usage(event); }); } - summary_worker.set_interceptor(CompactWorkerInterceptor { - usage_tracker: summary_usage_tracker, - max_input_tokens: compact_worker_max_input_tokens, + let compactor_warning_cb = self.alerter.clone().map(|alerter| { + Arc::new(move |message: String| { + alerter.alert(AlertLevel::Warn, AlertSource::Compactor, message); + }) as Arc }); - summary_worker.set_max_turns(compact_worker_max_turns); + summary_worker.set_interceptor(CompactWorkerInterceptor::new( + summary_usage_tracker, + worker_context_max_tokens, + finish_warning_remaining_tokens, + final_reserve_tokens, + compactor_warning_cb, + )); + summary_worker.set_max_turns(worker_max_turns); // Tools: read_file (shared scope, fresh tracker) + the three // compact-specific tools that populate `ctx`. @@ -2404,7 +2455,7 @@ impl Pod { summary_worker.register_tool(write_summary_tool(ctx.clone())); let out = summary_worker - .run(summary_input) + .run(summary_input.text) .await .map_err(PodError::Worker)?; let mut locked_worker = out.worker; @@ -2439,11 +2490,32 @@ impl Pod { let _ = locked_worker.run(prompt).await.map_err(PodError::Worker)?; } - let final_ctx = ctx.lock().expect("compact ctx poisoned").clone(); - let summary_text = final_ctx + let mut final_ctx = ctx.lock().expect("compact ctx poisoned").clone(); + let mut summary_text = final_ctx .summary .clone() .ok_or(PodError::CompactSummaryMissing)?; + let mut summary_tokens = estimate_text_tokens(summary_text.len()); + if summary_max_tokens > 0 && summary_tokens > summary_max_tokens { + let prompt = format!( + "Your `write_summary` output is too large (≈{summary_tokens} tokens; max \ + {summary_max_tokens}). Rewrite it now with `write_summary`, preserving the \ + same five sections but making it concise. Target ≈{summary_target_tokens} tokens." + ); + let _ = locked_worker.run(prompt).await.map_err(PodError::Worker)?; + final_ctx = ctx.lock().expect("compact ctx poisoned").clone(); + summary_text = final_ctx + .summary + .clone() + .ok_or(PodError::CompactSummaryMissing)?; + summary_tokens = estimate_text_tokens(summary_text.len()); + if summary_tokens > summary_max_tokens { + return Err(PodError::CompactSummaryTooLarge { + tokens: summary_tokens, + max: summary_max_tokens, + }); + } + } // Re-read each auto-read target via the Pod FS view. Errors are // logged and skipped inside `render_auto_read` rather than @@ -2515,6 +2587,13 @@ impl Pod { tools::task::snapshot_overview(&self.task_store.list()), task_snapshot_text.clone(), )); + let result_estimate = llm_worker::token_counter::total_tokens(&new_history, &[]); + if result_context_max_tokens > 0 && result_estimate.tokens > result_context_max_tokens { + return Err(PodError::CompactResultContextTooLarge { + tokens: result_estimate.tokens, + max: result_context_max_tokens, + }); + } // Build the SegmentStart entry for the new compacted segment. // Inherits the source Segment's session_id so the compacted @@ -4008,19 +4087,56 @@ impl From for PodRunResult { } } +#[derive(Debug, Clone, Copy)] +struct SummaryInputOptions { + overview_target_tokens: u64, + overview_warning_tokens: u64, + overview_deadline_tokens: u64, + summary_target_tokens: u64, +} + +#[derive(Debug)] +struct SummaryInputBuild { + text: String, + overview_tokens: u64, + warning_exceeded: bool, + deadline_fallback_used: bool, +} + /// Build the compact worker's input: default-reference instructions, -/// the list of recently-touched files, and the pruned conversation -/// produced by [`build_summary_prompt`]. +/// the list of recently-touched files, task snapshot, and a bounded overview +/// rather than a prefix-wide transcript. fn build_summary_input( items: &[Item], default_refs: &[PathBuf], task_snapshot: Option<&str>, -) -> String { - let mut out = String::new(); - out.push_str( - "Summarise the conversation below into a structured summary and nominate \ - files the next session needs.\n\n", + options: SummaryInputOptions, +) -> SummaryInputBuild { + let overview = build_summary_overview( + items, + options.overview_target_tokens, + options.overview_deadline_tokens, ); + let overview_tokens = estimate_text_tokens(overview.len()); + let warning_exceeded = + options.overview_warning_tokens > 0 && overview_tokens > options.overview_warning_tokens; + let deadline_fallback_used = + options.overview_deadline_tokens > 0 && overview_tokens > options.overview_deadline_tokens; + let overview = if deadline_fallback_used { + build_coarse_summary_overview(items, options.overview_deadline_tokens) + } else { + overview + }; + let overview_tokens = estimate_text_tokens(overview.len()); + + let mut out = String::new(); + out.push_str(&format!( + "Summarise this session into a structured summary of about {} tokens and \ + nominate files the next session needs. The conversation below is a \ + bounded overview/index, not the full transcript. Use tools to inspect \ + current files when deciding auto-read/reference output.\n\n", + options.summary_target_tokens + )); if !default_refs.is_empty() { out.push_str( "These files were touched recently in this session. Use `read_file` \ @@ -4045,47 +4161,166 @@ fn build_summary_input( out.push_str(task_snapshot); out.push_str("\n\n"); } - out.push_str("## Conversation\n"); - out.push_str(&build_summary_prompt(items)); + out.push_str("## Conversation overview/index\n"); + out.push_str(&overview); out.push_str("\n\nWhen you are done, call `write_summary` with the final 5-section text."); + + SummaryInputBuild { + text: out, + overview_tokens, + warning_exceeded, + deadline_fallback_used, + } +} + +fn build_summary_overview(items: &[Item], target_tokens: u64, deadline_tokens: u64) -> String { + let target_bytes = token_budget_bytes(target_tokens).max(1024); + let deadline_bytes = token_budget_bytes(deadline_tokens).max(target_bytes); + let mut out = String::new(); + write_overview_header(items, &mut out); + out.push_str("\n## Recent user/assistant/system messages\n"); + + let mut selected = Vec::new(); + let mut omitted_messages = 0usize; + for (idx, item) in items.iter().enumerate().rev() { + let Some(entry) = message_overview_entry(idx, item, 2_000) else { + continue; + }; + let projected = out + .len() + .saturating_add(selected.iter().map(String::len).sum::()) + .saturating_add(entry.len()) + .saturating_add(2); + if projected > target_bytes && !selected.is_empty() { + omitted_messages += 1; + continue; + } + selected.push(entry); + if projected >= target_bytes { + break; + } + } + selected.reverse(); + for entry in selected { + out.push_str(&entry); + out.push_str("\n\n"); + } + if omitted_messages > 0 { + out.push_str(&format!( + "[Overview omitted {omitted_messages} older message(s) to stay near target.]\n\n" + )); + } + + append_tool_index(items, &mut out, target_bytes, deadline_bytes); out } -/// Format conversation items into a text prompt for the summary Worker. -/// -/// The summary should capture decisions and user intent, not recreate code. -/// File contents and tool IO belong in auto-read / references, not in the -/// summary input. So this strips: -/// - `ToolCall.arguments` (keep only the tool name) -/// - `ToolResult.content` (keep only the summary line) -/// - `Reasoning` entirely (intermediate thought, superseded by decisions) -fn build_summary_prompt(items: &[Item]) -> String { - let mut lines = Vec::new(); +fn build_coarse_summary_overview(items: &[Item], deadline_tokens: u64) -> String { + let deadline_bytes = token_budget_bytes(deadline_tokens).max(1024); + let mut out = String::new(); + write_overview_header(items, &mut out); + out.push_str("\n## Coarse recent message index\n"); + for (idx, item) in items.iter().enumerate().rev() { + let Some(entry) = message_overview_entry(idx, item, 240) else { + continue; + }; + if out.len().saturating_add(entry.len()).saturating_add(2) > deadline_bytes { + break; + } + out.push_str(&entry); + out.push_str("\n\n"); + } + out +} + +fn write_overview_header(items: &[Item], out: &mut String) { + let mut messages = 0usize; + let mut tool_calls = 0usize; + let mut tool_results = 0usize; + let mut reasoning = 0usize; for item in items { match item { - Item::Message { role, content, .. } => { - let role_label = match role { - llm_worker::Role::User => "User", - llm_worker::Role::Assistant => "Assistant", - llm_worker::Role::System => "System", - }; - let text: String = content - .iter() - .map(|p| p.as_text()) - .collect::>() - .join(""); - lines.push(format!("[{role_label}] {text}")); - } - Item::ToolCall { name, .. } => { - lines.push(format!("[ToolCall] {name}")); - } - Item::ToolResult { summary, .. } => { - lines.push(format!("[ToolResult] {summary}")); - } - Item::Reasoning { .. } => {} + Item::Message { .. } => messages += 1, + Item::ToolCall { .. } => tool_calls += 1, + Item::ToolResult { .. } => tool_results += 1, + Item::Reasoning { .. } => reasoning += 1, } } - lines.join("\n\n") + out.push_str(&format!( + "Items summarized: {} total; {messages} message(s), {tool_calls} tool call(s), \ + {tool_results} tool result(s), {reasoning} reasoning item(s). Tool call \ + arguments, tool result full content, and reasoning bodies are omitted from \ + this initial input.\n", + items.len() + )); +} + +fn append_tool_index(items: &[Item], out: &mut String, target_bytes: usize, deadline_bytes: usize) { + let mut entries = Vec::new(); + for (idx, item) in items.iter().enumerate().rev() { + match item { + Item::ToolCall { name, .. } => entries.push(format!("[{idx} ToolCall] {name}")), + Item::ToolResult { summary, .. } => entries.push(format!( + "[{idx} ToolResult] {}", + truncate_chars(summary, 240) + )), + _ => {} + } + if entries.len() >= 24 { + break; + } + } + if entries.is_empty() { + return; + } + entries.reverse(); + out.push_str("## Recent tool index (content omitted)\n"); + for entry in entries { + let projected = out.len().saturating_add(entry.len()).saturating_add(1); + if projected > deadline_bytes || (projected > target_bytes && out.contains("ToolResult")) { + out.push_str("[Additional tool index entries omitted.]\n"); + break; + } + out.push_str(&entry); + out.push('\n'); + } +} + +fn message_overview_entry(idx: usize, item: &Item, max_chars: usize) -> Option { + let Item::Message { role, content, .. } = item else { + return None; + }; + let role_label = match role { + llm_worker::Role::User => "User", + llm_worker::Role::Assistant => "Assistant", + llm_worker::Role::System => "System", + }; + let text: String = content + .iter() + .map(|p| p.as_text()) + .collect::>() + .join(""); + Some(format!( + "[{idx} {role_label}] {}", + truncate_chars(&text, max_chars) + )) +} + +fn truncate_chars(text: &str, max_chars: usize) -> String { + if text.chars().count() <= max_chars { + return text.to_string(); + } + let mut out = text.chars().take(max_chars).collect::(); + out.push_str("… [truncated]"); + out +} + +fn estimate_text_tokens(bytes: usize) -> u64 { + (bytes as u64).div_ceil(4) +} + +fn token_budget_bytes(tokens: u64) -> usize { + tokens.saturating_mul(4).min(usize::MAX as u64) as usize } /// Pod errors. @@ -4125,6 +4360,12 @@ pub enum PodError { #[error("compact worker did not produce a summary (write_summary was never called)")] CompactSummaryMissing, + #[error("compact summary too large: {tokens} tokens exceeds max {max}")] + CompactSummaryTooLarge { tokens: u64, max: u64 }, + + #[error("compacted result context too large: {tokens} tokens exceeds max {max}")] + CompactResultContextTooLarge { tokens: u64, max: u64 }, + #[error("invalid system prompt template: {source}")] InvalidSystemPromptTemplate { #[source] @@ -4409,6 +4650,21 @@ mod memory_worker_event_tests { mod build_summary_prompt_tests { use super::*; + fn test_summary_input(items: &[Item]) -> String { + build_summary_input( + items, + &[], + None, + SummaryInputOptions { + overview_target_tokens: 512, + overview_warning_tokens: 1024, + overview_deadline_tokens: 2048, + summary_target_tokens: 256, + }, + ) + .text + } + #[test] fn strips_tool_call_arguments() { let items = vec![Item::tool_call_json( @@ -4416,8 +4672,8 @@ mod build_summary_prompt_tests { "read_file", serde_json::json!({ "path": "src/main.rs" }), )]; - let prompt = build_summary_prompt(&items); - assert_eq!(prompt, "[ToolCall] read_file"); + let prompt = test_summary_input(&items); + assert!(prompt.contains("[0 ToolCall] read_file")); assert!(!prompt.contains("src/main.rs")); } @@ -4428,8 +4684,8 @@ mod build_summary_prompt_tests { "read 3 lines", "fn main() { println!(\"hello\"); }", )]; - let prompt = build_summary_prompt(&items); - assert_eq!(prompt, "[ToolResult] read 3 lines"); + let prompt = test_summary_input(&items); + assert!(prompt.contains("[0 ToolResult] read 3 lines")); assert!(!prompt.contains("println")); } @@ -4440,13 +4696,50 @@ mod build_summary_prompt_tests { Item::reasoning("internal deliberation"), Item::assistant_message("hello"), ]; - let prompt = build_summary_prompt(&items); - assert!(prompt.contains("[User] hi")); - assert!(prompt.contains("[Assistant] hello")); + let prompt = test_summary_input(&items); + assert!(prompt.contains("[0 User] hi")); + assert!(prompt.contains("[2 Assistant] hello")); assert!(!prompt.contains("Reasoning")); assert!(!prompt.contains("deliberation")); } + #[test] + fn overview_warning_does_not_drop_input() { + let items = vec![Item::user_message("x".repeat(4_000))]; + let built = build_summary_input( + &items, + &[], + None, + SummaryInputOptions { + overview_target_tokens: 10, + overview_warning_tokens: 100, + overview_deadline_tokens: 2_000, + summary_target_tokens: 256, + }, + ); + assert!(built.warning_exceeded); + assert!(!built.deadline_fallback_used); + assert!(built.text.contains("[0 User]")); + } + + #[test] + fn overview_deadline_falls_back_to_coarse_index() { + let items = vec![Item::user_message("x".repeat(4_000))]; + let built = build_summary_input( + &items, + &[], + None, + SummaryInputOptions { + overview_target_tokens: 10, + overview_warning_tokens: 10, + overview_deadline_tokens: 100, + summary_target_tokens: 256, + }, + ); + assert!(built.deadline_fallback_used); + assert!(built.text.contains("## Coarse recent message index")); + } + #[test] fn worker_manifest_generation_settings_become_request_config() { let manifest = WorkerManifest { @@ -4478,8 +4771,9 @@ mod build_summary_prompt_tests { Item::user_message("fix the bug"), Item::assistant_message("done"), ]; - let prompt = build_summary_prompt(&items); - assert_eq!(prompt, "[User] fix the bug\n\n[Assistant] done"); + let prompt = test_summary_input(&items); + assert!(prompt.contains("[0 User] fix the bug")); + assert!(prompt.contains("[1 Assistant] done")); } #[derive(Clone)] diff --git a/docs/compaction.md b/docs/compaction.md index 63da259f..cb63f47b 100644 --- a/docs/compaction.md +++ b/docs/compaction.md @@ -72,11 +72,11 @@ pub struct ToolOutput { **ターンの合間が proactive (小さい閾値)**: turn が完了した地点はタスクの自然な区切り。ここで先を見越して早めに compact する。 -マニフェストの `compact_threshold` が対応。 +マニフェストの `threshold` が対応。 **リクエストの合間は safety net (大きい閾値)**: turn 内部でリクエストの合間にチェックするのは「暴走的に膨張した場合のみ止める」用途。 -マニフェストの `compact_request_threshold` が対応。通常は発動しない。 +マニフェストの `request_threshold` が対応。通常は発動しない。 **両閾値は manifest で個別指定する**。過去の設計では 9/8 倍で自動導出していたが、 比率に根拠がなかったため廃止。両方が `Option` で、片方だけの設定も可能 @@ -137,15 +137,29 @@ compact は fork と同じ構造。旧セッションを保全し、新 SessionI ```toml [compaction] -compact_threshold = 80000 # ターンの合間 (proactive) -compact_request_threshold = 90000 # リクエストの合間 (safety net) -prune_protected_tokens = 8000 # prune から保護する末尾 token budget -compact_retained_tokens = 8000 # compact 後に生のまま残す末尾 token budget -compact_auto_read_budget = 8000 # compact worker の mark_read_required 合計上限 -compact_worker_max_input_tokens = 50000 # compact worker 自身の現在占有トークン上限 -compact_worker_max_turns = 20 # compact worker 自身の tool loop 上限 +threshold = 80000 # ターンの合間 (proactive) +request_threshold = 90000 # リクエストの合間 (safety net) +prune_protected_tokens = 8000 # prune から保護する末尾 token budget +retained_tokens = 8000 # compact 後に生のまま残す末尾 token budget + +overview_target_tokens = 8000 # compact worker 初期 overview の通常目標 +overview_warning_tokens = 16000 # 超えたら警告・trace、compact は続行 +overview_deadline_tokens = 40000 # 超えたら粗い overview へ fallback + +worker_context_max_tokens = 50000 # compact worker session 全体の hard limit +finish_warning_remaining_tokens = 8000 # 残りが少ないため write_summary へ進める勧告 +final_reserve_tokens = 4000 # 最終 summary/closing turn 用 reserve +worker_max_turns = 20 # compact worker 自身の tool loop 上限 + +summary_target_tokens = 1500 # write_summary の目標サイズ +summary_max_tokens = 3000 # write_summary の hard validation +auto_read_budget_tokens = 8000 # compact 後に注入する file content 合計上限 +result_context_max_tokens = 24000 # 新 session 初期 context の dry-run validation ``` +`compact_*` prefix の旧 key は互換 alias として読み取るが、`[compaction]` 内の新規 key は prefix なしを正とする。 +初期 overview の target/warning は効率のための目安で、通常は hard error にしない。deadline 超過時も、可能なら deterministic に粗い overview へ fallback して compact の完走を優先する。 + ### Auto-Read とリファレンス 2段階のファイル参照: @@ -176,8 +190,9 @@ auto-read も通常の history 内 system message なので、将来の Prune/Co ## compact worker -要約生成とファイル選定を行う使い捨て Worker。ツールなし・1リクエストの現行実装から、 -ツール付きマルチターンに改善する。 +要約生成とファイル選定を行う使い捨て Worker。Pod は compact 対象 prefix を全文投入せず、User / Assistant / System を優先した bounded overview と tool index を初期 input として渡す。Tool call arguments、tool result full content、reasoning body は初期 input には載せない。 + +初期 overview は `overview_target_tokens` を目標にする。`overview_warning_tokens` を超えた場合は警告・trace を記録して続行し、`overview_deadline_tokens` を超えた場合は粗い deterministic overview へ fallback する。Compact の目的は完走なので、初期 input が少し大きいだけでは hard error にしない。 ### ツール @@ -192,13 +207,19 @@ write_summary(text) — 構造化要約を出力/上書き 1. Pod が `tools::Tracker::recent_files(5)` で最近触られたファイルを抽出(デフォルトリファレンス) 2. compact worker にプロンプトとして渡す: - - pruned history(summary only、arguments/reasoning 除去) + - bounded overview / index(User / Assistant / System 優先) - デフォルトリファレンスの一覧 + - TaskStore snapshot 3. compact worker が自律的に: - read_file で各ファイルを読み、必要性を判断 - mark_read_required / add_reference で指定 - write_summary で構造化要約を出力(呼び直し可) -4. ターン終了時に write_summary 未呼び出し or read_required 空(かつファイル操作履歴がある場合)→ 追加プロンプトで促す +4. CompactWorkerInterceptor が worker session 全体の context occupancy を監視する: + - `finish_warning_remaining_tokens` 到達時に「探索を切り上げて write_summary へ進め」と Worker history に永続化される warning を挿入し、人間向け warning も出す + - `final_reserve_tokens` を割った後は `write_summary` 以外の探索 tool call に synthetic error を返し、最終 summary の余白を守る + - `worker_context_max_tokens` 超過は最後の hard stop +5. ターン終了時に write_summary 未呼び出し or read_required 空(かつファイル操作履歴がある場合)→ 追加プロンプトで促す +6. `summary_max_tokens` と `result_context_max_tokens` で compact 結果を検証してから新 session を作る ### 構造化要約の要件 diff --git a/resources/prompts/internal/compact_system.md b/resources/prompts/internal/compact_system.md index 2951033b..fff10a16 100644 --- a/resources/prompts/internal/compact_system.md +++ b/resources/prompts/internal/compact_system.md @@ -1,13 +1,16 @@ You are a context compaction assistant. Your job is to hand the next session a structured summary plus pointers to the files it actually needs — not a narrative transcript of the conversation. +The conversation input is a bounded overview/index, not the full transcript. Treat tool result bodies and reasoning as intentionally omitted unless a tool exposes more detail. If you receive a compact worker budget warning, stop broad exploration immediately, read only if absolutely necessary, and call `write_summary`. + ## Workflow -1. Use `read_file` to inspect referenced files before deciding what the next session needs. Prefer skimming over blind inclusion. -2. For files whose current contents are load-bearing for the active work, call `mark_read_required` to inject them into the next session. These count against the auto-read token budget — spend it deliberately. -3. For files the next session should know about but can fetch on demand, call `add_reference` to record the path without embedding contents. -4. Finish with `write_summary` carrying the final text. You may call it multiple times; only the last call is kept. +1. Read the provided overview/index and current TaskStore snapshot. +2. Use `read_file` to inspect referenced files before deciding what the next session needs. Prefer skimming over blind inclusion. +3. For files whose current contents are load-bearing for the active work, call `mark_read_required` to inject them into the next session. These count against the auto-read token budget — spend it deliberately. +4. For files the next session should know about but can fetch on demand, call `add_reference` to record the path without embedding contents. +5. Finish with `write_summary` carrying the final text. You may call it multiple times; only the last call is kept. -Stop nominating and close out with `write_summary` as soon as the auto-read budget is exhausted, or whenever further nominations would not change the next session's next step. +Stop nominating and close out with `write_summary` as soon as the auto-read budget is exhausted, when a compact worker budget warning arrives, or whenever further exploration would not change the next session's next step. ## Summary format @@ -36,4 +39,4 @@ Produce the summary in this exact format: ## Constraints - Keep code snippets and raw tool output OUT of the summary — that is what auto-read and references are for. -- Target 1000–2000 tokens for the summary text itself. +- Follow the summary target stated in the run input; if asked to shrink, call `write_summary` again with a shorter version.