yoi/crates/pod/src/spawn/tool.rs

572 lines
21 KiB
Rust

//! `SpawnPod` tool — launch a new Pod process as a child of this one.
//!
//! Wires pod-registry delegation, child manifest-config construction, subprocess
//! launch, and socket handoff into a single `Tool` implementation. When
//! the LLM calls `SpawnPod`, a fresh `insomnia-pod` binary is exec'd in its own
//! process group, the pod-registry is updated atomically, and the child's
//! first turn is kicked off by handing its socket a `Method::Run`.
use std::path::{Path, PathBuf};
use std::process::Stdio;
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use llm_worker::tool::{Tool, ToolDefinition, ToolError, ToolMeta, ToolOutput};
use manifest::{
ModelManifest, Permission, PodManifestConfig, PodMetaConfig, ScopeConfig, ScopeRule,
SharedScope, WorkerManifestConfig,
};
use serde::Deserialize;
use session_store::PodScopeSnapshot;
use tokio::net::UnixStream;
use tokio::process::Command;
use tokio::time::sleep;
use crate::ipc::event;
use crate::runtime::dir::SpawnedPodRecord;
use crate::runtime::pod_registry::{self, LockFileGuard, ScopeLockError};
use crate::spawn::comm_tools::{SendRunError, send_run_and_confirm};
use crate::spawn::registry::SpawnedPodRegistry;
use protocol::PodEvent;
const DESCRIPTION: &str = "Spawn a new Pod process to work on a delegated task. \
The spawner's write scope is reduced by the scope passed here; the spawned \
Pod receives its own socket and starts running `task` immediately. The \
spawned Pod outlives the spawner's current turn and can be contacted again \
through its socket path.";
const DEFAULT_INSTRUCTION: &str = "$insomnia/default";
/// How long we will wait for the spawned Pod's socket to become
/// connectable before treating the spawn as failed.
const SOCKET_WAIT_TIMEOUT: Duration = Duration::from_secs(10);
#[derive(Debug, Deserialize, schemars::JsonSchema)]
struct SpawnPodInput {
/// Identifier for the spawned Pod. Must be unique machine-wide.
name: String,
/// Instruction-file reference (e.g. `$insomnia/default`, `$user/my-agent`).
#[serde(default)]
instruction: Option<String>,
/// First message sent to the spawned Pod via `Method::Run`.
task: String,
/// Allow rules delegated to the spawned Pod. Must be a subset of the
/// spawner's effective write scope.
scope: Vec<ScopeRuleInput>,
}
#[derive(Debug, Deserialize, schemars::JsonSchema)]
struct ScopeRuleInput {
/// Absolute target path. Relative paths are rejected.
target: PathBuf,
/// `"read"` or `"write"`.
permission: PermissionInput,
/// When `false`, the rule matches the target itself and its direct
/// children only. Defaults to `true`.
#[serde(default = "default_true")]
recursive: bool,
}
#[derive(Debug, Deserialize, schemars::JsonSchema, Clone, Copy)]
#[serde(rename_all = "lowercase")]
enum PermissionInput {
Read,
Write,
}
fn default_true() -> bool {
true
}
impl From<PermissionInput> for Permission {
fn from(p: PermissionInput) -> Self {
match p {
PermissionInput::Read => Permission::Read,
PermissionInput::Write => Permission::Write,
}
}
}
/// Runtime dependencies the `SpawnPod` tool needs in order to launch a
/// child Pod and record the handoff locally. Constructed by the Pod
/// controller once per Pod lifetime.
pub struct SpawnPodTool {
/// Spawner's own pod name — becomes the spawned Pod's
/// `delegated_from` in the pod-registry.
spawner_name: String,
/// Path to the spawner's Unix socket. Handed to the child via
/// `--callback` so its `PodEvent` callbacks have somewhere to land.
callback_socket: PathBuf,
/// Root of the `$XDG_RUNTIME_DIR/insomnia/` tree, used to predict
/// the spawned Pod's socket path before the child has bound it.
runtime_base: PathBuf,
/// Directory the spawned Pod should run in when the LLM did not
/// override it. Defaults to the spawner's pwd — see module docs.
spawner_pwd: PathBuf,
/// Shared registry of spawned children, also used by the
/// pod-comm tools (`SendToPod` / `ReadPodOutput` / `StopPod` /
/// `ListPods`). Writes the list to runtime and durable Pod state on
/// each add.
registry: Arc<SpawnedPodRegistry>,
/// THIS Pod's own parent-callback socket, if any. After a
/// successful spawn we fire `PodEvent::ScopeSubDelegated` upward
/// so the grandparent can register the grandchild directly.
/// `None` for top-level Pods — in that case the re-emission is a
/// no-op.
parent_socket: Option<PathBuf>,
/// Spawner's resolved provider config — copied into every spawned
/// Pod's internal manifest config so the child does not need its own provider
/// configuration. Per-spawn override is
/// out of scope here (see `tickets/spawn-inherit-provider.md`).
spawner_model: ModelManifest,
/// Spawner's runtime scope. After a successful spawn, the
/// `Permission::Write` rules in the delegated scope are revoked
/// from the spawner's in-memory view (a `deny(Write, target)` is
/// pushed on top, downgrading the spawner's effective access on
/// those paths to `Read`). Mirrors the pod-registry's
/// `effective_write` semantics: Write is the only permission
/// tracked across Pods, so revocation only touches Write.
spawner_scope: SharedScope,
/// Called after the spawner scope has been updated so the new
/// effective scope can be persisted to the session log.
scope_changed: Arc<dyn Fn(PodScopeSnapshot) + Send + Sync>,
}
impl SpawnPodTool {
pub fn new(
spawner_name: String,
callback_socket: PathBuf,
runtime_base: PathBuf,
spawner_pwd: PathBuf,
registry: Arc<SpawnedPodRegistry>,
parent_socket: Option<PathBuf>,
spawner_model: ModelManifest,
spawner_scope: SharedScope,
scope_changed: Arc<dyn Fn(PodScopeSnapshot) + Send + Sync>,
) -> Self {
Self {
spawner_name,
callback_socket,
runtime_base,
spawner_pwd,
registry,
parent_socket,
spawner_model,
spawner_scope,
scope_changed,
}
}
}
#[async_trait]
impl Tool for SpawnPodTool {
async fn execute(&self, input_json: &str) -> Result<ToolOutput, ToolError> {
let input: SpawnPodInput = serde_json::from_str(input_json)
.map_err(|e| ToolError::InvalidArgument(format!("invalid SpawnPod input: {e}")))?;
// `delegate_scope` catches this too (as `DuplicatePodName`), but
// the dedicated message is kinder to the LLM — which gets the
// error back verbatim — than the generic duplicate-name error.
if input.name == self.spawner_name {
return Err(ToolError::InvalidArgument(format!(
"spawned pod name `{}` collides with spawner's own name",
input.name
)));
}
let scope_allow = parse_scope(&input.scope)?;
let instruction = input
.instruction
.clone()
.unwrap_or_else(|| DEFAULT_INSTRUCTION.to_string());
let predicted_socket = self.runtime_base.join(&input.name).join("sock");
let lock_path = pod_registry::default_registry_path()
.map_err(|e| ToolError::ExecutionFailed(format!("pod-registry path: {e}")))?;
// Reserve the allocation up front. Spawner's pid is a live
// placeholder; the child will rewrite it via `adopt_allocation`.
{
let mut guard = LockFileGuard::open(&lock_path)
.map_err(|e| ToolError::ExecutionFailed(format!("pod-registry open: {e}")))?;
pod_registry::delegate_scope(
&mut guard,
&self.spawner_name,
input.name.clone(),
std::process::id(),
predicted_socket.clone(),
scope_allow.clone(),
)
.map_err(pod_registry_err_to_tool)?;
}
// `start_outcome` covers steps that happen before the child is
// observably alive (exec + socket bind). Once its socket is
// listening, the child owns the allocation and we must not roll
// it back — even if later steps (Method::Run delivery, record
// write) fail, the child is running and will release its own
// entry on exit.
let spawn_config_json = match build_spawn_config_json(
&input.name,
&instruction,
&scope_allow,
&self.spawner_model,
) {
Ok(s) => s,
Err(e) => {
self.release_reservation(&lock_path, &input.name);
return Err(ToolError::ExecutionFailed(format!(
"spawn config serialisation: {e}"
)));
}
};
let start_outcome = self
.exec_child(&input.name, &spawn_config_json, &predicted_socket)
.await;
if let Err(e) = start_outcome {
self.release_reservation(&lock_path, &input.name);
return Err(e);
}
// Child is live. Post-start errors propagate but do not roll
// back the scope allocation — the child already owns it.
//
// Mirror that ownership transfer in the spawner's in-memory
// scope: every `Permission::Write` rule in the delegated scope
// is shadowed by a `deny(Write, target)` so subsequent tool
// calls (Edit/Write) on the delegated paths fail with
// `ReadOnly`. Read access is left intact — the registry only
// arbitrates Write, and keeping Read lets the spawner observe
// the child's intermediate output through Read/Glob/Grep.
let revoke_write: Vec<ScopeRule> = scope_allow
.iter()
.filter(|r| r.permission == Permission::Write)
.cloned()
.collect();
if !revoke_write.is_empty() {
self.spawner_scope
.update(|cur| cur.with_added_deny_rules(revoke_write.clone()))
.map_err(|e| ToolError::ExecutionFailed(format!("revoke spawner scope: {e}")))?;
let current = self.spawner_scope.snapshot();
(self.scope_changed)(PodScopeSnapshot {
allow: current.allow_rules(),
deny: current.deny_rules(),
});
}
let record = SpawnedPodRecord {
pod_name: input.name.clone(),
socket_path: predicted_socket.clone(),
scope_delegated: scope_allow.clone(),
callback_address: self.callback_socket.clone(),
};
self.registry
.add(record)
.await
.map_err(|e| ToolError::ExecutionFailed(format!("write spawned pod registry: {e}")))?;
// Notify this Pod's own parent so the grandparent can register
// the new grandchild directly. Fire-and-forget; top-level Pods
// (with no parent) skip the send inside `fire_and_forget`.
event::fire_and_forget(
self.parent_socket.clone(),
PodEvent::ScopeSubDelegated {
parent_pod: self.spawner_name.clone(),
sub_pod: input.name.clone(),
sub_socket: predicted_socket.clone(),
scope: scope_allow,
},
);
send_run_and_confirm(&predicted_socket, input.task.clone())
.await
.map_err(|err| spawn_delivery_error(&input.name, err))?;
Ok(ToolOutput {
summary: format!(
"spawned pod `{}` listening on {}",
input.name,
predicted_socket.display()
),
content: None,
})
}
}
impl SpawnPodTool {
async fn exec_child(
&self,
pod_name: &str,
spawn_config_json: &str,
predicted_socket: &Path,
) -> Result<(), ToolError> {
let pod_command =
std::env::var("INSOMNIA_POD_COMMAND").unwrap_or_else(|_| "insomnia-pod".into());
// Pre-create the child's runtime dir so we have a stable place to
// capture its stderr before it has had a chance to bind anything.
// The child's own `RuntimeDir::create` will `create_dir_all` the
// same path again — that's idempotent. On clean exit the child's
// RuntimeDir Drop tears the dir (and this log) down with it.
let pod_runtime_dir = self.runtime_base.join(pod_name);
tokio::fs::create_dir_all(&pod_runtime_dir)
.await
.map_err(|e| {
ToolError::ExecutionFailed(format!(
"create runtime dir {}: {e}",
pod_runtime_dir.display()
))
})?;
let stderr_path = pod_runtime_dir.join("stderr.log");
let stderr_file = std::fs::File::create(&stderr_path).map_err(|e| {
ToolError::ExecutionFailed(format!("open {}: {e}", stderr_path.display()))
})?;
let mut cmd = Command::new(&pod_command);
cmd.arg("--adopt")
.arg("--callback")
.arg(&self.callback_socket)
.arg("--spawn-config-json")
.arg(spawn_config_json)
.current_dir(&self.spawner_pwd)
.stdin(Stdio::null())
.stdout(Stdio::null())
.stderr(Stdio::from(stderr_file))
.process_group(0);
let child = cmd.spawn().map_err(|e| {
ToolError::ExecutionFailed(format!("failed to spawn `{pod_command}`: {e}"))
})?;
// Default `kill_on_drop = false` keeps the process alive after
// the `Child` is dropped. We intentionally do not `.wait()` —
// when the spawner later exits, init adopts any remaining
// orphans. Lifecycle tracking lives in `spawned_pods.json`.
drop(child);
match wait_for_socket(predicted_socket, SOCKET_WAIT_TIMEOUT).await {
Ok(()) => Ok(()),
Err(e) => Err(annotate_with_stderr(e, &stderr_path).await),
}
}
fn release_reservation(&self, lock_path: &Path, pod_name: &str) {
if let Ok(mut g) = LockFileGuard::open(lock_path) {
let _ = pod_registry::release_pod(&mut g, pod_name);
}
}
}
fn parse_scope(rules: &[ScopeRuleInput]) -> Result<Vec<ScopeRule>, ToolError> {
if rules.is_empty() {
return Err(ToolError::InvalidArgument("scope must not be empty".into()));
}
rules
.iter()
.map(|r| {
if !r.target.is_absolute() {
return Err(ToolError::InvalidArgument(format!(
"scope.target must be absolute: {}",
r.target.display()
)));
}
Ok(ScopeRule {
target: r.target.clone(),
permission: r.permission.into(),
recursive: r.recursive,
})
})
.collect()
}
/// Serialise the internal manifest config that gets handed to the child
/// `insomnia-pod` binary via the hidden `--spawn-config-json` flag.
/// `PodManifestConfig`'s `Serialize` impl is the single source of truth for the
/// internal handoff shape.
///
/// The child's working directory is set separately via
/// `Command::current_dir` (see [`SpawnPodTool::exec_child`]) — it is
/// not part of the manifest.
fn build_spawn_config_json(
name: &str,
instruction: &str,
scope_allow: &[ScopeRule],
model: &ModelManifest,
) -> Result<String, serde_json::Error> {
let config = PodManifestConfig {
pod: PodMetaConfig {
name: Some(name.to_string()),
prompt_pack: None,
},
model: model.clone(),
worker: WorkerManifestConfig {
instruction: Some(instruction.to_string()),
..Default::default()
},
scope: ScopeConfig {
allow: scope_allow.to_vec(),
deny: Vec::new(),
},
..Default::default()
};
serde_json::to_string(&config)
}
/// Tail of the spawned child's `stderr.log` to splice into a startup
/// failure message. Capped so a chatty child can't blow up the LLM's
/// tool-result budget — debugging beyond this should read the file
/// directly.
const STDERR_TAIL_BYTES: usize = 4 * 1024;
async fn annotate_with_stderr(err: ToolError, stderr_path: &Path) -> ToolError {
let tail = match tokio::fs::read(stderr_path).await {
Ok(bytes) => {
let start = bytes.len().saturating_sub(STDERR_TAIL_BYTES);
String::from_utf8_lossy(&bytes[start..]).into_owned()
}
Err(_) => return err,
};
let trimmed = tail.trim();
if trimmed.is_empty() {
return err;
}
match err {
ToolError::ExecutionFailed(msg) => ToolError::ExecutionFailed(format!(
"{msg}\n--- child stderr ({}) ---\n{trimmed}",
stderr_path.display()
)),
other => other,
}
}
async fn wait_for_socket(path: &Path, timeout: Duration) -> Result<(), ToolError> {
let deadline = tokio::time::Instant::now() + timeout;
loop {
if path.exists() {
if let Ok(stream) = UnixStream::connect(path).await {
drop(stream);
return Ok(());
}
}
if tokio::time::Instant::now() >= deadline {
return Err(ToolError::ExecutionFailed(format!(
"spawned pod socket did not appear within {timeout:?}: {}",
path.display()
)));
}
sleep(Duration::from_millis(50)).await;
}
}
fn spawn_delivery_error(pod_name: &str, err: SendRunError) -> ToolError {
match err {
SendRunError::AlreadyRunning => ToolError::ExecutionFailed(format!(
"spawned pod `{pod_name}` rejected its initial task as already running; the pod remains registered and can be inspected or stopped"
)),
SendRunError::Rejected { code, message } => ToolError::ExecutionFailed(format!(
"spawned pod `{pod_name}` rejected its initial task with {code:?}: {message}; the pod remains registered and can be inspected or stopped"
)),
SendRunError::Io(msg) => ToolError::ExecutionFailed(format!(
"spawned pod `{pod_name}` did not confirm initial task delivery: {msg}; the pod remains registered and can be inspected or stopped"
)),
}
}
fn pod_registry_err_to_tool(e: ScopeLockError) -> ToolError {
match e {
ScopeLockError::NotSubset { .. }
| ScopeLockError::WriteConflict { .. }
| ScopeLockError::DuplicatePodName(_)
| ScopeLockError::UnknownPod(_)
| ScopeLockError::SegmentConflict { .. } => ToolError::InvalidArgument(e.to_string()),
ScopeLockError::Io(_) => ToolError::ExecutionFailed(e.to_string()),
}
}
/// Factory for the `SpawnPod` tool.
pub fn spawn_pod_tool(
spawner_name: String,
callback_socket: PathBuf,
runtime_base: PathBuf,
spawner_pwd: PathBuf,
registry: Arc<SpawnedPodRegistry>,
parent_socket: Option<PathBuf>,
spawner_model: ModelManifest,
spawner_scope: SharedScope,
scope_changed: Arc<dyn Fn(PodScopeSnapshot) + Send + Sync>,
) -> ToolDefinition {
Arc::new(move || {
let schema = schemars::schema_for!(SpawnPodInput);
let schema_value = serde_json::to_value(schema).unwrap_or(serde_json::json!({}));
let meta = ToolMeta::new("SpawnPod")
.description(DESCRIPTION)
.input_schema(schema_value);
let tool: Arc<dyn Tool> = Arc::new(SpawnPodTool::new(
spawner_name.clone(),
callback_socket.clone(),
runtime_base.clone(),
spawner_pwd.clone(),
registry.clone(),
parent_socket.clone(),
spawner_model.clone(),
spawner_scope.clone(),
scope_changed.clone(),
));
(meta, tool)
})
}
#[cfg(test)]
mod tests {
use super::*;
use manifest::{AuthRef, SchemeKind};
#[test]
fn spawn_config_inherits_inline_spawner_model() {
let model = ModelManifest {
scheme: Some(SchemeKind::Anthropic),
base_url: Some("https://example.test".into()),
model_id: Some("claude-sonnet-4".into()),
auth: Some(AuthRef::ApiKey {
env: None,
file: Some(PathBuf::from("/etc/keys/anthropic")),
}),
..Default::default()
};
let config_json =
build_spawn_config_json("child", "$insomnia/default", &[], &model).unwrap();
let parsed: PodManifestConfig = serde_json::from_str(&config_json).unwrap();
assert_eq!(parsed.model.scheme, Some(SchemeKind::Anthropic));
assert_eq!(parsed.model.model_id.as_deref(), Some("claude-sonnet-4"));
assert_eq!(
parsed.model.base_url.as_deref(),
Some("https://example.test")
);
let file = match parsed.model.auth {
Some(AuthRef::ApiKey { file, .. }) => file,
_ => panic!("expected ApiKey"),
};
assert_eq!(file.as_deref(), Some(Path::new("/etc/keys/anthropic")));
}
#[test]
fn spawn_config_inherits_ref_spawner_model() {
let model = ModelManifest {
ref_: Some("anthropic/claude-sonnet-4-6".into()),
..Default::default()
};
let config_json =
build_spawn_config_json("child", "$insomnia/default", &[], &model).unwrap();
let parsed: PodManifestConfig = serde_json::from_str(&config_json).unwrap();
assert_eq!(
parsed.model.ref_.as_deref(),
Some("anthropic/claude-sonnet-4-6")
);
}
}