fix: confirm SpawnPod initial run delivery

This commit is contained in:
Keisuke Hirata 2026-05-26 08:37:24 +09:00
parent f56ef010a8
commit c101b42619
3 changed files with 120 additions and 37 deletions

View File

@ -17,7 +17,7 @@ use async_trait::async_trait;
use llm_worker::llm_client::types::{ContentPart, Item, Role};
use llm_worker::tool::{Tool, ToolDefinition, ToolError, ToolMeta, ToolOutput};
use protocol::stream::{JsonLineReader, JsonLineWriter};
use protocol::{ErrorCode, Event, Method};
use protocol::{ErrorCode, Event, InvokeKind, Method};
use serde::Deserialize;
use session_store::LogEntry;
use tokio::net::UnixStream;
@ -365,7 +365,8 @@ where
}
/// Failure modes distinguished by `SendToPod`.
enum SendRunError {
#[derive(Debug)]
pub(crate) enum SendRunError {
/// Target Pod responded with `Error { AlreadyRunning }` — the
/// caller can retry once the current turn ends.
AlreadyRunning,
@ -374,10 +375,12 @@ enum SendRunError {
}
/// Write `Method::Run` to the target and read back events until we see
/// either `TurnStart` (accepted) or `Error { AlreadyRunning }`
/// (rejected). Any replayed alerts that precede the response are
/// skipped. Times out per-read so a stuck Pod doesn't hang the tool.
async fn send_run_and_confirm(socket: &Path, input: String) -> Result<(), SendRunError> {
/// evidence that the controller accepted the run (`UserMessage`,
/// `TurnStart`, or a user-send `InvokeStart`) or rejected it with
/// `Error { AlreadyRunning }`. Any connect-time Snapshot or replayed alerts
/// that precede the response are skipped. Times out per-read so a stuck Pod
/// doesn't hang the tool.
pub(crate) async fn send_run_and_confirm(socket: &Path, input: String) -> Result<(), SendRunError> {
let stream = tokio::time::timeout(SOCKET_OP_TIMEOUT, UnixStream::connect(socket))
.await
.map_err(|_| SendRunError::Io("connect timed out".into()))?
@ -404,10 +407,19 @@ async fn send_run_and_confirm(socket: &Path, input: String) -> Result<(), SendRu
code: ErrorCode::AlreadyRunning,
..
}) => return Err(SendRunError::AlreadyRunning),
Some(Event::TurnStart { .. }) => return Ok(()),
// Alerts and other pre-turn events are replayed to new
// subscribers; keep reading until the controller's response
// to our `Run` shows up.
Some(Event::Error { code, message }) => {
return Err(SendRunError::Io(format!(
"pod returned {code:?}: {message}"
)));
}
Some(Event::InvokeStart {
kind: InvokeKind::UserSend,
})
| Some(Event::UserMessage { .. })
| Some(Event::TurnStart { .. }) => return Ok(()),
// Alerts, Snapshot, and other pre-turn events can precede the
// controller's response; keep reading until the Run is accepted
// or rejected.
Some(_) => continue,
None => return Err(SendRunError::Io("connection closed before response".into())),
}
@ -555,6 +567,78 @@ mod tests {
})
}
fn serve_initial_events_then_run_ack(
listener: UnixListener,
initial_events: Vec<Event>,
ack: Event,
) -> JoinHandle<Option<Method>> {
tokio::spawn(async move {
let (stream, _) = listener.accept().await.ok()?;
let (r, w) = stream.into_split();
let mut reader = JsonLineReader::new(r);
let mut writer = JsonLineWriter::new(w);
for event in initial_events {
writer.write(&event).await.ok()?;
}
let method = reader.next::<Method>().await.ok().flatten()?;
writer.write(&ack).await.ok()?;
Some(method)
})
}
#[tokio::test]
async fn send_run_and_confirm_keeps_connection_open_until_user_message_ack() {
let tmp = TempDir::new().unwrap();
let socket = tmp.path().join("pod.sock");
let listener = UnixListener::bind(&socket).unwrap();
let received = serve_initial_events_then_run_ack(
listener,
vec![
Event::Alert(Alert {
level: AlertLevel::Warn,
source: AlertSource::Pod,
message: "replayed alert".into(),
timestamp_ms: 0,
}),
snapshot(Vec::new()),
],
Event::UserMessage {
segments: vec![protocol::Segment::text("hello")],
},
);
send_run_and_confirm(&socket, "hello".into()).await.unwrap();
let method = received.await.unwrap().expect("expected method");
match method {
Method::Run { input } => {
assert_eq!(protocol::Segment::flatten_to_text(&input), "hello");
}
other => panic!("expected Run, got {other:?}"),
}
}
#[tokio::test]
async fn send_run_and_confirm_reports_already_running() {
let tmp = TempDir::new().unwrap();
let socket = tmp.path().join("pod.sock");
let listener = UnixListener::bind(&socket).unwrap();
let received = serve_initial_events_then_run_ack(
listener,
vec![snapshot(Vec::new())],
Event::Error {
code: ErrorCode::AlreadyRunning,
message: "busy".into(),
},
);
let err = send_run_and_confirm(&socket, "hello".into())
.await
.expect_err("expected AlreadyRunning");
assert!(matches!(err, SendRunError::AlreadyRunning));
assert!(matches!(received.await.unwrap(), Some(Method::Run { .. })));
}
#[tokio::test]
async fn connect_and_send_drains_initial_alert_and_snapshot_before_method() {
let tmp = TempDir::new().unwrap();

View File

@ -17,8 +17,6 @@ use manifest::{
ModelManifest, Permission, PodManifestConfig, PodMetaConfig, ScopeConfig, ScopeRule,
SharedScope, WorkerManifestConfig,
};
use protocol::Method;
use protocol::stream::JsonLineWriter;
use serde::Deserialize;
use session_store::PodScopeSnapshot;
use tokio::net::UnixStream;
@ -28,6 +26,7 @@ use tokio::time::sleep;
use crate::ipc::event;
use crate::runtime::dir::SpawnedPodRecord;
use crate::runtime::pod_registry::{self, LockFileGuard, ScopeLockError};
use crate::spawn::comm_tools::{SendRunError, send_run_and_confirm};
use crate::spawn::registry::SpawnedPodRegistry;
use protocol::PodEvent;
@ -258,8 +257,6 @@ impl Tool for SpawnPodTool {
});
}
send_run(&predicted_socket, &input.task).await?;
let record = SpawnedPodRecord {
pod_name: input.name.clone(),
socket_path: predicted_socket.clone(),
@ -284,6 +281,10 @@ impl Tool for SpawnPodTool {
},
);
send_run_and_confirm(&predicted_socket, input.task.clone())
.await
.map_err(|err| spawn_delivery_error(&input.name, err))?;
Ok(ToolOutput {
summary: format!(
"spawned pod `{}` listening on {}",
@ -458,23 +459,15 @@ async fn wait_for_socket(path: &Path, timeout: Duration) -> Result<(), ToolError
}
}
async fn send_run(socket: &Path, task: &str) -> Result<(), ToolError> {
let stream = UnixStream::connect(socket)
.await
.map_err(|e| ToolError::ExecutionFailed(format!("connect {}: {e}", socket.display())))?;
let (_reader, writer) = stream.into_split();
let mut w = JsonLineWriter::new(writer);
w.write(&Method::Run {
input: vec![protocol::Segment::text(task)],
})
.await
.map_err(|e| ToolError::ExecutionFailed(format!("send Method::Run: {e}")))?;
// Drop the writer to close the socket's write half. The flush
// inside `JsonLineWriter::write` has already pushed the bytes
// across, so the child will see a complete method line followed by
// EOF.
drop(w);
Ok(())
fn spawn_delivery_error(pod_name: &str, err: SendRunError) -> ToolError {
match err {
SendRunError::AlreadyRunning => ToolError::ExecutionFailed(format!(
"spawned pod `{pod_name}` rejected its initial task as already running; the pod remains registered and can be inspected or stopped"
)),
SendRunError::Io(msg) => ToolError::ExecutionFailed(format!(
"spawned pod `{pod_name}` did not confirm initial task delivery: {msg}; the pod remains registered and can be inspected or stopped"
)),
}
}
fn pod_registry_err_to_tool(e: ScopeLockError) -> ToolError {

View File

@ -16,8 +16,8 @@ use pod::runtime::dir::{RuntimeDir, SpawnedPodRecord};
use pod::runtime::pod_registry::{self, LockFileGuard};
use pod::spawn::registry::SpawnedPodRegistry;
use pod::spawn::tool::spawn_pod_tool;
use protocol::Method;
use protocol::stream::JsonLineReader;
use protocol::stream::{JsonLineReader, JsonLineWriter};
use protocol::{Event, Method};
use serde_json::json;
use std::sync::Arc;
use tempfile::TempDir;
@ -97,16 +97,22 @@ async fn bind_mock_pod_socket(runtime_base: &Path, pod_name: &str) -> (PathBuf,
}
/// Launch a tokio task that accepts connections until one carries a
/// `Method` line, then returns it. `wait_for_socket` inside the tool
/// makes a probe connection that carries no data, so the task must
/// tolerate an empty connection and keep listening.
/// `Method` line, then acknowledges it and returns it. `wait_for_socket`
/// inside the tool makes a probe connection that carries no data, so the
/// task must tolerate an empty connection and keep listening.
fn accept_one_method(listener: UnixListener) -> tokio::task::JoinHandle<Option<Method>> {
tokio::spawn(async move {
loop {
let (stream, _) = listener.accept().await.ok()?;
let (reader, _writer) = stream.into_split();
let (reader, writer) = stream.into_split();
let mut r = JsonLineReader::new(reader);
let mut w = JsonLineWriter::new(writer);
if let Ok(Some(method)) = r.next::<Method>().await {
w.write(&Event::UserMessage {
segments: vec![protocol::Segment::text("accepted")],
})
.await
.ok()?;
return Some(method);
}
}