fix: confirm SpawnPod initial run delivery
This commit is contained in:
parent
0a07c50be4
commit
28ad8f01ec
|
|
@ -17,7 +17,7 @@ use async_trait::async_trait;
|
||||||
use llm_worker::llm_client::types::{ContentPart, Item, Role};
|
use llm_worker::llm_client::types::{ContentPart, Item, Role};
|
||||||
use llm_worker::tool::{Tool, ToolDefinition, ToolError, ToolMeta, ToolOutput};
|
use llm_worker::tool::{Tool, ToolDefinition, ToolError, ToolMeta, ToolOutput};
|
||||||
use protocol::stream::{JsonLineReader, JsonLineWriter};
|
use protocol::stream::{JsonLineReader, JsonLineWriter};
|
||||||
use protocol::{ErrorCode, Event, Method};
|
use protocol::{ErrorCode, Event, InvokeKind, Method};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use session_store::LogEntry;
|
use session_store::LogEntry;
|
||||||
use tokio::net::UnixStream;
|
use tokio::net::UnixStream;
|
||||||
|
|
@ -365,7 +365,8 @@ where
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Failure modes distinguished by `SendToPod`.
|
/// Failure modes distinguished by `SendToPod`.
|
||||||
enum SendRunError {
|
#[derive(Debug)]
|
||||||
|
pub(crate) enum SendRunError {
|
||||||
/// Target Pod responded with `Error { AlreadyRunning }` — the
|
/// Target Pod responded with `Error { AlreadyRunning }` — the
|
||||||
/// caller can retry once the current turn ends.
|
/// caller can retry once the current turn ends.
|
||||||
AlreadyRunning,
|
AlreadyRunning,
|
||||||
|
|
@ -374,10 +375,12 @@ enum SendRunError {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Write `Method::Run` to the target and read back events until we see
|
/// Write `Method::Run` to the target and read back events until we see
|
||||||
/// either `TurnStart` (accepted) or `Error { AlreadyRunning }`
|
/// evidence that the controller accepted the run (`UserMessage`,
|
||||||
/// (rejected). Any replayed alerts that precede the response are
|
/// `TurnStart`, or a user-send `InvokeStart`) or rejected it with
|
||||||
/// skipped. Times out per-read so a stuck Pod doesn't hang the tool.
|
/// `Error { AlreadyRunning }`. Any connect-time Snapshot or replayed alerts
|
||||||
async fn send_run_and_confirm(socket: &Path, input: String) -> Result<(), SendRunError> {
|
/// that precede the response are skipped. Times out per-read so a stuck Pod
|
||||||
|
/// doesn't hang the tool.
|
||||||
|
pub(crate) async fn send_run_and_confirm(socket: &Path, input: String) -> Result<(), SendRunError> {
|
||||||
let stream = tokio::time::timeout(SOCKET_OP_TIMEOUT, UnixStream::connect(socket))
|
let stream = tokio::time::timeout(SOCKET_OP_TIMEOUT, UnixStream::connect(socket))
|
||||||
.await
|
.await
|
||||||
.map_err(|_| SendRunError::Io("connect timed out".into()))?
|
.map_err(|_| SendRunError::Io("connect timed out".into()))?
|
||||||
|
|
@ -404,10 +407,19 @@ async fn send_run_and_confirm(socket: &Path, input: String) -> Result<(), SendRu
|
||||||
code: ErrorCode::AlreadyRunning,
|
code: ErrorCode::AlreadyRunning,
|
||||||
..
|
..
|
||||||
}) => return Err(SendRunError::AlreadyRunning),
|
}) => return Err(SendRunError::AlreadyRunning),
|
||||||
Some(Event::TurnStart { .. }) => return Ok(()),
|
Some(Event::Error { code, message }) => {
|
||||||
// Alerts and other pre-turn events are replayed to new
|
return Err(SendRunError::Io(format!(
|
||||||
// subscribers; keep reading until the controller's response
|
"pod returned {code:?}: {message}"
|
||||||
// to our `Run` shows up.
|
)));
|
||||||
|
}
|
||||||
|
Some(Event::InvokeStart {
|
||||||
|
kind: InvokeKind::UserSend,
|
||||||
|
})
|
||||||
|
| Some(Event::UserMessage { .. })
|
||||||
|
| Some(Event::TurnStart { .. }) => return Ok(()),
|
||||||
|
// Alerts, Snapshot, and other pre-turn events can precede the
|
||||||
|
// controller's response; keep reading until the Run is accepted
|
||||||
|
// or rejected.
|
||||||
Some(_) => continue,
|
Some(_) => continue,
|
||||||
None => return Err(SendRunError::Io("connection closed before response".into())),
|
None => return Err(SendRunError::Io("connection closed before response".into())),
|
||||||
}
|
}
|
||||||
|
|
@ -555,6 +567,78 @@ mod tests {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn serve_initial_events_then_run_ack(
|
||||||
|
listener: UnixListener,
|
||||||
|
initial_events: Vec<Event>,
|
||||||
|
ack: Event,
|
||||||
|
) -> JoinHandle<Option<Method>> {
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let (stream, _) = listener.accept().await.ok()?;
|
||||||
|
let (r, w) = stream.into_split();
|
||||||
|
let mut reader = JsonLineReader::new(r);
|
||||||
|
let mut writer = JsonLineWriter::new(w);
|
||||||
|
for event in initial_events {
|
||||||
|
writer.write(&event).await.ok()?;
|
||||||
|
}
|
||||||
|
let method = reader.next::<Method>().await.ok().flatten()?;
|
||||||
|
writer.write(&ack).await.ok()?;
|
||||||
|
Some(method)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn send_run_and_confirm_keeps_connection_open_until_user_message_ack() {
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
let socket = tmp.path().join("pod.sock");
|
||||||
|
let listener = UnixListener::bind(&socket).unwrap();
|
||||||
|
let received = serve_initial_events_then_run_ack(
|
||||||
|
listener,
|
||||||
|
vec![
|
||||||
|
Event::Alert(Alert {
|
||||||
|
level: AlertLevel::Warn,
|
||||||
|
source: AlertSource::Pod,
|
||||||
|
message: "replayed alert".into(),
|
||||||
|
timestamp_ms: 0,
|
||||||
|
}),
|
||||||
|
snapshot(Vec::new()),
|
||||||
|
],
|
||||||
|
Event::UserMessage {
|
||||||
|
segments: vec![protocol::Segment::text("hello")],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
send_run_and_confirm(&socket, "hello".into()).await.unwrap();
|
||||||
|
|
||||||
|
let method = received.await.unwrap().expect("expected method");
|
||||||
|
match method {
|
||||||
|
Method::Run { input } => {
|
||||||
|
assert_eq!(protocol::Segment::flatten_to_text(&input), "hello");
|
||||||
|
}
|
||||||
|
other => panic!("expected Run, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn send_run_and_confirm_reports_already_running() {
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
let socket = tmp.path().join("pod.sock");
|
||||||
|
let listener = UnixListener::bind(&socket).unwrap();
|
||||||
|
let received = serve_initial_events_then_run_ack(
|
||||||
|
listener,
|
||||||
|
vec![snapshot(Vec::new())],
|
||||||
|
Event::Error {
|
||||||
|
code: ErrorCode::AlreadyRunning,
|
||||||
|
message: "busy".into(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let err = send_run_and_confirm(&socket, "hello".into())
|
||||||
|
.await
|
||||||
|
.expect_err("expected AlreadyRunning");
|
||||||
|
assert!(matches!(err, SendRunError::AlreadyRunning));
|
||||||
|
assert!(matches!(received.await.unwrap(), Some(Method::Run { .. })));
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn connect_and_send_drains_initial_alert_and_snapshot_before_method() {
|
async fn connect_and_send_drains_initial_alert_and_snapshot_before_method() {
|
||||||
let tmp = TempDir::new().unwrap();
|
let tmp = TempDir::new().unwrap();
|
||||||
|
|
|
||||||
|
|
@ -17,8 +17,6 @@ use manifest::{
|
||||||
ModelManifest, Permission, PodManifestConfig, PodMetaConfig, ScopeConfig, ScopeRule,
|
ModelManifest, Permission, PodManifestConfig, PodMetaConfig, ScopeConfig, ScopeRule,
|
||||||
SharedScope, WorkerManifestConfig,
|
SharedScope, WorkerManifestConfig,
|
||||||
};
|
};
|
||||||
use protocol::Method;
|
|
||||||
use protocol::stream::JsonLineWriter;
|
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use session_store::PodScopeSnapshot;
|
use session_store::PodScopeSnapshot;
|
||||||
use tokio::net::UnixStream;
|
use tokio::net::UnixStream;
|
||||||
|
|
@ -28,6 +26,7 @@ use tokio::time::sleep;
|
||||||
use crate::ipc::event;
|
use crate::ipc::event;
|
||||||
use crate::runtime::dir::SpawnedPodRecord;
|
use crate::runtime::dir::SpawnedPodRecord;
|
||||||
use crate::runtime::pod_registry::{self, LockFileGuard, ScopeLockError};
|
use crate::runtime::pod_registry::{self, LockFileGuard, ScopeLockError};
|
||||||
|
use crate::spawn::comm_tools::{SendRunError, send_run_and_confirm};
|
||||||
use crate::spawn::registry::SpawnedPodRegistry;
|
use crate::spawn::registry::SpawnedPodRegistry;
|
||||||
use protocol::PodEvent;
|
use protocol::PodEvent;
|
||||||
|
|
||||||
|
|
@ -258,8 +257,6 @@ impl Tool for SpawnPodTool {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
send_run(&predicted_socket, &input.task).await?;
|
|
||||||
|
|
||||||
let record = SpawnedPodRecord {
|
let record = SpawnedPodRecord {
|
||||||
pod_name: input.name.clone(),
|
pod_name: input.name.clone(),
|
||||||
socket_path: predicted_socket.clone(),
|
socket_path: predicted_socket.clone(),
|
||||||
|
|
@ -284,6 +281,10 @@ impl Tool for SpawnPodTool {
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
|
send_run_and_confirm(&predicted_socket, input.task.clone())
|
||||||
|
.await
|
||||||
|
.map_err(|err| spawn_delivery_error(&input.name, err))?;
|
||||||
|
|
||||||
Ok(ToolOutput {
|
Ok(ToolOutput {
|
||||||
summary: format!(
|
summary: format!(
|
||||||
"spawned pod `{}` listening on {}",
|
"spawned pod `{}` listening on {}",
|
||||||
|
|
@ -458,23 +459,15 @@ async fn wait_for_socket(path: &Path, timeout: Duration) -> Result<(), ToolError
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn send_run(socket: &Path, task: &str) -> Result<(), ToolError> {
|
fn spawn_delivery_error(pod_name: &str, err: SendRunError) -> ToolError {
|
||||||
let stream = UnixStream::connect(socket)
|
match err {
|
||||||
.await
|
SendRunError::AlreadyRunning => ToolError::ExecutionFailed(format!(
|
||||||
.map_err(|e| ToolError::ExecutionFailed(format!("connect {}: {e}", socket.display())))?;
|
"spawned pod `{pod_name}` rejected its initial task as already running; the pod remains registered and can be inspected or stopped"
|
||||||
let (_reader, writer) = stream.into_split();
|
)),
|
||||||
let mut w = JsonLineWriter::new(writer);
|
SendRunError::Io(msg) => ToolError::ExecutionFailed(format!(
|
||||||
w.write(&Method::Run {
|
"spawned pod `{pod_name}` did not confirm initial task delivery: {msg}; the pod remains registered and can be inspected or stopped"
|
||||||
input: vec![protocol::Segment::text(task)],
|
)),
|
||||||
})
|
}
|
||||||
.await
|
|
||||||
.map_err(|e| ToolError::ExecutionFailed(format!("send Method::Run: {e}")))?;
|
|
||||||
// Drop the writer to close the socket's write half. The flush
|
|
||||||
// inside `JsonLineWriter::write` has already pushed the bytes
|
|
||||||
// across, so the child will see a complete method line followed by
|
|
||||||
// EOF.
|
|
||||||
drop(w);
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pod_registry_err_to_tool(e: ScopeLockError) -> ToolError {
|
fn pod_registry_err_to_tool(e: ScopeLockError) -> ToolError {
|
||||||
|
|
|
||||||
|
|
@ -16,8 +16,8 @@ use pod::runtime::dir::{RuntimeDir, SpawnedPodRecord};
|
||||||
use pod::runtime::pod_registry::{self, LockFileGuard};
|
use pod::runtime::pod_registry::{self, LockFileGuard};
|
||||||
use pod::spawn::registry::SpawnedPodRegistry;
|
use pod::spawn::registry::SpawnedPodRegistry;
|
||||||
use pod::spawn::tool::spawn_pod_tool;
|
use pod::spawn::tool::spawn_pod_tool;
|
||||||
use protocol::Method;
|
use protocol::stream::{JsonLineReader, JsonLineWriter};
|
||||||
use protocol::stream::JsonLineReader;
|
use protocol::{Event, Method};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
@ -97,16 +97,22 @@ async fn bind_mock_pod_socket(runtime_base: &Path, pod_name: &str) -> (PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Launch a tokio task that accepts connections until one carries a
|
/// Launch a tokio task that accepts connections until one carries a
|
||||||
/// `Method` line, then returns it. `wait_for_socket` inside the tool
|
/// `Method` line, then acknowledges it and returns it. `wait_for_socket`
|
||||||
/// makes a probe connection that carries no data, so the task must
|
/// inside the tool makes a probe connection that carries no data, so the
|
||||||
/// tolerate an empty connection and keep listening.
|
/// task must tolerate an empty connection and keep listening.
|
||||||
fn accept_one_method(listener: UnixListener) -> tokio::task::JoinHandle<Option<Method>> {
|
fn accept_one_method(listener: UnixListener) -> tokio::task::JoinHandle<Option<Method>> {
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
loop {
|
loop {
|
||||||
let (stream, _) = listener.accept().await.ok()?;
|
let (stream, _) = listener.accept().await.ok()?;
|
||||||
let (reader, _writer) = stream.into_split();
|
let (reader, writer) = stream.into_split();
|
||||||
let mut r = JsonLineReader::new(reader);
|
let mut r = JsonLineReader::new(reader);
|
||||||
|
let mut w = JsonLineWriter::new(writer);
|
||||||
if let Ok(Some(method)) = r.next::<Method>().await {
|
if let Ok(Some(method)) = r.next::<Method>().await {
|
||||||
|
w.write(&Event::UserMessage {
|
||||||
|
segments: vec![protocol::Segment::text("accepted")],
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.ok()?;
|
||||||
return Some(method);
|
return Some(method);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user