Compare commits

...

9 Commits

15 changed files with 692 additions and 48 deletions

View File

@ -900,28 +900,15 @@ async fn controller_loop<C, St>(
Method::ListCompletions { .. } => {} Method::ListCompletions { .. } => {}
Method::PodEvent(event) => { Method::PodEvent(event) => {
// For agent-visible PodEvents, live echo travels through the if handle_inbound_pod_event(
// SystemItem lane: once the interceptor drains the notify buffer, event,
// the typed `SystemItem::PodEvent` lands as a
// `LogEntry::SystemItem` entry and the sink forwards it
// to clients as `Event::SystemItem`. Control-plane-only
// PodEvents use this same receive path only for side effects.
//
// (1) system side effects — idempotent and tolerant of
// out-of-order delivery (e.g. `TurnEnded` arriving
// after `ShutDown`).
crate::ipc::event::apply_event_side_effects(
&event,
&spawned_registry, &spawned_registry,
&spawner_name, &spawner_name,
&self_parent_socket, self_parent_socket.as_ref(),
&notify_buffer,
) )
.await; .await
// (2) agent-visible events enter the notification/history lane. {
// Control-plane-only events (currently ScopeSubDelegated)
// stop after side effects so they do not wake or notify the LLM.
if event.should_notify_agent() {
pod.push_pod_event_notify(event);
// Auto-kick a turn if the Pod is idle so the // Auto-kick a turn if the Pod is idle so the
// notification is not stranded. Matches the // notification is not stranded. Matches the
// `Method::Notify` idle path. // `Method::Notify` idle path.
@ -961,6 +948,35 @@ async fn controller_loop<C, St>(
let _ = shutdown_tx.send(()); let _ = shutdown_tx.send(());
} }
/// Apply an inbound child `PodEvent` exactly once.
///
/// Side effects are control-plane state updates and upward propagation; they
/// run for every event. Only agent-visible events are staged on the notify
/// buffer. The caller owns lifecycle-dependent follow-up such as idle
/// `RunForNotification` auto-kick.
async fn handle_inbound_pod_event(
event: protocol::PodEvent,
spawned_registry: &Arc<SpawnedPodRegistry>,
self_name: &str,
parent_socket: Option<&PathBuf>,
notify_buffer: &NotifyBuffer,
) -> bool {
let self_parent_socket = parent_socket.cloned();
crate::ipc::event::apply_event_side_effects(
&event,
spawned_registry,
self_name,
&self_parent_socket,
)
.await;
let notify_agent = event.should_notify_agent();
if notify_agent {
notify_buffer.push_pod_event(event);
}
notify_agent
}
/// Drives a Pod future (one in-flight turn) while concurrently /// Drives a Pod future (one in-flight turn) while concurrently
/// processing incoming methods through an inner select! arm. Returns /// processing incoming methods through an inner select! arm. Returns
/// `(final_status, shutdown_requested)`. /// `(final_status, shutdown_requested)`.
@ -1095,23 +1111,17 @@ where
// mpsc is consume-once, so we cannot defer this // mpsc is consume-once, so we cannot defer this
// to the next main-loop iteration — drop here // to the next main-loop iteration — drop here
// would lose the event entirely (children fire // would lose the event entirely (children fire
// and forget). Apply the side effects inline // and forget). Auto-kick remains unnecessary here:
// and, for agent-visible variants, stage the typed // the in-flight turn will drain agent-visible events
// event on the notification buffer so the in-flight // from the notify buffer on its next history append.
// turn's next `pending_history_appends` surfaces it handle_inbound_pod_event(
// as a typed `SystemItem::PodEvent`. Control-plane-only event,
// variants stop after side effects.
let self_parent_socket = parent_socket.cloned();
crate::ipc::event::apply_event_side_effects(
&event,
spawned_registry, spawned_registry,
self_name, self_name,
&self_parent_socket, parent_socket,
notify_buffer,
) )
.await; .await;
if event.should_notify_agent() {
notify_buffer.push_pod_event(event);
}
} }
None => { None => {
let _ = cancel_tx.try_send(()); let _ = cancel_tx.try_send(());

View File

@ -656,7 +656,7 @@ fn row_status_label(entry: &PodListEntry) -> (&'static str, Style) {
.fg(Color::Cyan) .fg(Color::Cyan)
.add_modifier(Modifier::BOLD), .add_modifier(Modifier::BOLD),
), ),
None => ("live unknown", Style::default().fg(Color::DarkGray)), None => ("live", Style::default().fg(Color::DarkGray)),
}; };
} }
if entry if entry
@ -1194,6 +1194,31 @@ mod tests {
assert!(app.selected_send_disabled_reason().is_none()); assert!(app.selected_send_disabled_reason().is_none());
} }
#[test]
fn multi_status_label_for_live_without_reported_status_is_softened() {
let mut live = live_info("probing", PodStatus::Idle);
live.status = None;
let app = test_app(vec![live]);
let (label, _) = row_status_label(app.list.selected_entry().unwrap());
assert_eq!(label, "live");
}
#[test]
fn multi_status_labels_preserve_explicit_live_statuses() {
for (status, expected_label) in [
(PodStatus::Idle, "live idle"),
(PodStatus::Running, "live running"),
(PodStatus::Paused, "live paused"),
] {
let app = test_app(vec![live_info("pod", status)]);
let (label, _) = row_status_label(app.list.selected_entry().unwrap());
assert_eq!(label, expected_label);
}
}
#[test] #[test]
fn multi_running_paused_and_stopped_targets_are_direct_send_disabled() { fn multi_running_paused_and_stopped_targets_are_direct_send_disabled() {
let mut app = test_app(vec![ let mut app = test_app(vec![

View File

@ -291,19 +291,39 @@ pub(crate) async fn read_reachable_live_pod_infos(
store: &FsStore, store: &FsStore,
) -> Result<Vec<LivePodInfo>, io::Error> { ) -> Result<Vec<LivePodInfo>, io::Error> {
let records = read_live_pod_infos()?; let records = read_live_pod_infos()?;
let mut reachable = Vec::new(); probe_reachable_live_pod_infos(store, records).await
for mut record in records { }
let Ok(status) = probe_live_status(&record.socket_path).await else {
async fn probe_reachable_live_pod_infos(
store: &FsStore,
records: Vec<LivePodInfo>,
) -> Result<Vec<LivePodInfo>, io::Error> {
let mut handles = Vec::with_capacity(records.len());
for record in records {
handles.push(tokio::spawn(probe_live_pod_info(record)));
}
let mut reachable = Vec::with_capacity(handles.len());
for handle in handles {
let result = handle
.await
.map_err(|e| io::Error::other(format!("live status probe task failed: {e}")))?;
let Ok(mut record) = result else {
continue; continue;
}; };
record.reachable = true;
record.status = status;
record.summary = summarize_live_pod(store, &record); record.summary = summarize_live_pod(store, &record);
reachable.push(record); reachable.push(record);
} }
Ok(reachable) Ok(reachable)
} }
async fn probe_live_pod_info(mut record: LivePodInfo) -> Result<LivePodInfo, io::Error> {
let status = probe_live_status(&record.socket_path).await?;
record.reachable = true;
record.status = status;
Ok(record)
}
pub(crate) fn live_socket_for_pod(pod_name: &str) -> Option<PathBuf> { pub(crate) fn live_socket_for_pod(pod_name: &str) -> Option<PathBuf> {
read_live_pod_infos() read_live_pod_infos()
.ok()? .ok()?
@ -343,7 +363,7 @@ fn corrupt_stored_info(pod_name: String, message: String) -> StoredPodInfo {
} }
} }
const LIVE_STATUS_PROBE_TIMEOUT: Duration = Duration::from_millis(25); const LIVE_STATUS_PROBE_TIMEOUT: Duration = Duration::from_millis(200);
async fn probe_live_status(socket_path: &Path) -> Result<Option<PodStatus>, io::Error> { async fn probe_live_status(socket_path: &Path) -> Result<Option<PodStatus>, io::Error> {
let mut client = PodClient::connect(socket_path).await?; let mut client = PodClient::connect(socket_path).await?;
@ -561,11 +581,16 @@ fn trim_one_line(s: &str, max_chars: usize) -> String {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use std::sync::Arc;
use llm_worker::llm_client::types::RequestConfig; use llm_worker::llm_client::types::RequestConfig;
use pod_store::FsPodStore; use pod_store::FsPodStore;
use pod_store::{PodActiveSegmentRef, PodMetadataStore}; use pod_store::{PodActiveSegmentRef, PodMetadataStore};
use protocol::stream::JsonLineWriter;
use session_store::{new_segment_id, new_session_id}; use session_store::{new_segment_id, new_session_id};
use tempfile::tempdir; use tempfile::tempdir;
use tokio::net::UnixListener;
use tokio::sync::Barrier;
const SOURCE: PodVisibilitySource = PodVisibilitySource::ResumePicker; const SOURCE: PodVisibilitySource = PodVisibilitySource::ResumePicker;
@ -752,6 +777,30 @@ mod tests {
); );
} }
#[test]
fn live_reachable_row_without_reported_status_can_open_but_not_send_now() {
let mut live = live_info("live", PodStatus::Idle);
live.status = None;
live.reachable = true;
let entry = single_entry(PodList::from_sources(SOURCE, vec![], vec![live], None, 10));
assert!(entry.actions.can_open);
assert!(!entry.actions.can_restore);
assert!(!entry.actions.can_send_now);
assert!(!entry.actions.can_queue_send);
assert_eq!(
entry.attach_socket_path(),
Some(Path::new("/tmp/live.sock"))
);
assert!(
!entry
.diagnostics
.iter()
.any(|diagnostic| diagnostic.kind == PodEntryDiagnosticKind::LiveUnreachable)
);
}
#[test] #[test]
fn live_running_reachable_row_can_open_but_not_send_now() { fn live_running_reachable_row_can_open_but_not_send_now() {
let entry = single_entry(PodList::from_sources( let entry = single_entry(PodList::from_sources(
@ -811,6 +860,82 @@ mod tests {
assert_eq!(status, Some(PodStatus::Idle)); assert_eq!(status, Some(PodStatus::Idle));
} }
#[tokio::test]
async fn live_status_probes_run_concurrently() {
let store_dir = tempdir().unwrap();
let store = FsStore::new(store_dir.path()).unwrap();
let socket_dir = tempdir().unwrap();
let probe_count = 3;
let barrier = Arc::new(Barrier::new(probe_count));
let mut records = Vec::new();
let mut servers = Vec::new();
for index in 0..probe_count {
let pod_name = format!("pod-{index}");
let socket_path = socket_dir.path().join(format!("{pod_name}.sock"));
let listener = UnixListener::bind(&socket_path).unwrap();
let barrier = Arc::clone(&barrier);
servers.push(tokio::spawn(async move {
let (stream, _) = listener.accept().await.unwrap();
barrier.wait().await;
let mut writer = JsonLineWriter::new(stream);
writer
.write(&Event::Status {
status: PodStatus::Idle,
})
.await
.unwrap();
}));
records.push(live_probe_record(&pod_name, socket_path));
}
let records = tokio::time::timeout(
LIVE_STATUS_PROBE_TIMEOUT * 3,
probe_reachable_live_pod_infos(&store, records),
)
.await
.expect("status probes should complete")
.unwrap();
assert_eq!(records.len(), probe_count);
assert!(records.iter().all(|record| record.reachable));
assert!(
records
.iter()
.all(|record| record.status == Some(PodStatus::Idle))
);
for server in servers {
server.await.unwrap();
}
}
#[tokio::test]
async fn live_status_probe_timeout_still_marks_socket_reachable() {
let store_dir = tempdir().unwrap();
let store = FsStore::new(store_dir.path()).unwrap();
let socket_dir = tempdir().unwrap();
let socket_path = socket_dir.path().join("silent.sock");
let listener = UnixListener::bind(&socket_path).unwrap();
let server = tokio::spawn(async move {
let (_stream, _) = listener.accept().await.unwrap();
std::future::pending::<()>().await;
});
let records = probe_reachable_live_pod_infos(
&store,
vec![live_probe_record("silent", socket_path.clone())],
)
.await
.unwrap();
assert_eq!(records.len(), 1);
assert_eq!(records[0].pod_name, "silent");
assert!(records[0].reachable);
assert_eq!(records[0].status, None);
assert_eq!(records[0].socket_path, socket_path);
server.abort();
}
#[test] #[test]
fn corrupt_stored_metadata_has_diagnostic() { fn corrupt_stored_metadata_has_diagnostic() {
let entry = single_entry(PodList::from_sources( let entry = single_entry(PodList::from_sources(
@ -985,6 +1110,17 @@ mod tests {
} }
} }
fn live_probe_record(pod_name: &str, socket_path: PathBuf) -> LivePodInfo {
LivePodInfo {
pod_name: pod_name.to_string(),
socket_path,
status: None,
reachable: false,
segment_id: None,
summary: PodEntrySummary::default(),
}
}
fn test_greeting() -> protocol::Greeting { fn test_greeting() -> protocol::Greeting {
protocol::Greeting { protocol::Greeting {
pod_name: "live".to_string(), pod_name: "live".to_string(),

View File

@ -2,12 +2,12 @@
id: 20260527-000007-pod-inbound-pod-event-dedup id: 20260527-000007-pod-inbound-pod-event-dedup
slug: pod-inbound-pod-event-dedup slug: pod-inbound-pod-event-dedup
title: Inbound PodEvent ハンドリングの重複を統合する title: Inbound PodEvent ハンドリングの重複を統合する
status: open status: closed
kind: task kind: task
priority: P2 priority: P2
labels: [migrated] labels: [migrated]
created_at: 2026-05-27T00:00:07Z created_at: 2026-05-27T00:00:07Z
updated_at: 2026-05-27T00:00:07Z updated_at: 2026-05-30T05:37:00Z
assignee: null assignee: null
legacy_ticket: tickets/pod-inbound-pod-event-dedup.md legacy_ticket: tickets/pod-inbound-pod-event-dedup.md
--- ---

View File

@ -0,0 +1,76 @@
---
id: 20260527-000007-pod-inbound-pod-event-dedup
slug: pod-inbound-pod-event-dedup
title: Inbound PodEvent ハンドリングの重複を統合する
status: closed
kind: task
priority: P2
labels: [migrated]
created_at: 2026-05-27T00:00:07Z
updated_at: 2026-05-30T05:37:00Z
assignee: null
legacy_ticket: tickets/pod-inbound-pod-event-dedup.md
---
## Migration reference
- legacy_ticket: tickets/pod-inbound-pod-event-dedup.md
- migrated_from: TODO.md / tickets directory migration on 2026-05-27
# Inbound PodEvent ハンドリングの重複を統合する
## 背景
子 Pod から `Method::PodEvent(event)` を受けたときの処理が `controller_loop``drive_turn` の 2 箇所にコピーされている。
`controller.rs:693-720`idle / paused 中):
```rust
Method::PodEvent(event) => {
crate::ipc::event::apply_event_side_effects(
&event, &spawned_registry, &spawner_name, &self_parent_socket,
).await;
pod.push_pod_event_notify(event);
if shared_state.get_status() == PodStatus::Idle {
pending = Some(PendingRun::RunForNotification);
}
}
```
`controller.rs:861-879`in-flight turn 中):
```rust
Some(Method::PodEvent(event)) => {
let self_parent_socket = parent_socket.cloned();
crate::ipc::event::apply_event_side_effects(
&event, spawned_registry, self_name, &self_parent_socket,
).await;
notify_buffer.push_pod_event(event);
}
```
差分は 2 点:
1. **buffer への push 経路**: `pod.push_pod_event_notify(event)` vs `notify_buffer.push_pod_event(event)`。両者は同じ `NotifyBuffer` を叩く(`pod.rs:845-846` は `self.pending_notifies.push_pod_event(event)` を呼ぶだけで、`notify_buffer_handle()` はその `pending_notifies.clone()` を返す)。**完全に等価**。
2. **auto-kick**: idle 経路だけ `PendingRun::RunForNotification` を stage する。in-flight 経路は in-flight 自体が消化するので不要。
つまり「event の処理本体」side-effects + notify buffer への pushは同一で、後段の auto-kick だけが state-dependent な分岐。にもかかわらず関数化されておらず、片方をいじってもう片方を忘れると挙動が割れる。
## 要件
- side-effects 適用 + NotifyBuffer への typed push の流れを単一関数 `handle_inbound_pod_event` に切り出す。
- `controller_loop` / `drive_turn` の両方からこのヘルパーを呼ぶ形に置き換える。
- auto-kick (`PendingRun::RunForNotification` の stage) は呼び出し側の責務として残す。これは Pod のライフサイクル状態に依存した判断で、ヘルパー内には押し込めない。
- 関数シグネチャは引数を最小化する。`event`、`spawned_registry`、`self_name: &str`、`self_parent_socket: &Option<PathBuf>` または `Option<&PathBuf>`、`notify_buffer: &NotifyBuffer` の 5 つで足りる前提。`Pod` への可変参照は不要(`notify_buffer` で代用可能)。
- 動作変化なし。既存の `Method::PodEvent` 挙動in-flight / idle 両方)が完全に同一で続行すること。
## 完了条件
- `controller.rs` 内に `apply_event_side_effects` 呼び出しが 1 箇所だけ残り、`controller_loop` と `drive_turn``Method::PodEvent` アームはどちらも `handle_inbound_pod_event(...)` 呼び出し + idle 経路のみ auto-kick stage、という形になる。
- 既存の inbound PodEvent 関連テスト(特に `apply_event_side_effects` の idempotency や `notify_buffer` への typed pushが通る。
## 範囲外
- `apply_event_side_effects` 自体の中身変更。
- `NotifyBuffer` API のリネーム / 統合。
- `pod.push_pod_event_notify` の削除([[pod-interrupt-prep-internalize]] と同じく将来の整理対象だが、本チケットでは外向き API は触らない)。

View File

@ -0,0 +1,91 @@
<!-- event: migration author: tickets.sh-migration at: 2026-05-27T00:00:07Z -->
## Migrated
Migrated from tickets/pod-inbound-pod-event-dedup.md. No legacy review file was present at migration time.
---
<!-- event: close author: hare at: 2026-05-30T05:37:00Z status: closed -->
## Closed
---
id: 20260527-000007-pod-inbound-pod-event-dedup
slug: pod-inbound-pod-event-dedup
title: Inbound PodEvent ハンドリングの重複を統合する
status: closed
kind: task
priority: P2
labels: [migrated]
created_at: 2026-05-27T00:00:07Z
updated_at: 2026-05-30T05:37:00Z
assignee: null
legacy_ticket: tickets/pod-inbound-pod-event-dedup.md
---
## Migration reference
- legacy_ticket: tickets/pod-inbound-pod-event-dedup.md
- migrated_from: TODO.md / tickets directory migration on 2026-05-27
# Inbound PodEvent ハンドリングの重複を統合する
## 背景
子 Pod から `Method::PodEvent(event)` を受けたときの処理が `controller_loop``drive_turn` の 2 箇所にコピーされている。
`controller.rs:693-720`idle / paused 中):
```rust
Method::PodEvent(event) => {
crate::ipc::event::apply_event_side_effects(
&event, &spawned_registry, &spawner_name, &self_parent_socket,
).await;
pod.push_pod_event_notify(event);
if shared_state.get_status() == PodStatus::Idle {
pending = Some(PendingRun::RunForNotification);
}
}
```
`controller.rs:861-879`in-flight turn 中):
```rust
Some(Method::PodEvent(event)) => {
let self_parent_socket = parent_socket.cloned();
crate::ipc::event::apply_event_side_effects(
&event, spawned_registry, self_name, &self_parent_socket,
).await;
notify_buffer.push_pod_event(event);
}
```
差分は 2 点:
1. **buffer への push 経路**: `pod.push_pod_event_notify(event)` vs `notify_buffer.push_pod_event(event)`。両者は同じ `NotifyBuffer` を叩く(`pod.rs:845-846` は `self.pending_notifies.push_pod_event(event)` を呼ぶだけで、`notify_buffer_handle()` はその `pending_notifies.clone()` を返す)。**完全に等価**。
2. **auto-kick**: idle 経路だけ `PendingRun::RunForNotification` を stage する。in-flight 経路は in-flight 自体が消化するので不要。
つまり「event の処理本体」side-effects + notify buffer への pushは同一で、後段の auto-kick だけが state-dependent な分岐。にもかかわらず関数化されておらず、片方をいじってもう片方を忘れると挙動が割れる。
## 要件
- side-effects 適用 + NotifyBuffer への typed push の流れを単一関数 `handle_inbound_pod_event` に切り出す。
- `controller_loop` / `drive_turn` の両方からこのヘルパーを呼ぶ形に置き換える。
- auto-kick (`PendingRun::RunForNotification` の stage) は呼び出し側の責務として残す。これは Pod のライフサイクル状態に依存した判断で、ヘルパー内には押し込めない。
- 関数シグネチャは引数を最小化する。`event`、`spawned_registry`、`self_name: &str`、`self_parent_socket: &Option<PathBuf>` または `Option<&PathBuf>`、`notify_buffer: &NotifyBuffer` の 5 つで足りる前提。`Pod` への可変参照は不要(`notify_buffer` で代用可能)。
- 動作変化なし。既存の `Method::PodEvent` 挙動in-flight / idle 両方)が完全に同一で続行すること。
## 完了条件
- `controller.rs` 内に `apply_event_side_effects` 呼び出しが 1 箇所だけ残り、`controller_loop` と `drive_turn``Method::PodEvent` アームはどちらも `handle_inbound_pod_event(...)` 呼び出し + idle 経路のみ auto-kick stage、という形になる。
- 既存の inbound PodEvent 関連テスト(特に `apply_event_side_effects` の idempotency や `notify_buffer` への typed pushが通る。
## 範囲外
- `apply_event_side_effects` 自体の中身変更。
- `NotifyBuffer` API のリネーム / 統合。
- `pod.push_pod_event_notify` の削除([[pod-interrupt-prep-internalize]] と同じく将来の整理対象だが、本チケットでは外向き API は触らない)。
---

View File

@ -0,0 +1,45 @@
---
id: 20260530-053259-multi-pod-parallel-status-probes
slug: multi-pod-parallel-status-probes
title: Parallelize multi-Pod live status probes
status: closed
kind: task
priority: P2
labels: [tui, pod-dashboard, performance]
created_at: 2026-05-30T05:32:59Z
updated_at: 2026-05-30T05:45:37Z
assignee: null
legacy_ticket: null
---
## Background
The `--multi` dashboard frequently shows `[live unknown]` for reachable Pods. Current code probes each runtime-registry socket with a very short `LIVE_STATUS_PROBE_TIMEOUT` of 25ms in `crates/tui/src/pod_list.rs`. A live row becomes `status = None` when the socket connects but no `Event::Snapshot` / `Event::Status` is read before that deadline.
That label is misleading: the Pod is reachable, but status probing timed out or did not receive a status event quickly enough. Raising the timeout alone risks making dashboard reload latency scale linearly with the number of live Pods, because status probes are currently performed sequentially.
## Requirements
- Increase the live status probe timeout to a more realistic value, likely in the 150ms250ms range.
- Run live status probes concurrently so reload latency does not become the sum of all per-Pod timeouts.
- Keep reachable Pods with missing status as live/attachable; do not treat status timeout as unreachable.
- Keep restoreability separate from live attachability; this ticket must not make runtime-only Pods restorable.
- Replace or soften the `live unknown` label in `--multi` so it communicates reachable-live-with-unreported-status rather than broken state. Candidate labels: `live`, `live probing`, or similar.
- Keep the implementation in shared `PodList` / live probe code where possible; avoid duplicating dashboard-specific discovery logic.
- Preserve existing behavior for explicitly reported `Idle`, `Running`, and `Paused` statuses.
## Non-goals
- Do not redesign Pod notification or run completion delivery.
- Do not persist last-known status in pod-store.
- Do not change `AttachOrRestorePod` or restore semantics.
- Do not make unreachable registry allocations appear attachable.
## Acceptance criteria
- Multiple live Pod status probes wait concurrently, not strictly one after another.
- The per-Pod timeout is long enough to significantly reduce false `status = None` cases compared to 25ms.
- A reachable Pod whose status probe times out remains displayed as live and openable/attachable.
- The multi-Pod row label for `status = None` is less misleading than `live unknown`.
- Tests cover concurrent probing behavior, timeout/none-status handling, and label rendering.
- `cargo test -p tui pod_list`, `cargo test -p tui multi_pod`, `cargo test -p tui`, `cargo fmt --check`, and `./tickets.sh doctor` pass.

View File

@ -0,0 +1,45 @@
---
id: 20260530-053259-multi-pod-parallel-status-probes
slug: multi-pod-parallel-status-probes
title: Parallelize multi-Pod live status probes
status: closed
kind: task
priority: P2
labels: [tui, pod-dashboard, performance]
created_at: 2026-05-30T05:32:59Z
updated_at: 2026-05-30T05:45:37Z
assignee: null
legacy_ticket: null
---
## Background
The `--multi` dashboard frequently shows `[live unknown]` for reachable Pods. Current code probes each runtime-registry socket with a very short `LIVE_STATUS_PROBE_TIMEOUT` of 25ms in `crates/tui/src/pod_list.rs`. A live row becomes `status = None` when the socket connects but no `Event::Snapshot` / `Event::Status` is read before that deadline.
That label is misleading: the Pod is reachable, but status probing timed out or did not receive a status event quickly enough. Raising the timeout alone risks making dashboard reload latency scale linearly with the number of live Pods, because status probes are currently performed sequentially.
## Requirements
- Increase the live status probe timeout to a more realistic value, likely in the 150ms250ms range.
- Run live status probes concurrently so reload latency does not become the sum of all per-Pod timeouts.
- Keep reachable Pods with missing status as live/attachable; do not treat status timeout as unreachable.
- Keep restoreability separate from live attachability; this ticket must not make runtime-only Pods restorable.
- Replace or soften the `live unknown` label in `--multi` so it communicates reachable-live-with-unreported-status rather than broken state. Candidate labels: `live`, `live probing`, or similar.
- Keep the implementation in shared `PodList` / live probe code where possible; avoid duplicating dashboard-specific discovery logic.
- Preserve existing behavior for explicitly reported `Idle`, `Running`, and `Paused` statuses.
## Non-goals
- Do not redesign Pod notification or run completion delivery.
- Do not persist last-known status in pod-store.
- Do not change `AttachOrRestorePod` or restore semantics.
- Do not make unreachable registry allocations appear attachable.
## Acceptance criteria
- Multiple live Pod status probes wait concurrently, not strictly one after another.
- The per-Pod timeout is long enough to significantly reduce false `status = None` cases compared to 25ms.
- A reachable Pod whose status probe times out remains displayed as live and openable/attachable.
- The multi-Pod row label for `status = None` is less misleading than `live unknown`.
- Tests cover concurrent probing behavior, timeout/none-status handling, and label rendering.
- `cargo test -p tui pod_list`, `cargo test -p tui multi_pod`, `cargo test -p tui`, `cargo fmt --check`, and `./tickets.sh doctor` pass.

View File

@ -0,0 +1,116 @@
<!-- event: create author: tickets.sh at: 2026-05-30T05:32:59Z -->
## Created
Created by tickets.sh create.
---
<!-- event: plan author: hare at: 2026-05-30T05:33:43Z -->
## Plan
## Preflight
Classification: implementation-ready.
The issue is localized to TUI live status probing and dashboard labeling. The product decision is settled: reachable Pods with missing status remain live/attachable, while status probe timeout should be less frequent and less alarming. No restore semantics or Pod runtime protocol changes are required.
Current code map:
- `crates/tui/src/pod_list.rs`: `LIVE_STATUS_PROBE_TIMEOUT`, `load_live_pods`, `probe_live_status`, `LivePodInfo`, `PodList` merge/sort tests.
- `crates/tui/src/multi_pod.rs`: row label rendering for live status, currently `live unknown` when `LivePodInfo.status` is `None`.
Critical risks:
- Do not make reload latency scale as N * timeout.
- Do not treat `status = None` as unreachable.
- Do not change restoreability/attachability semantics.
- Avoid duplicating discovery logic outside shared `PodList`.
---
<!-- event: review author: hare at: 2026-05-30T05:45:14Z status: approve -->
## Review: approve
Approve.
The implementation addresses the ticket in the shared `PodList` live-probe path. Live status probes now run concurrently with a more realistic per-Pod timeout, reachable sockets without reported status remain live/attachable, and the multi-Pod label is softened from `live unknown` to `live` while explicit statuses remain unchanged.
Blocker findings: none.
Requirement coverage:
- Per-Pod timeout increased to 200ms.
- Probes are concurrent and bounded by the slowest bounded probe rather than `N * timeout`.
- Reachable sockets with no status remain reachable/live/attachable.
- Restoreability remains separate and unchanged.
- No last-known status is persisted to pod-store.
- Explicit `live idle`, `live running`, and `live paused` labels remain.
- `status = None` renders as `live`.
- The implementation changes shared `PodList` live-probe logic rather than duplicating dashboard-only logic.
- Tests cover concurrency, timeout/no-status reachable handling, and label rendering.
Validation reviewed:
- `cargo fmt --check` — passed.
- `cargo test -p tui pod_list` — passed.
- `cargo test -p tui multi_pod` — passed.
- `cargo test -p tui` — passed.
- Reviewer additionally ran `./tickets.sh doctor` — passed.
Final verdict: approve.
---
<!-- event: close author: hare at: 2026-05-30T05:45:37Z status: closed -->
## Closed
---
id: 20260530-053259-multi-pod-parallel-status-probes
slug: multi-pod-parallel-status-probes
title: Parallelize multi-Pod live status probes
status: closed
kind: task
priority: P2
labels: [tui, pod-dashboard, performance]
created_at: 2026-05-30T05:32:59Z
updated_at: 2026-05-30T05:45:37Z
assignee: null
legacy_ticket: null
---
## Background
The `--multi` dashboard frequently shows `[live unknown]` for reachable Pods. Current code probes each runtime-registry socket with a very short `LIVE_STATUS_PROBE_TIMEOUT` of 25ms in `crates/tui/src/pod_list.rs`. A live row becomes `status = None` when the socket connects but no `Event::Snapshot` / `Event::Status` is read before that deadline.
That label is misleading: the Pod is reachable, but status probing timed out or did not receive a status event quickly enough. Raising the timeout alone risks making dashboard reload latency scale linearly with the number of live Pods, because status probes are currently performed sequentially.
## Requirements
- Increase the live status probe timeout to a more realistic value, likely in the 150ms250ms range.
- Run live status probes concurrently so reload latency does not become the sum of all per-Pod timeouts.
- Keep reachable Pods with missing status as live/attachable; do not treat status timeout as unreachable.
- Keep restoreability separate from live attachability; this ticket must not make runtime-only Pods restorable.
- Replace or soften the `live unknown` label in `--multi` so it communicates reachable-live-with-unreported-status rather than broken state. Candidate labels: `live`, `live probing`, or similar.
- Keep the implementation in shared `PodList` / live probe code where possible; avoid duplicating dashboard-specific discovery logic.
- Preserve existing behavior for explicitly reported `Idle`, `Running`, and `Paused` statuses.
## Non-goals
- Do not redesign Pod notification or run completion delivery.
- Do not persist last-known status in pod-store.
- Do not change `AttachOrRestorePod` or restore semantics.
- Do not make unreachable registry allocations appear attachable.
## Acceptance criteria
- Multiple live Pod status probes wait concurrently, not strictly one after another.
- The per-Pod timeout is long enough to significantly reduce false `status = None` cases compared to 25ms.
- A reachable Pod whose status probe times out remains displayed as live and openable/attachable.
- The multi-Pod row label for `status = None` is less misleading than `live unknown`.
- Tests cover concurrent probing behavior, timeout/none-status handling, and label rendering.
- `cargo test -p tui pod_list`, `cargo test -p tui multi_pod`, `cargo test -p tui`, `cargo fmt --check`, and `./tickets.sh doctor` pass.
---

View File

@ -1,7 +0,0 @@
<!-- event: migration author: tickets.sh-migration at: 2026-05-27T00:00:07Z -->
## Migrated
Migrated from tickets/pod-inbound-pod-event-dedup.md. No legacy review file was present at migration time.
---

View File

@ -0,0 +1,74 @@
---
id: 20260530-053721-tui-inflight-composer-injection
slug: tui-inflight-composer-injection
title: Support immediate in-flight TUI composer injection
status: open
kind: feature
priority: P2
labels: [tui, worker, interrupt, ux]
created_at: 2026-05-30T05:37:21Z
updated_at: 2026-05-30T05:38:11Z
assignee: null
legacy_ticket: null
---
## Background
The TUI currently lets the user press Enter while a Pod is executing, but that input is queued for the next turn. This is useful when the user wants to continue the task after the current run finishes.
There is a separate UX need: while the model is in the middle of a long run with tool calls, the user may want to send urgent supplemental context that should be seen as soon as possible, ideally between tool calls / LLM calls during the current run. This is different from ordinary queued input.
We want both modes:
- **after-run queue**: “when this task finishes, continue with this next request.”
- **in-flight injection**: “while you are still working, please incorporate this additional context as soon as safe.”
This ticket is for designing and implementing an explicit TUI path for the second mode without breaking the existing queued-input behavior.
## Requirements
- Preserve the current Enter-while-running behavior as the after-run queue.
- Add an explicit user action / keybinding / command for immediate in-flight injection while a run is active.
- In-flight injected text must be delivered through the Pod/Worker history path, not as hidden context-only injection. It must satisfy the project principle that new input placed into LLM context is first appended to `worker.history` / persisted history.
- In-flight injection should be consumed at safe boundaries, such as before the next LLM request or between tool-call cycles, not by mutating an already-open provider stream.
- The UI must make the distinction visible: queued-for-next-turn vs injected-into-current-run.
- If no run is active, the immediate-injection action should either behave like normal submit or clearly report that there is no in-flight run to inject into.
- If the current turn cannot accept in-flight input at a safe boundary, the UI should fail closed or fall back to explicit queued mode with a visible notice; do not silently drop input.
- Preserve TUI-local input history behavior for submitted/queued text.
## Non-goals
- Do not interrupt/cancel the current run as part of this ticket.
- Do not mutate provider streams already in progress.
- Do not introduce hidden system-reminder/context-only messages that are not recorded in history.
- Do not remove the existing queued composer behavior.
- Do not redesign the entire Pod notification/input protocol unless a small typed Method/Event extension is required.
## Open design questions
- What should the TUI action be?
- Separate command such as `:inject`?
- Modified Enter keybinding such as Ctrl+Enter / Alt+Enter?
- Action menu entry?
- What Pod protocol shape is best?
- Existing `Method::Notify` may already represent in-flight user-visible context, but semantics must be checked.
- A new typed method such as `Method::InjectInput` may be clearer if `Notify` is too generic.
- What history item should represent the injected text?
- User item?
- System item with user-originated note?
- Existing Notify / PodEvent item?
- What exact safe boundaries are supported in `Worker` / controller today?
- before the next LLM request;
- before resuming after tool results;
- while a tool call is running;
- while provider stream is open.
- How should the UI display pending in-flight injection versus after-run queue?
## Acceptance criteria
- TUI users can choose between after-run queued submit and immediate in-flight injection while a Pod is running.
- In-flight injected input is recorded in history before it can influence an LLM request.
- In-flight injection is consumed only at safe boundaries and never mutates an active provider stream.
- The TUI visibly distinguishes queued-next-turn input from injected-current-run input.
- Existing queued Enter behavior remains intact.
- Tests cover TUI input routing, protocol/controller handling, worker history append behavior, and safe-boundary behavior.

View File

@ -0,0 +1,33 @@
<!-- event: create author: tickets.sh at: 2026-05-30T05:37:21Z -->
## Created
Created by tickets.sh create.
---
<!-- event: plan author: hare at: 2026-05-30T05:38:11Z -->
## Plan
## Initial preflight
Classification: requirements-sync-needed.
The user requirement is clear at the UX level: Enter while running remains an after-run queue, and a separate action should inject supplemental context into the current in-flight run as soon as safe. The exact protocol/history representation is not decided yet and must be designed before implementation.
Critical constraints:
- Do not place injected text into LLM context unless it has first been appended to Worker history / persisted history.
- Do not mutate an active provider stream.
- Consume injected text only at safe boundaries such as before a later LLM request or between tool-call cycles.
- Do not silently drop text; if the active turn cannot accept injection, report/fail closed or explicitly queue.
Design questions to settle before coding:
- TUI action/keybinding/command name.
- Whether existing `Method::Notify` is semantically sufficient or a new typed method is needed.
- Which history item represents user-originated in-flight supplemental context.
- Which Worker/controller boundaries can actually observe injected input before the next LLM call.
- How queued-next-turn vs injected-current-run is displayed.
---