fix: harden StopPod registry cleanup

This commit is contained in:
Keisuke Hirata 2026-06-24 00:23:10 +09:00
parent b28d64c3c6
commit 1ca5663298
No known key found for this signature in database
6 changed files with 94 additions and 31 deletions

View File

@ -31,6 +31,7 @@ Podの状態から純粋に再現可能で、且つ揮発性の無い操作で
明示的に指示されない限り、読み取り以外の操作は控えること。
基本はworktree上の一時的なブランチでコミットを重ね、メインブランチに取り込む運用をしている。
Orchestrator の cwd が orchestration 用ブランチ/worktree の場合、通常作業では親ブランチの dirty state を気にしない。
コミットメッセージは適当に`<prefix>: *簡潔な1行*`で書いている。
外部の参考プロジェクトは必要に応じてローカルの外部 checkout からReadすること。

View File

@ -547,9 +547,6 @@ fn append_operation_targets(out: &mut String, context: &TicketRoleLaunchContext)
if context.role != TicketRole::Orchestrator {
return;
}
if context.original_workspace_root.is_none() && context.target_workspace_root.is_none() {
return;
}
out.push_str("\nOrchestrator operation targets:\n");
push_bounded_bullet(
@ -557,13 +554,6 @@ fn append_operation_targets(out: &mut String, context: &TicketRoleLaunchContext)
"implementation_worktree_root",
&context.implementation_worktree_root().display().to_string(),
);
if context.target_workspace_root.is_some() {
push_bounded_bullet(
out,
"merge_target_workspace_root",
&context.target_workspace_root().display().to_string(),
);
}
}
fn default_pod_name(role: TicketRole, ticket: Option<&TicketRef>) -> String {
@ -706,6 +696,7 @@ mod tests {
context_tokens: 0,
},
status: PodStatus::Idle,
in_flight: protocol::InFlightSnapshot::default(),
}
}
@ -1152,7 +1143,7 @@ workflow = "ticket-review-workflow"
assert!(text.contains("Orchestrator operation targets:"));
assert!(text.contains("implementation_worktree_root"));
assert!(text.contains("merge_target_workspace_root"));
assert!(!text.contains("merge_target_workspace_root"));
assert!(!text.contains("Workspace routing context:"));
assert!(!text.contains("role_workspace_root"));
assert!(!text.contains("role_cwd"));

View File

@ -4,12 +4,17 @@ use std::fs::{DirBuilder, File, OpenOptions};
use std::io::{self, Read, Seek, SeekFrom, Write};
use std::os::unix::fs::{DirBuilderExt, OpenOptionsExt};
use std::path::{Path, PathBuf};
use std::thread;
use std::time::{Duration, Instant};
use fs4::fs_std::FileExt;
use manifest::{ScopeRule, paths};
use serde::{Deserialize, Serialize};
use session_store::SegmentId;
const LOCK_WAIT_TIMEOUT: Duration = Duration::from_secs(10);
const LOCK_WAIT_POLL_INTERVAL: Duration = Duration::from_millis(25);
/// On-disk representation of the allocation table.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct LockFile {
@ -119,7 +124,37 @@ impl LockFileGuard {
.truncate(false)
.mode(0o600)
.open(path)?;
FileExt::lock_exclusive(&file)?;
let started = Instant::now();
loop {
match FileExt::try_lock_exclusive(&file) {
Ok(true) => break,
Ok(false) => {
if started.elapsed() >= LOCK_WAIT_TIMEOUT {
return Err(io::Error::new(
io::ErrorKind::TimedOut,
format!(
"timed out waiting for pod registry lock `{}`",
path.display()
),
));
}
thread::sleep(LOCK_WAIT_POLL_INTERVAL);
}
Err(error) if error.kind() == io::ErrorKind::WouldBlock => {
if started.elapsed() >= LOCK_WAIT_TIMEOUT {
return Err(io::Error::new(
io::ErrorKind::TimedOut,
format!(
"timed out waiting for pod registry lock `{}`",
path.display()
),
));
}
thread::sleep(LOCK_WAIT_POLL_INTERVAL);
}
Err(error) => return Err(error),
}
}
let mut this = Self {
file,
data: LockFile::default(),

View File

@ -35,10 +35,12 @@ type RegistryStateWriter = Arc<dyn Fn(&[SpawnedPodRecord]) -> io::Result<()> + S
type RegistryReclaimWriter = Arc<dyn Fn(&SpawnedPodRecord) -> io::Result<()> + Send + Sync>;
const RESTORE_REACHABILITY_TIMEOUT: Duration = Duration::from_millis(500);
const REGISTRY_CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);
pub struct SpawnedPodRegistry {
records: Mutex<Vec<SpawnedPodRecord>>,
cursors: Mutex<HashMap<String, usize>>,
mutations: Mutex<()>,
runtime_dir: Arc<RuntimeDir>,
state_writer: Option<RegistryStateWriter>,
reclaim_writer: Option<RegistryReclaimWriter>,
@ -56,6 +58,7 @@ impl SpawnedPodRegistry {
Arc::new(Self {
records: Mutex::new(Vec::new()),
cursors: Mutex::new(HashMap::new()),
mutations: Mutex::new(()),
runtime_dir,
state_writer: None,
reclaim_writer: None,
@ -164,6 +167,7 @@ impl SpawnedPodRegistry {
registry: Arc::new(Self {
records: Mutex::new(records),
cursors: Mutex::new(HashMap::new()),
mutations: Mutex::new(()),
runtime_dir,
state_writer: Some(state_writer),
reclaim_writer: Some(reclaim_writer),
@ -178,9 +182,13 @@ impl SpawnedPodRegistry {
/// error if either persisted write fails; the in-memory state is still
/// updated in that case — the next successful write will reconcile.
pub async fn add(&self, record: SpawnedPodRecord) -> io::Result<()> {
let mut records = self.records.lock().await;
records.push(record);
self.persist_records(records.as_slice()).await
let _mutation = self.mutations.lock().await;
let snapshot = {
let mut records = self.records.lock().await;
records.push(record);
records.clone()
};
self.persist_records(&snapshot).await
}
/// Look up a record by pod name. Cloned so callers can drop the lock.
@ -201,29 +209,39 @@ impl SpawnedPodRegistry {
/// reclaim any delegated Write scope owned by that child. Returns the
/// removed record (if any).
pub async fn remove(&self, pod_name: &str) -> io::Result<Option<SpawnedPodRecord>> {
let removed = {
let _mutation = self.mutations.lock().await;
let (removed, snapshot) = {
let mut records = self.records.lock().await;
let idx = records.iter().position(|r| r.pod_name == pod_name);
let removed = idx.map(|i| records.remove(i));
self.persist_records(records.as_slice()).await?;
removed
let snapshot = records.clone();
(removed, snapshot)
};
self.persist_records(&snapshot).await?;
self.cursors.lock().await.remove(pod_name);
if let Some(record) = &removed {
self.reclaim_record(record)?;
if let Some(write_reclaim) = &self.reclaim_writer {
write_reclaim(record)?;
}
self.reclaim_removed_record(record.clone()).await?;
}
Ok(removed)
}
fn reclaim_record(&self, record: &SpawnedPodRecord) -> io::Result<()> {
let Some(parent_name) = &self.parent_name else {
release_child_allocation(&record.pod_name)?;
return Ok(());
};
reclaim_record(parent_name, self.parent_scope.as_ref(), record)
async fn reclaim_removed_record(&self, record: SpawnedPodRecord) -> io::Result<()> {
let parent_name = self.parent_name.clone();
let parent_scope = self.parent_scope.clone();
let reclaim_writer = self.reclaim_writer.clone();
let pod_name = record.pod_name.clone();
let reclaim = tokio::task::spawn_blocking(move || {
reclaim_removed_record_blocking(parent_name, parent_scope, reclaim_writer, record)
});
tokio::time::timeout(REGISTRY_CLEANUP_TIMEOUT, reclaim)
.await
.map_err(|_| {
io::Error::new(
io::ErrorKind::TimedOut,
format!("timed out reclaiming spawned pod `{pod_name}`"),
)
})?
.map_err(|err| io::Error::other(format!("spawned-pod reclaim task failed: {err}")))?
}
/// Read-only cursor lookup. Returns 0 when no cursor has been set.
@ -288,6 +306,23 @@ where
})
}
fn reclaim_removed_record_blocking(
parent_name: Option<String>,
parent_scope: Option<SharedScope>,
reclaim_writer: Option<RegistryReclaimWriter>,
record: SpawnedPodRecord,
) -> io::Result<()> {
if let Some(parent_name) = parent_name {
reclaim_record(&parent_name, parent_scope.as_ref(), &record)?;
} else {
release_child_allocation(&record.pod_name)?;
}
if let Some(write_reclaim) = reclaim_writer {
write_reclaim(&record)?;
}
Ok(())
}
fn reclaim_record(
parent_name: &str,
parent_scope: Option<&SharedScope>,

View File

@ -2890,7 +2890,7 @@ fn build_orchestrator_launch_context(
pod_name: &str,
) -> TicketRoleLaunchContext {
let mut context = TicketRoleLaunchContext::new(
orchestration_workspace_root.to_path_buf(),
original_workspace_root.to_path_buf(),
TicketRole::Orchestrator,
)
.with_cwd(orchestration_workspace_root.to_path_buf())

View File

@ -16,14 +16,15 @@ fn orchestration_worktree_layout_is_stable_under_original_workspace_root() {
}
#[test]
fn orchestrator_launch_context_uses_orchestration_root_for_runtime_workspace() {
fn orchestrator_launch_context_uses_original_root_for_runtime_workspace_and_worktree_cwd() {
let original = PathBuf::from("/repo/yoi");
let orchestration = original
.join(".worktree")
.join("orchestration")
.join("yoi-orchestrator");
let context = build_orchestrator_launch_context(&original, &orchestration, "yoi-orchestrator");
assert_eq!(context.workspace_root, orchestration);
assert_eq!(context.workspace_root, original);
assert_eq!(context.cwd.as_deref(), Some(orchestration.as_path()));
assert_eq!(
context.original_workspace_root.as_deref(),
Some(original.as_path())