fix: harden StopPod registry cleanup
This commit is contained in:
parent
b28d64c3c6
commit
1ca5663298
|
|
@ -31,6 +31,7 @@ Podの状態から純粋に再現可能で、且つ揮発性の無い操作で
|
||||||
|
|
||||||
明示的に指示されない限り、読み取り以外の操作は控えること。
|
明示的に指示されない限り、読み取り以外の操作は控えること。
|
||||||
基本はworktree上の一時的なブランチでコミットを重ね、メインブランチに取り込む運用をしている。
|
基本はworktree上の一時的なブランチでコミットを重ね、メインブランチに取り込む運用をしている。
|
||||||
|
Orchestrator の cwd が orchestration 用ブランチ/worktree の場合、通常作業では親ブランチの dirty state を気にしない。
|
||||||
コミットメッセージは適当に`<prefix>: *簡潔な1行*`で書いている。
|
コミットメッセージは適当に`<prefix>: *簡潔な1行*`で書いている。
|
||||||
|
|
||||||
外部の参考プロジェクトは必要に応じてローカルの外部 checkout からReadすること。
|
外部の参考プロジェクトは必要に応じてローカルの外部 checkout からReadすること。
|
||||||
|
|
|
||||||
|
|
@ -547,9 +547,6 @@ fn append_operation_targets(out: &mut String, context: &TicketRoleLaunchContext)
|
||||||
if context.role != TicketRole::Orchestrator {
|
if context.role != TicketRole::Orchestrator {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if context.original_workspace_root.is_none() && context.target_workspace_root.is_none() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
out.push_str("\nOrchestrator operation targets:\n");
|
out.push_str("\nOrchestrator operation targets:\n");
|
||||||
push_bounded_bullet(
|
push_bounded_bullet(
|
||||||
|
|
@ -557,13 +554,6 @@ fn append_operation_targets(out: &mut String, context: &TicketRoleLaunchContext)
|
||||||
"implementation_worktree_root",
|
"implementation_worktree_root",
|
||||||
&context.implementation_worktree_root().display().to_string(),
|
&context.implementation_worktree_root().display().to_string(),
|
||||||
);
|
);
|
||||||
if context.target_workspace_root.is_some() {
|
|
||||||
push_bounded_bullet(
|
|
||||||
out,
|
|
||||||
"merge_target_workspace_root",
|
|
||||||
&context.target_workspace_root().display().to_string(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_pod_name(role: TicketRole, ticket: Option<&TicketRef>) -> String {
|
fn default_pod_name(role: TicketRole, ticket: Option<&TicketRef>) -> String {
|
||||||
|
|
@ -706,6 +696,7 @@ mod tests {
|
||||||
context_tokens: 0,
|
context_tokens: 0,
|
||||||
},
|
},
|
||||||
status: PodStatus::Idle,
|
status: PodStatus::Idle,
|
||||||
|
in_flight: protocol::InFlightSnapshot::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1152,7 +1143,7 @@ workflow = "ticket-review-workflow"
|
||||||
|
|
||||||
assert!(text.contains("Orchestrator operation targets:"));
|
assert!(text.contains("Orchestrator operation targets:"));
|
||||||
assert!(text.contains("implementation_worktree_root"));
|
assert!(text.contains("implementation_worktree_root"));
|
||||||
assert!(text.contains("merge_target_workspace_root"));
|
assert!(!text.contains("merge_target_workspace_root"));
|
||||||
assert!(!text.contains("Workspace routing context:"));
|
assert!(!text.contains("Workspace routing context:"));
|
||||||
assert!(!text.contains("role_workspace_root"));
|
assert!(!text.contains("role_workspace_root"));
|
||||||
assert!(!text.contains("role_cwd"));
|
assert!(!text.contains("role_cwd"));
|
||||||
|
|
|
||||||
|
|
@ -4,12 +4,17 @@ use std::fs::{DirBuilder, File, OpenOptions};
|
||||||
use std::io::{self, Read, Seek, SeekFrom, Write};
|
use std::io::{self, Read, Seek, SeekFrom, Write};
|
||||||
use std::os::unix::fs::{DirBuilderExt, OpenOptionsExt};
|
use std::os::unix::fs::{DirBuilderExt, OpenOptionsExt};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::thread;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use fs4::fs_std::FileExt;
|
use fs4::fs_std::FileExt;
|
||||||
use manifest::{ScopeRule, paths};
|
use manifest::{ScopeRule, paths};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use session_store::SegmentId;
|
use session_store::SegmentId;
|
||||||
|
|
||||||
|
const LOCK_WAIT_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
const LOCK_WAIT_POLL_INTERVAL: Duration = Duration::from_millis(25);
|
||||||
|
|
||||||
/// On-disk representation of the allocation table.
|
/// On-disk representation of the allocation table.
|
||||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||||
pub struct LockFile {
|
pub struct LockFile {
|
||||||
|
|
@ -119,7 +124,37 @@ impl LockFileGuard {
|
||||||
.truncate(false)
|
.truncate(false)
|
||||||
.mode(0o600)
|
.mode(0o600)
|
||||||
.open(path)?;
|
.open(path)?;
|
||||||
FileExt::lock_exclusive(&file)?;
|
let started = Instant::now();
|
||||||
|
loop {
|
||||||
|
match FileExt::try_lock_exclusive(&file) {
|
||||||
|
Ok(true) => break,
|
||||||
|
Ok(false) => {
|
||||||
|
if started.elapsed() >= LOCK_WAIT_TIMEOUT {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
io::ErrorKind::TimedOut,
|
||||||
|
format!(
|
||||||
|
"timed out waiting for pod registry lock `{}`",
|
||||||
|
path.display()
|
||||||
|
),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
thread::sleep(LOCK_WAIT_POLL_INTERVAL);
|
||||||
|
}
|
||||||
|
Err(error) if error.kind() == io::ErrorKind::WouldBlock => {
|
||||||
|
if started.elapsed() >= LOCK_WAIT_TIMEOUT {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
io::ErrorKind::TimedOut,
|
||||||
|
format!(
|
||||||
|
"timed out waiting for pod registry lock `{}`",
|
||||||
|
path.display()
|
||||||
|
),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
thread::sleep(LOCK_WAIT_POLL_INTERVAL);
|
||||||
|
}
|
||||||
|
Err(error) => return Err(error),
|
||||||
|
}
|
||||||
|
}
|
||||||
let mut this = Self {
|
let mut this = Self {
|
||||||
file,
|
file,
|
||||||
data: LockFile::default(),
|
data: LockFile::default(),
|
||||||
|
|
|
||||||
|
|
@ -35,10 +35,12 @@ type RegistryStateWriter = Arc<dyn Fn(&[SpawnedPodRecord]) -> io::Result<()> + S
|
||||||
type RegistryReclaimWriter = Arc<dyn Fn(&SpawnedPodRecord) -> io::Result<()> + Send + Sync>;
|
type RegistryReclaimWriter = Arc<dyn Fn(&SpawnedPodRecord) -> io::Result<()> + Send + Sync>;
|
||||||
|
|
||||||
const RESTORE_REACHABILITY_TIMEOUT: Duration = Duration::from_millis(500);
|
const RESTORE_REACHABILITY_TIMEOUT: Duration = Duration::from_millis(500);
|
||||||
|
const REGISTRY_CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);
|
||||||
|
|
||||||
pub struct SpawnedPodRegistry {
|
pub struct SpawnedPodRegistry {
|
||||||
records: Mutex<Vec<SpawnedPodRecord>>,
|
records: Mutex<Vec<SpawnedPodRecord>>,
|
||||||
cursors: Mutex<HashMap<String, usize>>,
|
cursors: Mutex<HashMap<String, usize>>,
|
||||||
|
mutations: Mutex<()>,
|
||||||
runtime_dir: Arc<RuntimeDir>,
|
runtime_dir: Arc<RuntimeDir>,
|
||||||
state_writer: Option<RegistryStateWriter>,
|
state_writer: Option<RegistryStateWriter>,
|
||||||
reclaim_writer: Option<RegistryReclaimWriter>,
|
reclaim_writer: Option<RegistryReclaimWriter>,
|
||||||
|
|
@ -56,6 +58,7 @@ impl SpawnedPodRegistry {
|
||||||
Arc::new(Self {
|
Arc::new(Self {
|
||||||
records: Mutex::new(Vec::new()),
|
records: Mutex::new(Vec::new()),
|
||||||
cursors: Mutex::new(HashMap::new()),
|
cursors: Mutex::new(HashMap::new()),
|
||||||
|
mutations: Mutex::new(()),
|
||||||
runtime_dir,
|
runtime_dir,
|
||||||
state_writer: None,
|
state_writer: None,
|
||||||
reclaim_writer: None,
|
reclaim_writer: None,
|
||||||
|
|
@ -164,6 +167,7 @@ impl SpawnedPodRegistry {
|
||||||
registry: Arc::new(Self {
|
registry: Arc::new(Self {
|
||||||
records: Mutex::new(records),
|
records: Mutex::new(records),
|
||||||
cursors: Mutex::new(HashMap::new()),
|
cursors: Mutex::new(HashMap::new()),
|
||||||
|
mutations: Mutex::new(()),
|
||||||
runtime_dir,
|
runtime_dir,
|
||||||
state_writer: Some(state_writer),
|
state_writer: Some(state_writer),
|
||||||
reclaim_writer: Some(reclaim_writer),
|
reclaim_writer: Some(reclaim_writer),
|
||||||
|
|
@ -178,9 +182,13 @@ impl SpawnedPodRegistry {
|
||||||
/// error if either persisted write fails; the in-memory state is still
|
/// error if either persisted write fails; the in-memory state is still
|
||||||
/// updated in that case — the next successful write will reconcile.
|
/// updated in that case — the next successful write will reconcile.
|
||||||
pub async fn add(&self, record: SpawnedPodRecord) -> io::Result<()> {
|
pub async fn add(&self, record: SpawnedPodRecord) -> io::Result<()> {
|
||||||
|
let _mutation = self.mutations.lock().await;
|
||||||
|
let snapshot = {
|
||||||
let mut records = self.records.lock().await;
|
let mut records = self.records.lock().await;
|
||||||
records.push(record);
|
records.push(record);
|
||||||
self.persist_records(records.as_slice()).await
|
records.clone()
|
||||||
|
};
|
||||||
|
self.persist_records(&snapshot).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Look up a record by pod name. Cloned so callers can drop the lock.
|
/// Look up a record by pod name. Cloned so callers can drop the lock.
|
||||||
|
|
@ -201,29 +209,39 @@ impl SpawnedPodRegistry {
|
||||||
/// reclaim any delegated Write scope owned by that child. Returns the
|
/// reclaim any delegated Write scope owned by that child. Returns the
|
||||||
/// removed record (if any).
|
/// removed record (if any).
|
||||||
pub async fn remove(&self, pod_name: &str) -> io::Result<Option<SpawnedPodRecord>> {
|
pub async fn remove(&self, pod_name: &str) -> io::Result<Option<SpawnedPodRecord>> {
|
||||||
let removed = {
|
let _mutation = self.mutations.lock().await;
|
||||||
|
let (removed, snapshot) = {
|
||||||
let mut records = self.records.lock().await;
|
let mut records = self.records.lock().await;
|
||||||
let idx = records.iter().position(|r| r.pod_name == pod_name);
|
let idx = records.iter().position(|r| r.pod_name == pod_name);
|
||||||
let removed = idx.map(|i| records.remove(i));
|
let removed = idx.map(|i| records.remove(i));
|
||||||
self.persist_records(records.as_slice()).await?;
|
let snapshot = records.clone();
|
||||||
removed
|
(removed, snapshot)
|
||||||
};
|
};
|
||||||
|
self.persist_records(&snapshot).await?;
|
||||||
self.cursors.lock().await.remove(pod_name);
|
self.cursors.lock().await.remove(pod_name);
|
||||||
if let Some(record) = &removed {
|
if let Some(record) = &removed {
|
||||||
self.reclaim_record(record)?;
|
self.reclaim_removed_record(record.clone()).await?;
|
||||||
if let Some(write_reclaim) = &self.reclaim_writer {
|
|
||||||
write_reclaim(record)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(removed)
|
Ok(removed)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reclaim_record(&self, record: &SpawnedPodRecord) -> io::Result<()> {
|
async fn reclaim_removed_record(&self, record: SpawnedPodRecord) -> io::Result<()> {
|
||||||
let Some(parent_name) = &self.parent_name else {
|
let parent_name = self.parent_name.clone();
|
||||||
release_child_allocation(&record.pod_name)?;
|
let parent_scope = self.parent_scope.clone();
|
||||||
return Ok(());
|
let reclaim_writer = self.reclaim_writer.clone();
|
||||||
};
|
let pod_name = record.pod_name.clone();
|
||||||
reclaim_record(parent_name, self.parent_scope.as_ref(), record)
|
let reclaim = tokio::task::spawn_blocking(move || {
|
||||||
|
reclaim_removed_record_blocking(parent_name, parent_scope, reclaim_writer, record)
|
||||||
|
});
|
||||||
|
tokio::time::timeout(REGISTRY_CLEANUP_TIMEOUT, reclaim)
|
||||||
|
.await
|
||||||
|
.map_err(|_| {
|
||||||
|
io::Error::new(
|
||||||
|
io::ErrorKind::TimedOut,
|
||||||
|
format!("timed out reclaiming spawned pod `{pod_name}`"),
|
||||||
|
)
|
||||||
|
})?
|
||||||
|
.map_err(|err| io::Error::other(format!("spawned-pod reclaim task failed: {err}")))?
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read-only cursor lookup. Returns 0 when no cursor has been set.
|
/// Read-only cursor lookup. Returns 0 when no cursor has been set.
|
||||||
|
|
@ -288,6 +306,23 @@ where
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn reclaim_removed_record_blocking(
|
||||||
|
parent_name: Option<String>,
|
||||||
|
parent_scope: Option<SharedScope>,
|
||||||
|
reclaim_writer: Option<RegistryReclaimWriter>,
|
||||||
|
record: SpawnedPodRecord,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
if let Some(parent_name) = parent_name {
|
||||||
|
reclaim_record(&parent_name, parent_scope.as_ref(), &record)?;
|
||||||
|
} else {
|
||||||
|
release_child_allocation(&record.pod_name)?;
|
||||||
|
}
|
||||||
|
if let Some(write_reclaim) = reclaim_writer {
|
||||||
|
write_reclaim(&record)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn reclaim_record(
|
fn reclaim_record(
|
||||||
parent_name: &str,
|
parent_name: &str,
|
||||||
parent_scope: Option<&SharedScope>,
|
parent_scope: Option<&SharedScope>,
|
||||||
|
|
|
||||||
|
|
@ -2890,7 +2890,7 @@ fn build_orchestrator_launch_context(
|
||||||
pod_name: &str,
|
pod_name: &str,
|
||||||
) -> TicketRoleLaunchContext {
|
) -> TicketRoleLaunchContext {
|
||||||
let mut context = TicketRoleLaunchContext::new(
|
let mut context = TicketRoleLaunchContext::new(
|
||||||
orchestration_workspace_root.to_path_buf(),
|
original_workspace_root.to_path_buf(),
|
||||||
TicketRole::Orchestrator,
|
TicketRole::Orchestrator,
|
||||||
)
|
)
|
||||||
.with_cwd(orchestration_workspace_root.to_path_buf())
|
.with_cwd(orchestration_workspace_root.to_path_buf())
|
||||||
|
|
|
||||||
|
|
@ -16,14 +16,15 @@ fn orchestration_worktree_layout_is_stable_under_original_workspace_root() {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn orchestrator_launch_context_uses_orchestration_root_for_runtime_workspace() {
|
fn orchestrator_launch_context_uses_original_root_for_runtime_workspace_and_worktree_cwd() {
|
||||||
let original = PathBuf::from("/repo/yoi");
|
let original = PathBuf::from("/repo/yoi");
|
||||||
let orchestration = original
|
let orchestration = original
|
||||||
.join(".worktree")
|
.join(".worktree")
|
||||||
.join("orchestration")
|
.join("orchestration")
|
||||||
.join("yoi-orchestrator");
|
.join("yoi-orchestrator");
|
||||||
let context = build_orchestrator_launch_context(&original, &orchestration, "yoi-orchestrator");
|
let context = build_orchestrator_launch_context(&original, &orchestration, "yoi-orchestrator");
|
||||||
assert_eq!(context.workspace_root, orchestration);
|
assert_eq!(context.workspace_root, original);
|
||||||
|
assert_eq!(context.cwd.as_deref(), Some(orchestration.as_path()));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
context.original_workspace_root.as_deref(),
|
context.original_workspace_root.as_deref(),
|
||||||
Some(original.as_path())
|
Some(original.as_path())
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user