fix: reconcile missing delegated children

This commit is contained in:
Keisuke Hirata 2026-05-30 07:46:09 +09:00
parent e10b4ad4f0
commit d2e80871ce
No known key found for this signature in database
3 changed files with 142 additions and 46 deletions

View File

@ -45,9 +45,9 @@ pub fn register_pod(
/// and the registration proceeds. The check is structural (deny ⊇ /// and the registration proceeds. The check is structural (deny ⊇
/// competitor.rule), not relational — it does not verify that the /// competitor.rule), not relational — it does not verify that the
/// competitor actually descends from this Pod's prior delegations. /// competitor actually descends from this Pod's prior delegations.
/// In practice this is safe because the canonical caller is `restore`, /// In practice this is safe because the canonical restore caller derives
/// which derives `scope_deny` from the session's own snapshot, so any /// `scope_deny` from outstanding `pod-store` child delegations, so any
/// covered competitor is guaranteed to be a descendant of the original /// covered competitor is expected to be a descendant of the original
/// allocation. Direct callers must uphold the same invariant. /// allocation. Direct callers must uphold the same invariant.
pub fn register_pod_with_deny( pub fn register_pod_with_deny(
guard: &mut LockFileGuard, guard: &mut LockFileGuard,
@ -180,10 +180,11 @@ pub fn release_pod(guard: &mut LockFileGuard, pod_name: &str) -> Result<(), Scop
/// Reclaim a child delegation back into its parent allocation. /// Reclaim a child delegation back into its parent allocation.
/// ///
/// This is idempotent: missing child allocations and missing deny entries are /// This is idempotent for missing deny entries. For each delegated Write rule,
/// ignored. For each delegated Write rule, at most one exact matching deny rule /// at most one exact matching deny rule is removed from the parent's `scope_deny`
/// is removed from the parent's `scope_deny`, preserving any duplicate explicit /// even when the child allocation is already absent; restore reconciliation uses
/// base deny that was not owned by this child delegation. /// that case when durable Pod-state still records an outstanding delegation but
/// the live lock file no longer has a child allocation.
pub fn reclaim_delegated_scope( pub fn reclaim_delegated_scope(
guard: &mut LockFileGuard, guard: &mut LockFileGuard,
parent: &str, parent: &str,
@ -199,17 +200,13 @@ pub fn reclaim_delegated_scope(
.map(|idx| guard.data().allocations[idx].delegated_from.clone()) .map(|idx| guard.data().allocations[idx].delegated_from.clone())
.unwrap_or(None); .unwrap_or(None);
let child_exists = child_idx.is_some(); if let Some(parent_alloc) = guard.data_mut().find_mut(parent) {
for rule in delegated_scope
if child_exists { .iter()
if let Some(parent_alloc) = guard.data_mut().find_mut(parent) { .filter(|rule| rule.permission == Permission::Write)
for rule in delegated_scope {
.iter() if let Some(idx) = parent_alloc.scope_deny.iter().position(|deny| deny == rule) {
.filter(|rule| rule.permission == Permission::Write) parent_alloc.scope_deny.remove(idx);
{
if let Some(idx) = parent_alloc.scope_deny.iter().position(|deny| deny == rule) {
parent_alloc.scope_deny.remove(idx);
}
} }
} }
} }
@ -516,15 +513,43 @@ mod tests {
assert_eq!(a.scope_deny, vec![delegated_rule.clone()]); assert_eq!(a.scope_deny, vec![delegated_rule.clone()]);
assert!(g.data().find("b").is_none()); assert!(g.data().find("b").is_none());
reclaim_delegated_scope(&mut g, "a", "b", &[delegated_rule.clone()]).unwrap(); reclaim_delegated_scope(&mut g, "a", "b", std::slice::from_ref(&delegated_rule)).unwrap();
let a = g.data().find("a").unwrap(); let a = g.data().find("a").unwrap();
assert_eq!( assert!(
a.scope_deny, a.scope_deny.is_empty(),
vec![delegated_rule], "a missing child allocation still reclaims one matching parent deny"
"a repeated reclaim with no child allocation must not broaden an explicit duplicate base deny"
); );
} }
#[test]
fn reclaim_delegated_scope_removes_parent_deny_when_child_allocation_missing() {
let dir = TempDir::new().unwrap();
let path = dir.path().join("pods.json");
let mut g = open_empty(&path);
let delegated_rule = write_rule("/src/core", true);
register_pod_with_deny(
&mut g,
"a".into(),
std::process::id(),
sock("a"),
vec![write_rule("/src", true)],
vec![delegated_rule.clone()],
sid(),
)
.unwrap();
reclaim_delegated_scope(
&mut g,
"a",
"missing",
std::slice::from_ref(&delegated_rule),
)
.unwrap();
let a = g.data().find("a").unwrap();
assert!(a.scope_deny.is_empty());
}
#[test] #[test]
fn reclaim_stale_reparents_and_removes_dead_entries() { fn reclaim_stale_reparents_and_removes_dead_entries() {
let dir = TempDir::new().unwrap(); let dir = TempDir::new().unwrap();

View File

@ -1,6 +1,7 @@
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use std::time::Duration;
use arc_swap::ArcSwap; use arc_swap::ArcSwap;
use llm_worker::Item; use llm_worker::Item;
@ -10,7 +11,8 @@ use llm_worker::llm_client::types::Role;
use llm_worker::state::Mutable; use llm_worker::state::Mutable;
use llm_worker::{ToolOutputLimits, UsageRecord, Worker, WorkerError, WorkerResult}; use llm_worker::{ToolOutputLimits, UsageRecord, Worker, WorkerError, WorkerResult};
use pod_store::{ use pod_store::{
PodActiveSegmentRef, PodMetadata, PodMetadataStore, PodSpawnedScopeRule, PodStoreError, PodActiveSegmentRef, PodMetadata, PodMetadataStore, PodReclaimedChild, PodSpawnedChild,
PodSpawnedScopeRule, PodStoreError,
}; };
use session_store::{ use session_store::{
LogEntry, SegmentId, SessionId, Store, StoreError, SystemItem, segment_log, to_logged, LogEntry, SegmentId, SessionId, Store, StoreError, SystemItem, segment_log, to_logged,
@ -45,9 +47,12 @@ use llm_worker::interceptor::PreRequestAction;
use protocol::{ use protocol::{
AlertLevel, AlertSource, Event, RewindSummary, RewindTarget, RewindTargetId, Segment, AlertLevel, AlertSource, Event, RewindSummary, RewindTarget, RewindTargetId, Segment,
}; };
use tokio::net::UnixStream;
use tokio::sync::broadcast; use tokio::sync::broadcast;
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
const RESTORE_RECONCILIATION_REACHABILITY_TIMEOUT: Duration = Duration::from_millis(500);
/// `(SessionId, SegmentId)` pair the Pod is currently writing to. /// `(SessionId, SegmentId)` pair the Pod is currently writing to.
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct SegmentLocation { pub struct SegmentLocation {
@ -4048,10 +4053,61 @@ where
session_id, session_id,
segment_id, segment_id,
})?; })?;
pod.reconcile_restored_delegations().await?;
drain_skill_shadows(&pod, skill_shadows); drain_skill_shadows(&pod, skill_shadows);
Ok(pod) Ok(pod)
} }
async fn reconcile_restored_delegations(&mut self) -> Result<(), PodError> {
let pod_name = self.manifest.pod.name.clone();
let Some(metadata) = self.store.read_by_name(&pod_name)? else {
return Ok(());
};
let mut reclaimed = Vec::new();
for child in metadata.spawned_children {
if restored_child_reachable(&child).await {
continue;
}
let delegated_scope = spawned_child_scope_rules(&child);
if !delegated_scope.is_empty() {
let lock_path =
pod_registry::default_registry_path().map_err(ScopeLockError::from)?;
let mut guard =
pod_registry::LockFileGuard::open(&lock_path).map_err(ScopeLockError::from)?;
pod_registry::reclaim_delegated_scope(
&mut guard,
&pod_name,
&child.pod_name,
&delegated_scope,
)?;
let write_rules = delegated_scope
.iter()
.filter(|rule| rule.permission == Permission::Write)
.cloned()
.collect::<Vec<_>>();
self.scope
.update(|current| current.with_removed_deny_rules(write_rules))
.map_err(PodError::Scope)?;
}
reclaimed.push(PodReclaimedChild {
pod_name: child.pod_name,
scope_delegated: child.scope_delegated,
});
}
if reclaimed.is_empty() {
return Ok(());
}
self.store.reclaim_spawned_children(&pod_name, reclaimed)?;
self.push_notify(
"Restored Pod state contained missing or unreachable delegated child Pods; their delegated write scopes were reclaimed before resume."
.to_string(),
);
Ok(())
}
/// Convenience: build a Pod from a single-layer TOML manifest string. /// Convenience: build a Pod from a single-layer TOML manifest string.
/// ///
/// Parses the TOML into a [`PodManifestConfig`], converts to a /// Parses the TOML into a [`PodManifestConfig`], converts to a
@ -4594,6 +4650,40 @@ struct PodCommon {
skill_shadows: Vec<workflow_crate::ShadowedSkill>, skill_shadows: Vec<workflow_crate::ShadowedSkill>,
} }
async fn restored_child_reachable(child: &PodSpawnedChild) -> bool {
tokio::time::timeout(
RESTORE_RECONCILIATION_REACHABILITY_TIMEOUT,
UnixStream::connect(&child.socket_path),
)
.await
.map(|result| result.is_ok())
.unwrap_or(false)
}
fn spawned_child_scope_rules(child: &PodSpawnedChild) -> Vec<ScopeRule> {
child
.scope_delegated
.iter()
.filter_map(|rule| delegated_scope_rule_to_scope_rule(rule.clone()))
.collect()
}
fn delegated_scope_rule_to_scope_rule(rule: PodSpawnedScopeRule) -> Option<ScopeRule> {
let permission = match rule.permission.as_str() {
"read" => Permission::Read,
"write" => Permission::Write,
other => {
warn!(permission = %other, "ignoring invalid delegated child scope permission");
return None;
}
};
Some(ScopeRule {
target: rule.target,
permission,
recursive: rule.recursive,
})
}
fn effective_restore_scope_config<St>( fn effective_restore_scope_config<St>(
store: &St, store: &St,
manifest: &PodManifest, manifest: &PodManifest,
@ -4616,18 +4706,8 @@ where
} }
fn delegated_write_rule_to_deny(rule: PodSpawnedScopeRule) -> Option<ScopeRule> { fn delegated_write_rule_to_deny(rule: PodSpawnedScopeRule) -> Option<ScopeRule> {
match rule.permission.as_str() { let rule = delegated_scope_rule_to_scope_rule(rule)?;
"write" => Some(ScopeRule { (rule.permission == Permission::Write).then_some(rule)
target: rule.target,
permission: Permission::Write,
recursive: rule.recursive,
}),
"read" => None,
other => {
warn!(permission = %other, "ignoring invalid delegated child scope permission");
None
}
}
} }
/// Resolve pwd / scope / LLM client / prompt catalog from a validated /// Resolve pwd / scope / LLM client / prompt catalog from a validated

View File

@ -633,7 +633,7 @@ async fn load_from_pod_state_prunes_runtime_children_and_reclaims_durable_delega
} }
#[tokio::test] #[tokio::test]
async fn load_from_pod_state_reclaims_pruned_child_scope_and_records_history() { async fn load_from_pod_state_reclaims_missing_child_scope_and_records_history() {
let _env = EnvGuard::acquire(); let _env = EnvGuard::acquire();
let runtime_tmp = TempDir::new().unwrap(); let runtime_tmp = TempDir::new().unwrap();
let store_tmp = TempDir::new().unwrap(); let store_tmp = TempDir::new().unwrap();
@ -667,15 +667,6 @@ async fn load_from_pod_state_reclaims_pruned_child_scope_and_records_history() {
session_store::new_segment_id(), session_store::new_segment_id(),
) )
.unwrap(); .unwrap();
pod_registry::register_pod(
&mut g,
"missing".into(),
std::process::id(),
"/tmp/missing.sock".into(),
vec![missing_rule.clone()],
session_store::new_segment_id(),
)
.unwrap();
} }
let parent_scope = SharedScope::new( let parent_scope = SharedScope::new(