yoi/crates/tools/src/tracker.rs

351 lines
12 KiB
Rust

//! Pod-lifetime tracker for file operations performed by the builtin
//! file-manipulation tools.
//!
//! A `Tracker` serves two orthogonal purposes:
//!
//! 1. **Read-before-edit policy.** It records a SHA-256 hash of each
//! file's contents at the moment it was observed via `Read` (or
//! mutated via `Write` / `Edit`), and lets `Write` / `Edit` later
//! verify that the file has not been externally modified since then.
//!
//! 2. **Recency of touched files.** It keeps an LRU-ordered list of
//! files that have been touched by any of the tools, so the Pod
//! layer can ask "which files did the agent recently look at?" —
//! used e.g. as a default reference set passed to context compaction.
//!
//! Despite its historic name, the Tracker already watches all three of
//! Read / Write / Edit; the rename away from `ReadTracker` reflects this.
//!
//! # Lifetime
//!
//! A `Tracker` is **Pod-process scoped**: the Pod layer creates a fresh
//! instance at the start of each Pod run (including resume) and discards
//! it when the process exits — it is not persisted, so a resumed
//! conversation starts with an empty read/edit history. The `ScopedFs`
//! write boundary is likewise Pod-process scoped (derived from the
//! manifest). The two are orthogonal and the Pod wires them together
//! when registering builtin tools.
//!
//! ```no_run
//! # use std::path::PathBuf;
//! # use manifest::Scope;
//! # use tools::{ScopedFs, Tracker, core_builtin_tools};
//! let scope = Scope::writable("/workspace").unwrap();
//! let fs = ScopedFs::new(scope, PathBuf::from("/workspace")); // pod lifetime
//! let tracker = Tracker::new(); // session lifetime
//! let bash_outputs = PathBuf::from("/run/yoi/bash-output");
//! let defs = core_builtin_tools(fs, tracker, bash_outputs, None);
//! ```
use std::collections::{HashMap, VecDeque};
use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};
use sha2::{Digest, Sha256};
use crate::error::ToolsError;
/// Fixed-size content hash recorded per file.
type ContentHash = [u8; 32];
/// How many distinct paths the recency list keeps before evicting the
/// least-recently-touched entry.
const RECENCY_CAPACITY: usize = 20;
fn hash_bytes(bytes: &[u8]) -> ContentHash {
let mut hasher = Sha256::new();
hasher.update(bytes);
hasher.finalize().into()
}
#[derive(Debug, Default)]
struct Inner {
/// Hash of each file's last observed contents, keyed by canonical path.
hashes: HashMap<PathBuf, ContentHash>,
/// LRU list of touched files. Front = most recently touched.
recency: VecDeque<PathBuf>,
}
/// Canonical-path keyed tracker of file observations and their recency.
///
/// Cheap to clone: internally an `Arc<Mutex<Inner>>`, so sharing a
/// `Tracker` across every builtin tool in a session is effectively free
/// and keeps their views consistent.
#[derive(Debug, Clone, Default)]
pub struct Tracker {
inner: Arc<Mutex<Inner>>,
}
impl Tracker {
/// Create an empty tracker. Typically called once per session.
pub fn new() -> Self {
Self::default()
}
/// Record that `path` has been observed with the given content bytes.
///
/// Called by the `Read` tool after a successful read, and by the
/// `Write` / `Edit` tools after a successful modification (so that
/// subsequent edits see a clean history).
///
/// Also bumps `path` to the front of the recency list. If the list
/// grows past [`RECENCY_CAPACITY`], the oldest entry is evicted.
pub fn record(&self, path: &Path, bytes: &[u8]) {
let key = canonicalize_or_owned(path);
let hash = hash_bytes(bytes);
let mut inner = self.inner.lock().unwrap_or_else(|e| e.into_inner());
inner.hashes.insert(key.clone(), hash);
// LRU bump: remove an existing entry for this path then push to
// the front. We intentionally compare by the canonical key so
// symlink/real-path pairs collapse into a single slot.
inner.recency.retain(|p| p != &key);
inner.recency.push_front(key);
if inner.recency.len() > RECENCY_CAPACITY {
inner.recency.pop_back();
}
}
/// Verify that `path` was previously recorded and its current bytes
/// match the recorded hash.
///
/// - If the path has no history entry, returns [`ToolsError::NotRead`].
/// - If the current content hashes differ from the recorded value,
/// returns [`ToolsError::ExternallyModified`].
pub fn verify(&self, path: &Path, current_bytes: &[u8]) -> Result<(), ToolsError> {
let key = canonicalize_or_owned(path);
let guard = self.inner.lock().unwrap_or_else(|e| e.into_inner());
let recorded = guard
.hashes
.get(&key)
.ok_or_else(|| ToolsError::NotRead(path.to_path_buf()))?;
let current = hash_bytes(current_bytes);
if *recorded != current {
return Err(ToolsError::ExternallyModified(path.to_path_buf()));
}
Ok(())
}
/// Return up to `n` most recently touched file paths, most-recent first.
///
/// Intended for callers like the Pod's context-compaction path, which
/// wants to know which files the agent has been working with so it
/// can pass them as default references to the compaction worker.
pub fn recent_files(&self, n: usize) -> Vec<PathBuf> {
let inner = self.inner.lock().unwrap_or_else(|e| e.into_inner());
inner.recency.iter().take(n).cloned().collect()
}
/// Returns true if `path` has a history entry. Test-only.
#[cfg(test)]
pub(crate) fn has(&self, path: &Path) -> bool {
let key = canonicalize_or_owned(path);
self.inner
.lock()
.unwrap_or_else(|e| e.into_inner())
.hashes
.contains_key(&key)
}
/// Number of distinct files in the history. Test-only.
#[cfg(test)]
pub(crate) fn len(&self) -> usize {
self.inner
.lock()
.unwrap_or_else(|e| e.into_inner())
.hashes
.len()
}
}
fn canonicalize_or_owned(path: &Path) -> PathBuf {
path.canonicalize().unwrap_or_else(|_| path.to_path_buf())
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn record_then_verify_clean_ok() {
let dir = TempDir::new().unwrap();
let file = dir.path().join("a.txt");
fs::write(&file, b"hello").unwrap();
let tracker = Tracker::new();
tracker.record(&file, b"hello");
assert!(tracker.has(&file));
assert_eq!(tracker.len(), 1);
tracker.verify(&file, b"hello").unwrap();
}
#[test]
fn verify_without_record_returns_not_read() {
let dir = TempDir::new().unwrap();
let file = dir.path().join("a.txt");
fs::write(&file, b"x").unwrap();
let tracker = Tracker::new();
let err = tracker.verify(&file, b"x").unwrap_err();
assert!(matches!(err, ToolsError::NotRead(_)));
}
#[test]
fn verify_mismatch_returns_externally_modified() {
let dir = TempDir::new().unwrap();
let file = dir.path().join("a.txt");
fs::write(&file, b"original").unwrap();
let tracker = Tracker::new();
tracker.record(&file, b"original");
let err = tracker.verify(&file, b"tampered").unwrap_err();
assert!(matches!(err, ToolsError::ExternallyModified(_)));
}
#[test]
fn record_overwrites_previous_hash() {
let dir = TempDir::new().unwrap();
let file = dir.path().join("a.txt");
fs::write(&file, b"v1").unwrap();
let tracker = Tracker::new();
tracker.record(&file, b"v1");
tracker.record(&file, b"v2");
tracker.verify(&file, b"v2").unwrap();
assert!(tracker.verify(&file, b"v1").is_err());
}
#[test]
fn canonical_keys_collapse_symlink_variants() {
#[cfg(unix)]
{
use std::os::unix::fs::symlink;
let dir = TempDir::new().unwrap();
let real = dir.path().join("real.txt");
fs::write(&real, b"data").unwrap();
let link = dir.path().join("link.txt");
symlink(&real, &link).unwrap();
let tracker = Tracker::new();
tracker.record(&real, b"data");
// Looking up via the symlink should hit the same entry.
tracker.verify(&link, b"data").unwrap();
// Exactly one entry.
assert_eq!(tracker.len(), 1);
}
}
#[test]
fn clone_shares_state() {
let dir = TempDir::new().unwrap();
let file = dir.path().join("a.txt");
fs::write(&file, b"x").unwrap();
let t1 = Tracker::new();
let t2 = t1.clone();
t1.record(&file, b"x");
t2.verify(&file, b"x").unwrap();
}
#[test]
fn empty_bytes_hash_stable() {
let tracker = Tracker::new();
let dir = TempDir::new().unwrap();
let file = dir.path().join("empty.txt");
fs::write(&file, b"").unwrap();
tracker.record(&file, b"");
tracker.verify(&file, b"").unwrap();
assert!(tracker.verify(&file, b"x").is_err());
}
// --- recency ---
#[test]
fn recent_files_returns_in_lru_order() {
let dir = TempDir::new().unwrap();
let a = dir.path().join("a.txt");
let b = dir.path().join("b.txt");
let c = dir.path().join("c.txt");
fs::write(&a, b"").unwrap();
fs::write(&b, b"").unwrap();
fs::write(&c, b"").unwrap();
let tracker = Tracker::new();
tracker.record(&a, b"");
tracker.record(&b, b"");
tracker.record(&c, b"");
let recent = tracker.recent_files(10);
// Most recent first.
assert_eq!(recent.len(), 3);
assert!(recent[0].ends_with("c.txt"));
assert!(recent[1].ends_with("b.txt"));
assert!(recent[2].ends_with("a.txt"));
}
#[test]
fn recent_files_respects_n_limit() {
let dir = TempDir::new().unwrap();
let tracker = Tracker::new();
for i in 0..5 {
let p = dir.path().join(format!("f{i}.txt"));
fs::write(&p, b"").unwrap();
tracker.record(&p, b"");
}
assert_eq!(tracker.recent_files(3).len(), 3);
assert_eq!(tracker.recent_files(0).len(), 0);
assert_eq!(tracker.recent_files(100).len(), 5);
}
#[test]
fn re_recording_moves_entry_to_front() {
let dir = TempDir::new().unwrap();
let a = dir.path().join("a.txt");
let b = dir.path().join("b.txt");
let c = dir.path().join("c.txt");
fs::write(&a, b"").unwrap();
fs::write(&b, b"").unwrap();
fs::write(&c, b"").unwrap();
let tracker = Tracker::new();
tracker.record(&a, b"");
tracker.record(&b, b"");
tracker.record(&c, b"");
// Touching `a` again promotes it to the front.
tracker.record(&a, b"");
let recent = tracker.recent_files(10);
assert_eq!(recent.len(), 3);
assert!(recent[0].ends_with("a.txt"));
assert!(recent[1].ends_with("c.txt"));
assert!(recent[2].ends_with("b.txt"));
}
#[test]
fn recency_capacity_evicts_oldest() {
let dir = TempDir::new().unwrap();
let tracker = Tracker::new();
// Record one more than the capacity.
for i in 0..(RECENCY_CAPACITY + 5) {
let p = dir.path().join(format!("f{i:02}.txt"));
fs::write(&p, b"").unwrap();
tracker.record(&p, b"");
}
let recent = tracker.recent_files(RECENCY_CAPACITY + 100);
assert_eq!(recent.len(), RECENCY_CAPACITY);
// Newest-first: f24 down to f05. f00..f04 must be evicted.
assert!(recent[0].ends_with(&format!("f{:02}.txt", RECENCY_CAPACITY + 4)));
let last = recent.last().unwrap();
assert!(last.ends_with("f05.txt"), "oldest surviving: {last:?}");
// The evicted oldest ones must not appear.
for i in 0..5 {
let name = format!("f{i:02}.txt");
assert!(recent.iter().all(|p| !p.ends_with(&name)));
}
}
}