feat: add web search and fetch tools

This commit is contained in:
Keisuke Hirata 2026-05-29 17:58:11 +09:00
parent 7a6d9c33f3
commit 2be3a5bd36
No known key found for this signature in database
12 changed files with 1232 additions and 5 deletions

1
Cargo.lock generated
View File

@ -3503,6 +3503,7 @@ dependencies = [
"ignore", "ignore",
"llm-worker", "llm-worker",
"manifest", "manifest",
"reqwest",
"schemars", "schemars",
"serde", "serde",
"serde_json", "serde_json",

View File

@ -18,7 +18,7 @@ use crate::model::{AuthRef, ModelManifest, ReasoningControl};
use crate::{ use crate::{
CompactionConfig, FileUploadLimits, MemoryConfig, PodManifest, PodMeta, ScopeConfig, CompactionConfig, FileUploadLimits, MemoryConfig, PodManifest, PodMeta, ScopeConfig,
SessionConfig, SkillsConfig, ToolOutputLimits, ToolPermissionConfig, ToolPermissionRule, SessionConfig, SkillsConfig, ToolOutputLimits, ToolPermissionConfig, ToolPermissionRule,
WorkerManifest, WebConfig, WorkerManifest,
}; };
/// Partial-form Pod manifest. Every field is optional; one or more /// Partial-form Pod manifest. Every field is optional; one or more
@ -46,6 +46,9 @@ pub struct PodManifestConfig {
pub permissions: Option<PermissionConfigPartial>, pub permissions: Option<PermissionConfigPartial>,
#[serde(default)] #[serde(default)]
pub compaction: Option<CompactionConfigPartial>, pub compaction: Option<CompactionConfigPartial>,
/// First-class web tool opt-in. See [`WebConfig`].
#[serde(default)]
pub web: Option<WebConfig>,
/// Memory subsystem opt-in. See [`MemoryConfig`]. /// Memory subsystem opt-in. See [`MemoryConfig`].
#[serde(default)] #[serde(default)]
pub memory: Option<MemoryConfig>, pub memory: Option<MemoryConfig>,
@ -296,6 +299,7 @@ impl PodManifestConfig {
upper.compaction, upper.compaction,
CompactionConfigPartial::merge, CompactionConfigPartial::merge,
), ),
web: merge_option(self.web, upper.web, WebConfig::merge),
memory: merge_option(self.memory, upper.memory, MemoryConfig::merge), memory: merge_option(self.memory, upper.memory, MemoryConfig::merge),
skills: merge_option(self.skills, upper.skills, SkillsConfig::merge), skills: merge_option(self.skills, upper.skills, SkillsConfig::merge),
} }
@ -309,6 +313,49 @@ impl SkillsConfig {
} }
} }
impl WebConfig {
fn merge(self, upper: Self) -> Self {
Self {
enabled: upper.enabled.or(self.enabled),
allow_private_addresses: upper
.allow_private_addresses
.or(self.allow_private_addresses),
search: merge_option(self.search, upper.search, crate::WebSearchConfig::merge),
fetch: merge_option(self.fetch, upper.fetch, crate::WebFetchConfig::merge),
}
}
}
impl crate::WebSearchConfig {
fn merge(self, upper: Self) -> Self {
Self {
enabled: upper.enabled.or(self.enabled),
provider: upper.provider.or(self.provider),
api_key_env: upper.api_key_env.or(self.api_key_env),
base_url: upper.base_url.or(self.base_url),
country: upper.country.or(self.country),
search_lang: upper.search_lang.or(self.search_lang),
ui_lang: upper.ui_lang.or(self.ui_lang),
safesearch: upper.safesearch.or(self.safesearch),
}
}
}
impl crate::WebFetchConfig {
fn merge(self, upper: Self) -> Self {
Self {
enabled: upper.enabled.or(self.enabled),
timeout_secs: upper.timeout_secs.or(self.timeout_secs),
redirect_limit: upper.redirect_limit.or(self.redirect_limit),
max_response_bytes: upper.max_response_bytes.or(self.max_response_bytes),
max_output_bytes: upper.max_output_bytes.or(self.max_output_bytes),
allow_private_addresses: upper
.allow_private_addresses
.or(self.allow_private_addresses),
}
}
}
impl MemoryConfig { impl MemoryConfig {
fn merge(self, upper: Self) -> Self { fn merge(self, upper: Self) -> Self {
Self { Self {
@ -625,6 +672,7 @@ impl TryFrom<PodManifestConfig> for PodManifest {
session, session,
permissions, permissions,
compaction, compaction,
web: cfg.web,
memory: cfg.memory, memory: cfg.memory,
skills: cfg.skills, skills: cfg.skills,
}) })
@ -671,6 +719,7 @@ mod tests {
permissions: None, permissions: None,
session: None, session: None,
compaction: None, compaction: None,
web: None,
memory: None, memory: None,
skills: None, skills: None,
} }

View File

@ -53,6 +53,11 @@ pub struct PodManifest {
/// memory tools registered. /// memory tools registered.
#[serde(default)] #[serde(default)]
pub memory: Option<MemoryConfig>, pub memory: Option<MemoryConfig>,
/// First-class web tools configuration. Absent or `enabled = false` keeps
/// WebSearch/WebFetch registered but disabled, so no network access occurs
/// unless a manifest explicitly opts in.
#[serde(default)]
pub web: Option<WebConfig>,
/// External Agent Skills (`SKILL.md`) directories to ingest as /// External Agent Skills (`SKILL.md`) directories to ingest as
/// Workflows. Each entry is a path to a skills *root* (i.e. a /// Workflows. Each entry is a path to a skills *root* (i.e. a
/// directory whose children are individual `<name>/SKILL.md` skill /// directory whose children are individual `<name>/SKILL.md` skill
@ -79,6 +84,75 @@ pub struct SkillsConfig {
pub directories: Vec<PathBuf>, pub directories: Vec<PathBuf>,
} }
/// Configuration for WebSearch and WebFetch built-in tools.
///
/// Network tools are fail-closed: absent config or `enabled = false` disables
/// both tools. Per-tool `enabled = false` can disable a tool under an enabled
/// global section.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct WebConfig {
/// Global opt-in for web tools. Defaults to false when omitted.
#[serde(default)]
pub enabled: Option<bool>,
/// Escape hatch for tests / trusted local deployments. Defaults to false.
#[serde(default)]
pub allow_private_addresses: Option<bool>,
#[serde(default)]
pub search: Option<WebSearchConfig>,
#[serde(default)]
pub fetch: Option<WebFetchConfig>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum WebSearchProvider {
Brave,
}
/// WebSearch provider configuration.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct WebSearchConfig {
#[serde(default)]
pub enabled: Option<bool>,
#[serde(default)]
pub provider: Option<WebSearchProvider>,
/// Environment variable that stores the provider API key. Raw secrets do
/// not belong in manifest files.
#[serde(default)]
pub api_key_env: Option<String>,
/// Optional provider endpoint override for tests/proxies. Defaults to the
/// Brave web search endpoint for the Brave provider.
#[serde(default)]
pub base_url: Option<String>,
#[serde(default)]
pub country: Option<String>,
#[serde(default)]
pub search_lang: Option<String>,
#[serde(default)]
pub ui_lang: Option<String>,
#[serde(default)]
pub safesearch: Option<String>,
}
/// WebFetch HTTP client limits and policy.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct WebFetchConfig {
#[serde(default)]
pub enabled: Option<bool>,
#[serde(default)]
pub timeout_secs: Option<u64>,
#[serde(default)]
pub redirect_limit: Option<usize>,
#[serde(default)]
pub max_response_bytes: Option<usize>,
#[serde(default)]
pub max_output_bytes: Option<usize>,
/// Per-fetch escape hatch; when absent falls back to `[web]`
/// `allow_private_addresses`, then false.
#[serde(default)]
pub allow_private_addresses: Option<bool>,
}
/// Memory subsystem configuration. Presence in the manifest enables /// Memory subsystem configuration. Presence in the manifest enables
/// memory; the workspace root defaults to the Pod's pwd unless an /// memory; the workspace root defaults to the Pod's pwd unless an
/// explicit override is given. /// explicit override is given.
@ -560,6 +634,24 @@ permission = "write"
assert!(manifest.worker.top_p.is_none()); assert!(manifest.worker.top_p.is_none());
assert!(manifest.worker.top_k.is_none()); assert!(manifest.worker.top_k.is_none());
assert!(manifest.worker.stop_sequences.is_empty()); assert!(manifest.worker.stop_sequences.is_empty());
assert!(manifest.web.is_none());
}
#[test]
fn parse_web_config() {
let toml = format!(
"{}\n[web]\nenabled = true\n\n[web.search]\nprovider = \"brave\"\napi_key_env = \"BRAVE_SEARCH_API_KEY\"\n\n[web.fetch]\ntimeout_secs = 7\nredirect_limit = 3\nmax_response_bytes = 12345\nmax_output_bytes = 2048\n",
MINIMAL_REQUIRED
);
let manifest = PodManifest::from_toml(&toml).unwrap();
let web = manifest.web.unwrap();
assert_eq!(web.enabled, Some(true));
assert_eq!(web.search.unwrap().provider, Some(WebSearchProvider::Brave));
let fetch = web.fetch.unwrap();
assert_eq!(fetch.timeout_secs, Some(7));
assert_eq!(fetch.redirect_limit, Some(3));
assert_eq!(fetch.max_response_bytes, Some(12345));
assert_eq!(fetch.max_output_bytes, Some(2048));
} }
#[test] #[test]

View File

@ -498,6 +498,7 @@ where
let session_id_for_usage = pod.segment_id().to_string(); let session_id_for_usage = pod.segment_id().to_string();
let scope_change_sink = pod.scope_change_sink(); let scope_change_sink = pod.scope_change_sink();
let memory_config = pod.manifest().memory.clone(); let memory_config = pod.manifest().memory.clone();
let web_config = pod.manifest().web.clone();
let spawner_name = pod.manifest().pod.name.clone(); let spawner_name = pod.manifest().pod.name.clone();
let spawner_model = pod.manifest().model.clone(); let spawner_model = pod.manifest().model.clone();
let pod_store = pod.store().clone(); let pod_store = pod.store().clone();
@ -521,6 +522,7 @@ where
tracker.clone(), tracker.clone(),
task_store, task_store,
bash_output_dir, bash_output_dir,
web_config,
)); ));
// Memory subsystem opt-in. When `[memory]` is present in the // Memory subsystem opt-in. When `[memory]` is present in the

View File

@ -80,7 +80,7 @@ fn permission_ask_unsupported(input: &ToolCallSummary) -> ToolResult {
fn permission_target(arguments: &Value) -> String { fn permission_target(arguments: &Value) -> String {
if let Value::Object(map) = arguments { if let Value::Object(map) = arguments {
for key in ["command", "file_path", "path", "pattern"] { for key in ["command", "file_path", "path", "pattern", "query", "url"] {
if let Some(value) = map.get(key).and_then(Value::as_str) { if let Some(value) = map.get(key).and_then(Value::as_str) {
return value.to_string(); return value.to_string();
} }

View File

@ -13,6 +13,7 @@ grep-searcher = "0.1.16"
ignore = "0.4.25" ignore = "0.4.25"
llm-worker = { workspace = true } llm-worker = { workspace = true }
manifest = { workspace = true } manifest = { workspace = true }
reqwest = { version = "0.13", default-features = false, features = ["json", "native-tls"] }
schemars = { workspace = true } schemars = { workspace = true }
serde = { workspace = true, features = ["derive"] } serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true } serde_json = { workspace = true }

View File

@ -28,6 +28,7 @@ mod edit;
mod glob; mod glob;
mod grep; mod grep;
mod read; mod read;
mod web;
mod write; mod write;
pub use bash::bash_tool; pub use bash::bash_tool;
@ -39,6 +40,7 @@ pub use read::read_tool;
pub use scoped_fs::ScopedFs; pub use scoped_fs::ScopedFs;
pub use task::{TaskEntry, TaskSnapshot, TaskStatus, TaskStore, task_tools}; pub use task::{TaskEntry, TaskSnapshot, TaskStatus, TaskStore, task_tools};
pub use tracker::Tracker; pub use tracker::Tracker;
pub use web::{web_fetch_tool, web_search_tool};
pub use write::write_tool; pub use write::write_tool;
/// Register all builtin tools, wiring them to a shared `ScopedFs` /// Register all builtin tools, wiring them to a shared `ScopedFs`
@ -57,6 +59,7 @@ pub fn builtin_tools(
tracker: Tracker, tracker: Tracker,
task_store: TaskStore, task_store: TaskStore,
bash_output_dir: std::path::PathBuf, bash_output_dir: std::path::PathBuf,
web_config: Option<manifest::WebConfig>,
) -> Vec<llm_worker::tool::ToolDefinition> { ) -> Vec<llm_worker::tool::ToolDefinition> {
let mut defs = vec![ let mut defs = vec![
read_tool(fs.clone(), tracker.clone()), read_tool(fs.clone(), tracker.clone()),
@ -65,6 +68,8 @@ pub fn builtin_tools(
glob_tool(fs.clone()), glob_tool(fs.clone()),
grep_tool(fs.clone()), grep_tool(fs.clone()),
bash_tool(fs, bash_output_dir), bash_tool(fs, bash_output_dir),
web_search_tool(web::WebTools::new(web_config.clone())),
web_fetch_tool(web::WebTools::new(web_config)),
]; ];
defs.extend(task_tools(task_store)); defs.extend(task_tools(task_store));
defs defs

View File

@ -35,7 +35,7 @@
//! let tracker = Tracker::new(); // session lifetime //! let tracker = Tracker::new(); // session lifetime
//! let bash_outputs = PathBuf::from("/run/insomnia/bash-output"); //! let bash_outputs = PathBuf::from("/run/insomnia/bash-output");
//! let task_store = tools::TaskStore::new(); //! let task_store = tools::TaskStore::new();
//! let defs = builtin_tools(fs, tracker, task_store, bash_outputs); //! let defs = builtin_tools(fs, tracker, task_store, bash_outputs, None);
//! ``` //! ```
use std::collections::{HashMap, VecDeque}; use std::collections::{HashMap, VecDeque};

1032
crates/tools/src/web.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@ -48,6 +48,7 @@ fn setup() -> (TempDir, TempDir, Registry) {
tracker, tracker,
TaskStore::new(), TaskStore::new(),
spill.path().to_path_buf(), spill.path().to_path_buf(),
None,
)); ));
(dir, spill, reg) (dir, spill, reg)
} }

View File

@ -61,6 +61,7 @@ fn setup() -> (TempDir, TempDir, Registry) {
tracker, tracker,
TaskStore::new(), TaskStore::new(),
spill.path().to_path_buf(), spill.path().to_path_buf(),
None,
)); ));
(dir, spill, reg) (dir, spill, reg)
} }
@ -94,6 +95,8 @@ fn builtin_tools_registers_full_set() {
"TaskGet", "TaskGet",
"TaskList", "TaskList",
"TaskUpdate", "TaskUpdate",
"WebFetch",
"WebSearch",
"Write" "Write"
] ]
); );
@ -289,7 +292,7 @@ async fn edit_requires_read_across_tools() {
#[tokio::test] #[tokio::test]
async fn deterministic_tool_order_is_registration_order() { async fn deterministic_tool_order_is_registration_order() {
let (_dir, _spill, reg) = setup(); let (_dir, _spill, reg) = setup();
// Registration order from builtin_tools(): Read, Write, Edit, Glob, Grep, Bash, TaskCreate, TaskList, TaskGet, TaskUpdate // Registration order from builtin_tools(): Read, Write, Edit, Glob, Grep, Bash, WebSearch, WebFetch, TaskCreate, TaskList, TaskGet, TaskUpdate
let names: Vec<&str> = reg.entries.iter().map(|(m, _)| m.name.as_str()).collect(); let names: Vec<&str> = reg.entries.iter().map(|(m, _)| m.name.as_str()).collect();
assert_eq!( assert_eq!(
names, names,
@ -300,6 +303,8 @@ async fn deterministic_tool_order_is_registration_order() {
"Glob", "Glob",
"Grep", "Grep",
"Bash", "Bash",
"WebSearch",
"WebFetch",
"TaskCreate", "TaskCreate",
"TaskList", "TaskList",
"TaskGet", "TaskGet",
@ -319,6 +324,8 @@ fn tool_names_match_reference_spec() {
"Glob", "Glob",
"Grep", "Grep",
"Bash", "Bash",
"WebSearch",
"WebFetch",
"TaskCreate", "TaskCreate",
"TaskList", "TaskList",
"TaskGet", "TaskGet",
@ -344,6 +351,7 @@ async fn tracker_recent_files_tracks_read_write_edit() {
tracker.clone(), tracker.clone(),
TaskStore::new(), TaskStore::new(),
spill.path().to_path_buf(), spill.path().to_path_buf(),
None,
)); ));
let a = dir.path().join("a.txt"); let a = dir.path().join("a.txt");

View File

@ -178,6 +178,19 @@ tool = "Write"
pattern = "*.env" pattern = "*.env"
action = "deny" action = "deny"
[web]
enabled = true
[web.search]
provider = "brave"
api_key_env = "BRAVE_SEARCH_API_KEY"
[web.fetch]
timeout_secs = 20
redirect_limit = 5
max_response_bytes = 2097152
max_output_bytes = 65536
[compaction] [compaction]
prune_protected_tokens = 8000 prune_protected_tokens = 8000
prune_min_savings = 4096 prune_min_savings = 4096
@ -220,6 +233,29 @@ scheme 側が吸収する。
生成設定は provider 別の値域検証を行わない。型が TOML と合わない場合は manifest 生成設定は provider 別の値域検証を行わない。型が TOML と合わない場合は manifest
parse error になるが、provider が受け付けない値や組み合わせは API 応答で検出する。 parse error になるが、provider が受け付けない値や組み合わせは API 応答で検出する。
## `[web]` 設定
`WebSearch` / `WebFetch` は通常の built-in function tool として登録されるが、manifest で明示的に有効化されるまでネットワークアクセスしない。無効または未設定の場合、tool call は「設定されていない」旨の明示的なエラーを返す。
```toml
[web]
enabled = true
[web.search]
provider = "brave"
api_key_env = "BRAVE_SEARCH_API_KEY" # API key は env 参照に置き、manifest に raw secret を書かない
[web.fetch]
timeout_secs = 20
redirect_limit = 5
max_response_bytes = 2097152
max_output_bytes = 65536
```
`WebSearch` の最初の provider は Brave Search API`https://api.search.brave.com/res/v1/web/search`)で、入力は `query` と任意の `limit` / `offset`。Brave の制約に合わせて `query` は 400 文字 / 50 words まで、`limit` は 1-20、`offset` は 0-9 に制限される。
`WebFetch` は http/https URL のみを fetch し、timeout・redirect・response/output byte limit を適用する。localhost / private / link-local などの host/IP は fetch 前と各 redirect で拒否される。テストや明示的に信頼した環境では `[web] allow_private_addresses = true` または `[web.fetch] allow_private_addresses = true` を指定できる。
## `[permissions]` 設定 ## `[permissions]` 設定
`[permissions]` が無い場合、ツール permission 層は無効で従来通り実行する。`[permissions]` を書く場合は `default_action = "allow" | "deny" | "ask"` が必須で、`[[permissions.rule]]` は宣言順に最初に一致した rule が採用される。一致しなければ `default_action` を使う。 `[permissions]` が無い場合、ツール permission 層は無効で従来通り実行する。`[permissions]` を書く場合は `default_action = "allow" | "deny" | "ask"` が必須で、`[[permissions.rule]]` は宣言順に最初に一致した rule が採用される。一致しなければ `default_action` を使う。
@ -234,7 +270,7 @@ pattern = "rm *"
action = "deny" action = "deny"
``` ```
`tool` は実行時に登録されているツール名(`Bash`, `Read`, `Write`, `Edit`, `Glob`, `Grep` 等)に対して大小文字を無視して照合する。`pattern` は built-in tool では主に `command` / `file_path` / `path` / `pattern` 引数に対する `*` / `?` ワイルドカードとして評価される。 `tool` は実行時に登録されているツール名(`Bash`, `Read`, `Write`, `Edit`, `Glob`, `Grep`, `WebSearch`, `WebFetch` 等)に対して大小文字を無視して照合する。`pattern` は built-in tool では主に `command` / `file_path` / `path` / `pattern` / `query` / `url` 引数に対する `*` / `?` ワイルドカードとして評価される。
`allow` は通常実行、`deny` はその tool call を実行せず `is_error = true` の synthetic tool result を履歴へ追加してターンを継続する。`ask` は型として受け付けるが、承認 protocol は未実装のため現在は headless に待機せず fail-closedsynthetic error resultになる。 `allow` は通常実行、`deny` はその tool call を実行せず `is_error = true` の synthetic tool result を履歴へ追加してターンを継続する。`ask` は型として受け付けるが、承認 protocol は未実装のため現在は headless に待機せず fail-closedsynthetic error resultになる。