web: extract readable html content

This commit is contained in:
Keisuke Hirata 2026-05-31 05:49:20 +09:00
parent dc5ce2ba72
commit 7906ca5326
No known key found for this signature in database
4 changed files with 427 additions and 39 deletions

222
Cargo.lock generated
View File

@ -531,7 +531,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb2a7d3066da2de787b7f032c736763eb7ae5d355f81a68bab2675a96008b0bf" checksum = "eb2a7d3066da2de787b7f032c736763eb7ae5d355f81a68bab2675a96008b0bf"
dependencies = [ dependencies = [
"lab", "lab",
"phf", "phf 0.11.3",
] ]
[[package]] [[package]]
@ -876,6 +876,16 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.32" version = "0.3.32"
@ -1127,6 +1137,20 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]] [[package]]
name = "http" name = "http"
version = "1.4.0" version = "1.4.0"
@ -1744,6 +1768,12 @@ dependencies = [
"which", "which",
] ]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]] [[package]]
name = "mac_address" name = "mac_address"
version = "1.1.8" version = "1.1.8"
@ -1771,6 +1801,32 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen 0.10.0",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2"
dependencies = [
"html5ever",
"markup5ever",
"tendril",
"xml5ever",
]
[[package]] [[package]]
name = "matchers" name = "matchers"
version = "0.2.0" version = "0.2.0"
@ -1922,6 +1978,12 @@ dependencies = [
"tempfile", "tempfile",
] ]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]] [[package]]
name = "nix" name = "nix"
version = "0.29.0" version = "0.29.0"
@ -2151,6 +2213,15 @@ dependencies = [
"sha2 0.10.9", "sha2 0.10.9",
] ]
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]] [[package]]
name = "phf" name = "phf"
version = "0.11.3" version = "0.11.3"
@ -2158,7 +2229,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
dependencies = [ dependencies = [
"phf_macros", "phf_macros",
"phf_shared", "phf_shared 0.11.3",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
] ]
[[package]] [[package]]
@ -2167,8 +2248,18 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
dependencies = [ dependencies = [
"phf_generator", "phf_generator 0.11.3",
"phf_shared", "phf_shared 0.11.3",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand 0.8.5",
] ]
[[package]] [[package]]
@ -2177,7 +2268,7 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [ dependencies = [
"phf_shared", "phf_shared 0.11.3",
"rand 0.8.5", "rand 0.8.5",
] ]
@ -2187,20 +2278,29 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
dependencies = [ dependencies = [
"phf_generator", "phf_generator 0.11.3",
"phf_shared", "phf_shared 0.11.3",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.117", "syn 2.0.117",
] ]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher 0.3.11",
]
[[package]] [[package]]
name = "phf_shared" name = "phf_shared"
version = "0.11.3" version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [ dependencies = [
"siphasher", "siphasher 1.0.2",
] ]
[[package]] [[package]]
@ -2312,6 +2412,12 @@ dependencies = [
"zerocopy", "zerocopy",
] ]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]] [[package]]
name = "prettyplease" name = "prettyplease"
version = "0.2.37" version = "0.2.37"
@ -2456,6 +2562,8 @@ version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [ dependencies = [
"libc",
"rand_chacha 0.3.1",
"rand_core 0.6.4", "rand_core 0.6.4",
] ]
@ -2465,10 +2573,20 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
dependencies = [ dependencies = [
"rand_chacha", "rand_chacha 0.9.0",
"rand_core 0.9.5", "rand_core 0.9.5",
] ]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core 0.6.4",
]
[[package]] [[package]]
name = "rand_chacha" name = "rand_chacha"
version = "0.9.0" version = "0.9.0"
@ -2484,6 +2602,9 @@ name = "rand_core"
version = "0.6.4" version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom 0.2.17",
]
[[package]] [[package]]
name = "rand_core" name = "rand_core"
@ -2591,6 +2712,21 @@ dependencies = [
"unicode-width", "unicode-width",
] ]
[[package]]
name = "readability-rs"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a17841ca2fc1c3e2aed7c44b29121ab099176923c0ac55d9906edea8ab025bc"
dependencies = [
"html5ever",
"lazy_static",
"log",
"markup5ever_rcdom",
"regex",
"thiserror 2.0.18",
"url",
]
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.5.18" version = "0.5.18"
@ -3155,6 +3291,12 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]] [[package]]
name = "siphasher" name = "siphasher"
version = "1.0.2" version = "1.0.2"
@ -3195,6 +3337,31 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "string_cache"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared 0.11.3",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
dependencies = [
"phf_generator 0.11.3",
"phf_shared 0.11.3",
"proc-macro2",
"quote",
]
[[package]] [[package]]
name = "strsim" name = "strsim"
version = "0.11.1" version = "0.11.1"
@ -3310,6 +3477,17 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]] [[package]]
name = "termcolor" name = "termcolor"
version = "1.4.1" version = "1.4.1"
@ -3327,8 +3505,8 @@ checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662"
dependencies = [ dependencies = [
"fnv", "fnv",
"nom", "nom",
"phf", "phf 0.11.3",
"phf_codegen", "phf_codegen 0.11.3",
] ]
[[package]] [[package]]
@ -3374,10 +3552,10 @@ dependencies = [
"ordered-float 4.6.0", "ordered-float 4.6.0",
"pest", "pest",
"pest_derive", "pest_derive",
"phf", "phf 0.11.3",
"sha2 0.10.9", "sha2 0.10.9",
"signal-hook", "signal-hook",
"siphasher", "siphasher 1.0.2",
"terminfo", "terminfo",
"termios", "termios",
"thiserror 1.0.69", "thiserror 1.0.69",
@ -3600,6 +3778,7 @@ dependencies = [
"ignore", "ignore",
"llm-worker", "llm-worker",
"manifest", "manifest",
"readability-rs",
"reqwest", "reqwest",
"schemars", "schemars",
"serde", "serde",
@ -3845,6 +4024,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]] [[package]]
name = "utf8_iter" name = "utf8_iter"
version = "1.0.4" version = "1.0.4"
@ -4549,6 +4734,17 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
[[package]]
name = "xml5ever"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650"
dependencies = [
"log",
"mac",
"markup5ever",
]
[[package]] [[package]]
name = "yoke" name = "yoke"
version = "0.8.2" version = "0.8.2"

View File

@ -13,6 +13,7 @@ grep-searcher = "0.1.16"
ignore = "0.4.25" ignore = "0.4.25"
llm-worker = { workspace = true } llm-worker = { workspace = true }
manifest = { workspace = true } manifest = { workspace = true }
readability = { package = "readability-rs", version = "0.5.0" }
reqwest = { version = "0.13", default-features = false, features = ["json", "native-tls"] } reqwest = { version = "0.13", default-features = false, features = ["json", "native-tls"] }
schemars = { workspace = true } schemars = { workspace = true }
serde = { workspace = true, features = ["derive"] } serde = { workspace = true, features = ["derive"] }

View File

@ -1,3 +1,4 @@
use std::io::Cursor;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
@ -8,7 +9,7 @@ use manifest::{WebConfig, WebFetchConfig, WebSearchConfig, WebSearchProvider};
use reqwest::header::{CONTENT_LENGTH, CONTENT_TYPE, HeaderMap, LOCATION}; use reqwest::header::{CONTENT_LENGTH, CONTENT_TYPE, HeaderMap, LOCATION};
use reqwest::{Client, Url}; use reqwest::{Client, Url};
use schemars::JsonSchema; use schemars::JsonSchema;
use serde::Deserialize; use serde::{Deserialize, Serialize};
use serde_json::{Value, json}; use serde_json::{Value, json};
use tokio::net::lookup_host; use tokio::net::lookup_host;
@ -24,6 +25,8 @@ const WEB_FETCH_DEFAULT_MAX_RESPONSE_BYTES: usize = 2 * 1024 * 1024;
const WEB_FETCH_DEFAULT_MAX_OUTPUT_BYTES: usize = 64 * 1024; const WEB_FETCH_DEFAULT_MAX_OUTPUT_BYTES: usize = 64 * 1024;
const WEB_FETCH_MIN_MAX_RESPONSE_BYTES: usize = 1024; const WEB_FETCH_MIN_MAX_RESPONSE_BYTES: usize = 1024;
const WEB_FETCH_MIN_MAX_OUTPUT_BYTES: usize = 512; const WEB_FETCH_MIN_MAX_OUTPUT_BYTES: usize = 512;
const WEB_FETCH_READABILITY_MIN_TEXT_CHARS: usize = 40;
const WEB_FETCH_TRUNCATION_MARKER: &str = "\n[truncated]";
#[derive(Clone)] #[derive(Clone)]
pub struct WebTools { pub struct WebTools {
@ -429,10 +432,11 @@ async fn fetch_url(
))); )));
} }
let (bytes, response_truncated) = read_limited(response, limits.max_response_bytes).await?; let (bytes, response_truncated) = read_limited(response, limits.max_response_bytes).await?;
let (text, transformed_as) = render_content( let rendered = render_content(
&bytes, &bytes,
media_kind, media_kind,
content_type.as_deref(), content_type.as_deref(),
&url,
limits.max_output_bytes, limits.max_output_bytes,
)?; )?;
return Ok(json_output(json!({ return Ok(json_output(json!({
@ -440,13 +444,15 @@ async fn fetch_url(
"url": url.as_str(), "url": url.as_str(),
"status": status.as_u16(), "status": status.as_u16(),
"content_type": content_type, "content_type": content_type,
"transformed_as": transformed_as, "transformed_as": rendered.transformed_as,
"html_extraction": rendered.html_extraction,
"bytes_read": bytes.len(), "bytes_read": bytes.len(),
"truncated": response_truncated, "truncated": response_truncated,
"output_truncated": rendered.output_truncated,
"max_response_bytes": limits.max_response_bytes, "max_response_bytes": limits.max_response_bytes,
"max_output_bytes": limits.max_output_bytes, "max_output_bytes": limits.max_output_bytes,
"redirects": redirects, "redirects": redirects,
"text": text, "text": rendered.text,
}))); })));
} }
unreachable!("redirect loop exits through return or error") unreachable!("redirect loop exits through return or error")
@ -635,12 +641,36 @@ fn classify_content_type(content_type: Option<&str>) -> Result<MediaKind, ToolEr
} }
} }
#[derive(Debug)]
struct RenderedContent {
text: String,
transformed_as: &'static str,
html_extraction: Option<HtmlExtractionMetadata>,
output_truncated: bool,
}
#[derive(Debug, Serialize)]
struct HtmlExtractionMetadata {
method: &'static str,
fallback: bool,
#[serde(skip_serializing_if = "Option::is_none")]
fallback_reason: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
title: Option<String>,
}
struct HtmlDocument {
text: String,
metadata: HtmlExtractionMetadata,
}
fn render_content( fn render_content(
bytes: &[u8], bytes: &[u8],
kind: MediaKind, kind: MediaKind,
content_type: Option<&str>, content_type: Option<&str>,
base_url: &Url,
max_output_bytes: usize, max_output_bytes: usize,
) -> Result<(String, &'static str), ToolError> { ) -> Result<RenderedContent, ToolError> {
reject_binary(bytes)?; reject_binary(bytes)?;
let raw = String::from_utf8(bytes.to_vec()).map_err(|err| { let raw = String::from_utf8(bytes.to_vec()).map_err(|err| {
ToolError::ExecutionFailed(format!( ToolError::ExecutionFailed(format!(
@ -648,16 +678,75 @@ fn render_content(
content_type.unwrap_or("unknown") content_type.unwrap_or("unknown")
)) ))
})?; })?;
let rendered = match kind { let (text, transformed_as, html_extraction) = match kind {
MediaKind::Html => (html_to_text(&raw), "html_to_text"), MediaKind::Html => {
MediaKind::Json => (json_to_text(&raw)?, "json_pretty"), let document = extract_html_document(&raw, base_url);
MediaKind::Xml => (xmlish_to_text(&raw), "xml_text"), (
MediaKind::Text | MediaKind::Unknown => (raw, "text"), document.text,
document.metadata.method,
Some(document.metadata),
)
}
MediaKind::Json => (json_to_text(&raw)?, "json_pretty", None),
MediaKind::Xml => (xmlish_to_text(&raw), "xml_text", None),
MediaKind::Text | MediaKind::Unknown => (raw, "text", None),
}; };
Ok(( let (text, output_truncated) = truncate_to_bytes(clean_text(text), max_output_bytes);
truncate_to_bytes(clean_text(rendered.0), max_output_bytes), Ok(RenderedContent {
rendered.1, text,
)) transformed_as,
html_extraction,
output_truncated,
})
}
fn extract_html_document(html: &str, base_url: &Url) -> HtmlDocument {
let mut input = Cursor::new(html.as_bytes());
match readability::extract(&mut input, base_url, Default::default()) {
Ok(readable) => {
let text = clean_text(readable.text);
let title = non_empty_string(clean_text(readable.title));
if text.chars().count() >= WEB_FETCH_READABILITY_MIN_TEXT_CHARS {
return HtmlDocument {
text,
metadata: HtmlExtractionMetadata {
method: "readability",
fallback: false,
fallback_reason: None,
title,
},
};
}
html_fallback_document(
html,
title,
Some(format!(
"readability text shorter than {WEB_FETCH_READABILITY_MIN_TEXT_CHARS} characters"
)),
)
}
Err(err) => html_fallback_document(
html,
None,
Some(format!("readability extraction failed: {err}")),
),
}
}
fn html_fallback_document(
html: &str,
title: Option<String>,
fallback_reason: Option<String>,
) -> HtmlDocument {
HtmlDocument {
text: html_to_text(html),
metadata: HtmlExtractionMetadata {
method: "html_to_text",
fallback: true,
fallback_reason,
title,
},
}
} }
fn reject_binary(bytes: &[u8]) -> Result<(), ToolError> { fn reject_binary(bytes: &[u8]) -> Result<(), ToolError> {
@ -772,17 +861,31 @@ fn decode_basic_entities(input: &str) -> String {
.replace("&#39;", "'") .replace("&#39;", "'")
} }
fn truncate_to_bytes(mut s: String, max: usize) -> String { fn non_empty_string(input: String) -> Option<String> {
if input.is_empty() { None } else { Some(input) }
}
fn truncate_to_bytes(mut s: String, max: usize) -> (String, bool) {
if s.len() <= max { if s.len() <= max {
return s; return (s, false);
} }
let mut end = max;
while !s.is_char_boundary(end) { if max <= WEB_FETCH_TRUNCATION_MARKER.len() {
let mut end = max;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
s.truncate(end);
return (s, true);
}
let mut end = max - WEB_FETCH_TRUNCATION_MARKER.len();
while end > 0 && !s.is_char_boundary(end) {
end -= 1; end -= 1;
} }
s.truncate(end); s.truncate(end);
s.push_str("\n[truncated]"); s.push_str(WEB_FETCH_TRUNCATION_MARKER);
s (s, true)
} }
fn bounded_lossy(bytes: &[u8], max: usize) -> String { fn bounded_lossy(bytes: &[u8], max: usize) -> String {
@ -875,6 +978,16 @@ mod tests {
addr addr
} }
fn html_response(body: &str) -> &'static str {
Box::leak(
format!(
"HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: {}\r\n\r\n{}",
body.len(), body
)
.into_boxed_str(),
)
}
async fn read_request(stream: &mut TcpStream) -> String { async fn read_request(stream: &mut TcpStream) -> String {
let mut buf = vec![0; 4096]; let mut buf = vec![0; 4096];
let n = stream.read(&mut buf).await.unwrap(); let n = stream.read(&mut buf).await.unwrap();
@ -882,6 +995,10 @@ mod tests {
} }
fn enabled_web_fetch() -> WebTools { fn enabled_web_fetch() -> WebTools {
enabled_web_fetch_with_output(2048)
}
fn enabled_web_fetch_with_output(max_output_bytes: usize) -> WebTools {
WebTools::new(Some(WebConfig { WebTools::new(Some(WebConfig {
enabled: Some(true), enabled: Some(true),
allow_private_addresses: Some(true), allow_private_addresses: Some(true),
@ -891,7 +1008,7 @@ mod tests {
timeout_secs: Some(5), timeout_secs: Some(5),
redirect_limit: Some(2), redirect_limit: Some(2),
max_response_bytes: Some(4096), max_response_bytes: Some(4096),
max_output_bytes: Some(2048), max_output_bytes: Some(max_output_bytes),
allow_private_addresses: None, allow_private_addresses: None,
}), }),
})) }))
@ -942,10 +1059,10 @@ mod tests {
} }
#[tokio::test] #[tokio::test]
async fn fetches_html_as_bounded_text() { async fn fetches_short_html_with_fallback_metadata() {
let addr = serve_once( let addr = serve_once(html_response(
"HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: 93\r\n\r\n<html><body><h1>Hello &amp; welcome</h1><script>ignore()</script><p>Readable text.</p></body></html>", "<html><body><h1>Hello &amp; welcome</h1><script>ignore()</script><p>Readable text.</p></body></html>",
) ))
.await; .await;
let tools = enabled_web_fetch(); let tools = enabled_web_fetch();
let result = tools let result = tools
@ -959,6 +1076,80 @@ mod tests {
assert!(text.contains("Hello & welcome")); assert!(text.contains("Hello & welcome"));
assert!(text.contains("Readable text.")); assert!(text.contains("Readable text."));
assert!(!text.contains("ignore")); assert!(!text.contains("ignore"));
assert_eq!(value["transformed_as"], "html_to_text");
assert_eq!(value["html_extraction"]["method"], "html_to_text");
assert_eq!(value["html_extraction"]["fallback"], true);
assert!(
value["html_extraction"]["fallback_reason"]
.as_str()
.unwrap()
.contains("shorter")
);
}
#[tokio::test]
async fn fetches_html_with_readability_main_text() {
let body = r#"
<html>
<head><title>Example Readable Article</title></head>
<body>
<nav>Home Products Pricing unrelated navigation</nav>
<main>
<article>
<h1>Example Readable Article</h1>
<p>The useful article opens with a distinct sentence about careful Rust web fetching and reader mode extraction.</p>
<p>It continues with enough focused prose to make the main document body clearly longer than boilerplate around it.</p>
<p>A final paragraph mentions durable safety bounds and untrusted web content handling for the fetched page.</p>
</article>
</main>
<footer>Copyright boilerplate and social links should not be part of the article.</footer>
</body>
</html>
"#;
let addr = serve_once(html_response(body)).await;
let tools = enabled_web_fetch();
let result = tools
.run_fetch(WebFetchInput {
url: format!("http://{addr}/article"),
})
.await
.unwrap();
let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap();
let text = value.get("text").unwrap().as_str().unwrap();
assert!(text.contains("careful Rust web fetching"));
assert!(text.contains("durable safety bounds"));
assert!(!text.contains("Home Products Pricing"));
assert!(!text.contains("Copyright boilerplate"));
assert_eq!(value["transformed_as"], "readability");
assert_eq!(value["html_extraction"]["method"], "readability");
assert_eq!(value["html_extraction"]["fallback"], false);
assert_eq!(
value["html_extraction"]["title"].as_str().unwrap(),
"Example Readable Article"
);
}
#[tokio::test]
async fn fetches_readable_html_with_bounded_output() {
let repeated =
"Reader-mode extracted paragraph with enough content for truncation. ".repeat(30);
let body = format!(
"<html><head><title>Long Article</title></head><body><article><h1>Long Article</h1><p>{repeated}</p></article></body></html>"
);
let addr = serve_once(html_response(&body)).await;
let tools = enabled_web_fetch_with_output(WEB_FETCH_MIN_MAX_OUTPUT_BYTES);
let result = tools
.run_fetch(WebFetchInput {
url: format!("http://{addr}/long"),
})
.await
.unwrap();
let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap();
let text = value.get("text").unwrap().as_str().unwrap();
assert!(text.len() <= WEB_FETCH_MIN_MAX_OUTPUT_BYTES);
assert!(text.ends_with(WEB_FETCH_TRUNCATION_MARKER));
assert_eq!(value["output_truncated"], true);
assert_eq!(value["html_extraction"]["fallback"], false);
} }
#[tokio::test] #[tokio::test]

View File

@ -40,7 +40,7 @@ rustPlatform.buildRustPackage rec {
filter = sourceFilter; filter = sourceFilter;
}; };
cargoHash = "sha256-8ZT5moKFxj/5vbp5rsUG7UkPLY1fvQKhYTyjRWQ58xk="; cargoHash = "sha256-VzVFqOWJHfgX92Qw84995ICQu2uvQPeYm6AotU4/LR0=";
depsExtraArgs = { depsExtraArgs = {
# nixpkgs 25.11's fetchCargoVendor still uses crates.io's API # nixpkgs 25.11's fetchCargoVendor still uses crates.io's API