web: extract readable html content
This commit is contained in:
parent
dc5ce2ba72
commit
7906ca5326
222
Cargo.lock
generated
222
Cargo.lock
generated
|
|
@ -531,7 +531,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "eb2a7d3066da2de787b7f032c736763eb7ae5d355f81a68bab2675a96008b0bf"
|
||||
dependencies = [
|
||||
"lab",
|
||||
"phf",
|
||||
"phf 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -876,6 +876,16 @@ version = "1.3.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
|
||||
[[package]]
|
||||
name = "futf"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
|
||||
dependencies = [
|
||||
"mac",
|
||||
"new_debug_unreachable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.32"
|
||||
|
|
@ -1127,6 +1137,20 @@ version = "0.4.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.4.0"
|
||||
|
|
@ -1744,6 +1768,12 @@ dependencies = [
|
|||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mac"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "mac_address"
|
||||
version = "1.1.8"
|
||||
|
|
@ -1771,6 +1801,32 @@ dependencies = [
|
|||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
|
||||
dependencies = [
|
||||
"log",
|
||||
"phf 0.10.1",
|
||||
"phf_codegen 0.10.0",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever_rcdom"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2"
|
||||
dependencies = [
|
||||
"html5ever",
|
||||
"markup5ever",
|
||||
"tendril",
|
||||
"xml5ever",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.2.0"
|
||||
|
|
@ -1922,6 +1978,12 @@ dependencies = [
|
|||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.29.0"
|
||||
|
|
@ -2151,6 +2213,15 @@ dependencies = [
|
|||
"sha2 0.10.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
|
||||
dependencies = [
|
||||
"phf_shared 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.11.3"
|
||||
|
|
@ -2158,7 +2229,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
|
||||
dependencies = [
|
||||
"phf_macros",
|
||||
"phf_shared",
|
||||
"phf_shared 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -2167,8 +2248,18 @@ version = "0.11.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
|
||||
dependencies = [
|
||||
"phf_shared 0.10.0",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -2177,7 +2268,7 @@ version = "0.11.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
"phf_shared 0.11.3",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
|
|
@ -2187,20 +2278,29 @@ version = "0.11.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
|
||||
dependencies = [
|
||||
"siphasher 0.3.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
"siphasher 1.0.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -2312,6 +2412,12 @@ dependencies = [
|
|||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "precomputed-hash"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.2.37"
|
||||
|
|
@ -2456,6 +2562,8 @@ version = "0.8.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha 0.3.1",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
|
|
@ -2465,10 +2573,20 @@ version = "0.9.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
|
||||
dependencies = [
|
||||
"rand_chacha",
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
|
|
@ -2484,6 +2602,9 @@ name = "rand_core"
|
|||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
dependencies = [
|
||||
"getrandom 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
|
|
@ -2591,6 +2712,21 @@ dependencies = [
|
|||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "readability-rs"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a17841ca2fc1c3e2aed7c44b29121ab099176923c0ac55d9906edea8ab025bc"
|
||||
dependencies = [
|
||||
"html5ever",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"markup5ever_rcdom",
|
||||
"regex",
|
||||
"thiserror 2.0.18",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.18"
|
||||
|
|
@ -3155,6 +3291,12 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "1.0.2"
|
||||
|
|
@ -3195,6 +3337,31 @@ version = "1.1.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||
|
||||
[[package]]
|
||||
name = "string_cache"
|
||||
version = "0.8.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
|
||||
dependencies = [
|
||||
"new_debug_unreachable",
|
||||
"parking_lot",
|
||||
"phf_shared 0.11.3",
|
||||
"precomputed-hash",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "string_cache_codegen"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
|
||||
dependencies = [
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
|
|
@ -3310,6 +3477,17 @@ dependencies = [
|
|||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
|
||||
dependencies = [
|
||||
"futf",
|
||||
"mac",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.4.1"
|
||||
|
|
@ -3327,8 +3505,8 @@ checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662"
|
|||
dependencies = [
|
||||
"fnv",
|
||||
"nom",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"phf 0.11.3",
|
||||
"phf_codegen 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -3374,10 +3552,10 @@ dependencies = [
|
|||
"ordered-float 4.6.0",
|
||||
"pest",
|
||||
"pest_derive",
|
||||
"phf",
|
||||
"phf 0.11.3",
|
||||
"sha2 0.10.9",
|
||||
"signal-hook",
|
||||
"siphasher",
|
||||
"siphasher 1.0.2",
|
||||
"terminfo",
|
||||
"termios",
|
||||
"thiserror 1.0.69",
|
||||
|
|
@ -3600,6 +3778,7 @@ dependencies = [
|
|||
"ignore",
|
||||
"llm-worker",
|
||||
"manifest",
|
||||
"readability-rs",
|
||||
"reqwest",
|
||||
"schemars",
|
||||
"serde",
|
||||
|
|
@ -3845,6 +4024,12 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
|
|
@ -4549,6 +4734,17 @@ version = "0.6.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
|
||||
|
||||
[[package]]
|
||||
name = "xml5ever"
|
||||
version = "0.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.8.2"
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ grep-searcher = "0.1.16"
|
|||
ignore = "0.4.25"
|
||||
llm-worker = { workspace = true }
|
||||
manifest = { workspace = true }
|
||||
readability = { package = "readability-rs", version = "0.5.0" }
|
||||
reqwest = { version = "0.13", default-features = false, features = ["json", "native-tls"] }
|
||||
schemars = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
use std::io::Cursor;
|
||||
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
|
@ -8,7 +9,7 @@ use manifest::{WebConfig, WebFetchConfig, WebSearchConfig, WebSearchProvider};
|
|||
use reqwest::header::{CONTENT_LENGTH, CONTENT_TYPE, HeaderMap, LOCATION};
|
||||
use reqwest::{Client, Url};
|
||||
use schemars::JsonSchema;
|
||||
use serde::Deserialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Value, json};
|
||||
use tokio::net::lookup_host;
|
||||
|
||||
|
|
@ -24,6 +25,8 @@ const WEB_FETCH_DEFAULT_MAX_RESPONSE_BYTES: usize = 2 * 1024 * 1024;
|
|||
const WEB_FETCH_DEFAULT_MAX_OUTPUT_BYTES: usize = 64 * 1024;
|
||||
const WEB_FETCH_MIN_MAX_RESPONSE_BYTES: usize = 1024;
|
||||
const WEB_FETCH_MIN_MAX_OUTPUT_BYTES: usize = 512;
|
||||
const WEB_FETCH_READABILITY_MIN_TEXT_CHARS: usize = 40;
|
||||
const WEB_FETCH_TRUNCATION_MARKER: &str = "\n[truncated]";
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct WebTools {
|
||||
|
|
@ -429,10 +432,11 @@ async fn fetch_url(
|
|||
)));
|
||||
}
|
||||
let (bytes, response_truncated) = read_limited(response, limits.max_response_bytes).await?;
|
||||
let (text, transformed_as) = render_content(
|
||||
let rendered = render_content(
|
||||
&bytes,
|
||||
media_kind,
|
||||
content_type.as_deref(),
|
||||
&url,
|
||||
limits.max_output_bytes,
|
||||
)?;
|
||||
return Ok(json_output(json!({
|
||||
|
|
@ -440,13 +444,15 @@ async fn fetch_url(
|
|||
"url": url.as_str(),
|
||||
"status": status.as_u16(),
|
||||
"content_type": content_type,
|
||||
"transformed_as": transformed_as,
|
||||
"transformed_as": rendered.transformed_as,
|
||||
"html_extraction": rendered.html_extraction,
|
||||
"bytes_read": bytes.len(),
|
||||
"truncated": response_truncated,
|
||||
"output_truncated": rendered.output_truncated,
|
||||
"max_response_bytes": limits.max_response_bytes,
|
||||
"max_output_bytes": limits.max_output_bytes,
|
||||
"redirects": redirects,
|
||||
"text": text,
|
||||
"text": rendered.text,
|
||||
})));
|
||||
}
|
||||
unreachable!("redirect loop exits through return or error")
|
||||
|
|
@ -635,12 +641,36 @@ fn classify_content_type(content_type: Option<&str>) -> Result<MediaKind, ToolEr
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RenderedContent {
|
||||
text: String,
|
||||
transformed_as: &'static str,
|
||||
html_extraction: Option<HtmlExtractionMetadata>,
|
||||
output_truncated: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct HtmlExtractionMetadata {
|
||||
method: &'static str,
|
||||
fallback: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
fallback_reason: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
title: Option<String>,
|
||||
}
|
||||
|
||||
struct HtmlDocument {
|
||||
text: String,
|
||||
metadata: HtmlExtractionMetadata,
|
||||
}
|
||||
|
||||
fn render_content(
|
||||
bytes: &[u8],
|
||||
kind: MediaKind,
|
||||
content_type: Option<&str>,
|
||||
base_url: &Url,
|
||||
max_output_bytes: usize,
|
||||
) -> Result<(String, &'static str), ToolError> {
|
||||
) -> Result<RenderedContent, ToolError> {
|
||||
reject_binary(bytes)?;
|
||||
let raw = String::from_utf8(bytes.to_vec()).map_err(|err| {
|
||||
ToolError::ExecutionFailed(format!(
|
||||
|
|
@ -648,16 +678,75 @@ fn render_content(
|
|||
content_type.unwrap_or("unknown")
|
||||
))
|
||||
})?;
|
||||
let rendered = match kind {
|
||||
MediaKind::Html => (html_to_text(&raw), "html_to_text"),
|
||||
MediaKind::Json => (json_to_text(&raw)?, "json_pretty"),
|
||||
MediaKind::Xml => (xmlish_to_text(&raw), "xml_text"),
|
||||
MediaKind::Text | MediaKind::Unknown => (raw, "text"),
|
||||
let (text, transformed_as, html_extraction) = match kind {
|
||||
MediaKind::Html => {
|
||||
let document = extract_html_document(&raw, base_url);
|
||||
(
|
||||
document.text,
|
||||
document.metadata.method,
|
||||
Some(document.metadata),
|
||||
)
|
||||
}
|
||||
MediaKind::Json => (json_to_text(&raw)?, "json_pretty", None),
|
||||
MediaKind::Xml => (xmlish_to_text(&raw), "xml_text", None),
|
||||
MediaKind::Text | MediaKind::Unknown => (raw, "text", None),
|
||||
};
|
||||
Ok((
|
||||
truncate_to_bytes(clean_text(rendered.0), max_output_bytes),
|
||||
rendered.1,
|
||||
))
|
||||
let (text, output_truncated) = truncate_to_bytes(clean_text(text), max_output_bytes);
|
||||
Ok(RenderedContent {
|
||||
text,
|
||||
transformed_as,
|
||||
html_extraction,
|
||||
output_truncated,
|
||||
})
|
||||
}
|
||||
|
||||
fn extract_html_document(html: &str, base_url: &Url) -> HtmlDocument {
|
||||
let mut input = Cursor::new(html.as_bytes());
|
||||
match readability::extract(&mut input, base_url, Default::default()) {
|
||||
Ok(readable) => {
|
||||
let text = clean_text(readable.text);
|
||||
let title = non_empty_string(clean_text(readable.title));
|
||||
if text.chars().count() >= WEB_FETCH_READABILITY_MIN_TEXT_CHARS {
|
||||
return HtmlDocument {
|
||||
text,
|
||||
metadata: HtmlExtractionMetadata {
|
||||
method: "readability",
|
||||
fallback: false,
|
||||
fallback_reason: None,
|
||||
title,
|
||||
},
|
||||
};
|
||||
}
|
||||
html_fallback_document(
|
||||
html,
|
||||
title,
|
||||
Some(format!(
|
||||
"readability text shorter than {WEB_FETCH_READABILITY_MIN_TEXT_CHARS} characters"
|
||||
)),
|
||||
)
|
||||
}
|
||||
Err(err) => html_fallback_document(
|
||||
html,
|
||||
None,
|
||||
Some(format!("readability extraction failed: {err}")),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn html_fallback_document(
|
||||
html: &str,
|
||||
title: Option<String>,
|
||||
fallback_reason: Option<String>,
|
||||
) -> HtmlDocument {
|
||||
HtmlDocument {
|
||||
text: html_to_text(html),
|
||||
metadata: HtmlExtractionMetadata {
|
||||
method: "html_to_text",
|
||||
fallback: true,
|
||||
fallback_reason,
|
||||
title,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn reject_binary(bytes: &[u8]) -> Result<(), ToolError> {
|
||||
|
|
@ -772,17 +861,31 @@ fn decode_basic_entities(input: &str) -> String {
|
|||
.replace("'", "'")
|
||||
}
|
||||
|
||||
fn truncate_to_bytes(mut s: String, max: usize) -> String {
|
||||
if s.len() <= max {
|
||||
return s;
|
||||
fn non_empty_string(input: String) -> Option<String> {
|
||||
if input.is_empty() { None } else { Some(input) }
|
||||
}
|
||||
|
||||
fn truncate_to_bytes(mut s: String, max: usize) -> (String, bool) {
|
||||
if s.len() <= max {
|
||||
return (s, false);
|
||||
}
|
||||
|
||||
if max <= WEB_FETCH_TRUNCATION_MARKER.len() {
|
||||
let mut end = max;
|
||||
while !s.is_char_boundary(end) {
|
||||
while end > 0 && !s.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
}
|
||||
s.truncate(end);
|
||||
s.push_str("\n[truncated]");
|
||||
s
|
||||
return (s, true);
|
||||
}
|
||||
|
||||
let mut end = max - WEB_FETCH_TRUNCATION_MARKER.len();
|
||||
while end > 0 && !s.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
}
|
||||
s.truncate(end);
|
||||
s.push_str(WEB_FETCH_TRUNCATION_MARKER);
|
||||
(s, true)
|
||||
}
|
||||
|
||||
fn bounded_lossy(bytes: &[u8], max: usize) -> String {
|
||||
|
|
@ -875,6 +978,16 @@ mod tests {
|
|||
addr
|
||||
}
|
||||
|
||||
fn html_response(body: &str) -> &'static str {
|
||||
Box::leak(
|
||||
format!(
|
||||
"HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: {}\r\n\r\n{}",
|
||||
body.len(), body
|
||||
)
|
||||
.into_boxed_str(),
|
||||
)
|
||||
}
|
||||
|
||||
async fn read_request(stream: &mut TcpStream) -> String {
|
||||
let mut buf = vec![0; 4096];
|
||||
let n = stream.read(&mut buf).await.unwrap();
|
||||
|
|
@ -882,6 +995,10 @@ mod tests {
|
|||
}
|
||||
|
||||
fn enabled_web_fetch() -> WebTools {
|
||||
enabled_web_fetch_with_output(2048)
|
||||
}
|
||||
|
||||
fn enabled_web_fetch_with_output(max_output_bytes: usize) -> WebTools {
|
||||
WebTools::new(Some(WebConfig {
|
||||
enabled: Some(true),
|
||||
allow_private_addresses: Some(true),
|
||||
|
|
@ -891,7 +1008,7 @@ mod tests {
|
|||
timeout_secs: Some(5),
|
||||
redirect_limit: Some(2),
|
||||
max_response_bytes: Some(4096),
|
||||
max_output_bytes: Some(2048),
|
||||
max_output_bytes: Some(max_output_bytes),
|
||||
allow_private_addresses: None,
|
||||
}),
|
||||
}))
|
||||
|
|
@ -942,10 +1059,10 @@ mod tests {
|
|||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn fetches_html_as_bounded_text() {
|
||||
let addr = serve_once(
|
||||
"HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: 93\r\n\r\n<html><body><h1>Hello & welcome</h1><script>ignore()</script><p>Readable text.</p></body></html>",
|
||||
)
|
||||
async fn fetches_short_html_with_fallback_metadata() {
|
||||
let addr = serve_once(html_response(
|
||||
"<html><body><h1>Hello & welcome</h1><script>ignore()</script><p>Readable text.</p></body></html>",
|
||||
))
|
||||
.await;
|
||||
let tools = enabled_web_fetch();
|
||||
let result = tools
|
||||
|
|
@ -959,6 +1076,80 @@ mod tests {
|
|||
assert!(text.contains("Hello & welcome"));
|
||||
assert!(text.contains("Readable text."));
|
||||
assert!(!text.contains("ignore"));
|
||||
assert_eq!(value["transformed_as"], "html_to_text");
|
||||
assert_eq!(value["html_extraction"]["method"], "html_to_text");
|
||||
assert_eq!(value["html_extraction"]["fallback"], true);
|
||||
assert!(
|
||||
value["html_extraction"]["fallback_reason"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
.contains("shorter")
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn fetches_html_with_readability_main_text() {
|
||||
let body = r#"
|
||||
<html>
|
||||
<head><title>Example Readable Article</title></head>
|
||||
<body>
|
||||
<nav>Home Products Pricing unrelated navigation</nav>
|
||||
<main>
|
||||
<article>
|
||||
<h1>Example Readable Article</h1>
|
||||
<p>The useful article opens with a distinct sentence about careful Rust web fetching and reader mode extraction.</p>
|
||||
<p>It continues with enough focused prose to make the main document body clearly longer than boilerplate around it.</p>
|
||||
<p>A final paragraph mentions durable safety bounds and untrusted web content handling for the fetched page.</p>
|
||||
</article>
|
||||
</main>
|
||||
<footer>Copyright boilerplate and social links should not be part of the article.</footer>
|
||||
</body>
|
||||
</html>
|
||||
"#;
|
||||
let addr = serve_once(html_response(body)).await;
|
||||
let tools = enabled_web_fetch();
|
||||
let result = tools
|
||||
.run_fetch(WebFetchInput {
|
||||
url: format!("http://{addr}/article"),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap();
|
||||
let text = value.get("text").unwrap().as_str().unwrap();
|
||||
assert!(text.contains("careful Rust web fetching"));
|
||||
assert!(text.contains("durable safety bounds"));
|
||||
assert!(!text.contains("Home Products Pricing"));
|
||||
assert!(!text.contains("Copyright boilerplate"));
|
||||
assert_eq!(value["transformed_as"], "readability");
|
||||
assert_eq!(value["html_extraction"]["method"], "readability");
|
||||
assert_eq!(value["html_extraction"]["fallback"], false);
|
||||
assert_eq!(
|
||||
value["html_extraction"]["title"].as_str().unwrap(),
|
||||
"Example Readable Article"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn fetches_readable_html_with_bounded_output() {
|
||||
let repeated =
|
||||
"Reader-mode extracted paragraph with enough content for truncation. ".repeat(30);
|
||||
let body = format!(
|
||||
"<html><head><title>Long Article</title></head><body><article><h1>Long Article</h1><p>{repeated}</p></article></body></html>"
|
||||
);
|
||||
let addr = serve_once(html_response(&body)).await;
|
||||
let tools = enabled_web_fetch_with_output(WEB_FETCH_MIN_MAX_OUTPUT_BYTES);
|
||||
let result = tools
|
||||
.run_fetch(WebFetchInput {
|
||||
url: format!("http://{addr}/long"),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap();
|
||||
let text = value.get("text").unwrap().as_str().unwrap();
|
||||
assert!(text.len() <= WEB_FETCH_MIN_MAX_OUTPUT_BYTES);
|
||||
assert!(text.ends_with(WEB_FETCH_TRUNCATION_MARKER));
|
||||
assert_eq!(value["output_truncated"], true);
|
||||
assert_eq!(value["html_extraction"]["fallback"], false);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ rustPlatform.buildRustPackage rec {
|
|||
filter = sourceFilter;
|
||||
};
|
||||
|
||||
cargoHash = "sha256-8ZT5moKFxj/5vbp5rsUG7UkPLY1fvQKhYTyjRWQ58xk=";
|
||||
cargoHash = "sha256-VzVFqOWJHfgX92Qw84995ICQu2uvQPeYm6AotU4/LR0=";
|
||||
|
||||
depsExtraArgs = {
|
||||
# nixpkgs 25.11's fetchCargoVendor still uses crates.io's API
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user