web: extract readable html content
This commit is contained in:
parent
dc5ce2ba72
commit
7906ca5326
222
Cargo.lock
generated
222
Cargo.lock
generated
|
|
@ -531,7 +531,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "eb2a7d3066da2de787b7f032c736763eb7ae5d355f81a68bab2675a96008b0bf"
|
checksum = "eb2a7d3066da2de787b7f032c736763eb7ae5d355f81a68bab2675a96008b0bf"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"lab",
|
"lab",
|
||||||
"phf",
|
"phf 0.11.3",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -876,6 +876,16 @@ version = "1.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "futf"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
|
||||||
|
dependencies = [
|
||||||
|
"mac",
|
||||||
|
"new_debug_unreachable",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "futures"
|
name = "futures"
|
||||||
version = "0.3.32"
|
version = "0.3.32"
|
||||||
|
|
@ -1127,6 +1137,20 @@ version = "0.4.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "html5ever"
|
||||||
|
version = "0.26.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"mac",
|
||||||
|
"markup5ever",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 1.0.109",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "http"
|
name = "http"
|
||||||
version = "1.4.0"
|
version = "1.4.0"
|
||||||
|
|
@ -1744,6 +1768,12 @@ dependencies = [
|
||||||
"which",
|
"which",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mac"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mac_address"
|
name = "mac_address"
|
||||||
version = "1.1.8"
|
version = "1.1.8"
|
||||||
|
|
@ -1771,6 +1801,32 @@ dependencies = [
|
||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "markup5ever"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"phf 0.10.1",
|
||||||
|
"phf_codegen 0.10.0",
|
||||||
|
"string_cache",
|
||||||
|
"string_cache_codegen",
|
||||||
|
"tendril",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "markup5ever_rcdom"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2"
|
||||||
|
dependencies = [
|
||||||
|
"html5ever",
|
||||||
|
"markup5ever",
|
||||||
|
"tendril",
|
||||||
|
"xml5ever",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "matchers"
|
name = "matchers"
|
||||||
version = "0.2.0"
|
version = "0.2.0"
|
||||||
|
|
@ -1922,6 +1978,12 @@ dependencies = [
|
||||||
"tempfile",
|
"tempfile",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "new_debug_unreachable"
|
||||||
|
version = "1.0.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nix"
|
name = "nix"
|
||||||
version = "0.29.0"
|
version = "0.29.0"
|
||||||
|
|
@ -2151,6 +2213,15 @@ dependencies = [
|
||||||
"sha2 0.10.9",
|
"sha2 0.10.9",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf"
|
||||||
|
version = "0.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
|
||||||
|
dependencies = [
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "phf"
|
name = "phf"
|
||||||
version = "0.11.3"
|
version = "0.11.3"
|
||||||
|
|
@ -2158,7 +2229,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
|
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"phf_macros",
|
"phf_macros",
|
||||||
"phf_shared",
|
"phf_shared 0.11.3",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_codegen"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
|
||||||
|
dependencies = [
|
||||||
|
"phf_generator 0.10.0",
|
||||||
|
"phf_shared 0.10.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -2167,8 +2248,18 @@ version = "0.11.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
|
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"phf_generator",
|
"phf_generator 0.11.3",
|
||||||
"phf_shared",
|
"phf_shared 0.11.3",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_generator"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
|
||||||
|
dependencies = [
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
"rand 0.8.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -2177,7 +2268,7 @@ version = "0.11.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"phf_shared",
|
"phf_shared 0.11.3",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -2187,20 +2278,29 @@ version = "0.11.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
|
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"phf_generator",
|
"phf_generator 0.11.3",
|
||||||
"phf_shared",
|
"phf_shared 0.11.3",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.117",
|
"syn 2.0.117",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_shared"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
|
||||||
|
dependencies = [
|
||||||
|
"siphasher 0.3.11",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "phf_shared"
|
name = "phf_shared"
|
||||||
version = "0.11.3"
|
version = "0.11.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
|
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"siphasher",
|
"siphasher 1.0.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -2312,6 +2412,12 @@ dependencies = [
|
||||||
"zerocopy",
|
"zerocopy",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "precomputed-hash"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "prettyplease"
|
name = "prettyplease"
|
||||||
version = "0.2.37"
|
version = "0.2.37"
|
||||||
|
|
@ -2456,6 +2562,8 @@ version = "0.8.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"rand_chacha 0.3.1",
|
||||||
"rand_core 0.6.4",
|
"rand_core 0.6.4",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -2465,10 +2573,20 @@ version = "0.9.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
|
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rand_chacha",
|
"rand_chacha 0.9.0",
|
||||||
"rand_core 0.9.5",
|
"rand_core 0.9.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_chacha"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||||
|
dependencies = [
|
||||||
|
"ppv-lite86",
|
||||||
|
"rand_core 0.6.4",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rand_chacha"
|
name = "rand_chacha"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
|
|
@ -2484,6 +2602,9 @@ name = "rand_core"
|
||||||
version = "0.6.4"
|
version = "0.6.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom 0.2.17",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rand_core"
|
name = "rand_core"
|
||||||
|
|
@ -2591,6 +2712,21 @@ dependencies = [
|
||||||
"unicode-width",
|
"unicode-width",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "readability-rs"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5a17841ca2fc1c3e2aed7c44b29121ab099176923c0ac55d9906edea8ab025bc"
|
||||||
|
dependencies = [
|
||||||
|
"html5ever",
|
||||||
|
"lazy_static",
|
||||||
|
"log",
|
||||||
|
"markup5ever_rcdom",
|
||||||
|
"regex",
|
||||||
|
"thiserror 2.0.18",
|
||||||
|
"url",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "redox_syscall"
|
name = "redox_syscall"
|
||||||
version = "0.5.18"
|
version = "0.5.18"
|
||||||
|
|
@ -3155,6 +3291,12 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "siphasher"
|
||||||
|
version = "0.3.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "siphasher"
|
name = "siphasher"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
|
|
@ -3195,6 +3337,31 @@ version = "1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "string_cache"
|
||||||
|
version = "0.8.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
|
||||||
|
dependencies = [
|
||||||
|
"new_debug_unreachable",
|
||||||
|
"parking_lot",
|
||||||
|
"phf_shared 0.11.3",
|
||||||
|
"precomputed-hash",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "string_cache_codegen"
|
||||||
|
version = "0.5.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
|
||||||
|
dependencies = [
|
||||||
|
"phf_generator 0.11.3",
|
||||||
|
"phf_shared 0.11.3",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
version = "0.11.1"
|
version = "0.11.1"
|
||||||
|
|
@ -3310,6 +3477,17 @@ dependencies = [
|
||||||
"windows-sys 0.61.2",
|
"windows-sys 0.61.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tendril"
|
||||||
|
version = "0.4.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
|
||||||
|
dependencies = [
|
||||||
|
"futf",
|
||||||
|
"mac",
|
||||||
|
"utf-8",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "termcolor"
|
name = "termcolor"
|
||||||
version = "1.4.1"
|
version = "1.4.1"
|
||||||
|
|
@ -3327,8 +3505,8 @@ checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fnv",
|
"fnv",
|
||||||
"nom",
|
"nom",
|
||||||
"phf",
|
"phf 0.11.3",
|
||||||
"phf_codegen",
|
"phf_codegen 0.11.3",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -3374,10 +3552,10 @@ dependencies = [
|
||||||
"ordered-float 4.6.0",
|
"ordered-float 4.6.0",
|
||||||
"pest",
|
"pest",
|
||||||
"pest_derive",
|
"pest_derive",
|
||||||
"phf",
|
"phf 0.11.3",
|
||||||
"sha2 0.10.9",
|
"sha2 0.10.9",
|
||||||
"signal-hook",
|
"signal-hook",
|
||||||
"siphasher",
|
"siphasher 1.0.2",
|
||||||
"terminfo",
|
"terminfo",
|
||||||
"termios",
|
"termios",
|
||||||
"thiserror 1.0.69",
|
"thiserror 1.0.69",
|
||||||
|
|
@ -3600,6 +3778,7 @@ dependencies = [
|
||||||
"ignore",
|
"ignore",
|
||||||
"llm-worker",
|
"llm-worker",
|
||||||
"manifest",
|
"manifest",
|
||||||
|
"readability-rs",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"schemars",
|
"schemars",
|
||||||
"serde",
|
"serde",
|
||||||
|
|
@ -3845,6 +4024,12 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf-8"
|
||||||
|
version = "0.7.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8_iter"
|
name = "utf8_iter"
|
||||||
version = "1.0.4"
|
version = "1.0.4"
|
||||||
|
|
@ -4549,6 +4734,17 @@ version = "0.6.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
|
checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xml5ever"
|
||||||
|
version = "0.17.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"mac",
|
||||||
|
"markup5ever",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "yoke"
|
name = "yoke"
|
||||||
version = "0.8.2"
|
version = "0.8.2"
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ grep-searcher = "0.1.16"
|
||||||
ignore = "0.4.25"
|
ignore = "0.4.25"
|
||||||
llm-worker = { workspace = true }
|
llm-worker = { workspace = true }
|
||||||
manifest = { workspace = true }
|
manifest = { workspace = true }
|
||||||
|
readability = { package = "readability-rs", version = "0.5.0" }
|
||||||
reqwest = { version = "0.13", default-features = false, features = ["json", "native-tls"] }
|
reqwest = { version = "0.13", default-features = false, features = ["json", "native-tls"] }
|
||||||
schemars = { workspace = true }
|
schemars = { workspace = true }
|
||||||
serde = { workspace = true, features = ["derive"] }
|
serde = { workspace = true, features = ["derive"] }
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
use std::io::Cursor;
|
||||||
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
@ -8,7 +9,7 @@ use manifest::{WebConfig, WebFetchConfig, WebSearchConfig, WebSearchProvider};
|
||||||
use reqwest::header::{CONTENT_LENGTH, CONTENT_TYPE, HeaderMap, LOCATION};
|
use reqwest::header::{CONTENT_LENGTH, CONTENT_TYPE, HeaderMap, LOCATION};
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Url};
|
||||||
use schemars::JsonSchema;
|
use schemars::JsonSchema;
|
||||||
use serde::Deserialize;
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::{Value, json};
|
use serde_json::{Value, json};
|
||||||
use tokio::net::lookup_host;
|
use tokio::net::lookup_host;
|
||||||
|
|
||||||
|
|
@ -24,6 +25,8 @@ const WEB_FETCH_DEFAULT_MAX_RESPONSE_BYTES: usize = 2 * 1024 * 1024;
|
||||||
const WEB_FETCH_DEFAULT_MAX_OUTPUT_BYTES: usize = 64 * 1024;
|
const WEB_FETCH_DEFAULT_MAX_OUTPUT_BYTES: usize = 64 * 1024;
|
||||||
const WEB_FETCH_MIN_MAX_RESPONSE_BYTES: usize = 1024;
|
const WEB_FETCH_MIN_MAX_RESPONSE_BYTES: usize = 1024;
|
||||||
const WEB_FETCH_MIN_MAX_OUTPUT_BYTES: usize = 512;
|
const WEB_FETCH_MIN_MAX_OUTPUT_BYTES: usize = 512;
|
||||||
|
const WEB_FETCH_READABILITY_MIN_TEXT_CHARS: usize = 40;
|
||||||
|
const WEB_FETCH_TRUNCATION_MARKER: &str = "\n[truncated]";
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct WebTools {
|
pub struct WebTools {
|
||||||
|
|
@ -429,10 +432,11 @@ async fn fetch_url(
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
let (bytes, response_truncated) = read_limited(response, limits.max_response_bytes).await?;
|
let (bytes, response_truncated) = read_limited(response, limits.max_response_bytes).await?;
|
||||||
let (text, transformed_as) = render_content(
|
let rendered = render_content(
|
||||||
&bytes,
|
&bytes,
|
||||||
media_kind,
|
media_kind,
|
||||||
content_type.as_deref(),
|
content_type.as_deref(),
|
||||||
|
&url,
|
||||||
limits.max_output_bytes,
|
limits.max_output_bytes,
|
||||||
)?;
|
)?;
|
||||||
return Ok(json_output(json!({
|
return Ok(json_output(json!({
|
||||||
|
|
@ -440,13 +444,15 @@ async fn fetch_url(
|
||||||
"url": url.as_str(),
|
"url": url.as_str(),
|
||||||
"status": status.as_u16(),
|
"status": status.as_u16(),
|
||||||
"content_type": content_type,
|
"content_type": content_type,
|
||||||
"transformed_as": transformed_as,
|
"transformed_as": rendered.transformed_as,
|
||||||
|
"html_extraction": rendered.html_extraction,
|
||||||
"bytes_read": bytes.len(),
|
"bytes_read": bytes.len(),
|
||||||
"truncated": response_truncated,
|
"truncated": response_truncated,
|
||||||
|
"output_truncated": rendered.output_truncated,
|
||||||
"max_response_bytes": limits.max_response_bytes,
|
"max_response_bytes": limits.max_response_bytes,
|
||||||
"max_output_bytes": limits.max_output_bytes,
|
"max_output_bytes": limits.max_output_bytes,
|
||||||
"redirects": redirects,
|
"redirects": redirects,
|
||||||
"text": text,
|
"text": rendered.text,
|
||||||
})));
|
})));
|
||||||
}
|
}
|
||||||
unreachable!("redirect loop exits through return or error")
|
unreachable!("redirect loop exits through return or error")
|
||||||
|
|
@ -635,12 +641,36 @@ fn classify_content_type(content_type: Option<&str>) -> Result<MediaKind, ToolEr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct RenderedContent {
|
||||||
|
text: String,
|
||||||
|
transformed_as: &'static str,
|
||||||
|
html_extraction: Option<HtmlExtractionMetadata>,
|
||||||
|
output_truncated: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct HtmlExtractionMetadata {
|
||||||
|
method: &'static str,
|
||||||
|
fallback: bool,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
fallback_reason: Option<String>,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
title: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct HtmlDocument {
|
||||||
|
text: String,
|
||||||
|
metadata: HtmlExtractionMetadata,
|
||||||
|
}
|
||||||
|
|
||||||
fn render_content(
|
fn render_content(
|
||||||
bytes: &[u8],
|
bytes: &[u8],
|
||||||
kind: MediaKind,
|
kind: MediaKind,
|
||||||
content_type: Option<&str>,
|
content_type: Option<&str>,
|
||||||
|
base_url: &Url,
|
||||||
max_output_bytes: usize,
|
max_output_bytes: usize,
|
||||||
) -> Result<(String, &'static str), ToolError> {
|
) -> Result<RenderedContent, ToolError> {
|
||||||
reject_binary(bytes)?;
|
reject_binary(bytes)?;
|
||||||
let raw = String::from_utf8(bytes.to_vec()).map_err(|err| {
|
let raw = String::from_utf8(bytes.to_vec()).map_err(|err| {
|
||||||
ToolError::ExecutionFailed(format!(
|
ToolError::ExecutionFailed(format!(
|
||||||
|
|
@ -648,16 +678,75 @@ fn render_content(
|
||||||
content_type.unwrap_or("unknown")
|
content_type.unwrap_or("unknown")
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
let rendered = match kind {
|
let (text, transformed_as, html_extraction) = match kind {
|
||||||
MediaKind::Html => (html_to_text(&raw), "html_to_text"),
|
MediaKind::Html => {
|
||||||
MediaKind::Json => (json_to_text(&raw)?, "json_pretty"),
|
let document = extract_html_document(&raw, base_url);
|
||||||
MediaKind::Xml => (xmlish_to_text(&raw), "xml_text"),
|
(
|
||||||
MediaKind::Text | MediaKind::Unknown => (raw, "text"),
|
document.text,
|
||||||
|
document.metadata.method,
|
||||||
|
Some(document.metadata),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
MediaKind::Json => (json_to_text(&raw)?, "json_pretty", None),
|
||||||
|
MediaKind::Xml => (xmlish_to_text(&raw), "xml_text", None),
|
||||||
|
MediaKind::Text | MediaKind::Unknown => (raw, "text", None),
|
||||||
};
|
};
|
||||||
Ok((
|
let (text, output_truncated) = truncate_to_bytes(clean_text(text), max_output_bytes);
|
||||||
truncate_to_bytes(clean_text(rendered.0), max_output_bytes),
|
Ok(RenderedContent {
|
||||||
rendered.1,
|
text,
|
||||||
))
|
transformed_as,
|
||||||
|
html_extraction,
|
||||||
|
output_truncated,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_html_document(html: &str, base_url: &Url) -> HtmlDocument {
|
||||||
|
let mut input = Cursor::new(html.as_bytes());
|
||||||
|
match readability::extract(&mut input, base_url, Default::default()) {
|
||||||
|
Ok(readable) => {
|
||||||
|
let text = clean_text(readable.text);
|
||||||
|
let title = non_empty_string(clean_text(readable.title));
|
||||||
|
if text.chars().count() >= WEB_FETCH_READABILITY_MIN_TEXT_CHARS {
|
||||||
|
return HtmlDocument {
|
||||||
|
text,
|
||||||
|
metadata: HtmlExtractionMetadata {
|
||||||
|
method: "readability",
|
||||||
|
fallback: false,
|
||||||
|
fallback_reason: None,
|
||||||
|
title,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
html_fallback_document(
|
||||||
|
html,
|
||||||
|
title,
|
||||||
|
Some(format!(
|
||||||
|
"readability text shorter than {WEB_FETCH_READABILITY_MIN_TEXT_CHARS} characters"
|
||||||
|
)),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
Err(err) => html_fallback_document(
|
||||||
|
html,
|
||||||
|
None,
|
||||||
|
Some(format!("readability extraction failed: {err}")),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn html_fallback_document(
|
||||||
|
html: &str,
|
||||||
|
title: Option<String>,
|
||||||
|
fallback_reason: Option<String>,
|
||||||
|
) -> HtmlDocument {
|
||||||
|
HtmlDocument {
|
||||||
|
text: html_to_text(html),
|
||||||
|
metadata: HtmlExtractionMetadata {
|
||||||
|
method: "html_to_text",
|
||||||
|
fallback: true,
|
||||||
|
fallback_reason,
|
||||||
|
title,
|
||||||
|
},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reject_binary(bytes: &[u8]) -> Result<(), ToolError> {
|
fn reject_binary(bytes: &[u8]) -> Result<(), ToolError> {
|
||||||
|
|
@ -772,17 +861,31 @@ fn decode_basic_entities(input: &str) -> String {
|
||||||
.replace("'", "'")
|
.replace("'", "'")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn truncate_to_bytes(mut s: String, max: usize) -> String {
|
fn non_empty_string(input: String) -> Option<String> {
|
||||||
|
if input.is_empty() { None } else { Some(input) }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn truncate_to_bytes(mut s: String, max: usize) -> (String, bool) {
|
||||||
if s.len() <= max {
|
if s.len() <= max {
|
||||||
return s;
|
return (s, false);
|
||||||
}
|
}
|
||||||
let mut end = max;
|
|
||||||
while !s.is_char_boundary(end) {
|
if max <= WEB_FETCH_TRUNCATION_MARKER.len() {
|
||||||
|
let mut end = max;
|
||||||
|
while end > 0 && !s.is_char_boundary(end) {
|
||||||
|
end -= 1;
|
||||||
|
}
|
||||||
|
s.truncate(end);
|
||||||
|
return (s, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut end = max - WEB_FETCH_TRUNCATION_MARKER.len();
|
||||||
|
while end > 0 && !s.is_char_boundary(end) {
|
||||||
end -= 1;
|
end -= 1;
|
||||||
}
|
}
|
||||||
s.truncate(end);
|
s.truncate(end);
|
||||||
s.push_str("\n[truncated]");
|
s.push_str(WEB_FETCH_TRUNCATION_MARKER);
|
||||||
s
|
(s, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bounded_lossy(bytes: &[u8], max: usize) -> String {
|
fn bounded_lossy(bytes: &[u8], max: usize) -> String {
|
||||||
|
|
@ -875,6 +978,16 @@ mod tests {
|
||||||
addr
|
addr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn html_response(body: &str) -> &'static str {
|
||||||
|
Box::leak(
|
||||||
|
format!(
|
||||||
|
"HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: {}\r\n\r\n{}",
|
||||||
|
body.len(), body
|
||||||
|
)
|
||||||
|
.into_boxed_str(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
async fn read_request(stream: &mut TcpStream) -> String {
|
async fn read_request(stream: &mut TcpStream) -> String {
|
||||||
let mut buf = vec![0; 4096];
|
let mut buf = vec![0; 4096];
|
||||||
let n = stream.read(&mut buf).await.unwrap();
|
let n = stream.read(&mut buf).await.unwrap();
|
||||||
|
|
@ -882,6 +995,10 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn enabled_web_fetch() -> WebTools {
|
fn enabled_web_fetch() -> WebTools {
|
||||||
|
enabled_web_fetch_with_output(2048)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn enabled_web_fetch_with_output(max_output_bytes: usize) -> WebTools {
|
||||||
WebTools::new(Some(WebConfig {
|
WebTools::new(Some(WebConfig {
|
||||||
enabled: Some(true),
|
enabled: Some(true),
|
||||||
allow_private_addresses: Some(true),
|
allow_private_addresses: Some(true),
|
||||||
|
|
@ -891,7 +1008,7 @@ mod tests {
|
||||||
timeout_secs: Some(5),
|
timeout_secs: Some(5),
|
||||||
redirect_limit: Some(2),
|
redirect_limit: Some(2),
|
||||||
max_response_bytes: Some(4096),
|
max_response_bytes: Some(4096),
|
||||||
max_output_bytes: Some(2048),
|
max_output_bytes: Some(max_output_bytes),
|
||||||
allow_private_addresses: None,
|
allow_private_addresses: None,
|
||||||
}),
|
}),
|
||||||
}))
|
}))
|
||||||
|
|
@ -942,10 +1059,10 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn fetches_html_as_bounded_text() {
|
async fn fetches_short_html_with_fallback_metadata() {
|
||||||
let addr = serve_once(
|
let addr = serve_once(html_response(
|
||||||
"HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: 93\r\n\r\n<html><body><h1>Hello & welcome</h1><script>ignore()</script><p>Readable text.</p></body></html>",
|
"<html><body><h1>Hello & welcome</h1><script>ignore()</script><p>Readable text.</p></body></html>",
|
||||||
)
|
))
|
||||||
.await;
|
.await;
|
||||||
let tools = enabled_web_fetch();
|
let tools = enabled_web_fetch();
|
||||||
let result = tools
|
let result = tools
|
||||||
|
|
@ -959,6 +1076,80 @@ mod tests {
|
||||||
assert!(text.contains("Hello & welcome"));
|
assert!(text.contains("Hello & welcome"));
|
||||||
assert!(text.contains("Readable text."));
|
assert!(text.contains("Readable text."));
|
||||||
assert!(!text.contains("ignore"));
|
assert!(!text.contains("ignore"));
|
||||||
|
assert_eq!(value["transformed_as"], "html_to_text");
|
||||||
|
assert_eq!(value["html_extraction"]["method"], "html_to_text");
|
||||||
|
assert_eq!(value["html_extraction"]["fallback"], true);
|
||||||
|
assert!(
|
||||||
|
value["html_extraction"]["fallback_reason"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap()
|
||||||
|
.contains("shorter")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn fetches_html_with_readability_main_text() {
|
||||||
|
let body = r#"
|
||||||
|
<html>
|
||||||
|
<head><title>Example Readable Article</title></head>
|
||||||
|
<body>
|
||||||
|
<nav>Home Products Pricing unrelated navigation</nav>
|
||||||
|
<main>
|
||||||
|
<article>
|
||||||
|
<h1>Example Readable Article</h1>
|
||||||
|
<p>The useful article opens with a distinct sentence about careful Rust web fetching and reader mode extraction.</p>
|
||||||
|
<p>It continues with enough focused prose to make the main document body clearly longer than boilerplate around it.</p>
|
||||||
|
<p>A final paragraph mentions durable safety bounds and untrusted web content handling for the fetched page.</p>
|
||||||
|
</article>
|
||||||
|
</main>
|
||||||
|
<footer>Copyright boilerplate and social links should not be part of the article.</footer>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"#;
|
||||||
|
let addr = serve_once(html_response(body)).await;
|
||||||
|
let tools = enabled_web_fetch();
|
||||||
|
let result = tools
|
||||||
|
.run_fetch(WebFetchInput {
|
||||||
|
url: format!("http://{addr}/article"),
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap();
|
||||||
|
let text = value.get("text").unwrap().as_str().unwrap();
|
||||||
|
assert!(text.contains("careful Rust web fetching"));
|
||||||
|
assert!(text.contains("durable safety bounds"));
|
||||||
|
assert!(!text.contains("Home Products Pricing"));
|
||||||
|
assert!(!text.contains("Copyright boilerplate"));
|
||||||
|
assert_eq!(value["transformed_as"], "readability");
|
||||||
|
assert_eq!(value["html_extraction"]["method"], "readability");
|
||||||
|
assert_eq!(value["html_extraction"]["fallback"], false);
|
||||||
|
assert_eq!(
|
||||||
|
value["html_extraction"]["title"].as_str().unwrap(),
|
||||||
|
"Example Readable Article"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn fetches_readable_html_with_bounded_output() {
|
||||||
|
let repeated =
|
||||||
|
"Reader-mode extracted paragraph with enough content for truncation. ".repeat(30);
|
||||||
|
let body = format!(
|
||||||
|
"<html><head><title>Long Article</title></head><body><article><h1>Long Article</h1><p>{repeated}</p></article></body></html>"
|
||||||
|
);
|
||||||
|
let addr = serve_once(html_response(&body)).await;
|
||||||
|
let tools = enabled_web_fetch_with_output(WEB_FETCH_MIN_MAX_OUTPUT_BYTES);
|
||||||
|
let result = tools
|
||||||
|
.run_fetch(WebFetchInput {
|
||||||
|
url: format!("http://{addr}/long"),
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap();
|
||||||
|
let text = value.get("text").unwrap().as_str().unwrap();
|
||||||
|
assert!(text.len() <= WEB_FETCH_MIN_MAX_OUTPUT_BYTES);
|
||||||
|
assert!(text.ends_with(WEB_FETCH_TRUNCATION_MARKER));
|
||||||
|
assert_eq!(value["output_truncated"], true);
|
||||||
|
assert_eq!(value["html_extraction"]["fallback"], false);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ rustPlatform.buildRustPackage rec {
|
||||||
filter = sourceFilter;
|
filter = sourceFilter;
|
||||||
};
|
};
|
||||||
|
|
||||||
cargoHash = "sha256-8ZT5moKFxj/5vbp5rsUG7UkPLY1fvQKhYTyjRWQ58xk=";
|
cargoHash = "sha256-VzVFqOWJHfgX92Qw84995ICQu2uvQPeYm6AotU4/LR0=";
|
||||||
|
|
||||||
depsExtraArgs = {
|
depsExtraArgs = {
|
||||||
# nixpkgs 25.11's fetchCargoVendor still uses crates.io's API
|
# nixpkgs 25.11's fetchCargoVendor still uses crates.io's API
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user