diff --git a/Cargo.lock b/Cargo.lock index 8025453d..95aaf723 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -531,7 +531,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb2a7d3066da2de787b7f032c736763eb7ae5d355f81a68bab2675a96008b0bf" dependencies = [ "lab", - "phf", + "phf 0.11.3", ] [[package]] @@ -876,6 +876,16 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.32" @@ -1127,6 +1137,20 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "http" version = "1.4.0" @@ -1744,6 +1768,12 @@ dependencies = [ "which", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "mac_address" version = "1.1.8" @@ -1771,6 +1801,32 @@ dependencies = [ "tracing", ] +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen 0.10.0", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "matchers" version = "0.2.0" @@ -1922,6 +1978,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "nix" version = "0.29.0" @@ -2151,6 +2213,15 @@ dependencies = [ "sha2 0.10.9", ] +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + [[package]] name = "phf" version = "0.11.3" @@ -2158,7 +2229,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ "phf_macros", - "phf_shared", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", ] [[package]] @@ -2167,8 +2248,18 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ - "phf_generator", - "phf_shared", + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.5", ] [[package]] @@ -2177,7 +2268,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ - "phf_shared", + "phf_shared 0.11.3", "rand 0.8.5", ] @@ -2187,20 +2278,29 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" dependencies = [ - "phf_generator", - "phf_shared", + "phf_generator 0.11.3", + "phf_shared 0.11.3", "proc-macro2", "quote", "syn 2.0.117", ] +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher 0.3.11", +] + [[package]] name = "phf_shared" version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ - "siphasher", + "siphasher 1.0.2", ] [[package]] @@ -2312,6 +2412,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "prettyplease" version = "0.2.37" @@ -2456,6 +2562,8 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ + "libc", + "rand_chacha 0.3.1", "rand_core 0.6.4", ] @@ -2465,10 +2573,20 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ - "rand_chacha", + "rand_chacha 0.9.0", "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + [[package]] name = "rand_chacha" version = "0.9.0" @@ -2484,6 +2602,9 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] [[package]] name = "rand_core" @@ -2591,6 +2712,21 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "readability-rs" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a17841ca2fc1c3e2aed7c44b29121ab099176923c0ac55d9906edea8ab025bc" +dependencies = [ + "html5ever", + "lazy_static", + "log", + "markup5ever_rcdom", + "regex", + "thiserror 2.0.18", + "url", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -3155,6 +3291,12 @@ dependencies = [ "libc", ] +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "siphasher" version = "1.0.2" @@ -3195,6 +3337,31 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.11.1" @@ -3310,6 +3477,17 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "termcolor" version = "1.4.1" @@ -3327,8 +3505,8 @@ checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662" dependencies = [ "fnv", "nom", - "phf", - "phf_codegen", + "phf 0.11.3", + "phf_codegen 0.11.3", ] [[package]] @@ -3374,10 +3552,10 @@ dependencies = [ "ordered-float 4.6.0", "pest", "pest_derive", - "phf", + "phf 0.11.3", "sha2 0.10.9", "signal-hook", - "siphasher", + "siphasher 1.0.2", "terminfo", "termios", "thiserror 1.0.69", @@ -3600,6 +3778,7 @@ dependencies = [ "ignore", "llm-worker", "manifest", + "readability-rs", "reqwest", "schemars", "serde", @@ -3845,6 +4024,12 @@ dependencies = [ "serde", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -4549,6 +4734,17 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever", +] + [[package]] name = "yoke" version = "0.8.2" diff --git a/crates/tools/Cargo.toml b/crates/tools/Cargo.toml index 50e260a5..be1398d3 100644 --- a/crates/tools/Cargo.toml +++ b/crates/tools/Cargo.toml @@ -13,6 +13,7 @@ grep-searcher = "0.1.16" ignore = "0.4.25" llm-worker = { workspace = true } manifest = { workspace = true } +readability = { package = "readability-rs", version = "0.5.0" } reqwest = { version = "0.13", default-features = false, features = ["json", "native-tls"] } schemars = { workspace = true } serde = { workspace = true, features = ["derive"] } diff --git a/crates/tools/src/web.rs b/crates/tools/src/web.rs index a2b7b2e1..bd972d57 100644 --- a/crates/tools/src/web.rs +++ b/crates/tools/src/web.rs @@ -1,3 +1,4 @@ +use std::io::Cursor; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; use std::sync::Arc; use std::time::Duration; @@ -8,7 +9,7 @@ use manifest::{WebConfig, WebFetchConfig, WebSearchConfig, WebSearchProvider}; use reqwest::header::{CONTENT_LENGTH, CONTENT_TYPE, HeaderMap, LOCATION}; use reqwest::{Client, Url}; use schemars::JsonSchema; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use serde_json::{Value, json}; use tokio::net::lookup_host; @@ -24,6 +25,8 @@ const WEB_FETCH_DEFAULT_MAX_RESPONSE_BYTES: usize = 2 * 1024 * 1024; const WEB_FETCH_DEFAULT_MAX_OUTPUT_BYTES: usize = 64 * 1024; const WEB_FETCH_MIN_MAX_RESPONSE_BYTES: usize = 1024; const WEB_FETCH_MIN_MAX_OUTPUT_BYTES: usize = 512; +const WEB_FETCH_READABILITY_MIN_TEXT_CHARS: usize = 40; +const WEB_FETCH_TRUNCATION_MARKER: &str = "\n[truncated]"; #[derive(Clone)] pub struct WebTools { @@ -429,10 +432,11 @@ async fn fetch_url( ))); } let (bytes, response_truncated) = read_limited(response, limits.max_response_bytes).await?; - let (text, transformed_as) = render_content( + let rendered = render_content( &bytes, media_kind, content_type.as_deref(), + &url, limits.max_output_bytes, )?; return Ok(json_output(json!({ @@ -440,13 +444,15 @@ async fn fetch_url( "url": url.as_str(), "status": status.as_u16(), "content_type": content_type, - "transformed_as": transformed_as, + "transformed_as": rendered.transformed_as, + "html_extraction": rendered.html_extraction, "bytes_read": bytes.len(), "truncated": response_truncated, + "output_truncated": rendered.output_truncated, "max_response_bytes": limits.max_response_bytes, "max_output_bytes": limits.max_output_bytes, "redirects": redirects, - "text": text, + "text": rendered.text, }))); } unreachable!("redirect loop exits through return or error") @@ -635,12 +641,36 @@ fn classify_content_type(content_type: Option<&str>) -> Result, + output_truncated: bool, +} + +#[derive(Debug, Serialize)] +struct HtmlExtractionMetadata { + method: &'static str, + fallback: bool, + #[serde(skip_serializing_if = "Option::is_none")] + fallback_reason: Option, + #[serde(skip_serializing_if = "Option::is_none")] + title: Option, +} + +struct HtmlDocument { + text: String, + metadata: HtmlExtractionMetadata, +} + fn render_content( bytes: &[u8], kind: MediaKind, content_type: Option<&str>, + base_url: &Url, max_output_bytes: usize, -) -> Result<(String, &'static str), ToolError> { +) -> Result { reject_binary(bytes)?; let raw = String::from_utf8(bytes.to_vec()).map_err(|err| { ToolError::ExecutionFailed(format!( @@ -648,16 +678,75 @@ fn render_content( content_type.unwrap_or("unknown") )) })?; - let rendered = match kind { - MediaKind::Html => (html_to_text(&raw), "html_to_text"), - MediaKind::Json => (json_to_text(&raw)?, "json_pretty"), - MediaKind::Xml => (xmlish_to_text(&raw), "xml_text"), - MediaKind::Text | MediaKind::Unknown => (raw, "text"), + let (text, transformed_as, html_extraction) = match kind { + MediaKind::Html => { + let document = extract_html_document(&raw, base_url); + ( + document.text, + document.metadata.method, + Some(document.metadata), + ) + } + MediaKind::Json => (json_to_text(&raw)?, "json_pretty", None), + MediaKind::Xml => (xmlish_to_text(&raw), "xml_text", None), + MediaKind::Text | MediaKind::Unknown => (raw, "text", None), }; - Ok(( - truncate_to_bytes(clean_text(rendered.0), max_output_bytes), - rendered.1, - )) + let (text, output_truncated) = truncate_to_bytes(clean_text(text), max_output_bytes); + Ok(RenderedContent { + text, + transformed_as, + html_extraction, + output_truncated, + }) +} + +fn extract_html_document(html: &str, base_url: &Url) -> HtmlDocument { + let mut input = Cursor::new(html.as_bytes()); + match readability::extract(&mut input, base_url, Default::default()) { + Ok(readable) => { + let text = clean_text(readable.text); + let title = non_empty_string(clean_text(readable.title)); + if text.chars().count() >= WEB_FETCH_READABILITY_MIN_TEXT_CHARS { + return HtmlDocument { + text, + metadata: HtmlExtractionMetadata { + method: "readability", + fallback: false, + fallback_reason: None, + title, + }, + }; + } + html_fallback_document( + html, + title, + Some(format!( + "readability text shorter than {WEB_FETCH_READABILITY_MIN_TEXT_CHARS} characters" + )), + ) + } + Err(err) => html_fallback_document( + html, + None, + Some(format!("readability extraction failed: {err}")), + ), + } +} + +fn html_fallback_document( + html: &str, + title: Option, + fallback_reason: Option, +) -> HtmlDocument { + HtmlDocument { + text: html_to_text(html), + metadata: HtmlExtractionMetadata { + method: "html_to_text", + fallback: true, + fallback_reason, + title, + }, + } } fn reject_binary(bytes: &[u8]) -> Result<(), ToolError> { @@ -772,17 +861,31 @@ fn decode_basic_entities(input: &str) -> String { .replace("'", "'") } -fn truncate_to_bytes(mut s: String, max: usize) -> String { +fn non_empty_string(input: String) -> Option { + if input.is_empty() { None } else { Some(input) } +} + +fn truncate_to_bytes(mut s: String, max: usize) -> (String, bool) { if s.len() <= max { - return s; + return (s, false); } - let mut end = max; - while !s.is_char_boundary(end) { + + if max <= WEB_FETCH_TRUNCATION_MARKER.len() { + let mut end = max; + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + s.truncate(end); + return (s, true); + } + + let mut end = max - WEB_FETCH_TRUNCATION_MARKER.len(); + while end > 0 && !s.is_char_boundary(end) { end -= 1; } s.truncate(end); - s.push_str("\n[truncated]"); - s + s.push_str(WEB_FETCH_TRUNCATION_MARKER); + (s, true) } fn bounded_lossy(bytes: &[u8], max: usize) -> String { @@ -875,6 +978,16 @@ mod tests { addr } + fn html_response(body: &str) -> &'static str { + Box::leak( + format!( + "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: {}\r\n\r\n{}", + body.len(), body + ) + .into_boxed_str(), + ) + } + async fn read_request(stream: &mut TcpStream) -> String { let mut buf = vec![0; 4096]; let n = stream.read(&mut buf).await.unwrap(); @@ -882,6 +995,10 @@ mod tests { } fn enabled_web_fetch() -> WebTools { + enabled_web_fetch_with_output(2048) + } + + fn enabled_web_fetch_with_output(max_output_bytes: usize) -> WebTools { WebTools::new(Some(WebConfig { enabled: Some(true), allow_private_addresses: Some(true), @@ -891,7 +1008,7 @@ mod tests { timeout_secs: Some(5), redirect_limit: Some(2), max_response_bytes: Some(4096), - max_output_bytes: Some(2048), + max_output_bytes: Some(max_output_bytes), allow_private_addresses: None, }), })) @@ -942,10 +1059,10 @@ mod tests { } #[tokio::test] - async fn fetches_html_as_bounded_text() { - let addr = serve_once( - "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nContent-Length: 93\r\n\r\n

Hello & welcome

Readable text.

", - ) + async fn fetches_short_html_with_fallback_metadata() { + let addr = serve_once(html_response( + "

Hello & welcome

Readable text.

", + )) .await; let tools = enabled_web_fetch(); let result = tools @@ -959,6 +1076,80 @@ mod tests { assert!(text.contains("Hello & welcome")); assert!(text.contains("Readable text.")); assert!(!text.contains("ignore")); + assert_eq!(value["transformed_as"], "html_to_text"); + assert_eq!(value["html_extraction"]["method"], "html_to_text"); + assert_eq!(value["html_extraction"]["fallback"], true); + assert!( + value["html_extraction"]["fallback_reason"] + .as_str() + .unwrap() + .contains("shorter") + ); + } + + #[tokio::test] + async fn fetches_html_with_readability_main_text() { + let body = r#" + + Example Readable Article + + +
+
+

Example Readable Article

+

The useful article opens with a distinct sentence about careful Rust web fetching and reader mode extraction.

+

It continues with enough focused prose to make the main document body clearly longer than boilerplate around it.

+

A final paragraph mentions durable safety bounds and untrusted web content handling for the fetched page.

+
+
+
Copyright boilerplate and social links should not be part of the article.
+ + + "#; + let addr = serve_once(html_response(body)).await; + let tools = enabled_web_fetch(); + let result = tools + .run_fetch(WebFetchInput { + url: format!("http://{addr}/article"), + }) + .await + .unwrap(); + let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap(); + let text = value.get("text").unwrap().as_str().unwrap(); + assert!(text.contains("careful Rust web fetching")); + assert!(text.contains("durable safety bounds")); + assert!(!text.contains("Home Products Pricing")); + assert!(!text.contains("Copyright boilerplate")); + assert_eq!(value["transformed_as"], "readability"); + assert_eq!(value["html_extraction"]["method"], "readability"); + assert_eq!(value["html_extraction"]["fallback"], false); + assert_eq!( + value["html_extraction"]["title"].as_str().unwrap(), + "Example Readable Article" + ); + } + + #[tokio::test] + async fn fetches_readable_html_with_bounded_output() { + let repeated = + "Reader-mode extracted paragraph with enough content for truncation. ".repeat(30); + let body = format!( + "Long Article

Long Article

{repeated}

" + ); + let addr = serve_once(html_response(&body)).await; + let tools = enabled_web_fetch_with_output(WEB_FETCH_MIN_MAX_OUTPUT_BYTES); + let result = tools + .run_fetch(WebFetchInput { + url: format!("http://{addr}/long"), + }) + .await + .unwrap(); + let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap(); + let text = value.get("text").unwrap().as_str().unwrap(); + assert!(text.len() <= WEB_FETCH_MIN_MAX_OUTPUT_BYTES); + assert!(text.ends_with(WEB_FETCH_TRUNCATION_MARKER)); + assert_eq!(value["output_truncated"], true); + assert_eq!(value["html_extraction"]["fallback"], false); } #[tokio::test] diff --git a/package.nix b/package.nix index e960f41c..6edcf2fa 100644 --- a/package.nix +++ b/package.nix @@ -40,7 +40,7 @@ rustPlatform.buildRustPackage rec { filter = sourceFilter; }; - cargoHash = "sha256-8ZT5moKFxj/5vbp5rsUG7UkPLY1fvQKhYTyjRWQ58xk="; + cargoHash = "sha256-VzVFqOWJHfgX92Qw84995ICQu2uvQPeYm6AotU4/LR0="; depsExtraArgs = { # nixpkgs 25.11's fetchCargoVendor still uses crates.io's API