diff --git a/crates/tools/src/web.rs b/crates/tools/src/web.rs index acab0641..e4996d9f 100644 --- a/crates/tools/src/web.rs +++ b/crates/tools/src/web.rs @@ -731,12 +731,13 @@ fn extract_html_document(html: &str, base_url: &Url, include_navigation: bool) - Ok(dom) => dom, Err(err) => { return html_fallback_document( - html, + fallback_diagnostic_text(html_to_text(html)), None, Some(format!("HTML parser failed: {err}")), false, false, false, + false, ); } }; @@ -750,16 +751,21 @@ fn extract_html_document(html: &str, base_url: &Url, include_navigation: bool) - } else { (None, false) }; + let navigation_included = navigation_markdown + .as_ref() + .map(|navigation_markdown| !navigation_markdown.is_empty()) + .unwrap_or(false); let Some(candidate) = select_main_candidate(&body) else { return html_fallback_document( - html, + fallback_diagnostic_text_from_body(&body, base_url, navigation_markdown.as_deref()), title, Some(format!( "local reader found no main-content candidate with at least {WEB_FETCH_READER_MIN_TEXT_CHARS} text characters" )), navigation_detected, include_navigation, + navigation_included, navigation_truncated, ); }; @@ -767,21 +773,18 @@ fn extract_html_document(html: &str, base_url: &Url, include_navigation: bool) - let mut text = clean_text(markdown_for_node(&candidate.handle, base_url, true)); if text.chars().count() < WEB_FETCH_READER_MIN_TEXT_CHARS { return html_fallback_document( - html, + fallback_diagnostic_text_from_body(&body, base_url, navigation_markdown.as_deref()), title, Some(format!( "local reader selected content shorter than {WEB_FETCH_READER_MIN_TEXT_CHARS} characters" )), navigation_detected, include_navigation, + navigation_included, navigation_truncated, ); } - let navigation_included = navigation_markdown - .as_ref() - .map(|navigation_markdown| !navigation_markdown.is_empty()) - .unwrap_or(false); if let Some(navigation_markdown) = navigation_markdown { if !navigation_markdown.is_empty() { text.push_str("\n\n## Navigation\n\n"); @@ -807,17 +810,14 @@ fn extract_html_document(html: &str, base_url: &Url, include_navigation: bool) - } fn html_fallback_document( - html: &str, + text: String, title: Option, fallback_reason: Option, navigation_detected: bool, include_navigation: bool, + navigation_included: bool, navigation_truncated: bool, ) -> HtmlDocument { - let mut text = String::from( - "[fallback diagnostic: local reader did not find useful main content; below is stripped HTML text]\n\n", - ); - text.push_str(&html_to_text(html)); HtmlDocument { text, metadata: HtmlExtractionMetadata { @@ -827,7 +827,7 @@ fn html_fallback_document( title, readable: false, navigation_detected, - navigation_included: false, + navigation_included, navigation_omitted: navigation_detected && !include_navigation, navigation_truncated, navigation_notice: navigation_notice(navigation_detected, include_navigation), @@ -835,6 +835,29 @@ fn html_fallback_document( } } +fn fallback_diagnostic_text_from_body( + body: &Handle, + base_url: &Url, + navigation_markdown: Option<&str>, +) -> String { + let mut body_text = clean_text(markdown_for_node(body, base_url, true)); + if let Some(navigation_markdown) = navigation_markdown { + if !navigation_markdown.is_empty() { + body_text.push_str("\n\n## Navigation\n\n"); + body_text.push_str(navigation_markdown); + } + } + fallback_diagnostic_text(body_text) +} + +fn fallback_diagnostic_text(body_text: String) -> String { + let mut text = String::from( + "[fallback diagnostic: local reader did not find useful main content; below is stripped HTML body text]\n\n", + ); + text.push_str(&body_text); + text +} + #[derive(Debug)] struct MainCandidate { handle: Handle, @@ -897,7 +920,7 @@ fn candidate_score(handle: &Handle, tag: &str, stats: TextStats) -> Option return None; } let link_density = stats.link_text_chars as f64 / stats.text_chars.max(1) as f64; - if link_density > 0.60 && !matches!(tag, "body" | "main") { + if link_density > 0.60 { return None; } @@ -1364,7 +1387,6 @@ fn is_navigation_element(handle: &Handle) -> bool { || has("menu") || has("breadcrumb") || has("breadcrumbs") - || has("chapter") || has("pagination") || has("pager") || has("prevnext") @@ -1780,7 +1802,9 @@ mod tests { assert!(text.contains("[careful Rust web fetching](")); assert!(text.contains(&format!("http://{addr}/docs/reader"))); assert!(text.contains("durable safety bounds")); - assert!(!text.contains("Home Products Pricing")); + assert!(!text.contains("Home")); + assert!(!text.contains("Pricing")); + assert!(!text.contains("unrelated navigation")); assert!(!text.contains("Copyright boilerplate")); assert_eq!(value["transformed_as"], "local_reader_markdown"); assert_eq!(value["html_extraction"]["method"], "local_reader_markdown"); @@ -1800,6 +1824,92 @@ mod tests { ); } + #[tokio::test] + async fn link_heavy_main_is_not_reported_as_readable() { + let body = r#" + + +
+ +
+ + + "#; + let addr = serve_once(html_response(body)).await; + let tools = enabled_web_fetch(); + let result = tools + .run_fetch(WebFetchInput { + url: format!("http://{addr}/contents"), + include_navigation: None, + }) + .await + .unwrap(); + let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap(); + let text = value.get("text").unwrap().as_str().unwrap(); + assert!(text.contains("fallback diagnostic")); + assert_ne!(value["transformed_as"], "local_reader_markdown"); + assert_eq!(value["html_extraction"]["fallback"], true); + assert_eq!(value["html_extraction"]["readable"], false); + } + + #[tokio::test] + async fn fallback_omits_detected_navigation_when_not_requested() { + let body = r#" + + + +

Tiny body.

+ + + "#; + let addr = serve_once(html_response(body)).await; + let tools = enabled_web_fetch(); + let result = tools + .run_fetch(WebFetchInput { + url: format!("http://{addr}/short"), + include_navigation: None, + }) + .await + .unwrap(); + let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap(); + let text = value.get("text").unwrap().as_str().unwrap(); + assert!(text.contains("Tiny body.")); + assert!(!text.contains("Home")); + assert!(!text.contains("Pricing")); + assert_eq!(value["html_extraction"]["fallback"], true); + assert_eq!(value["html_extraction"]["readable"], false); + assert_eq!(value["html_extraction"]["navigation_detected"], true); + assert_eq!(value["html_extraction"]["navigation_omitted"], true); + assert_eq!(value["html_extraction"]["navigation_included"], false); + } + + #[test] + fn included_navigation_reports_truncation_metadata() { + let links = (0..600) + .map(|index| { + format!("Navigation item {index} with a verbose label") + }) + .collect::(); + let html = format!( + "

Readable Article

This useful article has enough focused prose to make the local reader choose it as main content for the truncation test.

It also mentions bounded extraction, markdown rendering, and link preservation for untrusted HTML bodies.

" + ); + let base_url = Url::parse("https://example.test/docs/index.html").unwrap(); + let document = extract_html_document(&html, &base_url, true); + assert_eq!(document.metadata.readable, true); + assert_eq!(document.metadata.navigation_detected, true); + assert_eq!(document.metadata.navigation_included, true); + assert_eq!(document.metadata.navigation_truncated, true); + assert!(document.text.contains("## Navigation")); + } + #[tokio::test] async fn fetches_html_with_included_navigation_section() { let body = r#"