web: harden reader navigation fallbacks

2026-05-31 07:19:08 +09:00 · 2026-05-31 07:19:08 +09:00 · 44262c4236
commit 44262c4236
parent 2a3208b96e
1 changed files with 126 additions and 16 deletions
--- a/crates/tools/src/web.rs
+++ b/crates/tools/src/web.rs
@ -731,12 +731,13 @@ fn extract_html_document(html: &str, base_url: &Url, include_navigation: bool) -
        Ok(dom) => dom,
        Err(err) => {
            return html_fallback_document(
-                html,
+                fallback_diagnostic_text(html_to_text(html)),
                None,
                Some(format!("HTML parser failed: {err}")),
                false,
                false,
                false,
+                false,
            );
        }
    };
@ -750,16 +751,21 @@ fn extract_html_document(html: &str, base_url: &Url, include_navigation: bool) -
    } else {
        (None, false)
    };
+    let navigation_included = navigation_markdown
+        .as_ref()
+        .map(|navigation_markdown| !navigation_markdown.is_empty())
+        .unwrap_or(false);

    let Some(candidate) = select_main_candidate(&body) else {
        return html_fallback_document(
-            html,
+            fallback_diagnostic_text_from_body(&body, base_url, navigation_markdown.as_deref()),
            title,
            Some(format!(
                "local reader found no main-content candidate with at least {WEB_FETCH_READER_MIN_TEXT_CHARS} text characters"
            )),
            navigation_detected,
            include_navigation,
+            navigation_included,
            navigation_truncated,
        );
    };
@ -767,21 +773,18 @@ fn extract_html_document(html: &str, base_url: &Url, include_navigation: bool) -
    let mut text = clean_text(markdown_for_node(&candidate.handle, base_url, true));
    if text.chars().count() < WEB_FETCH_READER_MIN_TEXT_CHARS {
        return html_fallback_document(
-            html,
+            fallback_diagnostic_text_from_body(&body, base_url, navigation_markdown.as_deref()),
            title,
            Some(format!(
                "local reader selected content shorter than {WEB_FETCH_READER_MIN_TEXT_CHARS} characters"
            )),
            navigation_detected,
            include_navigation,
+            navigation_included,
            navigation_truncated,
        );
    }

-    let navigation_included = navigation_markdown
-        .as_ref()
-        .map(|navigation_markdown| !navigation_markdown.is_empty())
-        .unwrap_or(false);
    if let Some(navigation_markdown) = navigation_markdown {
        if !navigation_markdown.is_empty() {
            text.push_str("\n\n## Navigation\n\n");
@ -807,17 +810,14 @@ fn extract_html_document(html: &str, base_url: &Url, include_navigation: bool) -
 }

 fn html_fallback_document(
-    html: &str,
+    text: String,
    title: Option<String>,
    fallback_reason: Option<String>,
    navigation_detected: bool,
    include_navigation: bool,
+    navigation_included: bool,
    navigation_truncated: bool,
 ) -> HtmlDocument {
-    let mut text = String::from(
-        "[fallback diagnostic: local reader did not find useful main content; below is stripped HTML text]\n\n",
-    );
-    text.push_str(&html_to_text(html));
    HtmlDocument {
        text,
        metadata: HtmlExtractionMetadata {
@ -827,7 +827,7 @@ fn html_fallback_document(
            title,
            readable: false,
            navigation_detected,
-            navigation_included: false,
+            navigation_included,
            navigation_omitted: navigation_detected && !include_navigation,
            navigation_truncated,
            navigation_notice: navigation_notice(navigation_detected, include_navigation),
@ -835,6 +835,29 @@ fn html_fallback_document(
    }
 }

+fn fallback_diagnostic_text_from_body(
+    body: &Handle,
+    base_url: &Url,
+    navigation_markdown: Option<&str>,
+) -> String {
+    let mut body_text = clean_text(markdown_for_node(body, base_url, true));
+    if let Some(navigation_markdown) = navigation_markdown {
+        if !navigation_markdown.is_empty() {
+            body_text.push_str("\n\n## Navigation\n\n");
+            body_text.push_str(navigation_markdown);
+        }
+    }
+    fallback_diagnostic_text(body_text)
+}
+
+fn fallback_diagnostic_text(body_text: String) -> String {
+    let mut text = String::from(
+        "[fallback diagnostic: local reader did not find useful main content; below is stripped HTML body text]\n\n",
+    );
+    text.push_str(&body_text);
+    text
+}
+
 #[derive(Debug)]
 struct MainCandidate {
    handle: Handle,
@ -897,7 +920,7 @@ fn candidate_score(handle: &Handle, tag: &str, stats: TextStats) -> Option<f64>
        return None;
    }
    let link_density = stats.link_text_chars as f64 / stats.text_chars.max(1) as f64;
-    if link_density > 0.60 && !matches!(tag, "body" | "main") {
+    if link_density > 0.60 {
        return None;
    }

@ -1364,7 +1387,6 @@ fn is_navigation_element(handle: &Handle) -> bool {
        || has("menu")
        || has("breadcrumb")
        || has("breadcrumbs")
-        || has("chapter")
        || has("pagination")
        || has("pager")
        || has("prevnext")
@ -1780,7 +1802,9 @@ mod tests {
        assert!(text.contains("[careful Rust web fetching]("));
        assert!(text.contains(&format!("http://{addr}/docs/reader")));
        assert!(text.contains("durable safety bounds"));
-        assert!(!text.contains("Home Products Pricing"));
+        assert!(!text.contains("Home"));
+        assert!(!text.contains("Pricing"));
+        assert!(!text.contains("unrelated navigation"));
        assert!(!text.contains("Copyright boilerplate"));
        assert_eq!(value["transformed_as"], "local_reader_markdown");
        assert_eq!(value["html_extraction"]["method"], "local_reader_markdown");
@ -1800,6 +1824,92 @@ mod tests {
        );
    }

+    #[tokio::test]
+    async fn link_heavy_main_is_not_reported_as_readable() {
+        let body = r#"
+            <html>
+              <body>
+                <main>
+                  <ul>
+                    <li><a href="/chapter-1">Chapter one overview and navigation entry</a></li>
+                    <li><a href="/chapter-2">Chapter two overview and navigation entry</a></li>
+                    <li><a href="/chapter-3">Chapter three overview and navigation entry</a></li>
+                    <li><a href="/chapter-4">Chapter four overview and navigation entry</a></li>
+                  </ul>
+                </main>
+              </body>
+            </html>
+        "#;
+        let addr = serve_once(html_response(body)).await;
+        let tools = enabled_web_fetch();
+        let result = tools
+            .run_fetch(WebFetchInput {
+                url: format!("http://{addr}/contents"),
+                include_navigation: None,
+            })
+            .await
+            .unwrap();
+        let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap();
+        let text = value.get("text").unwrap().as_str().unwrap();
+        assert!(text.contains("fallback diagnostic"));
+        assert_ne!(value["transformed_as"], "local_reader_markdown");
+        assert_eq!(value["html_extraction"]["fallback"], true);
+        assert_eq!(value["html_extraction"]["readable"], false);
+    }
+
+    #[tokio::test]
+    async fn fallback_omits_detected_navigation_when_not_requested() {
+        let body = r#"
+            <html>
+              <body>
+                <aside class="sidebar menu">
+                  <a href="/home">Home</a>
+                  <a href="/pricing">Pricing</a>
+                </aside>
+                <article><p>Tiny body.</p></article>
+              </body>
+            </html>
+        "#;
+        let addr = serve_once(html_response(body)).await;
+        let tools = enabled_web_fetch();
+        let result = tools
+            .run_fetch(WebFetchInput {
+                url: format!("http://{addr}/short"),
+                include_navigation: None,
+            })
+            .await
+            .unwrap();
+        let value: Value = serde_json::from_str(result.content.as_deref().unwrap()).unwrap();
+        let text = value.get("text").unwrap().as_str().unwrap();
+        assert!(text.contains("Tiny body."));
+        assert!(!text.contains("Home"));
+        assert!(!text.contains("Pricing"));
+        assert_eq!(value["html_extraction"]["fallback"], true);
+        assert_eq!(value["html_extraction"]["readable"], false);
+        assert_eq!(value["html_extraction"]["navigation_detected"], true);
+        assert_eq!(value["html_extraction"]["navigation_omitted"], true);
+        assert_eq!(value["html_extraction"]["navigation_included"], false);
+    }
+
+    #[test]
+    fn included_navigation_reports_truncation_metadata() {
+        let links = (0..600)
+            .map(|index| {
+                format!("<a href=\"/nav/{index}\">Navigation item {index} with a verbose label</a>")
+            })
+            .collect::<String>();
+        let html = format!(
+            "<html><body><nav>{links}</nav><article><h1>Readable Article</h1><p>This useful article has enough focused prose to make the local reader choose it as main content for the truncation test.</p><p>It also mentions bounded extraction, markdown rendering, and link preservation for untrusted HTML bodies.</p></article></body></html>"
+        );
+        let base_url = Url::parse("https://example.test/docs/index.html").unwrap();
+        let document = extract_html_document(&html, &base_url, true);
+        assert_eq!(document.metadata.readable, true);
+        assert_eq!(document.metadata.navigation_detected, true);
+        assert_eq!(document.metadata.navigation_included, true);
+        assert_eq!(document.metadata.navigation_truncated, true);
+        assert!(document.text.contains("## Navigation"));
+    }
+
    #[tokio::test]
    async fn fetches_html_with_included_navigation_section() {
        let body = r#"