Compare commits

...

11 Commits

12 changed files with 1590 additions and 40 deletions

208
Cargo.lock generated
View File

@ -531,7 +531,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb2a7d3066da2de787b7f032c736763eb7ae5d355f81a68bab2675a96008b0bf"
dependencies = [
"lab",
"phf",
"phf 0.11.3",
]
[[package]]
@ -876,6 +876,16 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.32"
@ -1127,6 +1137,20 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "http"
version = "1.4.0"
@ -1744,6 +1768,12 @@ dependencies = [
"which",
]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mac_address"
version = "1.1.8"
@ -1771,6 +1801,32 @@ dependencies = [
"tracing",
]
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen 0.10.0",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2"
dependencies = [
"html5ever",
"markup5ever",
"tendril",
"xml5ever",
]
[[package]]
name = "matchers"
version = "0.2.0"
@ -1922,6 +1978,12 @@ dependencies = [
"tempfile",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "nix"
version = "0.29.0"
@ -2151,6 +2213,15 @@ dependencies = [
"sha2 0.10.9",
]
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]]
name = "phf"
version = "0.11.3"
@ -2158,7 +2229,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
dependencies = [
"phf_macros",
"phf_shared",
"phf_shared 0.11.3",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
@ -2167,8 +2248,18 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
dependencies = [
"phf_generator",
"phf_shared",
"phf_generator 0.11.3",
"phf_shared 0.11.3",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand 0.8.5",
]
[[package]]
@ -2177,7 +2268,7 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared",
"phf_shared 0.11.3",
"rand 0.8.5",
]
@ -2187,20 +2278,29 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
dependencies = [
"phf_generator",
"phf_shared",
"phf_generator 0.11.3",
"phf_shared 0.11.3",
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher 0.3.11",
]
[[package]]
name = "phf_shared"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [
"siphasher",
"siphasher 1.0.2",
]
[[package]]
@ -2312,6 +2412,12 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "prettyplease"
version = "0.2.37"
@ -2456,6 +2562,8 @@ version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha 0.3.1",
"rand_core 0.6.4",
]
@ -2465,10 +2573,20 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
dependencies = [
"rand_chacha",
"rand_chacha 0.9.0",
"rand_core 0.9.5",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
@ -2484,6 +2602,9 @@ name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom 0.2.17",
]
[[package]]
name = "rand_core"
@ -3155,6 +3276,12 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "siphasher"
version = "1.0.2"
@ -3195,6 +3322,31 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "string_cache"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared 0.11.3",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
dependencies = [
"phf_generator 0.11.3",
"phf_shared 0.11.3",
"proc-macro2",
"quote",
]
[[package]]
name = "strsim"
version = "0.11.1"
@ -3310,6 +3462,17 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "termcolor"
version = "1.4.1"
@ -3327,8 +3490,8 @@ checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662"
dependencies = [
"fnv",
"nom",
"phf",
"phf_codegen",
"phf 0.11.3",
"phf_codegen 0.11.3",
]
[[package]]
@ -3374,10 +3537,10 @@ dependencies = [
"ordered-float 4.6.0",
"pest",
"pest_derive",
"phf",
"phf 0.11.3",
"sha2 0.10.9",
"signal-hook",
"siphasher",
"siphasher 1.0.2",
"terminfo",
"termios",
"thiserror 1.0.69",
@ -3597,9 +3760,11 @@ dependencies = [
"grep-matcher",
"grep-regex",
"grep-searcher",
"html5ever",
"ignore",
"llm-worker",
"manifest",
"markup5ever_rcdom",
"reqwest",
"schemars",
"serde",
@ -3845,6 +4010,12 @@ dependencies = [
"serde",
]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8_iter"
version = "1.0.4"
@ -4549,6 +4720,17 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
[[package]]
name = "xml5ever"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650"
dependencies = [
"log",
"mac",
"markup5ever",
]
[[package]]
name = "yoke"
version = "0.8.2"

View File

@ -11,8 +11,10 @@ grep-matcher = "0.1.8"
grep-regex = "0.1.14"
grep-searcher = "0.1.16"
ignore = "0.4.25"
html5ever = "0.26"
llm-worker = { workspace = true }
manifest = { workspace = true }
markup5ever_rcdom = "0.2"
reqwest = { version = "0.13", default-features = false, features = ["json", "native-tls"] }
schemars = { workspace = true }
serde = { workspace = true, features = ["derive"] }

File diff suppressed because it is too large Load Diff

View File

@ -40,7 +40,7 @@ rustPlatform.buildRustPackage rec {
filter = sourceFilter;
};
cargoHash = "sha256-8ZT5moKFxj/5vbp5rsUG7UkPLY1fvQKhYTyjRWQ58xk=";
cargoHash = "sha256-8TAJLV7+7Th4o5Jpsyqz+n9kiuB0FO6qxGi559otfko=";
depsExtraArgs = {
# nixpkgs 25.11's fetchCargoVendor still uses crates.io's API

View File

@ -0,0 +1,71 @@
---
id: 20260530-204045-webfetch-readable-extraction
slug: webfetch-readable-extraction
title: WebFetch: extract main HTML content with lightweight readability
status: closed
kind: task
priority: P2
labels: [web, tools, html]
created_at: 2026-05-30T20:40:45Z
updated_at: 2026-05-30T20:55:13Z
assignee: null
legacy_ticket: null
---
## Background
`WebFetch` currently returns bounded, safety-checked content but the HTML path is still close to raw page text: it strips tags with a small local formatter and does not try to isolate the article/main content. For LLM use, a reader-mode style extraction layer is more useful than raw boilerplate-heavy page text.
`readability-js` was investigated but brings QuickJS / bundled JavaScript dependency weight. The desired direction is a lightweight pure-Rust extraction backend with fallback to the current `html_to_text` behavior.
Reference implementations checked out with ghq under `.worktree/ghq-root/` for planning:
- `github.com/quambene/readability-rs` — crate `readability-rs`, MIT, small arc90-style extractor (`Readable { title, content, text }`).
- `github.com/theiskaa/readabilityrs` — crate `readabilityrs`, Apache-2.0, larger Mozilla Readability port with metadata/markdown support.
- `github.com/readable-app/readability.rs` — crate `readable-readability`, MIT, kuchiki-based extractor but sparse docs and older maintenance surface.
## Requirements
- Keep `WebSearch` and `WebFetch` as separate tools. Do not add an automatic summarization/research tool in this ticket.
- Add a reader-mode extraction path for HTML responses in `WebFetch`.
- Use a pure-Rust dependency or local extraction implementation; do not use `readability-js`, QuickJS, Node, Python, or subprocess-based extraction.
- Prefer the lightweight `readability-rs` crate if it builds cleanly and produces usable `title` + main `text`; escalate if the crate is incompatible or obviously too low-quality for the included fixtures.
- Preserve the current network safety behavior: configured provider requirement, private/local host rejection, bounded redirects, response size limits, binary rejection, output truncation, and untrusted-content warning semantics.
- Preserve fallback behavior. If readability extraction fails or returns empty/too-short text, return HTML text produced by the existing local fallback rather than failing the entire fetch.
- Structure HTML fetch output so the LLM can distinguish extraction metadata from document content. At minimum include:
- extraction method (`readability` or fallback name)
- fallback indicator / reason when applicable
- title when available
- main text
- existing fetch metadata such as URL/final URL/status/content type/truncation
- Keep HTML returned to the LLM as text by default. Do not expose full extracted HTML unless there is a clear existing output field need.
- Add focused tests with small HTML fixtures covering:
- article/main content is preferred over nav/footer/sidebar boilerplate
- fallback is used when readability extraction is not useful
- output remains bounded/truncated under the existing output limit
## Non-goals
- Provider expansion or changes to `WebSearch` provider selection.
- LLM-generated summaries inside `WebFetch`.
- Browser rendering, JavaScript execution, or dynamic page support.
- Large benchmark suite or exhaustive readability quality comparison.
- Public API/protocol changes beyond the tool result JSON shape.
## Implementation plan
1. Add the selected pure-Rust readability dependency to `crates/tools`.
2. Introduce a small internal HTML extraction helper, e.g. `extract_html_document(html, base_url, output_limit)`, wrapping readability success and fallback.
3. Update the `ContentKind::Html` branch in `WebFetch` rendering to use the helper.
4. Keep existing `html_to_text` as fallback and testable utility.
5. Update tests in `crates/tools/src/web.rs` or a focused tools test module.
6. Validate with formatting, focused tools tests, and broader checks appropriate to the dependency change.
## Acceptance criteria
- `WebFetch` HTML responses prefer extracted main content over navigation/footer/sidebar boilerplate in tests.
- `WebFetch` still returns useful bounded text when readability extraction fails or is empty.
- Tool output clearly reports extraction method and fallback status.
- No JavaScript engine/runtime dependency is introduced.
- `Cargo.lock` and Nix cargo hash implications are handled or explicitly reported.
- `cargo fmt --check`, focused tools tests, `cargo check -p tools`, and `./tickets.sh doctor` pass or any failure is clearly reported as unrelated/pre-existing.

View File

@ -0,0 +1 @@
Implemented `WebFetch` HTML reader-mode extraction with pure-Rust `readability-rs`, preserving existing safety checks and fallback to local `html_to_text`. Output now reports `html_extraction` metadata and bounded main text without exposing extracted HTML by default. Reviewed externally and approved; validation passed including focused tools tests and `nix build .#insomnia`.

View File

@ -0,0 +1,103 @@
<!-- event: create author: tickets.sh at: 2026-05-30T20:40:45Z -->
## Created
Created by tickets.sh create.
---
<!-- event: plan author: hare at: 2026-05-30T20:41:21Z -->
## Plan
Planning note:
- ghq checkouts for prior art were placed under `.worktree/ghq-root/` so they stay inside the repository write scope and under the ignored `.worktree/` area.
- `readability-js` is intentionally excluded from the implementation path because it pulls in QuickJS/rquickjs and bundled JavaScript.
- Candidate preference for this ticket is `readability-rs` first because it is small, MIT licensed, and exposes a simple `extract` API returning `title`, extracted HTML, and text. If it fails to build or extraction is unusable on the ticket fixtures, the coder should stop and report rather than silently switching to a heavier dependency.
- `readabilityrs` is the heavier pure-Rust backup candidate and useful for reference, but adopting it changes the dependency footprint more significantly.
---
<!-- event: implementation_report author: hare at: 2026-05-30T20:54:26Z -->
## Implementation report
Implementation report from coder Pod `webfetch-readable-coder-20260530`:
- Branch: `webfetch-readable-extraction`
- Commit: `7906ca532666669417c20d831a08103c2f0f80dd` (`web: extract readable html content`)
- Changed files: `Cargo.lock`, `crates/tools/Cargo.toml`, `crates/tools/src/web.rs`, `package.nix`
- Added `readability-rs = 0.5.0` to `tools` and updated Nix cargo hash.
- Added a WebFetch HTML extraction helper that uses readability for main text when useful and falls back to existing `html_to_text` when readability fails or returns too-short text.
- Added `html_extraction` metadata with method/fallback/reason/title and kept output bounded.
- Full extracted HTML is not returned.
Validation reported by coder:
- `cargo fmt --check` passed
- `cargo test -p tools web` passed (10 passed)
- `cargo check -p tools` passed, with only existing `llm-worker` dead_code warning
- `./tickets.sh doctor` passed
- `git diff --check` passed
- `nix build .#insomnia` passed
Unresolved issues: none.
---
<!-- event: review author: hare at: 2026-05-30T20:54:26Z status: approve -->
## Review: approve
External review by reviewer Pod `webfetch-readable-reviewer-20260530`: approve.
Summary:
- The change adds a pure-Rust `readability-rs` extraction path for `WebFetch` HTML responses.
- HTML responses use reader-mode text when extraction is useful and fall back to existing local `html_to_text` otherwise.
- Output JSON includes separate `html_extraction` metadata plus document `text`, while preserving fetch metadata and untrusted-content warning.
Requirements check:
- `WebSearch` / `WebFetch` separation preserved.
- Pure Rust dependency only; no QuickJS, Node, Python, browser, or subprocess path.
- Existing WebFetch safety behavior remains in place.
- Fallback behavior exists for readability errors and too-short/empty text.
- Output separates extraction metadata from text.
- Full extracted HTML is not exposed.
- Tests cover fallback metadata, article/main preference over nav/footer, truncation, and existing WebSearch/fetch safety behavior.
- Dependency and Nix hash changes are reasonable.
Blockers: none.
Non-blocking follow-up:
- Optional future direct test for a stable readability error path; current fallback coverage is sufficient for this ticket.
---
<!-- event: implementation_report author: hare at: 2026-05-30T20:55:12Z -->
## Implementation report
Main workspace validation after merge:
- `cargo fmt --check` passed
- `cargo test -p tools web` passed (10 passed)
- `cargo check -p tools` passed with existing `llm-worker` dead_code warning
- `./tickets.sh doctor` passed
- `git diff --check` passed
- `nix build .#insomnia` passed (with dirty tree warning due to unrelated `.insomnia/workflow/multi-agent-workflow.md` local modification)
---
<!-- event: close author: hare at: 2026-05-30T20:55:13Z status: closed -->
## Closed
Implemented `WebFetch` HTML reader-mode extraction with pure-Rust `readability-rs`, preserving existing safety checks and fallback to local `html_to_text`. Output now reports `html_extraction` metadata and bounded main text without exposing extracted HTML by default. Reviewed externally and approved; validation passed including focused tools tests and `nix build .#insomnia`.
---

View File

@ -0,0 +1,76 @@
---
id: 20260530-215928-webfetch-local-reader-markdown
slug: webfetch-local-reader-markdown
title: WebFetch: replace readability dependency with Markdown-preserving local reader
status: closed
kind: task
priority: P2
labels: [web, tools, html]
created_at: 2026-05-30T21:59:28Z
updated_at: 2026-05-30T22:21:39Z
assignee: null
legacy_ticket: null
---
## Background
`webfetch-readable-extraction` added `readability-rs` to improve `WebFetch` HTML output. It proved the direction, but the next design step is to own the reader behavior instead of depending on an article extractor that flattens links to plain text.
For LLM research workflows, article text without links is lossy: links inside the readable body often point to RFCs, docs, downloads, related pages, or citations that the agent must be able to follow. At the same time, navigation/sidebar content should be omitted by default, while still being discoverable when the page is documentation/book-like and navigation links are important.
## Requirements
- Replace the `readability-rs` dependency with a local, pure-Rust HTML reader extractor in `crates/tools`.
- Keep `WebSearch` and `WebFetch` separate. Do not add summarization or research orchestration in this ticket.
- `WebFetch` HTML output should be Markdown-ish text, not plain text:
- preserve inline links as `[label](absolute-url)`;
- preserve useful headings/lists/paragraph breaks enough for LLM readability;
- do not expose full HTML by default.
- Add optional `include_navigation: Option<bool>` to `WebFetchInput`, defaulting to `false`.
- Detect navigation-like content (`nav`, sidebar/toc/menu/breadcrumb-ish class/id/role, previous/next chapter areas, etc.) generically.
- With `include_navigation=false`, omit navigation from the main text by default.
- If navigation was detected and omitted, include metadata/notice in the tool result such as “navigation was detected and omitted; re-run with include_navigation=true if navigation/sidebar links are needed.”
- With `include_navigation=true`, include a bounded `## Navigation` section containing navigation links rendered as Markdown.
- Treat reader failure as a page-selection/readability signal, not as a second hidden reader mode:
- report `readable=false` or equivalent metadata/reason when no useful main content was selected;
- fallback text may remain as diagnostic last resort, but metadata must make clear it is fallback/raw-ish output.
- Preserve current WebFetch safety behavior:
- configured provider requirement;
- private/local host rejection;
- bounded redirects, response size, and output size;
- binary rejection;
- untrusted-content warning semantics.
- Preserve output bounding for both main text and navigation content.
- Avoid site-specific branches for mdBook/docs.rs/rustdoc/etc.; use generic DOM/tag/class/id/role heuristics only.
## Non-goals
- Firefox/Mozilla Readability compatibility.
- JavaScript execution, browser rendering, QuickJS, Node, Python, or subprocess extraction.
- Search result ranking changes or provider expansion.
- LLM summarization inside `WebFetch`.
- Exhaustive benchmark/quality suite.
## Implementation guidance
- Prefer using a lightweight DOM parser dependency already implied by the current dependency graph if possible (`html5ever` / rcdom or similar). It is acceptable to retain such parser dependencies directly while removing `readability-rs`.
- Build a small local extractor with clear stages:
1. parse HTML;
2. classify nodes as navigation/skipped/main candidates;
3. select the best main candidate using simple scoring (text length, paragraph count, link density, positive tags like `main`/`article`, negative class/id words);
4. render selected content as bounded Markdown with absolute links;
5. optionally render bounded navigation links under `## Navigation`.
- Keep the existing simple `html_to_text` path only as explicit diagnostic fallback when local reader extraction cannot find useful content.
- Keep result JSON compatibility where practical, but update `html_extraction` metadata to expose method, readable status, navigation status, fallback status/reason, and title when available.
## Acceptance criteria
- `readability-rs` is removed from direct dependencies and no JavaScript runtime dependency is introduced.
- HTML article fixture renders body links as Markdown `[label](absolute-url)`.
- Navigation/sidebar/footer are omitted from main text by default.
- When navigation is omitted, result metadata or notice clearly says navigation was detected and can be included via `include_navigation=true`.
- With `include_navigation=true`, bounded navigation links appear under a separate `## Navigation` section.
- Link-heavy navigation-only pages are not misreported as successfully readable main content.
- Existing safety and bounds tests continue to pass.
- Focused tests cover link preservation, navigation omission notice, navigation inclusion, reader failure/fallback metadata, and truncation/bounds.
- `cargo fmt --check`, focused tools tests, `cargo check -p tools`, `./tickets.sh doctor`, `git diff --check`, and Nix build/hash handling pass or failures are clearly reported.

View File

@ -0,0 +1 @@
Replaced the `readability-rs` WebFetch HTML extraction path with a local pure-Rust DOM reader that renders Markdown-ish main content and preserves inline links as absolute Markdown links. Added optional `include_navigation`, default navigation omission notices, bounded navigation inclusion, readable/fallback metadata, and regression coverage. External review approved after blocker fixes; validation passed including focused tools tests and Nix build.

View File

@ -0,0 +1,103 @@
<!-- event: create author: tickets.sh at: 2026-05-30T21:59:28Z -->
## Created
Created by tickets.sh create.
---
<!-- event: plan author: hare at: 2026-05-30T22:00:33Z -->
## Plan
Implementation plan:
1. Replace the current `readability-rs` adapter with a local DOM-based reader extractor scoped to `crates/tools`.
2. Add `include_navigation` to `WebFetchInput`, default false, and thread it only through the HTML render path.
3. Render readable content as Markdown-ish text so inline links remain followable.
4. Detect navigation generically and omit it by default while reporting a notice; include bounded navigation links only when requested.
5. Remove the direct `readability-rs` dependency and update Cargo/Nix lock data.
6. Validate with focused web tests, tools check, doctor, diff check, and Nix build/hash handling.
---
<!-- event: implementation_report author: hare at: 2026-05-30T22:20:54Z -->
## Implementation report
Implementation report from coder Pod `webfetch-local-reader-coder-20260530`:
- Branch: `webfetch-local-reader-markdown`
- Commits:
- `2a3208b96e151d748fd80d5a1c4236ba616b3c6e` `web: render readable html as markdown`
- `44262c42365cbba1dad87629c06ce539cbd25105` `web: harden reader navigation fallbacks`
- Changed files: `crates/tools/src/web.rs`, `crates/tools/Cargo.toml`, `Cargo.lock`, `package.nix`
- Removed direct `readability-rs` dependency and replaced it with a local `html5ever`/rcdom reader implementation.
- Added `include_navigation: Option<bool>` to `WebFetchInput`.
- Renders readable HTML as Markdown-ish text, preserving inline body links as `[label](absolute-url)`.
- Detects generic navigation/sidebar/toc/menu/breadcrumb/pagination/prev-next content, omits it by default with metadata/notice, and includes bounded `## Navigation` when requested.
- Reader failure/fallback is reported with `readable=false`/fallback metadata; fallback output omits detected navigation by default so metadata and text agree.
- Added/strengthened regression tests for link preservation, navigation omission/inclusion, link-heavy main failure, fallback navigation consistency, and navigation truncation.
Validation reported by coder:
- `cargo fmt --check` passed
- `cargo test -p tools web` passed (14 passed)
- `cargo check -p tools` passed
- `./tickets.sh doctor` passed
- `git diff --check` passed
- `nix build .#insomnia` passed
Unresolved issues: none.
---
<!-- event: review author: hare at: 2026-05-30T22:20:54Z status: approve -->
## Review: approve
External review by reviewer Pod `webfetch-local-reader-reviewer-20260530`: approve.
First review requested changes for two blockers:
1. link-heavy `body` / `main` could be accepted as readable main content;
2. fallback could claim navigation omission while returning detected navigation text.
Follow-up commit `44262c42365cbba1dad87629c06ce539cbd25105` resolved both:
- `candidate_score` rejects high link density for all candidate tags, including `body` and `main`;
- fallback text is generated through the DOM reader path so detected navigation is omitted by default when `include_navigation=false`;
- metadata aligns with included/omitted navigation state;
- tests cover link-heavy main, fallback nav omission consistency, strengthened omitted nav labels, and navigation truncation metadata.
Reviewer found no new blocker. Reported validation is adequate.
---
<!-- event: implementation_report author: hare at: 2026-05-30T22:21:39Z -->
## Implementation report
Main workspace validation after merge:
- `cargo fmt --check` passed
- `cargo test -p tools web` passed (14 passed)
- `cargo check -p tools` passed with existing `llm-worker` dead_code warning
- `./tickets.sh doctor` passed
- `git diff --check` passed
- `nix build .#insomnia` passed (with dirty tree warning due to existing `.insomnia/workflow/multi-agent-workflow.md` local modification and open ticket lifecycle files)
---
<!-- event: close author: hare at: 2026-05-30T22:21:39Z status: closed -->
## Closed
Replaced the `readability-rs` WebFetch HTML extraction path with a local pure-Rust DOM reader that renders Markdown-ish main content and preserves inline links as absolute Markdown links. Added optional `include_navigation`, default navigation omission notices, bounded navigation inclusion, readable/fallback metadata, and regression coverage. External review approved after blocker fixes; validation passed including focused tools tests and Nix build.
---