diff --git a/crates/provider/src/catalog.rs b/crates/provider/src/catalog.rs index 26e4092d..049c4477 100644 --- a/crates/provider/src/catalog.rs +++ b/crates/provider/src/catalog.rs @@ -241,9 +241,9 @@ pub fn load_models_from(path: &Path) -> Result, CatalogError> { // --- ref 解決 / マニフェスト → ModelConfig --------------------------------- /// `/` の最初の `/` で 1 回だけ split する。 -/// OpenRouter の `openrouter/anthropic/claude-sonnet-4` のように +/// OpenRouter の `openrouter/anthropic/claude-sonnet-4.6` のように /// model_id に `/` を含むケースは、provider=`openrouter`、 -/// model_id=`anthropic/claude-sonnet-4` として通る。 +/// model_id=`anthropic/claude-sonnet-4.6` として通る。 fn split_ref(s: &str) -> Option<(&str, &str)> { let (provider, rest) = s.split_once('/')?; if provider.is_empty() || rest.is_empty() { @@ -403,7 +403,7 @@ mod tests { } #[test] - fn resolve_ref_pulls_provider_defaults() { + fn resolve_ref_merges_provider_and_model_catalog() { let providers = load_builtin_providers().unwrap(); let models = load_builtin_models().unwrap(); let manifest = ModelManifest { @@ -423,9 +423,9 @@ mod tests { } assert!( cfg.capability.is_some(), - "should fall back to provider.default_capability" + "model catalog should provide capability" ); - assert_eq!(cfg.context_window, 200_000); + assert_eq!(cfg.context_window, 1_000_000); } #[test] @@ -515,12 +515,12 @@ mod tests { let providers = load_builtin_providers().unwrap(); let models = load_builtin_models().unwrap(); let manifest = ModelManifest { - ref_: Some("openrouter/anthropic/claude-sonnet-4".into()), + ref_: Some("openrouter/anthropic/claude-sonnet-4.6".into()), ..Default::default() }; let cfg = resolve_with_catalogs(&manifest, &providers, &models).unwrap(); assert_eq!(cfg.scheme, SchemeKind::OpenaiChat); - assert_eq!(cfg.model_id, "anthropic/claude-sonnet-4"); + assert_eq!(cfg.model_id, "anthropic/claude-sonnet-4.6"); } #[test] diff --git a/resources/models/builtin.toml b/resources/models/builtin.toml index ae56682f..a4fcf58b 100644 --- a/resources/models/builtin.toml +++ b/resources/models/builtin.toml @@ -1,59 +1,68 @@ # Anthropic direct +[[model]] +id = "claude-opus-4-8" +provider = "anthropic" +context_window = 1000000 +capability = { tool_calling = "parallel", structured_output = "json_schema", vision = true, prompt_caching = { kind = "explicit", max_breakpoints = 4 } } + [[model]] id = "claude-sonnet-4-6" provider = "anthropic" -context_window = 200000 +context_window = 1000000 +capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "budget_tokens", vision = true, prompt_caching = { kind = "explicit", max_breakpoints = 4 } } [[model]] -id = "claude-sonnet-4-5" -provider = "anthropic" -context_window = 200000 - -[[model]] -id = "claude-opus-4-1" +id = "claude-haiku-4-5" provider = "anthropic" context_window = 200000 +capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "budget_tokens", vision = true, prompt_caching = { kind = "explicit", max_breakpoints = 4 } } # Ollama local (capability is router-ish / ollama handles its own models) [[model]] -id = "llama3.1" +id = "llama3.3" provider = "ollama-local" context_window = 128000 [[model]] -id = "qwen2.5-coder" +id = "qwen3-coder" provider = "ollama-local" -context_window = 128000 +context_window = 256000 # Codex OAuth (ChatGPT backend via Responses API) +[[model]] +id = "gpt-5.5" +provider = "codex-oauth" +context_window = 1050000 +max_context_window = 272000 +capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "effort", vision = true, prompt_caching = { kind = "auto" } } + +[[model]] +id = "gpt-5.4" +provider = "codex-oauth" +context_window = 1050000 +capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "effort", vision = true, prompt_caching = { kind = "auto" } } + [[model]] id = "gpt-5-codex" provider = "codex-oauth" context_window = 400000 capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "effort", vision = true, prompt_caching = { kind = "auto" } } -[[model]] -id = "gpt-5" -provider = "codex-oauth" -context_window = 400000 -capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "effort", vision = true, prompt_caching = { kind = "auto" } } - -[[model]] -id = "gpt-5.5" -provider = "codex-oauth" -context_window = 1000000 -max_context_window = 272000 -capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "effort", vision = true, prompt_caching = { kind = "auto" } } - # OpenRouter [[model]] -id = "anthropic/claude-sonnet-4" +id = "anthropic/claude-opus-4.8" provider = "openrouter" -context_window = 200000 +context_window = 1000000 capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "budget_tokens", vision = true, prompt_caching = { kind = "auto" } } [[model]] -id = "openai/gpt-5" +id = "anthropic/claude-sonnet-4.6" provider = "openrouter" -context_window = 400000 +context_window = 1000000 +capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "budget_tokens", vision = true, prompt_caching = { kind = "auto" } } + +[[model]] +id = "openai/gpt-5.5" +provider = "openrouter" +context_window = 1050000 capability = { tool_calling = "parallel", structured_output = "json_schema", reasoning = "effort", vision = true, prompt_caching = { kind = "auto" } } diff --git a/work-items/open/20260530-054927-refresh-builtin-model-catalog/item.md b/work-items/open/20260530-054927-refresh-builtin-model-catalog/item.md index d66040e3..099ec2d1 100644 --- a/work-items/open/20260530-054927-refresh-builtin-model-catalog/item.md +++ b/work-items/open/20260530-054927-refresh-builtin-model-catalog/item.md @@ -7,7 +7,7 @@ kind: task priority: P2 labels: [models, providers, catalog, research] created_at: 2026-05-30T05:49:27Z -updated_at: 2026-05-30T05:50:04Z +updated_at: 2026-05-30T23:17:25Z assignee: null legacy_ticket: null --- diff --git a/work-items/open/20260530-054927-refresh-builtin-model-catalog/thread.md b/work-items/open/20260530-054927-refresh-builtin-model-catalog/thread.md index f7f005ae..9678d5d8 100644 --- a/work-items/open/20260530-054927-refresh-builtin-model-catalog/thread.md +++ b/work-items/open/20260530-054927-refresh-builtin-model-catalog/thread.md @@ -23,4 +23,92 @@ Critical risks: - If changing the default profile model, explain the product reason and verify compaction/effective window metadata. +--- + + + +## Decision + +Research note for builtin catalog refresh: + +Sources checked: + +- Anthropic Models overview (`https://docs.anthropic.com/en/docs/about-claude/models/overview`, redirected to `https://platform.claude.com/docs/en/about-claude/models/overview`): current comparison lists Claude Opus 4.8, Claude Sonnet 4.6, and Claude Haiku 4.5. API IDs: `claude-opus-4-8`, `claude-sonnet-4-6`, `claude-haiku-4-5-20251001`; aliases include `claude-haiku-4-5`. Context windows: Opus 4.8 1M, Sonnet 4.6 1M, Haiku 4.5 200k. Opus 4.8 is described as the starting point for most complex tasks, but the table says Extended thinking: No, so the catalog gives it an explicit capability without `reasoning = "budget_tokens"`. +- OpenAI Models overview (`https://platform.openai.com/docs/models`, redirected to `https://developers.openai.com/api/docs/models`): recommends `gpt-5.5` for complex reasoning/coding, with `gpt-5.4` and `gpt-5.4-mini` as lower latency/cost variants. `gpt-5.5` and `gpt-5.4` have 1.05M context windows and 128k max output. +- OpenAI model detail pages: + - `https://developers.openai.com/api/docs/models/gpt-5.5`: model ID `gpt-5.5`, 1,050,000 context window, xhigh reasoning support, notes prompts over 272K input tokens are charged differently; local catalog retains `max_context_window = 272000` for the existing backend/effective-window clamp decision. + - `https://developers.openai.com/api/docs/models/gpt-5.4`: model ID `gpt-5.4`, 1,050,000 context window. + - `https://developers.openai.com/api/docs/models/gpt-5-codex`: model ID `gpt-5-codex`, 400,000 context window, Responses API only, optimized for agentic coding in Codex/similar environments. +- OpenRouter model list endpoint (`https://openrouter.ai/api/v1/models`): confirmed `anthropic/claude-opus-4.8` (1M), `anthropic/claude-sonnet-4.6` (1M), and `openai/gpt-5.5` (1.05M) with tools/structured output/reasoning parameters. Dynamic `~...latest` router aliases exist, but the builtin catalog uses concrete IDs to avoid unstable default behavior. +- Ollama Library: + - `https://ollama.com/library/llama3.3`: `llama3.3` latest/70b has 128K context. + - `https://ollama.com/library/qwen3-coder`: `qwen3-coder` latest/30b has 256K context and is positioned for agentic/coding tasks. + +Selected changes: + +- Anthropic direct: replace stale `claude-sonnet-4-5` / `claude-opus-4-1` with `claude-opus-4-8`, `claude-sonnet-4-6`, and `claude-haiku-4-5`; update Sonnet context to 1M. +- Codex OAuth/OpenAI: keep default `codex-oauth/gpt-5.5`, update advertised context to 1.05M while retaining the existing 272K effective clamp; replace older plain `gpt-5` entry with `gpt-5.4`; keep `gpt-5-codex` because OpenAI documents it as a Codex/similar-environment Responses model. +- OpenRouter: replace stale `anthropic/claude-sonnet-4` / `openai/gpt-5` with concrete current IDs `anthropic/claude-opus-4.8`, `anthropic/claude-sonnet-4.6`, and `openai/gpt-5.5`. +- Ollama: replace `llama3.1` / `qwen2.5-coder` with current generic local placeholders `llama3.3` and `qwen3-coder`. +- Provider definitions unchanged; no provider-level source indicated that `resources/providers/builtin.toml` is stale. +- Default profile remains `codex-oauth/gpt-5.5`; this remains aligned with OpenAI’s current model recommendation and the existing effective-context clamp used by compaction safety. + + +--- + + + +## Review: approve + +External review by reviewer Pod `builtin-catalog-reviewer-20260530`: approve. + +Reviewer summary: + +- Sources were recorded in the ticket thread before/with the catalog changes. +- Catalog changes align with the recorded sources: + - Anthropic entries now use Opus 4.8 / Sonnet 4.6 / Haiku 4.5, with Sonnet and Opus 1M context and Haiku 200k. + - Codex OAuth/OpenAI entries use `gpt-5.5`, `gpt-5.4`, and `gpt-5-codex`; `gpt-5.5` retains the existing `max_context_window = 272000` effective clamp while advertising 1.05M. + - OpenRouter uses concrete current IDs, avoiding unstable `~...latest` aliases. + - Ollama entries use `llama3.3` and `qwen3-coder`. +- Provider definitions and default profile were reasonably left unchanged. +- Reported validation was adequate. + +Blockers: none. + +Non-blocking note addressed after review: +- The provider catalog test name/assertion still implied provider-default capability fallback for a model that now has explicit capability. It was renamed/updated to describe provider+model catalog merge semantics. + + +--- + + + +## Implementation report + +Implementation report: + +- Refreshed `resources/models/builtin.toml` from recorded official/semiofficial sources. +- Anthropic direct entries now cover `claude-opus-4-8`, `claude-sonnet-4-6`, and `claude-haiku-4-5`; Sonnet/Opus context windows are 1M and Haiku is 200k. Opus has explicit capability without `reasoning = "budget_tokens"` because the source table says Extended thinking is not supported. +- Ollama local placeholders are now `llama3.3` (128K) and `qwen3-coder` (256K). +- Codex OAuth/OpenAI entries now cover `gpt-5.5`, `gpt-5.4`, and `gpt-5-codex`; `gpt-5.5` advertises 1.05M while retaining the existing `max_context_window = 272000` effective clamp. +- OpenRouter entries now use concrete current IDs `anthropic/claude-opus-4.8`, `anthropic/claude-sonnet-4.6`, and `openai/gpt-5.5`; dynamic latest aliases were intentionally not added. +- `resources/providers/builtin.toml` and `resources/profiles/default.lua` were left unchanged. +- Updated provider catalog test expectations and renamed the affected test to avoid implying provider-default capability fallback for an explicitly cataloged model. + +External review: + +- Reviewer Pod `builtin-catalog-reviewer-20260530` approved with no blockers. +- Reviewer non-blocking note about the stale test name/assert message was addressed. + +Validation: + +- `cargo fmt --check` passed +- `cargo test -p provider` passed +- `cargo test -p manifest model` passed +- `cargo test -p manifest profile -- --nocapture` passed +- `cargo check -p provider -p manifest` passed +- `./tickets.sh doctor` passed +- `git diff --check` passed + + ---