max_tokensのスキーマ不整合に関する修正

2026-04-28 17:58:24 +09:00 · 2026-04-28 17:58:24 +09:00 · ce4c0930c3
commit ce4c0930c3
parent 3d1b8a4761
10 changed files with 266 additions and 13 deletions
--- a/crates/llm-worker/src/llm_client/scheme/openai_responses/mod.rs
+++ b/crates/llm-worker/src/llm_client/scheme/openai_responses/mod.rs
@ -16,11 +16,12 @@ pub use scheme_impl::OpenAIResponsesState;
 /// OpenAI Responses scheme 本体。
 ///
-/// `store` / `include_encrypted_content` は scheme 固定の wire 設定で、
+/// `store` / `include_encrypted_content` / `send_max_output_tokens` は
-/// デフォルトは stateless + ZDR 相当 (`store=false`, `include=[...]`)。
+/// scheme 固定の wire 設定で、デフォルトは公式 OpenAI Responses API
-/// 将来 ZDR 非対応環境で `store=true` にしたくなった場合に限り override
+/// 向け (stateless + ZDR + `max_output_tokens` 送出可)。ChatGPT backend
-/// する。`ModelCapability` には入れない（これはモデルの能力ではなく、
+/// (codex-oauth) のように受理パラメータが subset の経路では provider 層で
-/// クライアントの運用方針）。
+/// `send_max_output_tokens=false` 等に上書きする。`ModelCapability` には
 /// 入れない（モデル能力ではなく wire policy）。
 #[derive(Debug, Clone)]
 pub struct OpenAIResponsesScheme {
    /// サーバ側に response を保存するか。ZDR/stateless 運用では `false`。
@ -28,6 +29,10 @@ pub struct OpenAIResponsesScheme {
    /// `include: ["reasoning.encrypted_content"]` を付けるか。
    /// `store=false` で reasoning を使うなら必須。
    pub include_encrypted_content: bool,
    /// `max_output_tokens` を body に載せるか。公式 OpenAI Responses API は
    /// 受理するが、ChatGPT backend (codex-oauth) は `Unsupported parameter`
    /// で 400 を返すため、その経路では `false` にする。
    pub send_max_output_tokens: bool,
 }
 impl Default for OpenAIResponsesScheme {
@ -35,12 +40,14 @@ impl Default for OpenAIResponsesScheme {
        Self {
            store: false,
            include_encrypted_content: true,
            send_max_output_tokens: true,
        }
    }
 }
 impl OpenAIResponsesScheme {
-    /// デフォルト設定 (`store=false`, `include=["reasoning.encrypted_content"]`)。
+    /// デフォルト設定 (`store=false`, `include=["reasoning.encrypted_content"]`,
    /// `send_max_output_tokens=true`)。
    pub fn new() -> Self {
        Self::default()
    }
@ -56,4 +63,10 @@ impl OpenAIResponsesScheme {
        self.include_encrypted_content = include;
        self
    }
    /// `max_output_tokens` を body に載せるかを上書き。
    pub fn with_send_max_output_tokens(mut self, send: bool) -> Self {
        self.send_max_output_tokens = send;
        self
    }
 }
--- a/crates/llm-worker/src/llm_client/scheme/openai_responses/request.rs
+++ b/crates/llm-worker/src/llm_client/scheme/openai_responses/request.rs
@ -38,6 +38,9 @@ pub(crate) struct ResponsesRequest {
    /// `["reasoning.encrypted_content"]` 等。
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub include: Vec<&'static str>,
    /// 公式 OpenAI Responses API では受理されるが、ChatGPT backend
    /// (codex-oauth) は 400 で弾く。scheme の `send_max_output_tokens`
    /// が `false` のときは `None` のまま送る (skip_serializing_if で除外)。
    #[serde(skip_serializing_if = "Option::is_none")]
    pub max_output_tokens: Option<u32>,
    #[serde(skip_serializing_if = "Option::is_none")]
@ -195,7 +198,11 @@ impl OpenAIResponsesScheme {
            store: self.store,
            stream: true,
            include,
-            max_output_tokens: request.config.max_tokens,
+            max_output_tokens: if self.send_max_output_tokens {
                request.config.max_tokens
            } else {
                None
            },
            temperature: request.config.temperature,
            top_p: request.config.top_p,
        }
@ -444,13 +451,26 @@ mod tests {
    }
    #[test]
-    fn max_output_tokens_passed_through() {
+    fn max_output_tokens_passed_through_by_default() {
        let scheme = OpenAIResponsesScheme::new();
        let req = Request::new().user("hi").max_tokens(100);
        let body = scheme.build_request("gpt-5", &req, &cap_with_reasoning());
        assert_eq!(body.max_output_tokens, Some(100));
    }
    #[test]
    fn max_output_tokens_dropped_when_send_disabled() {
        let scheme = OpenAIResponsesScheme::new().with_send_max_output_tokens(false);
        let req = Request::new().user("hi").max_tokens(100);
        let body = scheme.build_request("gpt-5", &req, &cap_with_reasoning());
        assert_eq!(body.max_output_tokens, None);
        let json = serde_json::to_value(&body).unwrap();
        assert!(
            json.get("max_output_tokens").is_none(),
            "max_output_tokens key must not appear in serialised body, got: {json}"
        );
    }
    #[test]
    fn tool_schema_without_properties_is_normalized() {
        // schemars は引数なし struct から `type:"object"` だけのスキーマを
--- a/crates/llm-worker/src/llm_client/scheme/openai_responses/scheme_impl.rs
+++ b/crates/llm-worker/src/llm_client/scheme/openai_responses/scheme_impl.rs
@ -3,8 +3,9 @@
 use serde_json::Value;
 use crate::llm_client::{
-    ClientError, auth::AuthRequirement, capability::ModelCapability, event::Event, scheme::Scheme,
+    ClientError, auth::AuthRequirement, capability::ModelCapability,
-    types::Request,
+    client::ConfigWarning, event::Event, scheme::Scheme,
    types::{Request, RequestConfig},
 };
 use super::OpenAIResponsesScheme;
@ -51,4 +52,18 @@ impl Scheme for OpenAIResponsesScheme {
    fn default_capability(&self) -> ModelCapability {
        super::capability::default_capability()
    }
    fn validate_config(&self, config: &RequestConfig) -> Vec<ConfigWarning> {
        let mut warnings = Vec::new();
        // ChatGPT backend (codex-oauth) は `max_output_tokens` を 400 で弾く。
        // scheme 構築時に `send_max_output_tokens=false` で組まれていれば
        // body 投影は止まっているので、ユーザの意図が落ちることだけを通知する。
        if !self.send_max_output_tokens && config.max_tokens.is_some() {
            warnings.push(ConfigWarning::unsupported(
                "max_tokens",
                "OpenAI Responses (ChatGPT backend)",
            ));
        }
        warnings
    }
 }
--- a/crates/pod/src/pod.rs
+++ b/crates/pod/src/pod.rs
@ -1064,7 +1064,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
        let mut summary_worker = Worker::new(summary_client)
            .system_prompt(summary_system_prompt)
            .temperature(0.0);
        summary_worker.set_max_tokens(4096);
        // Cumulative input-token meter + interceptor. The meter is bumped
        // from the on_usage callback and read on every pre_llm_request.
@ -1413,7 +1412,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
        let mut extract_worker = Worker::new(client)
            .system_prompt(extract::EXTRACT_SYSTEM_PROMPT)
            .temperature(0.0);
        extract_worker.set_max_tokens(4096);
        // Cumulative input-token meter + interceptor (mirror of
        // CompactWorkerInterceptor). Aborts the extract worker if its
--- a/crates/provider/src/lib.rs
+++ b/crates/provider/src/lib.rs
@ -142,7 +142,12 @@ fn build_from_config(config: &ModelConfig) -> Result<Box<dyn LlmClient>, Provide
        SchemeKind::OpenaiChat => build_transport(OpenAIScheme::new(), config, resolved),
        SchemeKind::Gemini => build_transport(GeminiScheme::new(), config, resolved),
        SchemeKind::OpenaiResponses => {
-            build_transport(OpenAIResponsesScheme::new(), config, resolved)
+            // ChatGPT backend (codex-oauth) は `max_output_tokens` を
            // 400 で弾くため、その経路では送出を止める。
            let scheme = OpenAIResponsesScheme::new().with_send_max_output_tokens(
                !matches!(config.auth, AuthRef::CodexOAuth),
            );
            build_transport(scheme, config, resolved)
        }
    }
 }
--- a/docs/research/anthropic_max_tokens.md
+++ b/docs/research/anthropic_max_tokens.md
@ -0,0 +1,44 @@
 # Anthropic Messages API: `max_tokens` パラメータ仕様
 Source: https://platform.claude.com/docs/en/api/messages  
 Retrieved: 2026-04-28
 ---
 ## 1. パラメータ名
 `max_tokens`
 `max_output_tokens` ではない。
 ## 2. 必須か任意か
 **必須 (required)**。
 `POST /v1/messages` のボディパラメータとして必須指定。  
 insomnia の現在の実装（`max_tokens: u32`、未指定時 4096 にフォールバック）は仕様と合致している。
 ## 3. 型・範囲
 - 型: integer (number)
 - 意味: 生成を停止する前に出力できるトークンの上限。モデルはこの値に達する前に停止することもある（上限の指定であり、保証値ではない）
 - モデルごとに最大値が異なる:
  - Claude Opus 4.6 / 4.7: 最大 128k トークン
  - Claude Sonnet 4.6 / Haiku 4.5: 最大 64k トークン
  - Message Batches API + beta ヘッダ `output-300k-2026-03-24`: 最大 300k トークン（Opus 4.7, 4.6, Sonnet 4.6）
 ## 4. Extended Thinking との組み合わせ制約
 Source: https://platform.claude.com/docs/en/build-with-claude/extended-thinking
 - `thinking.budget_tokens` は必ず `max_tokens` **未満** でなければならない
 - thinking トークンは `max_tokens` の上限に含まれてカウントされる
 - `budget_tokens` の最小値は **1,024 トークン**
 - 例外: ツールを伴う interleaved thinking では `budget_tokens` が `max_tokens` を超えることが許容される（予算がコンテキストウィンドウ全体に対して適用されるため）
 - `max_tokens` が 21,333 を超える場合はストリーミングが必須
 - Claude Opus 4.6 / Sonnet 4.6 以降では `budget_tokens` は非推奨になり、代わりに `effort` パラメータによる adaptive thinking が推奨されている
 ## 5. ドキュメント URL
 - Messages API リファレンス: https://platform.claude.com/docs/en/api/messages
 - Extended Thinking ガイド: https://platform.claude.com/docs/en/build-with-claude/extended-thinking
--- a/docs/research/gemini_max_output_tokens.md
+++ b/docs/research/gemini_max_output_tokens.md
@ -0,0 +1,45 @@
 # Google Gemini API: `maxOutputTokens` パラメータ仕様
 Source: https://ai.google.dev/api/generate-content  
 Source (thinking): https://ai.google.dev/gemini-api/docs/thinking  
 Source (Gemini 2.5 Flash): https://ai.google.dev/gemini-api/docs/models/gemini-2.5-flash  
 Source (Gemini 2.5 Pro): https://ai.google.dev/gemini-api/docs/models/gemini-2.5-pro  
 Retrieved: 2026-04-28
 ---
 ## 1. パラメータ名と位置
 `generationConfig.maxOutputTokens`
 リクエストボディのトップレベルではなく、`generationConfig` オブジェクト内に配置する。
 SDK では `GenerateContentConfig(max_output_tokens=...)` として渡す。
 ## 2. 必須 / 任意
 **任意 (optional)**。省略時はモデルのデフォルト上限が適用される。
 ## 3. 型と範囲
 - 型: `integer`
 - モデル別の最大値:
  - `gemini-2.5-flash`: 最大 **65,536** トークン
  - `gemini-2.5-pro`: 最大 **65,536** トークン
 - 最小値の公式明記はないが、正の整数を指定する。
 ## 4. thinking トークンとの関係
 - `maxOutputTokens` が制限するのは**最終レスポンスの出力トークン数のみ**。thinking トークンは `usageMetadata.thoughtsTokenCount` として別途計上され、`maxOutputTokens` のカウントには含まれない。
 - thinking トークンの制御には `generationConfig.thinkingConfig.thinkingBudget` を用いる。
  - Gemini 2.5 Flash / Pro: `128`〜`32768` トークン、`0` で thinking 無効化（モデルによる）、`-1` で動的
 - 課金は「output tokens + thinking tokens」の合算。
 - `maxOutputTokens` と `thinkingBudget` は独立したパラメータであり、両方を同時に指定できる。
 > **注意**: 2025年10月時点で `gemini-2.5-flash` において `max_output_tokens` が無視されるバグが報告されており、Google 側が修正をロールアウトした経緯がある。最新モデルで想定通りに機能するか実測で確認することを推奨。
 ## 5. ドキュメント URL
 - API リファレンス (GenerationConfig): https://ai.google.dev/api/generate-content#v1beta.GenerationConfig
 - Thinking ガイド: https://ai.google.dev/gemini-api/docs/thinking
 - Gemini 2.5 Flash モデル仕様: https://ai.google.dev/gemini-api/docs/models/gemini-2.5-flash
 - Gemini 2.5 Pro モデル仕様: https://ai.google.dev/gemini-api/docs/models/gemini-2.5-pro
--- a/docs/research/openai_chat_max_tokens.md
+++ b/docs/research/openai_chat_max_tokens.md
@ -0,0 +1,51 @@
 # OpenAI Chat Completions API — 出力トークン数制御パラメータ仕様
 - **Source**: https://platform.openai.com/docs/api-reference/chat/create
 - **Supplementary**: https://learn.microsoft.com/en-us/azure/foundry/openai/how-to/reasoning
 - **Retrieved**: 2026-04-28
 ---
 ## 1. `max_tokens` と `max_completion_tokens` の関係
 | パラメータ | 状態 | 対応モデル |
 |---|---|---|
 | `max_tokens` | **Deprecated** | GPT-3.5, GPT-4 系など旧モデルでは動作する |
 | `max_completion_tokens` | 現行・推奨 | 全モデル（旧モデルにも後方互換あり） |
 - `max_tokens` は o1 系以降では **受け付けられない**（エラーまたは無視）。
 - 変更の背景: 旧来の `max_tokens` は「返却トークン = 生成トークン = 課金トークン」を前提にしていた。o1 系で推論トークン（reasoning tokens）が導入されたことでこの前提が崩れ、新パラメータが設計された。
 - `max_completion_tokens` は旧モデルでも機能するため、**新規実装では `max_completion_tokens` を使うべき**。
 ## 2. 必須か任意か
 - **任意（optional）**。
 - 指定しない場合はモデルのコンテキスト上限まで生成する（デフォルト: `null`）。
 ## 3. 型と範囲
 - **型**: `integer | null`
 - **範囲**: `1` 以上、モデルのコンテキストウィンドウの残りトークン数以下。上限値はモデルごとに異なり、ドキュメント上に固定の最大値は明示されていない。
 - `null` を渡すと制限なし（モデル上限に従う）。
 ## 4. Reasoning モデルでの reasoning tokens のカウント
 - `max_completion_tokens` の上限には **reasoning tokens（推論トークン）を含む**。
  - reasoning tokens: モデルが内部で生成するが API レスポンスには含まれない隠しトークン。
  - 課金対象は reasoning tokens + visible output tokens の合計。
  - レスポンスの `usage.completion_tokens_details.reasoning_tokens` で内訳を確認できる。
 - したがって、`max_completion_tokens = 5000` と設定しても、推論に多くのトークンを使った場合、目に見える出力は 5000 より少なくなる。
 ## 5. Ollama の OpenAI compat API での扱い（補助情報）
 - Ollama の `/v1/chat/completions` は現時点で **`max_tokens` のみを公式サポート**している（内部的に `num_predict` にマッピング）。
 - `max_completion_tokens` サポートは Issue #7125 / PR #14464 で議論中だが、2026-04-28 時点では公式ドキュメント上に記載なし。
 - **Ollama に対しては `max_tokens` を使う**のが安全な選択。ただし将来的に `max_completion_tokens` に移行される見込み。
 ## 6. ドキュメント URL
 - [OpenAI Chat Completions API Reference](https://platform.openai.com/docs/api-reference/chat/create)
 - [Azure OpenAI Reasoning Models (GPT-5, o3, o1)](https://learn.microsoft.com/en-us/azure/foundry/openai/how-to/reasoning)
 - [Ollama OpenAI Compatibility](https://docs.ollama.com/api/openai-compatibility)
 - [Ollama Issue #7125 — max_completion_tokens support](https://github.com/ollama/ollama/issues/7125)
 - [OpenAI Community — Why max_tokens changed to max_completion_tokens](https://community.openai.com/t/why-was-max-tokens-changed-to-max-completion-tokens/938077)
--- a/docs/research/openai_responses_max_output_tokens.md
+++ b/docs/research/openai_responses_max_output_tokens.md
@ -0,0 +1,60 @@
 # OpenAI Responses API — `max_output_tokens` Parameter
 - **Source**: https://platform.openai.com/docs/api-reference/responses/create
 - **Retrieved**: 2026-04-28
 ---
 ## 1. パラメータ名
 `max_output_tokens` — 正しい。Chat Completions API の `max_tokens` / `max_completion_tokens` とは別物。
 ## 2. 必須 / 任意
 **任意 (optional)**。省略時のデフォルトは `inf`（モデルが許容する最大出力トークン数）。
 ## 3. 型と範囲
 | 項目 | 値 |
 |---|---|
 | 型 | `integer` または文字列 `"inf"` |
 | 最小値 | `1` |
 | 最大値 | モデルごとの最大出力トークン数（例: gpt-4.1 系は 32,768） |
 | デフォルト | `inf` |
 上限に達した場合、レスポンスの `status` が `"incomplete"` になり、`incomplete_details.reason` が `"max_output_tokens"` にセットされる。
 ## 4. Reasoning tokens との関係 / Reasoning モデルとの組合せ制約
 `max_output_tokens` は **reasoning tokens を含む** 合計生成トークン数の上限として機能する。
 公式ガイド (https://platform.openai.com/docs/guides/reasoning) には以下の記述がある:
 > "You can limit the total number of tokens the model generates (including both reasoning and final output tokens) by using the max_output_tokens parameter."
 **実用上の注意点:**
 - モデルが内部思考に多数の reasoning tokens を消費した後に上限に達すると、visible output が一切返らずに打ち切られる場合がある。
 - コスト制御目的には `reasoning.effort` (`"low"` など) の使用が推奨される。`max_output_tokens` はあくまで暴走抑止のガードとして位置づける。
 - o シリーズなど reasoning モデルでは `reasoning.max_tokens` (別パラメータ) で reasoning 専用の上限を設定できる場合もある。
 ## 5. ChatGPT backend (`https://chatgpt.com/backend-api/codex/responses`) における取り扱い
 このエンドポイントは公式 Responses API のサブセットのみをサポートするCodex CLI 互換 backend であり、`max_output_tokens` を **サポートしないパラメータとして 400 エラーで拒否する**。
 LiteLLM の調査 (https://github.com/BerriAI/litellm/issues/21193) によれば、ChatGPT Codex backend が受け付けるパラメータは以下に限られる:
 ```
 model, input, instructions, stream, store, include,
 tools, tool_choice, reasoning, previous_response_id, truncation
 ```
 `max_output_tokens`, `max_tokens`, `max_completion_tokens`, `temperature`, `user`, `metadata`, `context_management` はすべて拒否される。
 Codex CLI 自身も `config.toml` の `model_max_output_tokens` を API リクエストに載せない実装になっており (https://github.com/openai/codex/issues/4138)、これはバグではなく ChatGPT backend の制約に対する回避策と解釈できる。
 ## 6. ドキュメント URL
 - 公式 API リファレンス: https://platform.openai.com/docs/api-reference/responses/create
 - Reasoning ガイド: https://platform.openai.com/docs/guides/reasoning
 - Codex CLI issue (max_output_tokens 未送信): https://github.com/openai/codex/issues/4138
 - LiteLLM issue (ChatGPT backend 拒否パラメータ一覧): https://github.com/BerriAI/litellm/issues/21193
 - OpenAI Community (reasoning tokens 上限): https://community.openai.com/t/limiting-maximum-number-of-reasoning-tokens/1285430
--- a/resources/prompts/default.md
+++ b/resources/prompts/default.md
@ -1,5 +1,7 @@
 You are here as an agent of the "insomnia system".
 Stay precise, edit code directly when asked, and avoid speculative refactoring. 
 {% include "common/workspace" %}
 {% include "common/tool-usage" %}