max_tokensのスキーマ不整合に関する修正

2026-04-28 17:58:24 +09:00 · 2026-04-28 17:58:24 +09:00 · f1ba5b5686
commit f1ba5b5686
parent ce7153f6e8
10 changed files with 266 additions and 13 deletions
--- a/crates/llm-worker/src/llm_client/scheme/openai_responses/mod.rs
+++ b/crates/llm-worker/src/llm_client/scheme/openai_responses/mod.rs
@ -16,11 +16,12 @@ pub use scheme_impl::OpenAIResponsesState;

 /// OpenAI Responses scheme 本体。
 ///
-/// `store` / `include_encrypted_content` は scheme 固定の wire 設定で、
-/// デフォルトは stateless + ZDR 相当 (`store=false`, `include=[...]`)。
-/// 将来 ZDR 非対応環境で `store=true` にしたくなった場合に限り override
-/// する。`ModelCapability` には入れない（これはモデルの能力ではなく、
-/// クライアントの運用方針）。
+/// `store` / `include_encrypted_content` / `send_max_output_tokens` は
+/// scheme 固定の wire 設定で、デフォルトは公式 OpenAI Responses API
+/// 向け (stateless + ZDR + `max_output_tokens` 送出可)。ChatGPT backend
+/// (codex-oauth) のように受理パラメータが subset の経路では provider 層で
+/// `send_max_output_tokens=false` 等に上書きする。`ModelCapability` には
+/// 入れない（モデル能力ではなく wire policy）。
 #[derive(Debug, Clone)]
 pub struct OpenAIResponsesScheme {
    /// サーバ側に response を保存するか。ZDR/stateless 運用では `false`。
@ -28,6 +29,10 @@ pub struct OpenAIResponsesScheme {
    /// `include: ["reasoning.encrypted_content"]` を付けるか。
    /// `store=false` で reasoning を使うなら必須。
    pub include_encrypted_content: bool,
+    /// `max_output_tokens` を body に載せるか。公式 OpenAI Responses API は
+    /// 受理するが、ChatGPT backend (codex-oauth) は `Unsupported parameter`
+    /// で 400 を返すため、その経路では `false` にする。
+    pub send_max_output_tokens: bool,
 }

 impl Default for OpenAIResponsesScheme {
@ -35,12 +40,14 @@ impl Default for OpenAIResponsesScheme {
        Self {
            store: false,
            include_encrypted_content: true,
+            send_max_output_tokens: true,
        }
    }
 }

 impl OpenAIResponsesScheme {
-    /// デフォルト設定 (`store=false`, `include=["reasoning.encrypted_content"]`)。
+    /// デフォルト設定 (`store=false`, `include=["reasoning.encrypted_content"]`,
+    /// `send_max_output_tokens=true`)。
    pub fn new() -> Self {
        Self::default()
    }
@ -56,4 +63,10 @@ impl OpenAIResponsesScheme {
        self.include_encrypted_content = include;
        self
    }
+
+    /// `max_output_tokens` を body に載せるかを上書き。
+    pub fn with_send_max_output_tokens(mut self, send: bool) -> Self {
+        self.send_max_output_tokens = send;
+        self
+    }
 }
--- a/crates/llm-worker/src/llm_client/scheme/openai_responses/request.rs
+++ b/crates/llm-worker/src/llm_client/scheme/openai_responses/request.rs
@ -38,6 +38,9 @@ pub(crate) struct ResponsesRequest {
    /// `["reasoning.encrypted_content"]` 等。
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub include: Vec<&'static str>,
+    /// 公式 OpenAI Responses API では受理されるが、ChatGPT backend
+    /// (codex-oauth) は 400 で弾く。scheme の `send_max_output_tokens`
+    /// が `false` のときは `None` のまま送る (skip_serializing_if で除外)。
    #[serde(skip_serializing_if = "Option::is_none")]
    pub max_output_tokens: Option<u32>,
    #[serde(skip_serializing_if = "Option::is_none")]
@ -195,7 +198,11 @@ impl OpenAIResponsesScheme {
            store: self.store,
            stream: true,
            include,
-            max_output_tokens: request.config.max_tokens,
+            max_output_tokens: if self.send_max_output_tokens {
+                request.config.max_tokens
+            } else {
+                None
+            },
            temperature: request.config.temperature,
            top_p: request.config.top_p,
        }
@ -444,13 +451,26 @@ mod tests {
    }

    #[test]
-    fn max_output_tokens_passed_through() {
+    fn max_output_tokens_passed_through_by_default() {
        let scheme = OpenAIResponsesScheme::new();
        let req = Request::new().user("hi").max_tokens(100);
        let body = scheme.build_request("gpt-5", &req, &cap_with_reasoning());
        assert_eq!(body.max_output_tokens, Some(100));
    }

+    #[test]
+    fn max_output_tokens_dropped_when_send_disabled() {
+        let scheme = OpenAIResponsesScheme::new().with_send_max_output_tokens(false);
+        let req = Request::new().user("hi").max_tokens(100);
+        let body = scheme.build_request("gpt-5", &req, &cap_with_reasoning());
+        assert_eq!(body.max_output_tokens, None);
+        let json = serde_json::to_value(&body).unwrap();
+        assert!(
+            json.get("max_output_tokens").is_none(),
+            "max_output_tokens key must not appear in serialised body, got: {json}"
+        );
+    }
+
    #[test]
    fn tool_schema_without_properties_is_normalized() {
        // schemars は引数なし struct から `type:"object"` だけのスキーマを
--- a/crates/llm-worker/src/llm_client/scheme/openai_responses/scheme_impl.rs
+++ b/crates/llm-worker/src/llm_client/scheme/openai_responses/scheme_impl.rs
@ -3,8 +3,9 @@
 use serde_json::Value;

 use crate::llm_client::{
-    ClientError, auth::AuthRequirement, capability::ModelCapability, event::Event, scheme::Scheme,
-    types::Request,
+    ClientError, auth::AuthRequirement, capability::ModelCapability,
+    client::ConfigWarning, event::Event, scheme::Scheme,
+    types::{Request, RequestConfig},
 };

 use super::OpenAIResponsesScheme;
@ -51,4 +52,18 @@ impl Scheme for OpenAIResponsesScheme {
    fn default_capability(&self) -> ModelCapability {
        super::capability::default_capability()
    }
+
+    fn validate_config(&self, config: &RequestConfig) -> Vec<ConfigWarning> {
+        let mut warnings = Vec::new();
+        // ChatGPT backend (codex-oauth) は `max_output_tokens` を 400 で弾く。
+        // scheme 構築時に `send_max_output_tokens=false` で組まれていれば
+        // body 投影は止まっているので、ユーザの意図が落ちることだけを通知する。
+        if !self.send_max_output_tokens && config.max_tokens.is_some() {
+            warnings.push(ConfigWarning::unsupported(
+                "max_tokens",
+                "OpenAI Responses (ChatGPT backend)",
+            ));
+        }
+        warnings
+    }
 }
--- a/crates/pod/src/pod.rs
+++ b/crates/pod/src/pod.rs
@ -1064,7 +1064,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
        let mut summary_worker = Worker::new(summary_client)
            .system_prompt(summary_system_prompt)
            .temperature(0.0);
-        summary_worker.set_max_tokens(4096);

        // Cumulative input-token meter + interceptor. The meter is bumped
        // from the on_usage callback and read on every pre_llm_request.
@ -1413,7 +1412,6 @@ impl<C: LlmClient, St: Store> Pod<C, St> {
        let mut extract_worker = Worker::new(client)
            .system_prompt(extract::EXTRACT_SYSTEM_PROMPT)
            .temperature(0.0);
-        extract_worker.set_max_tokens(4096);

        // Cumulative input-token meter + interceptor (mirror of
        // CompactWorkerInterceptor). Aborts the extract worker if its
--- a/crates/provider/src/lib.rs
+++ b/crates/provider/src/lib.rs
@ -142,7 +142,12 @@ fn build_from_config(config: &ModelConfig) -> Result<Box<dyn LlmClient>, Provide
        SchemeKind::OpenaiChat => build_transport(OpenAIScheme::new(), config, resolved),
        SchemeKind::Gemini => build_transport(GeminiScheme::new(), config, resolved),
        SchemeKind::OpenaiResponses => {
-            build_transport(OpenAIResponsesScheme::new(), config, resolved)
+            // ChatGPT backend (codex-oauth) は `max_output_tokens` を
+            // 400 で弾くため、その経路では送出を止める。
+            let scheme = OpenAIResponsesScheme::new().with_send_max_output_tokens(
+                !matches!(config.auth, AuthRef::CodexOAuth),
+            );
+            build_transport(scheme, config, resolved)
        }
    }
 }
--- a/docs/research/anthropic_max_tokens.md
+++ b/docs/research/anthropic_max_tokens.md
@ -0,0 +1,44 @@
+# Anthropic Messages API: `max_tokens` パラメータ仕様
+
+Source: https://platform.claude.com/docs/en/api/messages  
+Retrieved: 2026-04-28
+
+---
+
+## 1. パラメータ名
+
+`max_tokens`
+
+`max_output_tokens` ではない。
+
+## 2. 必須か任意か
+
+**必須 (required)**。
+
+`POST /v1/messages` のボディパラメータとして必須指定。  
+insomnia の現在の実装（`max_tokens: u32`、未指定時 4096 にフォールバック）は仕様と合致している。
+
+## 3. 型・範囲
+
+- 型: integer (number)
+- 意味: 生成を停止する前に出力できるトークンの上限。モデルはこの値に達する前に停止することもある（上限の指定であり、保証値ではない）
+- モデルごとに最大値が異なる:
+  - Claude Opus 4.6 / 4.7: 最大 128k トークン
+  - Claude Sonnet 4.6 / Haiku 4.5: 最大 64k トークン
+  - Message Batches API + beta ヘッダ `output-300k-2026-03-24`: 最大 300k トークン（Opus 4.7, 4.6, Sonnet 4.6）
+
+## 4. Extended Thinking との組み合わせ制約
+
+Source: https://platform.claude.com/docs/en/build-with-claude/extended-thinking
+
+- `thinking.budget_tokens` は必ず `max_tokens` **未満** でなければならない
+- thinking トークンは `max_tokens` の上限に含まれてカウントされる
+- `budget_tokens` の最小値は **1,024 トークン**
+- 例外: ツールを伴う interleaved thinking では `budget_tokens` が `max_tokens` を超えることが許容される（予算がコンテキストウィンドウ全体に対して適用されるため）
+- `max_tokens` が 21,333 を超える場合はストリーミングが必須
+- Claude Opus 4.6 / Sonnet 4.6 以降では `budget_tokens` は非推奨になり、代わりに `effort` パラメータによる adaptive thinking が推奨されている
+
+## 5. ドキュメント URL
+
+- Messages API リファレンス: https://platform.claude.com/docs/en/api/messages
+- Extended Thinking ガイド: https://platform.claude.com/docs/en/build-with-claude/extended-thinking
--- a/docs/research/gemini_max_output_tokens.md
+++ b/docs/research/gemini_max_output_tokens.md
@ -0,0 +1,45 @@
+# Google Gemini API: `maxOutputTokens` パラメータ仕様
+
+Source: https://ai.google.dev/api/generate-content  
+Source (thinking): https://ai.google.dev/gemini-api/docs/thinking  
+Source (Gemini 2.5 Flash): https://ai.google.dev/gemini-api/docs/models/gemini-2.5-flash  
+Source (Gemini 2.5 Pro): https://ai.google.dev/gemini-api/docs/models/gemini-2.5-pro  
+Retrieved: 2026-04-28
+
+---
+
+## 1. パラメータ名と位置
+
+`generationConfig.maxOutputTokens`
+
+リクエストボディのトップレベルではなく、`generationConfig` オブジェクト内に配置する。
+SDK では `GenerateContentConfig(max_output_tokens=...)` として渡す。
+
+## 2. 必須 / 任意
+
+**任意 (optional)**。省略時はモデルのデフォルト上限が適用される。
+
+## 3. 型と範囲
+
+- 型: `integer`
+- モデル別の最大値:
+  - `gemini-2.5-flash`: 最大 **65,536** トークン
+  - `gemini-2.5-pro`: 最大 **65,536** トークン
+- 最小値の公式明記はないが、正の整数を指定する。
+
+## 4. thinking トークンとの関係
+
+- `maxOutputTokens` が制限するのは**最終レスポンスの出力トークン数のみ**。thinking トークンは `usageMetadata.thoughtsTokenCount` として別途計上され、`maxOutputTokens` のカウントには含まれない。
+- thinking トークンの制御には `generationConfig.thinkingConfig.thinkingBudget` を用いる。
+  - Gemini 2.5 Flash / Pro: `128`〜`32768` トークン、`0` で thinking 無効化（モデルによる）、`-1` で動的
+- 課金は「output tokens + thinking tokens」の合算。
+- `maxOutputTokens` と `thinkingBudget` は独立したパラメータであり、両方を同時に指定できる。
+
+> **注意**: 2025年10月時点で `gemini-2.5-flash` において `max_output_tokens` が無視されるバグが報告されており、Google 側が修正をロールアウトした経緯がある。最新モデルで想定通りに機能するか実測で確認することを推奨。
+
+## 5. ドキュメント URL
+
+- API リファレンス (GenerationConfig): https://ai.google.dev/api/generate-content#v1beta.GenerationConfig
+- Thinking ガイド: https://ai.google.dev/gemini-api/docs/thinking
+- Gemini 2.5 Flash モデル仕様: https://ai.google.dev/gemini-api/docs/models/gemini-2.5-flash
+- Gemini 2.5 Pro モデル仕様: https://ai.google.dev/gemini-api/docs/models/gemini-2.5-pro
--- a/docs/research/openai_chat_max_tokens.md
+++ b/docs/research/openai_chat_max_tokens.md
@ -0,0 +1,51 @@
+# OpenAI Chat Completions API — 出力トークン数制御パラメータ仕様
+
+- **Source**: https://platform.openai.com/docs/api-reference/chat/create
+- **Supplementary**: https://learn.microsoft.com/en-us/azure/foundry/openai/how-to/reasoning
+- **Retrieved**: 2026-04-28
+
+---
+
+## 1. `max_tokens` と `max_completion_tokens` の関係
+
+| パラメータ | 状態 | 対応モデル |
+|---|---|---|
+| `max_tokens` | **Deprecated** | GPT-3.5, GPT-4 系など旧モデルでは動作する |
+| `max_completion_tokens` | 現行・推奨 | 全モデル（旧モデルにも後方互換あり） |
+
+- `max_tokens` は o1 系以降では **受け付けられない**（エラーまたは無視）。
+- 変更の背景: 旧来の `max_tokens` は「返却トークン = 生成トークン = 課金トークン」を前提にしていた。o1 系で推論トークン（reasoning tokens）が導入されたことでこの前提が崩れ、新パラメータが設計された。
+- `max_completion_tokens` は旧モデルでも機能するため、**新規実装では `max_completion_tokens` を使うべき**。
+
+## 2. 必須か任意か
+
+- **任意（optional）**。
+- 指定しない場合はモデルのコンテキスト上限まで生成する（デフォルト: `null`）。
+
+## 3. 型と範囲
+
+- **型**: `integer | null`
+- **範囲**: `1` 以上、モデルのコンテキストウィンドウの残りトークン数以下。上限値はモデルごとに異なり、ドキュメント上に固定の最大値は明示されていない。
+- `null` を渡すと制限なし（モデル上限に従う）。
+
+## 4. Reasoning モデルでの reasoning tokens のカウント
+
+- `max_completion_tokens` の上限には **reasoning tokens（推論トークン）を含む**。
+  - reasoning tokens: モデルが内部で生成するが API レスポンスには含まれない隠しトークン。
+  - 課金対象は reasoning tokens + visible output tokens の合計。
+  - レスポンスの `usage.completion_tokens_details.reasoning_tokens` で内訳を確認できる。
+- したがって、`max_completion_tokens = 5000` と設定しても、推論に多くのトークンを使った場合、目に見える出力は 5000 より少なくなる。
+
+## 5. Ollama の OpenAI compat API での扱い（補助情報）
+
+- Ollama の `/v1/chat/completions` は現時点で **`max_tokens` のみを公式サポート**している（内部的に `num_predict` にマッピング）。
+- `max_completion_tokens` サポートは Issue #7125 / PR #14464 で議論中だが、2026-04-28 時点では公式ドキュメント上に記載なし。
+- **Ollama に対しては `max_tokens` を使う**のが安全な選択。ただし将来的に `max_completion_tokens` に移行される見込み。
+
+## 6. ドキュメント URL
+
+- [OpenAI Chat Completions API Reference](https://platform.openai.com/docs/api-reference/chat/create)
+- [Azure OpenAI Reasoning Models (GPT-5, o3, o1)](https://learn.microsoft.com/en-us/azure/foundry/openai/how-to/reasoning)
+- [Ollama OpenAI Compatibility](https://docs.ollama.com/api/openai-compatibility)
+- [Ollama Issue #7125 — max_completion_tokens support](https://github.com/ollama/ollama/issues/7125)
+- [OpenAI Community — Why max_tokens changed to max_completion_tokens](https://community.openai.com/t/why-was-max-tokens-changed-to-max-completion-tokens/938077)
--- a/docs/research/openai_responses_max_output_tokens.md
+++ b/docs/research/openai_responses_max_output_tokens.md
@ -0,0 +1,60 @@
+# OpenAI Responses API — `max_output_tokens` Parameter
+
+- **Source**: https://platform.openai.com/docs/api-reference/responses/create
+- **Retrieved**: 2026-04-28
+
+---
+
+## 1. パラメータ名
+
+`max_output_tokens` — 正しい。Chat Completions API の `max_tokens` / `max_completion_tokens` とは別物。
+
+## 2. 必須 / 任意
+
+**任意 (optional)**。省略時のデフォルトは `inf`（モデルが許容する最大出力トークン数）。
+
+## 3. 型と範囲
+
+| 項目 | 値 |
+|---|---|
+| 型 | `integer` または文字列 `"inf"` |
+| 最小値 | `1` |
+| 最大値 | モデルごとの最大出力トークン数（例: gpt-4.1 系は 32,768） |
+| デフォルト | `inf` |
+
+上限に達した場合、レスポンスの `status` が `"incomplete"` になり、`incomplete_details.reason` が `"max_output_tokens"` にセットされる。
+
+## 4. Reasoning tokens との関係 / Reasoning モデルとの組合せ制約
+
+`max_output_tokens` は **reasoning tokens を含む** 合計生成トークン数の上限として機能する。
+公式ガイド (https://platform.openai.com/docs/guides/reasoning) には以下の記述がある:
+
+> "You can limit the total number of tokens the model generates (including both reasoning and final output tokens) by using the max_output_tokens parameter."
+
+**実用上の注意点:**
+- モデルが内部思考に多数の reasoning tokens を消費した後に上限に達すると、visible output が一切返らずに打ち切られる場合がある。
+- コスト制御目的には `reasoning.effort` (`"low"` など) の使用が推奨される。`max_output_tokens` はあくまで暴走抑止のガードとして位置づける。
+- o シリーズなど reasoning モデルでは `reasoning.max_tokens` (別パラメータ) で reasoning 専用の上限を設定できる場合もある。
+
+## 5. ChatGPT backend (`https://chatgpt.com/backend-api/codex/responses`) における取り扱い
+
+このエンドポイントは公式 Responses API のサブセットのみをサポートするCodex CLI 互換 backend であり、`max_output_tokens` を **サポートしないパラメータとして 400 エラーで拒否する**。
+
+LiteLLM の調査 (https://github.com/BerriAI/litellm/issues/21193) によれば、ChatGPT Codex backend が受け付けるパラメータは以下に限られる:
+
+```
+model, input, instructions, stream, store, include,
+tools, tool_choice, reasoning, previous_response_id, truncation
+```
+
+`max_output_tokens`, `max_tokens`, `max_completion_tokens`, `temperature`, `user`, `metadata`, `context_management` はすべて拒否される。
+
+Codex CLI 自身も `config.toml` の `model_max_output_tokens` を API リクエストに載せない実装になっており (https://github.com/openai/codex/issues/4138)、これはバグではなく ChatGPT backend の制約に対する回避策と解釈できる。
+
+## 6. ドキュメント URL
+
+- 公式 API リファレンス: https://platform.openai.com/docs/api-reference/responses/create
+- Reasoning ガイド: https://platform.openai.com/docs/guides/reasoning
+- Codex CLI issue (max_output_tokens 未送信): https://github.com/openai/codex/issues/4138
+- LiteLLM issue (ChatGPT backend 拒否パラメータ一覧): https://github.com/BerriAI/litellm/issues/21193
+- OpenAI Community (reasoning tokens 上限): https://community.openai.com/t/limiting-maximum-number-of-reasoning-tokens/1285430
--- a/resources/prompts/default.md
+++ b/resources/prompts/default.md
@ -1,5 +1,7 @@
 You are here as an agent of the "insomnia system".

+Stay precise, edit code directly when asked, and avoid speculative refactoring. 
+
 {% include "common/workspace" %}

 {% include "common/tool-usage" %}