Rebase PR #784 and fix OpenAI model/chat regressions (#1384)

* Wire conversation history through OpenAI responses API * Fix RuboCop hash brace spacing in assistant tests * Pipelock ignores * Batch fixes --------- Co-authored-by: sokiee <sokysrm@gmail.com>
2026-04-25 17:15:07 +02:00 · 2026-04-15 18:45:24 +02:00
parent 53ea0375db
commit 7b2b1dd367
24 changed files with 937 additions and 90 deletions
--- a/test/controllers/settings/hostings_controller_test.rb
+++ b/test/controllers/settings/hostings_controller_test.rb
@@ -235,6 +235,61 @@ class Settings::HostingsControllerTest < ActionDispatch::IntegrationTest
    end
  end

+  test "accepts valid llm budget overrides and blanks clear them" do
+    with_self_hosting do
+      patch settings_hosting_url, params: { setting: {
+        llm_context_window: "4096",
+        llm_max_response_tokens: "1024",
+        llm_max_items_per_call: "40"
+      } }
+
+      assert_redirected_to settings_hosting_url
+      assert_equal 4096, Setting.llm_context_window
+      assert_equal 1024, Setting.llm_max_response_tokens
+      assert_equal 40, Setting.llm_max_items_per_call
+
+      patch settings_hosting_url, params: { setting: {
+        llm_context_window: "",
+        llm_max_response_tokens: "",
+        llm_max_items_per_call: ""
+      } }
+
+      assert_nil Setting.llm_context_window
+      assert_nil Setting.llm_max_response_tokens
+      assert_nil Setting.llm_max_items_per_call
+    end
+  ensure
+    Setting.llm_context_window = nil
+    Setting.llm_max_response_tokens = nil
+    Setting.llm_max_items_per_call = nil
+  end
+
+  test "rejects llm budget below field minimum" do
+    with_self_hosting do
+      patch settings_hosting_url, params: { setting: { llm_context_window: "0" } }
+
+      assert_response :unprocessable_entity
+      assert_match(/must be a whole number/, flash[:alert])
+      assert_nil Setting.llm_context_window
+
+      patch settings_hosting_url, params: { setting: { llm_max_response_tokens: "-5" } }
+
+      assert_response :unprocessable_entity
+      assert_match(/must be a whole number/, flash[:alert])
+      assert_nil Setting.llm_max_response_tokens
+
+      patch settings_hosting_url, params: { setting: { llm_max_items_per_call: "not-a-number" } }
+
+      assert_response :unprocessable_entity
+      assert_match(/must be a whole number/, flash[:alert])
+      assert_nil Setting.llm_max_items_per_call
+    end
+  ensure
+    Setting.llm_context_window = nil
+    Setting.llm_max_response_tokens = nil
+    Setting.llm_max_items_per_call = nil
+  end
+
  test "can clear data only when admin" do
    with_self_hosting do
      sign_in users(:family_member)
--- a/test/models/assistant/history_trimmer_test.rb
+++ b/test/models/assistant/history_trimmer_test.rb
@@ -0,0 +1,93 @@
+require "test_helper"
+
+class Assistant::HistoryTrimmerTest < ActiveSupport::TestCase
+  test "returns empty array for empty input" do
+    assert_equal [], Assistant::HistoryTrimmer.new([], max_tokens: 1000).call
+    assert_equal [], Assistant::HistoryTrimmer.new(nil, max_tokens: 1000).call
+  end
+
+  test "returns empty array when max_tokens is zero or negative" do
+    messages = [ { role: "user", content: "hi" } ]
+    assert_equal [], Assistant::HistoryTrimmer.new(messages, max_tokens: 0).call
+    assert_equal [], Assistant::HistoryTrimmer.new(messages, max_tokens: -10).call
+  end
+
+  test "keeps full history when budget is generous" do
+    messages = [
+      { role: "user", content: "a" * 40 },
+      { role: "assistant", content: "b" * 40 },
+      { role: "user", content: "c" * 40 }
+    ]
+
+    result = Assistant::HistoryTrimmer.new(messages, max_tokens: 10_000).call
+
+    assert_equal messages, result
+  end
+
+  test "drops oldest messages when budget is tight" do
+    messages = [
+      { role: "user", content: "a" * 100 },
+      { role: "assistant", content: "b" * 100 },
+      { role: "user", content: "c" * 100 }
+    ]
+
+    # ~40 tokens per message; budget of 60 should keep only the last one
+    result = Assistant::HistoryTrimmer.new(messages, max_tokens: 60).call
+
+    assert_equal 1, result.size
+    assert_equal "c" * 100, result.first[:content]
+  end
+
+  test "keeps tool_call and tool_result as an atomic pair" do
+    messages = [
+      { role: "user", content: "a" * 40 },
+      { role: "assistant", content: "", tool_calls: [ { id: "c1", type: "function", function: { name: "f", arguments: "{}" } } ] },
+      { role: "tool", tool_call_id: "c1", name: "f", content: "result" },
+      { role: "assistant", content: "follow up" }
+    ]
+
+    # Small budget: should keep the newest assistant follow-up. Pair is either
+    # kept together or dropped together — never split.
+    result = Assistant::HistoryTrimmer.new(messages, max_tokens: 20).call
+
+    tool_call_idx = result.index { |m| m[:role] == "assistant" && m[:tool_calls] }
+    tool_result_idx = result.index { |m| m[:role] == "tool" }
+
+    if tool_call_idx
+      assert_not_nil tool_result_idx, "tool_call without paired tool_result"
+      assert tool_result_idx > tool_call_idx, "tool_result must follow its tool_call"
+    else
+      assert_nil tool_result_idx, "tool_result without paired tool_call leaked through"
+    end
+  end
+
+  test "never splits tool_call + tool_result pair when dropping" do
+    messages = [
+      { role: "assistant", content: "", tool_calls: [ { id: "c1", type: "function", function: { name: "f", arguments: "{}" } } ] },
+      { role: "tool", tool_call_id: "c1", name: "f", content: "x" * 200 },
+      { role: "user", content: "y" * 20 }
+    ]
+
+    # Budget big enough for the user turn, not for the pair
+    result = Assistant::HistoryTrimmer.new(messages, max_tokens: 15).call
+
+    assert_equal 1, result.size
+    assert_equal "user", result.first[:role]
+  end
+
+  test "handles multiple tool results following a single assistant tool_calls message" do
+    messages = [
+      { role: "user", content: "hi" },
+      { role: "assistant", content: "", tool_calls: [
+        { id: "c1", type: "function", function: { name: "f1", arguments: "{}" } },
+        { id: "c2", type: "function", function: { name: "f2", arguments: "{}" } }
+      ] },
+      { role: "tool", tool_call_id: "c1", name: "f1", content: "r1" },
+      { role: "tool", tool_call_id: "c2", name: "f2", content: "r2" }
+    ]
+
+    result = Assistant::HistoryTrimmer.new(messages, max_tokens: 10_000).call
+
+    assert_equal messages, result
+  end
+end
--- a/test/models/assistant/token_estimator_test.rb
+++ b/test/models/assistant/token_estimator_test.rb
@@ -0,0 +1,40 @@
+require "test_helper"
+
+class Assistant::TokenEstimatorTest < ActiveSupport::TestCase
+  test "estimate is zero for nil or empty" do
+    assert_equal 0, Assistant::TokenEstimator.estimate(nil)
+    assert_equal 0, Assistant::TokenEstimator.estimate("")
+    assert_equal 0, Assistant::TokenEstimator.estimate([])
+  end
+
+  test "estimate applies chars/4 with safety factor for strings" do
+    # 100 chars / 4 = 25 tokens, × 1.25 safety factor, ceil = 32
+    assert_equal 32, Assistant::TokenEstimator.estimate("a" * 100)
+  end
+
+  test "estimate sums across arrays" do
+    # Use lengths that avoid ceil rounding drift: 80 chars → 20 tokens → 25.0 ceil → 25.
+    string_estimate = Assistant::TokenEstimator.estimate("a" * 80)
+    array_estimate  = Assistant::TokenEstimator.estimate([ "a" * 80, "a" * 80 ])
+
+    assert_equal string_estimate * 2, array_estimate
+  end
+
+  test "estimate serializes hashes via JSON" do
+    hash = { role: "assistant", content: "hi" }
+    # {"role":"assistant","content":"hi"} => 35 chars / 4 × 1.25 ceil = 11
+    assert_equal 11, Assistant::TokenEstimator.estimate(hash)
+  end
+
+  test "estimate handles nested structures" do
+    nested = [ { role: "user", content: "hello" }, { role: "assistant", content: "world" } ]
+    # Each hash gets JSON-serialized and summed
+    expected = nested.sum { |h| Assistant::TokenEstimator.estimate(h) }
+    assert_equal expected, Assistant::TokenEstimator.estimate(nested)
+  end
+
+  test "estimate coerces unknown types via to_s" do
+    assert Assistant::TokenEstimator.estimate(12345) > 0
+    assert Assistant::TokenEstimator.estimate(:symbol) > 0
+  end
+end
--- a/test/models/assistant_test.rb
+++ b/test/models/assistant_test.rb
@@ -14,6 +14,10 @@ class AssistantTest < ActiveSupport::TestCase
    @provider = mock
    @expected_session_id = @chat.id.to_s
    @expected_user_identifier = ::Digest::SHA256.hexdigest(@chat.user_id.to_s)
+    @expected_conversation_history = [
+      { role: "user", content: "Can you help me understand my spending habits?" },
+      { role: "user", content: "What is my net worth?" }
+    ]
  end

  test "errors get added to chat" do
@@ -100,6 +104,7 @@ class AssistantTest < ActiveSupport::TestCase
    @provider.expects(:chat_response).with do |message, **options|
      assert_equal @expected_session_id, options[:session_id]
      assert_equal @expected_user_identifier, options[:user_identifier]
+      assert_equal @expected_conversation_history, options[:messages]
      text_chunks.each do |text_chunk|
        options[:streamer].call(text_chunk)
      end
@@ -154,6 +159,7 @@ class AssistantTest < ActiveSupport::TestCase
    @provider.expects(:chat_response).with do |message, **options|
      assert_equal @expected_session_id, options[:session_id]
      assert_equal @expected_user_identifier, options[:user_identifier]
+      assert_equal @expected_conversation_history, options[:messages]
      call2_text_chunks.each do |text_chunk|
        options[:streamer].call(text_chunk)
      end
@@ -165,6 +171,7 @@ class AssistantTest < ActiveSupport::TestCase
    @provider.expects(:chat_response).with do |message, **options|
      assert_equal @expected_session_id, options[:session_id]
      assert_equal @expected_user_identifier, options[:user_identifier]
+      assert_equal @expected_conversation_history, options[:messages]
      options[:streamer].call(call1_response_chunk)
      true
    end.returns(call1_response).once.in_sequence(sequence)
@@ -418,6 +425,95 @@ class AssistantTest < ActiveSupport::TestCase
    assert_raises(Assistant::Error) { Assistant.for_chat(nil) }
  end

+  test "builtin demotes a partially-streamed assistant message to failed on error" do
+    @assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider)
+
+    boom = StandardError.new("boom mid-stream")
+
+    @provider.expects(:chat_response).with do |_prompt, **options|
+      # Simulate a partial text chunk landing before the error propagates.
+      options[:streamer].call(provider_text_chunk("partial tokens "))
+      true
+    end.returns(provider_error_response(boom))
+
+    @assistant.respond_to(@message)
+
+    partial = @chat.messages.where(type: "AssistantMessage").order(:created_at).last
+    assert partial.present?, "partial assistant message should be persisted"
+    assert_equal "failed", partial.status
+    assert_equal "partial tokens ", partial.content
+  end
+
+  test "conversation_history excludes failed and pending messages" do
+    # Add a failed assistant turn; it must NOT leak into history.
+    AssistantMessage.create!(
+      chat: @chat,
+      content: "partial error response",
+      ai_model: "gpt-4.1",
+      status: "failed"
+    )
+
+    @assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider)
+
+    captured_history = nil
+    @provider.expects(:chat_response).with do |_prompt, **options|
+      captured_history = options[:messages]
+      options[:streamer].call(
+        provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: [])
+      )
+      true
+    end.returns(provider_success_response(
+      provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: []).data
+    ))
+
+    @assistant.respond_to(@message)
+
+    contents = captured_history.map { |m| m[:content] }
+    assert_not_includes contents, "partial error response"
+  end
+
+  test "conversation_history serializes assistant tool_calls with paired tool result" do
+    assistant_msg = AssistantMessage.create!(
+      chat: @chat,
+      content: "Looking that up",
+      ai_model: "gpt-4.1",
+      status: "complete"
+    )
+
+    ToolCall::Function.create!(
+      message: assistant_msg,
+      provider_id: "call_abc",
+      provider_call_id: "call_abc",
+      function_name: "get_net_worth",
+      function_arguments: { foo: "bar" },
+      function_result: { amount: 1000, currency: "USD" }
+    )
+
+    @assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider)
+
+    captured_history = nil
+    @provider.expects(:chat_response).with do |_prompt, **options|
+      captured_history = options[:messages]
+      options[:streamer].call(
+        provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: [])
+      )
+      true
+    end.returns(provider_success_response(
+      provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: []).data
+    ))
+
+    @assistant.respond_to(@message)
+
+    tool_call_entry = captured_history.find { |m| m[:role] == "assistant" && m[:tool_calls].present? }
+    tool_result_entry = captured_history.find { |m| m[:role] == "tool" }
+
+    assert_not_nil tool_call_entry, "tool_call message missing from history"
+    assert_not_nil tool_result_entry, "tool_result message missing from history"
+    assert_equal "call_abc", tool_call_entry[:tool_calls].first[:id]
+    assert_equal "call_abc", tool_result_entry[:tool_call_id]
+    assert_equal "get_net_worth", tool_result_entry[:name]
+  end
+
  private

    def mock_external_sse_response(sse_body)
--- a/test/models/provider/openai/batch_slicer_test.rb
+++ b/test/models/provider/openai/batch_slicer_test.rb
@@ -0,0 +1,74 @@
+require "test_helper"
+
+class Provider::Openai::BatchSlicerTest < ActiveSupport::TestCase
+  test "returns empty array for empty input" do
+    assert_equal [], Provider::Openai::BatchSlicer.call([], max_items: 25, max_tokens: 2000)
+  end
+
+  test "single batch when within item cap and token budget" do
+    items = Array.new(10) { |i| { id: i, name: "Item#{i}" } }
+
+    batches = Provider::Openai::BatchSlicer.call(items, max_items: 25, max_tokens: 10_000)
+
+    assert_equal 1, batches.size
+    assert_equal items, batches.first
+  end
+
+  test "splits on item cap" do
+    items = Array.new(30) { |i| { id: i } }
+
+    batches = Provider::Openai::BatchSlicer.call(items, max_items: 10, max_tokens: 100_000)
+
+    assert_equal 3, batches.size
+    assert_equal 10, batches.first.size
+    assert_equal 10, batches.last.size
+    assert_equal items, batches.flatten
+  end
+
+  test "splits on token budget" do
+    big_item = { payload: "x" * 200 }  # ~63 tokens each
+    items = Array.new(10) { big_item }
+
+    # ~125-token budget fits 2 items per batch
+    batches = Provider::Openai::BatchSlicer.call(items, max_items: 100, max_tokens: 125)
+
+    assert batches.size > 1
+    assert_equal items, batches.flatten
+    batches.each do |batch|
+      tokens = Assistant::TokenEstimator.estimate(batch)
+      assert tokens <= 125, "Batch exceeded token budget: #{tokens}"
+    end
+  end
+
+  test "raises ContextOverflowError when single item exceeds budget" do
+    huge = { payload: "x" * 10_000 }
+
+    assert_raises Provider::Openai::BatchSlicer::ContextOverflowError do
+      Provider::Openai::BatchSlicer.call([ huge ], max_items: 25, max_tokens: 500)
+    end
+  end
+
+  test "raises ContextOverflowError when fixed_tokens exceed budget" do
+    assert_raises Provider::Openai::BatchSlicer::ContextOverflowError do
+      Provider::Openai::BatchSlicer.call([ { a: 1 } ], max_items: 25, max_tokens: 500, fixed_tokens: 600)
+    end
+  end
+
+  test "respects fixed_tokens when computing available budget" do
+    items = Array.new(5) { { payload: "x" * 80 } }  # ~25 tokens each
+
+    # Budget of 200 minus fixed 100 = 100 available; fits ~4 items per batch
+    batches = Provider::Openai::BatchSlicer.call(items, max_items: 25, max_tokens: 200, fixed_tokens: 100)
+
+    assert batches.size >= 2
+    assert_equal items, batches.flatten
+  end
+
+  test "never produces empty batches" do
+    items = Array.new(7) { { id: SecureRandom.hex(4) } }
+
+    batches = Provider::Openai::BatchSlicer.call(items, max_items: 2, max_tokens: 10_000)
+
+    batches.each { |b| assert b.any?, "Empty batch produced" }
+  end
+end
--- a/test/models/provider/openai_test.rb
+++ b/test/models/provider/openai_test.rb
@@ -330,4 +330,130 @@ class Provider::OpenaiTest < ActiveSupport::TestCase

    @subject.send(:create_langfuse_trace, name: "openai.test", input: { foo: "bar" })
  end
+
+  test "SUPPORTED_MODELS and VISION_CAPABLE_MODEL_PREFIXES are Ruby constants, not YAML-derived" do
+    assert_kind_of Array, Provider::Openai::SUPPORTED_MODELS
+    assert Provider::Openai::SUPPORTED_MODELS.all? { |s| s.is_a?(String) }
+    assert Provider::Openai::SUPPORTED_MODELS.frozen?
+
+    assert_kind_of Array, Provider::Openai::VISION_CAPABLE_MODEL_PREFIXES
+    assert Provider::Openai::VISION_CAPABLE_MODEL_PREFIXES.frozen?
+    assert_equal "gpt-4.1", Provider::Openai::DEFAULT_MODEL
+  end
+
+  test "budget readers default to conservative values" do
+    with_env_overrides(
+      "LLM_CONTEXT_WINDOW" => nil,
+      "LLM_MAX_RESPONSE_TOKENS" => nil,
+      "LLM_SYSTEM_PROMPT_RESERVE" => nil,
+      "LLM_MAX_HISTORY_TOKENS" => nil,
+      "LLM_MAX_ITEMS_PER_CALL" => nil
+    ) do
+      subject = Provider::Openai.new("test-token")
+      assert_equal 2048, subject.context_window
+      assert_equal 512, subject.max_response_tokens
+      assert_equal 256, subject.system_prompt_reserve
+      assert_equal 2048 - 512 - 256, subject.max_history_tokens
+      assert_equal 2048 - 512 - 256, subject.max_input_tokens
+      assert_equal 25, subject.max_items_per_call
+    end
+  end
+
+  test "budget readers respect explicit env overrides" do
+    with_env_overrides(
+      "LLM_CONTEXT_WINDOW" => "8192",
+      "LLM_MAX_RESPONSE_TOKENS" => "1024",
+      "LLM_SYSTEM_PROMPT_RESERVE" => "512",
+      "LLM_MAX_HISTORY_TOKENS" => "4096",
+      "LLM_MAX_ITEMS_PER_CALL" => "50"
+    ) do
+      subject = Provider::Openai.new("test-token")
+      assert_equal 8192, subject.context_window
+      assert_equal 1024, subject.max_response_tokens
+      assert_equal 512, subject.system_prompt_reserve
+      assert_equal 4096, subject.max_history_tokens  # explicit overrides derived
+      assert_equal 8192 - 1024 - 512, subject.max_input_tokens
+      assert_equal 50, subject.max_items_per_call
+    end
+  end
+
+  test "budget readers fall back to Setting when ENV unset" do
+    with_env_overrides(
+      "LLM_CONTEXT_WINDOW" => nil,
+      "LLM_MAX_RESPONSE_TOKENS" => nil,
+      "LLM_MAX_ITEMS_PER_CALL" => nil
+    ) do
+      Setting.llm_context_window = 8192
+      Setting.llm_max_response_tokens = 1024
+      Setting.llm_max_items_per_call = 40
+
+      subject = Provider::Openai.new("test-token")
+      assert_equal 8192, subject.context_window
+      assert_equal 1024, subject.max_response_tokens
+      assert_equal 40, subject.max_items_per_call
+    end
+  ensure
+    Setting.llm_context_window = nil
+    Setting.llm_max_response_tokens = nil
+    Setting.llm_max_items_per_call = nil
+  end
+
+  test "budget readers: ENV beats Setting when both present" do
+    with_env_overrides("LLM_CONTEXT_WINDOW" => "16384") do
+      Setting.llm_context_window = 4096
+      subject = Provider::Openai.new("test-token")
+      assert_equal 16384, subject.context_window
+    end
+  ensure
+    Setting.llm_context_window = nil
+  end
+
+  test "budget readers: zero or negative values fall through to default" do
+    with_env_overrides(
+      "LLM_CONTEXT_WINDOW" => "0",
+      "LLM_MAX_RESPONSE_TOKENS" => nil,
+      "LLM_MAX_ITEMS_PER_CALL" => nil
+    ) do
+      Setting.llm_context_window = 0
+      subject = Provider::Openai.new("test-token")
+      assert_equal 2048, subject.context_window
+    end
+  ensure
+    Setting.llm_context_window = nil
+  end
+
+  test "auto_categorize fans out oversized batches into sequential sub-calls" do
+    with_env_overrides("LLM_MAX_ITEMS_PER_CALL" => "10") do
+      subject = Provider::Openai.new("test-token")
+      transactions = Array.new(25) { |i| { id: i.to_s, name: "txn#{i}", amount: 10, classification: "expense" } }
+      user_categories = [ { id: "cat1", name: "Groceries", is_subcategory: false, parent_id: nil, classification: "expense" } ]
+
+      # Capture the batch size passed to each AutoCategorizer. `.new` is called
+      # once per sub-batch; we record each invocation's transactions count.
+      seen_sizes = []
+      fake_instance = mock
+      fake_instance.stubs(:auto_categorize).returns([])
+      Provider::Openai::AutoCategorizer.stubs(:new).with do |*_args, **kwargs|
+        seen_sizes << kwargs[:transactions].size
+        true
+      end.returns(fake_instance)
+
+      response = subject.auto_categorize(transactions: transactions, user_categories: user_categories)
+
+      assert response.success?
+      assert_equal [ 10, 10, 5 ], seen_sizes
+    end
+  end
+
+  test "build_input no longer accepts inline messages history" do
+    config = Provider::Openai::ChatConfig.new(functions: [], function_results: [])
+    # Positive control: prompt works
+    result = config.build_input(prompt: "hi")
+    assert_equal [ { role: "user", content: "hi" } ], result
+
+    # `messages:` kwarg is no longer part of the signature — calling with it must raise
+    assert_raises(ArgumentError) do
+      config.build_input(prompt: "hi", messages: [ { role: "user", content: "old" } ])
+    end
+  end
 end