mirror of
https://github.com/we-promise/sure
synced 2026-04-25 17:15:07 +02:00
* Wire conversation history through OpenAI responses API * Fix RuboCop hash brace spacing in assistant tests * Pipelock ignores * Batch fixes --------- Co-authored-by: sokiee <sokysrm@gmail.com>
This commit is contained in:
23
.env.example
23
.env.example
@@ -25,19 +25,34 @@ OPENAI_ACCESS_TOKEN=
|
||||
OPENAI_MODEL=
|
||||
OPENAI_URI_BASE=
|
||||
|
||||
# Optional: LLM token budget (applies to chat, auto-categorize, merchant detection, PDF processing).
|
||||
# Lower these for small-context local models (Ollama, LM Studio, LocalAI).
|
||||
# Defaults work for modern cloud OpenAI models without configuration.
|
||||
# LLM_CONTEXT_WINDOW=2048
|
||||
# LLM_MAX_RESPONSE_TOKENS=512
|
||||
# LLM_MAX_HISTORY_TOKENS=
|
||||
# LLM_SYSTEM_PROMPT_RESERVE=256
|
||||
# LLM_MAX_ITEMS_PER_CALL=25
|
||||
|
||||
# Optional: OpenAI-compatible capability flags
|
||||
# OPENAI_REQUEST_TIMEOUT=60 # HTTP timeout in seconds; raise for slow local models
|
||||
# OPENAI_SUPPORTS_PDF_PROCESSING=true # Set to false for endpoints without vision support
|
||||
# OPENAI_SUPPORTS_RESPONSES_ENDPOINT= # Override Responses-API vs chat.completions routing
|
||||
# LLM_JSON_MODE= # auto | strict | json_object | none
|
||||
|
||||
# Optional: External AI Assistant — delegates chat to a remote AI agent
|
||||
# instead of calling LLMs directly. The agent calls back to Sure's /mcp endpoint.
|
||||
# See docs/hosting/ai.md for full details.
|
||||
# ASSISTANT_TYPE=external
|
||||
# EXTERNAL_ASSISTANT_URL=https://your-agent-host/v1/chat/completions
|
||||
# EXTERNAL_ASSISTANT_TOKEN=your-api-token
|
||||
# EXTERNAL_ASSISTANT_TOKEN=your-api-token # pipelock:ignore
|
||||
# EXTERNAL_ASSISTANT_AGENT_ID=main
|
||||
# EXTERNAL_ASSISTANT_SESSION_KEY=agent:main:main
|
||||
# EXTERNAL_ASSISTANT_ALLOWED_EMAILS=user@example.com
|
||||
|
||||
# Optional: MCP server endpoint — enables /mcp for external AI assistants.
|
||||
# Both values are required. MCP_USER_EMAIL must match an existing user's email.
|
||||
# MCP_API_TOKEN=your-random-bearer-token
|
||||
# MCP_API_TOKEN=your-random-bearer-token # pipelock:ignore
|
||||
# MCP_USER_EMAIL=user@example.com
|
||||
|
||||
# Optional: Langfuse config
|
||||
@@ -82,7 +97,7 @@ EMAIL_SENDER=
|
||||
# Database Configuration
|
||||
DB_HOST=localhost # May need to be changed to `DB_HOST=db` if using devcontainer
|
||||
DB_PORT=5432
|
||||
POSTGRES_PASSWORD=postgres
|
||||
POSTGRES_PASSWORD=postgres # pipelock:ignore
|
||||
POSTGRES_USER=postgres
|
||||
|
||||
# Redis configuration
|
||||
@@ -94,7 +109,7 @@ REDIS_URL=redis://localhost:6379/1
|
||||
# REDIS_SENTINEL_HOSTS=sentinel1:26379,sentinel2:26379,sentinel3:26379
|
||||
# REDIS_SENTINEL_MASTER=mymaster
|
||||
# REDIS_SENTINEL_USERNAME=default
|
||||
# REDIS_PASSWORD=your-redis-password
|
||||
# REDIS_PASSWORD=your-redis-password # pipelock:ignore
|
||||
|
||||
# App Domain
|
||||
# This is the domain that your Sure instance will be hosted at. It is used to generate links in emails and other places.
|
||||
|
||||
@@ -28,8 +28,22 @@ TWELVE_DATA_API_KEY =
|
||||
OPENAI_ACCESS_TOKEN =
|
||||
OPENAI_URI_BASE =
|
||||
OPENAI_MODEL =
|
||||
# OPENAI_REQUEST_TIMEOUT: Request timeout in seconds (default: 60)
|
||||
# OPENAI_SUPPORTS_PDF_PROCESSING: Set to false for endpoints without vision support (default: true)
|
||||
|
||||
# LLM token budget. Applies to ALL outbound LLM calls: chat history,
|
||||
# auto-categorize, merchant detection, provider enhancer, PDF processing.
|
||||
# Defaults to Ollama's historical 2048-token baseline so small local models
|
||||
# work out of the box — raise explicitly for cloud or larger-context models.
|
||||
# LLM_CONTEXT_WINDOW = 2048 # Total tokens the model will accept
|
||||
# LLM_MAX_RESPONSE_TOKENS = 512 # Reserved for the model's reply
|
||||
# LLM_MAX_HISTORY_TOKENS = # Derived if unset (context - response - system_reserve)
|
||||
# LLM_SYSTEM_PROMPT_RESERVE = 256 # Tokens reserved for the system prompt
|
||||
# LLM_MAX_ITEMS_PER_CALL = 25 # Upper bound on auto-categorize / merchant batches
|
||||
|
||||
# OpenAI-compatible capability flags (custom/self-hosted providers)
|
||||
# OPENAI_REQUEST_TIMEOUT = 60 # HTTP timeout in seconds; raise for slow local models
|
||||
# OPENAI_SUPPORTS_PDF_PROCESSING = true # Set to false for endpoints without vision support
|
||||
# OPENAI_SUPPORTS_RESPONSES_ENDPOINT = # true to force Responses API on custom providers
|
||||
# LLM_JSON_MODE = # auto | strict | json_object | none
|
||||
|
||||
# (example: LM Studio/Docker config) OpenAI-compatible API endpoint config
|
||||
# OPENAI_URI_BASE = http://host.docker.internal:1234/
|
||||
|
||||
@@ -1,6 +1,15 @@
|
||||
class Settings::HostingsController < ApplicationController
|
||||
layout "settings"
|
||||
|
||||
# Minimum accepted value for each configurable LLM budget field. Mirrors the
|
||||
# `min:` attribute on the form inputs in `_openai_settings.html.erb` so the
|
||||
# controller rejects what the browser-side validator would reject.
|
||||
LLM_BUDGET_MINIMUMS = {
|
||||
llm_context_window: 256,
|
||||
llm_max_response_tokens: 64,
|
||||
llm_max_items_per_call: 1
|
||||
}.freeze
|
||||
|
||||
guard_feature unless: -> { self_hosted? }
|
||||
|
||||
before_action :ensure_admin, only: [ :update, :clear_cache, :disconnect_external_assistant ]
|
||||
@@ -157,6 +166,21 @@ class Settings::HostingsController < ApplicationController
|
||||
Setting.openai_json_mode = hosting_params[:openai_json_mode].presence
|
||||
end
|
||||
|
||||
LLM_BUDGET_MINIMUMS.each do |key, minimum|
|
||||
next unless hosting_params.key?(key)
|
||||
raw = hosting_params[key].to_s.strip
|
||||
if raw.blank?
|
||||
Setting.public_send("#{key}=", nil)
|
||||
next
|
||||
end
|
||||
parsed = Integer(raw, 10) rescue nil
|
||||
if parsed.nil? || parsed < minimum
|
||||
label = t("settings.hostings.openai_settings.#{key}_label")
|
||||
raise Setting::ValidationError, t(".invalid_llm_budget", field: label, minimum: minimum)
|
||||
end
|
||||
Setting.public_send("#{key}=", parsed)
|
||||
end
|
||||
|
||||
if hosting_params.key?(:external_assistant_url)
|
||||
Setting.external_assistant_url = hosting_params[:external_assistant_url]
|
||||
end
|
||||
@@ -199,7 +223,7 @@ class Settings::HostingsController < ApplicationController
|
||||
private
|
||||
def hosting_params
|
||||
return ActionController::Parameters.new unless params.key?(:setting)
|
||||
params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :invite_only_default_family_id, :brand_fetch_client_id, :brand_fetch_high_res_logos, :twelve_data_api_key, :tiingo_api_key, :eodhd_api_key, :alpha_vantage_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :exchange_rate_provider, :securities_provider, :syncs_include_pending, :auto_sync_enabled, :auto_sync_time, :external_assistant_url, :external_assistant_token, :external_assistant_agent_id, securities_providers: [])
|
||||
params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :invite_only_default_family_id, :brand_fetch_client_id, :brand_fetch_high_res_logos, :twelve_data_api_key, :tiingo_api_key, :eodhd_api_key, :alpha_vantage_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :llm_context_window, :llm_max_response_tokens, :llm_max_items_per_call, :exchange_rate_provider, :securities_provider, :syncs_include_pending, :auto_sync_enabled, :auto_sync_time, :external_assistant_url, :external_assistant_token, :external_assistant_agent_id, securities_providers: [])
|
||||
end
|
||||
|
||||
def update_assistant_type
|
||||
|
||||
@@ -63,6 +63,13 @@ class Assistant::Builtin < Assistant::Base
|
||||
responder.respond(previous_response_id: latest_response_id)
|
||||
rescue => e
|
||||
stop_thinking
|
||||
# If we streamed any partial content before the error, the message was
|
||||
# persisted with the default `complete` status. Demote it to `failed` so
|
||||
# `Assistant::Responder#conversation_history` won't feed a broken turn
|
||||
# back into future prompts.
|
||||
if assistant_message&.persisted?
|
||||
assistant_message.update_columns(status: "failed")
|
||||
end
|
||||
chat.add_error(e)
|
||||
end
|
||||
|
||||
|
||||
53
app/models/assistant/history_trimmer.rb
Normal file
53
app/models/assistant/history_trimmer.rb
Normal file
@@ -0,0 +1,53 @@
|
||||
class Assistant::HistoryTrimmer
|
||||
def initialize(messages, max_tokens:)
|
||||
@messages = messages || []
|
||||
@max_tokens = max_tokens.to_i
|
||||
end
|
||||
|
||||
def call
|
||||
return [] if @messages.empty? || @max_tokens <= 0
|
||||
|
||||
kept = []
|
||||
tokens = 0
|
||||
|
||||
group_tool_pairs(@messages).reverse_each do |group|
|
||||
group_tokens = Assistant::TokenEstimator.estimate(group)
|
||||
break if tokens + group_tokens > @max_tokens
|
||||
|
||||
kept.unshift(*group)
|
||||
tokens += group_tokens
|
||||
end
|
||||
|
||||
kept
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Bundles each assistant message that has `tool_calls` with the
|
||||
# consecutive `role: "tool"` results that follow it, so the trimmer
|
||||
# never splits a call/result pair when dropping from the oldest end.
|
||||
def group_tool_pairs(messages)
|
||||
groups = []
|
||||
current_group = nil
|
||||
|
||||
messages.each do |msg|
|
||||
if assistant_with_tool_calls?(msg)
|
||||
groups << current_group if current_group
|
||||
current_group = [ msg ]
|
||||
elsif msg[:role].to_s == "tool" && current_group
|
||||
current_group << msg
|
||||
else
|
||||
groups << current_group if current_group
|
||||
current_group = nil
|
||||
groups << [ msg ]
|
||||
end
|
||||
end
|
||||
|
||||
groups << current_group if current_group
|
||||
groups
|
||||
end
|
||||
|
||||
def assistant_with_tool_calls?(msg)
|
||||
msg[:role].to_s == "assistant" && msg[:tool_calls].is_a?(Array) && msg[:tool_calls].any?
|
||||
end
|
||||
end
|
||||
@@ -79,6 +79,7 @@ class Assistant::Responder
|
||||
instructions: instructions,
|
||||
functions: function_tool_caller.function_definitions,
|
||||
function_results: function_results,
|
||||
messages: conversation_history,
|
||||
streamer: streamer,
|
||||
previous_response_id: previous_response_id,
|
||||
session_id: chat_session_id,
|
||||
@@ -114,4 +115,46 @@ class Assistant::Responder
|
||||
def chat
|
||||
@chat ||= message.chat
|
||||
end
|
||||
|
||||
def conversation_history
|
||||
messages = []
|
||||
return messages unless chat&.messages
|
||||
|
||||
chat.messages
|
||||
.where(type: [ "UserMessage", "AssistantMessage" ], status: "complete")
|
||||
.includes(:tool_calls)
|
||||
.ordered
|
||||
.each do |chat_message|
|
||||
if chat_message.tool_calls.any?
|
||||
messages << {
|
||||
role: chat_message.role,
|
||||
content: chat_message.content || "",
|
||||
tool_calls: chat_message.tool_calls.map(&:to_tool_call)
|
||||
}
|
||||
|
||||
chat_message.tool_calls.map(&:to_result).each do |fn_result|
|
||||
# Handle nil explicitly to avoid serializing to "null"
|
||||
output = fn_result[:output]
|
||||
content = if output.nil?
|
||||
""
|
||||
elsif output.is_a?(String)
|
||||
output
|
||||
else
|
||||
output.to_json
|
||||
end
|
||||
|
||||
messages << {
|
||||
role: "tool",
|
||||
tool_call_id: fn_result[:call_id],
|
||||
name: fn_result[:name],
|
||||
content: content
|
||||
}
|
||||
end
|
||||
|
||||
elsif !chat_message.content.blank?
|
||||
messages << { role: chat_message.role, content: chat_message.content || "" }
|
||||
end
|
||||
end
|
||||
messages
|
||||
end
|
||||
end
|
||||
|
||||
19
app/models/assistant/token_estimator.rb
Normal file
19
app/models/assistant/token_estimator.rb
Normal file
@@ -0,0 +1,19 @@
|
||||
module Assistant::TokenEstimator
|
||||
CHARS_PER_TOKEN = 4
|
||||
SAFETY_FACTOR = 1.25
|
||||
|
||||
def self.estimate(value)
|
||||
chars = char_length(value)
|
||||
((chars / CHARS_PER_TOKEN.to_f) * SAFETY_FACTOR).ceil
|
||||
end
|
||||
|
||||
def self.char_length(value)
|
||||
case value
|
||||
when nil then 0
|
||||
when String then value.length
|
||||
when Array then value.sum { |v| char_length(v) }
|
||||
when Hash then value.to_json.length
|
||||
else value.to_s.length
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -26,9 +26,9 @@ class Chat < ApplicationRecord
|
||||
end
|
||||
|
||||
# Returns the default AI model to use for chats
|
||||
# Priority: ENV variable > Setting > OpenAI default
|
||||
# Priority: AI Config > Setting
|
||||
def default_model
|
||||
ENV["OPENAI_MODEL"].presence || Setting.openai_model.presence || Provider::Openai::DEFAULT_MODEL
|
||||
Provider::Openai.effective_model.presence || Setting.openai_model
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -40,6 +40,7 @@ module Provider::LlmConcept
|
||||
instructions: nil,
|
||||
functions: [],
|
||||
function_results: [],
|
||||
messages: nil,
|
||||
streamer: nil,
|
||||
previous_response_id: nil,
|
||||
session_id: nil,
|
||||
|
||||
@@ -4,31 +4,29 @@ class Provider::Openai < Provider
|
||||
# Subclass so errors caught in this provider are raised as Provider::Openai::Error
|
||||
Error = Class.new(Provider::Error)
|
||||
|
||||
# Supported OpenAI model prefixes (e.g., "gpt-4" matches "gpt-4", "gpt-4.1", "gpt-4-turbo", etc.)
|
||||
DEFAULT_OPENAI_MODEL_PREFIXES = %w[gpt-4 gpt-5 o1 o3]
|
||||
DEFAULT_MODEL = "gpt-4.1"
|
||||
|
||||
# Models that support PDF/vision input (not all OpenAI models have vision capabilities)
|
||||
DEFAULT_MODEL = "gpt-4.1".freeze
|
||||
SUPPORTED_MODELS = %w[gpt-4 gpt-5 o1 o3].freeze
|
||||
VISION_CAPABLE_MODEL_PREFIXES = %w[gpt-4o gpt-4-turbo gpt-4.1 gpt-5 o1 o3].freeze
|
||||
|
||||
# Returns the effective model that would be used by the provider
|
||||
# Uses the same logic as Provider::Registry and the initializer
|
||||
# Returns the effective model that would be used by the provider.
|
||||
# Priority: explicit ENV > Setting > DEFAULT_MODEL.
|
||||
def self.effective_model
|
||||
configured_model = ENV.fetch("OPENAI_MODEL", Setting.openai_model)
|
||||
configured_model.presence || DEFAULT_MODEL
|
||||
ENV.fetch("OPENAI_MODEL") { Setting.openai_model }.presence || DEFAULT_MODEL
|
||||
end
|
||||
|
||||
def initialize(access_token, uri_base: nil, model: nil)
|
||||
client_options = { access_token: access_token }
|
||||
client_options[:uri_base] = uri_base if uri_base.present?
|
||||
llm_uri_base = uri_base.presence
|
||||
llm_model = model.presence
|
||||
client_options[:uri_base] = llm_uri_base if llm_uri_base.present?
|
||||
client_options[:request_timeout] = ENV.fetch("OPENAI_REQUEST_TIMEOUT", 60).to_i
|
||||
|
||||
@client = ::OpenAI::Client.new(**client_options)
|
||||
@uri_base = uri_base
|
||||
if custom_provider? && model.blank?
|
||||
@uri_base = llm_uri_base
|
||||
if custom_provider? && llm_model.blank?
|
||||
raise Error, "Model is required when using a custom OpenAI‑compatible provider"
|
||||
end
|
||||
@default_model = model.presence || DEFAULT_MODEL
|
||||
@default_model = llm_model.presence || self.class.effective_model
|
||||
end
|
||||
|
||||
def supports_model?(model)
|
||||
@@ -36,7 +34,18 @@ class Provider::Openai < Provider
|
||||
return true if custom_provider?
|
||||
|
||||
# Otherwise, check if model starts with any supported OpenAI prefix
|
||||
DEFAULT_OPENAI_MODEL_PREFIXES.any? { |prefix| model.start_with?(prefix) }
|
||||
SUPPORTED_MODELS.any? { |prefix| model.start_with?(prefix) }
|
||||
end
|
||||
|
||||
def supports_responses_endpoint?
|
||||
return @supports_responses_endpoint if defined?(@supports_responses_endpoint)
|
||||
|
||||
env_override = ENV["OPENAI_SUPPORTS_RESPONSES_ENDPOINT"]
|
||||
if env_override.to_s.present?
|
||||
return @supports_responses_endpoint = ActiveModel::Type::Boolean.new.cast(env_override)
|
||||
end
|
||||
|
||||
@supports_responses_endpoint = !custom_provider?
|
||||
end
|
||||
|
||||
def provider_name
|
||||
@@ -47,7 +56,7 @@ class Provider::Openai < Provider
|
||||
if custom_provider?
|
||||
@default_model.present? ? "configured model: #{@default_model}" : "any model"
|
||||
else
|
||||
"models starting with: #{DEFAULT_OPENAI_MODEL_PREFIXES.join(', ')}"
|
||||
"models starting with: #{SUPPORTED_MODELS.join(", ")}"
|
||||
end
|
||||
end
|
||||
|
||||
@@ -55,9 +64,43 @@ class Provider::Openai < Provider
|
||||
@uri_base.present?
|
||||
end
|
||||
|
||||
# Token-budget knobs. Precedence: ENV > Setting > default. Defaults match
|
||||
# Ollama's historical 2048-token baseline so local small-context models work
|
||||
# out of the box. Users on larger-context cloud models can raise via ENV or
|
||||
# via the Self-Hosting settings page.
|
||||
def context_window
|
||||
positive_budget(ENV["LLM_CONTEXT_WINDOW"], Setting.llm_context_window, 2048)
|
||||
end
|
||||
|
||||
def max_response_tokens
|
||||
positive_budget(ENV["LLM_MAX_RESPONSE_TOKENS"], Setting.llm_max_response_tokens, 512)
|
||||
end
|
||||
|
||||
def system_prompt_reserve
|
||||
positive_budget(ENV["LLM_SYSTEM_PROMPT_RESERVE"], nil, 256)
|
||||
end
|
||||
|
||||
def max_history_tokens
|
||||
explicit = ENV["LLM_MAX_HISTORY_TOKENS"].presence&.to_i
|
||||
return explicit if explicit&.positive?
|
||||
[ context_window - max_response_tokens - system_prompt_reserve, 256 ].max
|
||||
end
|
||||
|
||||
# Budget available for a one-shot (non-chat) request's full input,
|
||||
# excluding reserved response tokens AND the system/instructions prompt.
|
||||
# Drives the batch slicer for the auto_categorize / auto_detect_merchants /
|
||||
# enhance_provider_merchants calls — each ships ~200–400 tokens of
|
||||
# instructions + JSON schema that aren't counted in `fixed_tokens`.
|
||||
def max_input_tokens
|
||||
[ context_window - max_response_tokens - system_prompt_reserve, 256 ].max
|
||||
end
|
||||
|
||||
def max_items_per_call
|
||||
positive_budget(ENV["LLM_MAX_ITEMS_PER_CALL"], Setting.llm_max_items_per_call, 25)
|
||||
end
|
||||
|
||||
def auto_categorize(transactions: [], user_categories: [], model: "", family: nil, json_mode: nil)
|
||||
with_provider_response do
|
||||
raise Error, "Too many transactions to auto-categorize. Max is 25 per request." if transactions.size > 25
|
||||
if user_categories.blank?
|
||||
family_id = family&.id || "unknown"
|
||||
Rails.logger.error("Cannot auto-categorize transactions for family #{family_id}: no categories available")
|
||||
@@ -71,16 +114,20 @@ class Provider::Openai < Provider
|
||||
input: { transactions: transactions, user_categories: user_categories }
|
||||
)
|
||||
|
||||
result = AutoCategorizer.new(
|
||||
client,
|
||||
model: effective_model,
|
||||
transactions: transactions,
|
||||
user_categories: user_categories,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family,
|
||||
json_mode: json_mode
|
||||
).auto_categorize
|
||||
batches = slice_for_context(transactions, fixed: user_categories)
|
||||
|
||||
result = batches.flat_map do |batch|
|
||||
AutoCategorizer.new(
|
||||
client,
|
||||
model: effective_model,
|
||||
transactions: batch,
|
||||
user_categories: user_categories,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family,
|
||||
json_mode: json_mode
|
||||
).auto_categorize
|
||||
end
|
||||
|
||||
upsert_langfuse_trace(trace: trace, output: result.map(&:to_h))
|
||||
|
||||
@@ -90,8 +137,6 @@ class Provider::Openai < Provider
|
||||
|
||||
def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil, json_mode: nil)
|
||||
with_provider_response do
|
||||
raise Error, "Too many transactions to auto-detect merchants. Max is 25 per request." if transactions.size > 25
|
||||
|
||||
effective_model = model.presence || @default_model
|
||||
|
||||
trace = create_langfuse_trace(
|
||||
@@ -99,16 +144,20 @@ class Provider::Openai < Provider
|
||||
input: { transactions: transactions, user_merchants: user_merchants }
|
||||
)
|
||||
|
||||
result = AutoMerchantDetector.new(
|
||||
client,
|
||||
model: effective_model,
|
||||
transactions: transactions,
|
||||
user_merchants: user_merchants,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family,
|
||||
json_mode: json_mode
|
||||
).auto_detect_merchants
|
||||
batches = slice_for_context(transactions, fixed: user_merchants)
|
||||
|
||||
result = batches.flat_map do |batch|
|
||||
AutoMerchantDetector.new(
|
||||
client,
|
||||
model: effective_model,
|
||||
transactions: batch,
|
||||
user_merchants: user_merchants,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family,
|
||||
json_mode: json_mode
|
||||
).auto_detect_merchants
|
||||
end
|
||||
|
||||
upsert_langfuse_trace(trace: trace, output: result.map(&:to_h))
|
||||
|
||||
@@ -118,8 +167,6 @@ class Provider::Openai < Provider
|
||||
|
||||
def enhance_provider_merchants(merchants: [], model: "", family: nil, json_mode: nil)
|
||||
with_provider_response do
|
||||
raise Error, "Too many merchants to enhance. Max is 25 per request." if merchants.size > 25
|
||||
|
||||
effective_model = model.presence || @default_model
|
||||
|
||||
trace = create_langfuse_trace(
|
||||
@@ -127,15 +174,19 @@ class Provider::Openai < Provider
|
||||
input: { merchants: merchants }
|
||||
)
|
||||
|
||||
result = ProviderMerchantEnhancer.new(
|
||||
client,
|
||||
model: effective_model,
|
||||
merchants: merchants,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family,
|
||||
json_mode: json_mode
|
||||
).enhance_merchants
|
||||
batches = slice_for_context(merchants)
|
||||
|
||||
result = batches.flat_map do |batch|
|
||||
ProviderMerchantEnhancer.new(
|
||||
client,
|
||||
model: effective_model,
|
||||
merchants: batch,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family,
|
||||
json_mode: json_mode
|
||||
).enhance_merchants
|
||||
end
|
||||
|
||||
upsert_langfuse_trace(trace: trace, output: result.map(&:to_h))
|
||||
|
||||
@@ -171,7 +222,8 @@ class Provider::Openai < Provider
|
||||
pdf_content: pdf_content,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family
|
||||
family: family,
|
||||
max_response_tokens: max_response_tokens
|
||||
).process
|
||||
|
||||
upsert_langfuse_trace(trace: trace, output: result.to_h)
|
||||
@@ -207,25 +259,17 @@ class Provider::Openai < Provider
|
||||
instructions: nil,
|
||||
functions: [],
|
||||
function_results: [],
|
||||
messages: nil,
|
||||
streamer: nil,
|
||||
previous_response_id: nil,
|
||||
session_id: nil,
|
||||
user_identifier: nil,
|
||||
family: nil
|
||||
)
|
||||
if custom_provider?
|
||||
generic_chat_response(
|
||||
prompt: prompt,
|
||||
model: model,
|
||||
instructions: instructions,
|
||||
functions: functions,
|
||||
function_results: function_results,
|
||||
streamer: streamer,
|
||||
session_id: session_id,
|
||||
user_identifier: user_identifier,
|
||||
family: family
|
||||
)
|
||||
else
|
||||
if supports_responses_endpoint?
|
||||
# Native path uses the Responses API which chains history via
|
||||
# `previous_response_id`; it does NOT need (and must not receive)
|
||||
# inline message history in the input payload.
|
||||
native_chat_response(
|
||||
prompt: prompt,
|
||||
model: model,
|
||||
@@ -238,12 +282,48 @@ class Provider::Openai < Provider
|
||||
user_identifier: user_identifier,
|
||||
family: family
|
||||
)
|
||||
else
|
||||
generic_chat_response(
|
||||
prompt: prompt,
|
||||
model: model,
|
||||
instructions: instructions,
|
||||
functions: functions,
|
||||
function_results: function_results,
|
||||
messages: messages,
|
||||
streamer: streamer,
|
||||
session_id: session_id,
|
||||
user_identifier: user_identifier,
|
||||
family: family
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
attr_reader :client
|
||||
|
||||
# Returns the first positive integer among env, setting, default. Treats
|
||||
# zero or negative values as "unset" and falls through — a 0-token budget
|
||||
# is never what the user meant.
|
||||
def positive_budget(env_value, setting_value, default)
|
||||
from_env = env_value.to_s.strip.to_i
|
||||
return from_env if from_env.positive?
|
||||
return setting_value.to_i if setting_value.to_i.positive?
|
||||
default
|
||||
end
|
||||
|
||||
# Routes one-shot (non-chat) inputs through the BatchSlicer so large
|
||||
# caller batches are split to fit the model's context window. `fixed` is
|
||||
# the portion of the prompt that stays constant across every sub-batch
|
||||
# (e.g. user_categories, user_merchants), used for fixed-tokens accounting.
|
||||
def slice_for_context(items, fixed: nil)
|
||||
BatchSlicer.call(
|
||||
Array(items),
|
||||
max_items: max_items_per_call,
|
||||
max_tokens: max_input_tokens,
|
||||
fixed_tokens: fixed ? Assistant::TokenEstimator.estimate(fixed) : 0
|
||||
)
|
||||
end
|
||||
|
||||
def native_chat_response(
|
||||
prompt:,
|
||||
model:,
|
||||
@@ -278,7 +358,7 @@ class Provider::Openai < Provider
|
||||
nil
|
||||
end
|
||||
|
||||
input_payload = chat_config.build_input(prompt)
|
||||
input_payload = chat_config.build_input(prompt: prompt)
|
||||
|
||||
begin
|
||||
raw_response = client.responses.create(parameters: {
|
||||
@@ -344,6 +424,7 @@ class Provider::Openai < Provider
|
||||
instructions: nil,
|
||||
functions: [],
|
||||
function_results: [],
|
||||
messages: nil,
|
||||
streamer: nil,
|
||||
session_id: nil,
|
||||
user_identifier: nil,
|
||||
@@ -353,7 +434,8 @@ class Provider::Openai < Provider
|
||||
messages = build_generic_messages(
|
||||
prompt: prompt,
|
||||
instructions: instructions,
|
||||
function_results: function_results
|
||||
function_results: function_results,
|
||||
messages: messages
|
||||
)
|
||||
|
||||
tools = build_generic_tools(functions)
|
||||
@@ -412,16 +494,24 @@ class Provider::Openai < Provider
|
||||
end
|
||||
end
|
||||
|
||||
def build_generic_messages(prompt:, instructions: nil, function_results: [])
|
||||
messages = []
|
||||
def build_generic_messages(prompt:, instructions: nil, function_results: [], messages: nil)
|
||||
payload = []
|
||||
|
||||
# Add system message if instructions present
|
||||
if instructions.present?
|
||||
messages << { role: "system", content: instructions }
|
||||
payload << { role: "system", content: instructions }
|
||||
end
|
||||
|
||||
# Add user prompt
|
||||
messages << { role: "user", content: prompt }
|
||||
# Add conversation history or user prompt. History is trimmed to fit the
|
||||
# configured token budget so small-context local models (Ollama, LM Studio,
|
||||
# LocalAI) don't silently truncate. tool_call/tool_result pairs are
|
||||
# preserved atomically by HistoryTrimmer.
|
||||
if messages.present?
|
||||
trimmed = Assistant::HistoryTrimmer.new(messages, max_tokens: max_history_tokens).call
|
||||
payload.concat(trimmed)
|
||||
elsif prompt.present?
|
||||
payload << { role: "user", content: prompt }
|
||||
end
|
||||
|
||||
# If there are function results, we need to add the assistant message that made the tool calls
|
||||
# followed by the tool messages with the results
|
||||
@@ -442,7 +532,7 @@ class Provider::Openai < Provider
|
||||
}
|
||||
end
|
||||
|
||||
messages << {
|
||||
payload << {
|
||||
role: "assistant",
|
||||
content: "", # Some OpenAI-compatible APIs require string, not null
|
||||
tool_calls: tool_calls
|
||||
@@ -462,7 +552,7 @@ class Provider::Openai < Provider
|
||||
output.to_json
|
||||
end
|
||||
|
||||
messages << {
|
||||
payload << {
|
||||
role: "tool",
|
||||
tool_call_id: fn_result[:call_id],
|
||||
name: fn_result[:name],
|
||||
@@ -471,7 +561,7 @@ class Provider::Openai < Provider
|
||||
end
|
||||
end
|
||||
|
||||
messages
|
||||
payload
|
||||
end
|
||||
|
||||
def build_generic_tools(functions)
|
||||
|
||||
45
app/models/provider/openai/batch_slicer.rb
Normal file
45
app/models/provider/openai/batch_slicer.rb
Normal file
@@ -0,0 +1,45 @@
|
||||
class Provider::Openai::BatchSlicer
|
||||
class ContextOverflowError < StandardError; end
|
||||
|
||||
# Splits `items` into sub-batches that respect both a hard item cap and a
|
||||
# token-budget cap. Used by auto_categorize / auto_detect_merchants /
|
||||
# enhance_provider_merchants so callers can pass larger batches and have the
|
||||
# provider fan them out to fit small-context models.
|
||||
def self.call(items, max_items:, max_tokens:, fixed_tokens: 0)
|
||||
items = Array(items)
|
||||
return [] if items.empty?
|
||||
|
||||
available = max_tokens.to_i - fixed_tokens.to_i
|
||||
if available <= 0
|
||||
raise ContextOverflowError,
|
||||
"Fixed prompt tokens (#{fixed_tokens}) exceed context budget (#{max_tokens})"
|
||||
end
|
||||
|
||||
batches = []
|
||||
current = []
|
||||
current_tokens = 0
|
||||
|
||||
items.each do |item|
|
||||
item_tokens = Assistant::TokenEstimator.estimate(item)
|
||||
if item_tokens > available
|
||||
raise ContextOverflowError,
|
||||
"Single item requires ~#{item_tokens} tokens, which exceeds available budget (#{available})"
|
||||
end
|
||||
|
||||
would_exceed_items = current.size >= max_items.to_i
|
||||
would_exceed_tokens = current_tokens + item_tokens > available
|
||||
|
||||
if would_exceed_items || would_exceed_tokens
|
||||
batches << current unless current.empty?
|
||||
current = []
|
||||
current_tokens = 0
|
||||
end
|
||||
|
||||
current << item
|
||||
current_tokens += item_tokens
|
||||
end
|
||||
|
||||
batches << current unless current.empty?
|
||||
batches
|
||||
end
|
||||
end
|
||||
@@ -16,7 +16,9 @@ class Provider::Openai::ChatConfig
|
||||
end
|
||||
end
|
||||
|
||||
def build_input(prompt)
|
||||
def build_input(prompt: nil)
|
||||
input_messages = prompt.present? ? [ { role: "user", content: prompt } ] : []
|
||||
|
||||
results = function_results.map do |fn_result|
|
||||
# Handle nil explicitly to avoid serializing to "null"
|
||||
output = fn_result[:output]
|
||||
@@ -36,7 +38,7 @@ class Provider::Openai::ChatConfig
|
||||
end
|
||||
|
||||
[
|
||||
{ role: "user", content: prompt },
|
||||
*input_messages,
|
||||
*results
|
||||
]
|
||||
end
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
class Provider::Openai::PdfProcessor
|
||||
include Provider::Openai::Concerns::UsageRecorder
|
||||
|
||||
attr_reader :client, :model, :pdf_content, :custom_provider, :langfuse_trace, :family
|
||||
attr_reader :client, :model, :pdf_content, :custom_provider, :langfuse_trace, :family, :max_response_tokens
|
||||
|
||||
def initialize(client, model: "", pdf_content: nil, custom_provider: false, langfuse_trace: nil, family: nil)
|
||||
def initialize(client, model: "", pdf_content: nil, custom_provider: false, langfuse_trace: nil, family: nil, max_response_tokens:)
|
||||
@client = client
|
||||
@model = model
|
||||
@pdf_content = pdf_content
|
||||
@custom_provider = custom_provider
|
||||
@langfuse_trace = langfuse_trace
|
||||
@family = family
|
||||
@max_response_tokens = max_response_tokens
|
||||
end
|
||||
|
||||
def process
|
||||
@@ -175,7 +176,7 @@ class Provider::Openai::PdfProcessor
|
||||
{ role: "system", content: instructions + "\n\nIMPORTANT: Respond with valid JSON only, no markdown or other formatting." },
|
||||
{ role: "user", content: content }
|
||||
],
|
||||
max_tokens: 4096
|
||||
max_tokens: max_response_tokens
|
||||
}
|
||||
|
||||
response = client.chat(parameters: params)
|
||||
|
||||
@@ -10,6 +10,14 @@ class Setting < RailsSettings::Base
|
||||
field :openai_uri_base, type: :string, default: ENV["OPENAI_URI_BASE"]
|
||||
field :openai_model, type: :string, default: ENV["OPENAI_MODEL"]
|
||||
field :openai_json_mode, type: :string, default: ENV["LLM_JSON_MODE"]
|
||||
|
||||
# LLM token budget (applies to every outbound LLM call: chat, auto-categorize,
|
||||
# merchant detection, enhance-merchants, PDF processing). Defaults track
|
||||
# Ollama's historical 2048-token baseline so local small-context models work
|
||||
# out of the box. ENV overrides Setting at read time in Provider::Openai.
|
||||
field :llm_context_window, type: :integer, default: ENV["LLM_CONTEXT_WINDOW"]&.to_i
|
||||
field :llm_max_response_tokens, type: :integer, default: ENV["LLM_MAX_RESPONSE_TOKENS"]&.to_i
|
||||
field :llm_max_items_per_call, type: :integer, default: ENV["LLM_MAX_ITEMS_PER_CALL"]&.to_i
|
||||
field :external_assistant_url, type: :string
|
||||
field :external_assistant_token, type: :string
|
||||
field :external_assistant_agent_id, type: :string
|
||||
|
||||
@@ -63,5 +63,37 @@
|
||||
{ disabled: ENV["LLM_JSON_MODE"].present?,
|
||||
data: { "auto-submit-form-target": "auto" } } %>
|
||||
<p class="text-xs text-secondary mt-1"><%= t(".json_mode_help") %></p>
|
||||
|
||||
<div class="pt-4 border-t border-secondary">
|
||||
<h3 class="font-medium mb-1"><%= t(".budget_heading") %></h3>
|
||||
<p class="text-xs text-secondary mb-3"><%= t(".budget_description") %></p>
|
||||
|
||||
<%= form.number_field :llm_context_window,
|
||||
label: t(".context_window_label"),
|
||||
placeholder: "2048",
|
||||
value: Setting.llm_context_window,
|
||||
min: 256,
|
||||
disabled: ENV["LLM_CONTEXT_WINDOW"].present?,
|
||||
data: { "auto-submit-form-target": "auto" } %>
|
||||
<p class="text-xs text-secondary mt-1 mb-3"><%= t(".context_window_help") %></p>
|
||||
|
||||
<%= form.number_field :llm_max_response_tokens,
|
||||
label: t(".max_response_tokens_label"),
|
||||
placeholder: "512",
|
||||
value: Setting.llm_max_response_tokens,
|
||||
min: 64,
|
||||
disabled: ENV["LLM_MAX_RESPONSE_TOKENS"].present?,
|
||||
data: { "auto-submit-form-target": "auto" } %>
|
||||
<p class="text-xs text-secondary mt-1 mb-3"><%= t(".max_response_tokens_help") %></p>
|
||||
|
||||
<%= form.number_field :llm_max_items_per_call,
|
||||
label: t(".max_items_per_call_label"),
|
||||
placeholder: "25",
|
||||
value: Setting.llm_max_items_per_call,
|
||||
min: 1,
|
||||
disabled: ENV["LLM_MAX_ITEMS_PER_CALL"].present?,
|
||||
data: { "auto-submit-form-target": "auto" } %>
|
||||
<p class="text-xs text-secondary mt-1"><%= t(".max_items_per_call_help") %></p>
|
||||
</div>
|
||||
<% end %>
|
||||
</div>
|
||||
|
||||
@@ -187,7 +187,7 @@ services:
|
||||
- WEBUI_AUTH=False
|
||||
- WEBUI_NAME=AI
|
||||
- WEBUI_URL=http://localhost:8080
|
||||
- WEBUI_SECRET_KEY=t0p-s3cr3t
|
||||
- WEBUI_SECRET_KEY=t0p-s3cr3t # pipelock:ignore
|
||||
- NO_PROXY=host.docker.internal
|
||||
extra_hosts:
|
||||
- host.docker.internal:host-gateway
|
||||
@@ -325,7 +325,7 @@ services:
|
||||
- POSTGRES_HOST=db
|
||||
- POSTGRES_DB=${POSTGRES_DB:-sure_production}
|
||||
- POSTGRES_USER=${POSTGRES_USER:-sure_user}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-sure_password}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-sure_password} # pipelock:ignore
|
||||
- SCHEDULE=@daily # Runs once a day at midnight
|
||||
- BACKUP_KEEP_DAYS=7 # Keeps the last 7 days of backups
|
||||
- BACKUP_KEEP_WEEKS=4 # Keeps 4 weekly backups
|
||||
|
||||
@@ -130,7 +130,7 @@ services:
|
||||
- POSTGRES_HOST=db
|
||||
- POSTGRES_DB=${POSTGRES_DB:-sure_production}
|
||||
- POSTGRES_USER=${POSTGRES_USER:-sure_user}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-sure_password}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-sure_password} # pipelock:ignore
|
||||
- SCHEDULE=@daily # Runs once a day at midnight
|
||||
- BACKUP_KEEP_DAYS=7 # Keeps the last 7 days of backups
|
||||
- BACKUP_KEEP_WEEKS=4 # Keeps 4 weekly backups
|
||||
|
||||
@@ -100,6 +100,14 @@ en:
|
||||
json_mode_none: None (best for standard models)
|
||||
json_mode_json_object: JSON Object
|
||||
json_mode_help: "Strict mode works best with thinking models (qwen-thinking, deepseek-reasoner). None mode works best with standard models (llama, mistral, gpt-oss)."
|
||||
budget_heading: Token Budget
|
||||
budget_description: Applies to every LLM call — chat history, auto-categorization, merchant detection, and PDF processing. Defaults are conservative for small-context local models. Raise for cloud models with larger context windows.
|
||||
context_window_label: Context Window (Optional)
|
||||
context_window_help: "Total tokens the model will accept. Default: 2048 — raise to 8192+ for cloud OpenAI or large-context local models."
|
||||
max_response_tokens_label: Max Response Tokens (Optional)
|
||||
max_response_tokens_help: "Tokens reserved for the model's reply. Default: 512. Lower to free up room for longer history."
|
||||
max_items_per_call_label: Max Items Per Batch (Optional)
|
||||
max_items_per_call_help: "Upper bound for auto-categorize / merchant detection batches. Default: 25. Larger batches are auto-sliced to fit the context window."
|
||||
title: OpenAI
|
||||
yahoo_finance_settings:
|
||||
title: Yahoo Finance
|
||||
@@ -161,6 +169,7 @@ en:
|
||||
success: Settings updated
|
||||
invalid_onboarding_state: Invalid onboarding state
|
||||
invalid_sync_time: Invalid sync time format. Please use HH:MM format (e.g., 02:30).
|
||||
invalid_llm_budget: "%{field} must be a whole number ≥ %{minimum}."
|
||||
scheduler_sync_failed: Settings saved, but failed to update the sync schedule. Please try again or check the server logs.
|
||||
disconnect_external_assistant:
|
||||
external_assistant_disconnected: External assistant disconnected
|
||||
|
||||
@@ -235,6 +235,61 @@ class Settings::HostingsControllerTest < ActionDispatch::IntegrationTest
|
||||
end
|
||||
end
|
||||
|
||||
test "accepts valid llm budget overrides and blanks clear them" do
|
||||
with_self_hosting do
|
||||
patch settings_hosting_url, params: { setting: {
|
||||
llm_context_window: "4096",
|
||||
llm_max_response_tokens: "1024",
|
||||
llm_max_items_per_call: "40"
|
||||
} }
|
||||
|
||||
assert_redirected_to settings_hosting_url
|
||||
assert_equal 4096, Setting.llm_context_window
|
||||
assert_equal 1024, Setting.llm_max_response_tokens
|
||||
assert_equal 40, Setting.llm_max_items_per_call
|
||||
|
||||
patch settings_hosting_url, params: { setting: {
|
||||
llm_context_window: "",
|
||||
llm_max_response_tokens: "",
|
||||
llm_max_items_per_call: ""
|
||||
} }
|
||||
|
||||
assert_nil Setting.llm_context_window
|
||||
assert_nil Setting.llm_max_response_tokens
|
||||
assert_nil Setting.llm_max_items_per_call
|
||||
end
|
||||
ensure
|
||||
Setting.llm_context_window = nil
|
||||
Setting.llm_max_response_tokens = nil
|
||||
Setting.llm_max_items_per_call = nil
|
||||
end
|
||||
|
||||
test "rejects llm budget below field minimum" do
|
||||
with_self_hosting do
|
||||
patch settings_hosting_url, params: { setting: { llm_context_window: "0" } }
|
||||
|
||||
assert_response :unprocessable_entity
|
||||
assert_match(/must be a whole number/, flash[:alert])
|
||||
assert_nil Setting.llm_context_window
|
||||
|
||||
patch settings_hosting_url, params: { setting: { llm_max_response_tokens: "-5" } }
|
||||
|
||||
assert_response :unprocessable_entity
|
||||
assert_match(/must be a whole number/, flash[:alert])
|
||||
assert_nil Setting.llm_max_response_tokens
|
||||
|
||||
patch settings_hosting_url, params: { setting: { llm_max_items_per_call: "not-a-number" } }
|
||||
|
||||
assert_response :unprocessable_entity
|
||||
assert_match(/must be a whole number/, flash[:alert])
|
||||
assert_nil Setting.llm_max_items_per_call
|
||||
end
|
||||
ensure
|
||||
Setting.llm_context_window = nil
|
||||
Setting.llm_max_response_tokens = nil
|
||||
Setting.llm_max_items_per_call = nil
|
||||
end
|
||||
|
||||
test "can clear data only when admin" do
|
||||
with_self_hosting do
|
||||
sign_in users(:family_member)
|
||||
|
||||
93
test/models/assistant/history_trimmer_test.rb
Normal file
93
test/models/assistant/history_trimmer_test.rb
Normal file
@@ -0,0 +1,93 @@
|
||||
require "test_helper"
|
||||
|
||||
class Assistant::HistoryTrimmerTest < ActiveSupport::TestCase
|
||||
test "returns empty array for empty input" do
|
||||
assert_equal [], Assistant::HistoryTrimmer.new([], max_tokens: 1000).call
|
||||
assert_equal [], Assistant::HistoryTrimmer.new(nil, max_tokens: 1000).call
|
||||
end
|
||||
|
||||
test "returns empty array when max_tokens is zero or negative" do
|
||||
messages = [ { role: "user", content: "hi" } ]
|
||||
assert_equal [], Assistant::HistoryTrimmer.new(messages, max_tokens: 0).call
|
||||
assert_equal [], Assistant::HistoryTrimmer.new(messages, max_tokens: -10).call
|
||||
end
|
||||
|
||||
test "keeps full history when budget is generous" do
|
||||
messages = [
|
||||
{ role: "user", content: "a" * 40 },
|
||||
{ role: "assistant", content: "b" * 40 },
|
||||
{ role: "user", content: "c" * 40 }
|
||||
]
|
||||
|
||||
result = Assistant::HistoryTrimmer.new(messages, max_tokens: 10_000).call
|
||||
|
||||
assert_equal messages, result
|
||||
end
|
||||
|
||||
test "drops oldest messages when budget is tight" do
|
||||
messages = [
|
||||
{ role: "user", content: "a" * 100 },
|
||||
{ role: "assistant", content: "b" * 100 },
|
||||
{ role: "user", content: "c" * 100 }
|
||||
]
|
||||
|
||||
# ~40 tokens per message; budget of 60 should keep only the last one
|
||||
result = Assistant::HistoryTrimmer.new(messages, max_tokens: 60).call
|
||||
|
||||
assert_equal 1, result.size
|
||||
assert_equal "c" * 100, result.first[:content]
|
||||
end
|
||||
|
||||
test "keeps tool_call and tool_result as an atomic pair" do
|
||||
messages = [
|
||||
{ role: "user", content: "a" * 40 },
|
||||
{ role: "assistant", content: "", tool_calls: [ { id: "c1", type: "function", function: { name: "f", arguments: "{}" } } ] },
|
||||
{ role: "tool", tool_call_id: "c1", name: "f", content: "result" },
|
||||
{ role: "assistant", content: "follow up" }
|
||||
]
|
||||
|
||||
# Small budget: should keep the newest assistant follow-up. Pair is either
|
||||
# kept together or dropped together — never split.
|
||||
result = Assistant::HistoryTrimmer.new(messages, max_tokens: 20).call
|
||||
|
||||
tool_call_idx = result.index { |m| m[:role] == "assistant" && m[:tool_calls] }
|
||||
tool_result_idx = result.index { |m| m[:role] == "tool" }
|
||||
|
||||
if tool_call_idx
|
||||
assert_not_nil tool_result_idx, "tool_call without paired tool_result"
|
||||
assert tool_result_idx > tool_call_idx, "tool_result must follow its tool_call"
|
||||
else
|
||||
assert_nil tool_result_idx, "tool_result without paired tool_call leaked through"
|
||||
end
|
||||
end
|
||||
|
||||
test "never splits tool_call + tool_result pair when dropping" do
|
||||
messages = [
|
||||
{ role: "assistant", content: "", tool_calls: [ { id: "c1", type: "function", function: { name: "f", arguments: "{}" } } ] },
|
||||
{ role: "tool", tool_call_id: "c1", name: "f", content: "x" * 200 },
|
||||
{ role: "user", content: "y" * 20 }
|
||||
]
|
||||
|
||||
# Budget big enough for the user turn, not for the pair
|
||||
result = Assistant::HistoryTrimmer.new(messages, max_tokens: 15).call
|
||||
|
||||
assert_equal 1, result.size
|
||||
assert_equal "user", result.first[:role]
|
||||
end
|
||||
|
||||
test "handles multiple tool results following a single assistant tool_calls message" do
|
||||
messages = [
|
||||
{ role: "user", content: "hi" },
|
||||
{ role: "assistant", content: "", tool_calls: [
|
||||
{ id: "c1", type: "function", function: { name: "f1", arguments: "{}" } },
|
||||
{ id: "c2", type: "function", function: { name: "f2", arguments: "{}" } }
|
||||
] },
|
||||
{ role: "tool", tool_call_id: "c1", name: "f1", content: "r1" },
|
||||
{ role: "tool", tool_call_id: "c2", name: "f2", content: "r2" }
|
||||
]
|
||||
|
||||
result = Assistant::HistoryTrimmer.new(messages, max_tokens: 10_000).call
|
||||
|
||||
assert_equal messages, result
|
||||
end
|
||||
end
|
||||
40
test/models/assistant/token_estimator_test.rb
Normal file
40
test/models/assistant/token_estimator_test.rb
Normal file
@@ -0,0 +1,40 @@
|
||||
require "test_helper"
|
||||
|
||||
class Assistant::TokenEstimatorTest < ActiveSupport::TestCase
|
||||
test "estimate is zero for nil or empty" do
|
||||
assert_equal 0, Assistant::TokenEstimator.estimate(nil)
|
||||
assert_equal 0, Assistant::TokenEstimator.estimate("")
|
||||
assert_equal 0, Assistant::TokenEstimator.estimate([])
|
||||
end
|
||||
|
||||
test "estimate applies chars/4 with safety factor for strings" do
|
||||
# 100 chars / 4 = 25 tokens, × 1.25 safety factor, ceil = 32
|
||||
assert_equal 32, Assistant::TokenEstimator.estimate("a" * 100)
|
||||
end
|
||||
|
||||
test "estimate sums across arrays" do
|
||||
# Use lengths that avoid ceil rounding drift: 80 chars → 20 tokens → 25.0 ceil → 25.
|
||||
string_estimate = Assistant::TokenEstimator.estimate("a" * 80)
|
||||
array_estimate = Assistant::TokenEstimator.estimate([ "a" * 80, "a" * 80 ])
|
||||
|
||||
assert_equal string_estimate * 2, array_estimate
|
||||
end
|
||||
|
||||
test "estimate serializes hashes via JSON" do
|
||||
hash = { role: "assistant", content: "hi" }
|
||||
# {"role":"assistant","content":"hi"} => 35 chars / 4 × 1.25 ceil = 11
|
||||
assert_equal 11, Assistant::TokenEstimator.estimate(hash)
|
||||
end
|
||||
|
||||
test "estimate handles nested structures" do
|
||||
nested = [ { role: "user", content: "hello" }, { role: "assistant", content: "world" } ]
|
||||
# Each hash gets JSON-serialized and summed
|
||||
expected = nested.sum { |h| Assistant::TokenEstimator.estimate(h) }
|
||||
assert_equal expected, Assistant::TokenEstimator.estimate(nested)
|
||||
end
|
||||
|
||||
test "estimate coerces unknown types via to_s" do
|
||||
assert Assistant::TokenEstimator.estimate(12345) > 0
|
||||
assert Assistant::TokenEstimator.estimate(:symbol) > 0
|
||||
end
|
||||
end
|
||||
@@ -14,6 +14,10 @@ class AssistantTest < ActiveSupport::TestCase
|
||||
@provider = mock
|
||||
@expected_session_id = @chat.id.to_s
|
||||
@expected_user_identifier = ::Digest::SHA256.hexdigest(@chat.user_id.to_s)
|
||||
@expected_conversation_history = [
|
||||
{ role: "user", content: "Can you help me understand my spending habits?" },
|
||||
{ role: "user", content: "What is my net worth?" }
|
||||
]
|
||||
end
|
||||
|
||||
test "errors get added to chat" do
|
||||
@@ -100,6 +104,7 @@ class AssistantTest < ActiveSupport::TestCase
|
||||
@provider.expects(:chat_response).with do |message, **options|
|
||||
assert_equal @expected_session_id, options[:session_id]
|
||||
assert_equal @expected_user_identifier, options[:user_identifier]
|
||||
assert_equal @expected_conversation_history, options[:messages]
|
||||
text_chunks.each do |text_chunk|
|
||||
options[:streamer].call(text_chunk)
|
||||
end
|
||||
@@ -154,6 +159,7 @@ class AssistantTest < ActiveSupport::TestCase
|
||||
@provider.expects(:chat_response).with do |message, **options|
|
||||
assert_equal @expected_session_id, options[:session_id]
|
||||
assert_equal @expected_user_identifier, options[:user_identifier]
|
||||
assert_equal @expected_conversation_history, options[:messages]
|
||||
call2_text_chunks.each do |text_chunk|
|
||||
options[:streamer].call(text_chunk)
|
||||
end
|
||||
@@ -165,6 +171,7 @@ class AssistantTest < ActiveSupport::TestCase
|
||||
@provider.expects(:chat_response).with do |message, **options|
|
||||
assert_equal @expected_session_id, options[:session_id]
|
||||
assert_equal @expected_user_identifier, options[:user_identifier]
|
||||
assert_equal @expected_conversation_history, options[:messages]
|
||||
options[:streamer].call(call1_response_chunk)
|
||||
true
|
||||
end.returns(call1_response).once.in_sequence(sequence)
|
||||
@@ -418,6 +425,95 @@ class AssistantTest < ActiveSupport::TestCase
|
||||
assert_raises(Assistant::Error) { Assistant.for_chat(nil) }
|
||||
end
|
||||
|
||||
test "builtin demotes a partially-streamed assistant message to failed on error" do
|
||||
@assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider)
|
||||
|
||||
boom = StandardError.new("boom mid-stream")
|
||||
|
||||
@provider.expects(:chat_response).with do |_prompt, **options|
|
||||
# Simulate a partial text chunk landing before the error propagates.
|
||||
options[:streamer].call(provider_text_chunk("partial tokens "))
|
||||
true
|
||||
end.returns(provider_error_response(boom))
|
||||
|
||||
@assistant.respond_to(@message)
|
||||
|
||||
partial = @chat.messages.where(type: "AssistantMessage").order(:created_at).last
|
||||
assert partial.present?, "partial assistant message should be persisted"
|
||||
assert_equal "failed", partial.status
|
||||
assert_equal "partial tokens ", partial.content
|
||||
end
|
||||
|
||||
test "conversation_history excludes failed and pending messages" do
|
||||
# Add a failed assistant turn; it must NOT leak into history.
|
||||
AssistantMessage.create!(
|
||||
chat: @chat,
|
||||
content: "partial error response",
|
||||
ai_model: "gpt-4.1",
|
||||
status: "failed"
|
||||
)
|
||||
|
||||
@assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider)
|
||||
|
||||
captured_history = nil
|
||||
@provider.expects(:chat_response).with do |_prompt, **options|
|
||||
captured_history = options[:messages]
|
||||
options[:streamer].call(
|
||||
provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: [])
|
||||
)
|
||||
true
|
||||
end.returns(provider_success_response(
|
||||
provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: []).data
|
||||
))
|
||||
|
||||
@assistant.respond_to(@message)
|
||||
|
||||
contents = captured_history.map { |m| m[:content] }
|
||||
assert_not_includes contents, "partial error response"
|
||||
end
|
||||
|
||||
test "conversation_history serializes assistant tool_calls with paired tool result" do
|
||||
assistant_msg = AssistantMessage.create!(
|
||||
chat: @chat,
|
||||
content: "Looking that up",
|
||||
ai_model: "gpt-4.1",
|
||||
status: "complete"
|
||||
)
|
||||
|
||||
ToolCall::Function.create!(
|
||||
message: assistant_msg,
|
||||
provider_id: "call_abc",
|
||||
provider_call_id: "call_abc",
|
||||
function_name: "get_net_worth",
|
||||
function_arguments: { foo: "bar" },
|
||||
function_result: { amount: 1000, currency: "USD" }
|
||||
)
|
||||
|
||||
@assistant.expects(:get_model_provider).with("gpt-4.1").returns(@provider)
|
||||
|
||||
captured_history = nil
|
||||
@provider.expects(:chat_response).with do |_prompt, **options|
|
||||
captured_history = options[:messages]
|
||||
options[:streamer].call(
|
||||
provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: [])
|
||||
)
|
||||
true
|
||||
end.returns(provider_success_response(
|
||||
provider_response_chunk(id: "1", model: "gpt-4.1", messages: [ provider_message(id: "1", text: "ok") ], function_requests: []).data
|
||||
))
|
||||
|
||||
@assistant.respond_to(@message)
|
||||
|
||||
tool_call_entry = captured_history.find { |m| m[:role] == "assistant" && m[:tool_calls].present? }
|
||||
tool_result_entry = captured_history.find { |m| m[:role] == "tool" }
|
||||
|
||||
assert_not_nil tool_call_entry, "tool_call message missing from history"
|
||||
assert_not_nil tool_result_entry, "tool_result message missing from history"
|
||||
assert_equal "call_abc", tool_call_entry[:tool_calls].first[:id]
|
||||
assert_equal "call_abc", tool_result_entry[:tool_call_id]
|
||||
assert_equal "get_net_worth", tool_result_entry[:name]
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def mock_external_sse_response(sse_body)
|
||||
|
||||
74
test/models/provider/openai/batch_slicer_test.rb
Normal file
74
test/models/provider/openai/batch_slicer_test.rb
Normal file
@@ -0,0 +1,74 @@
|
||||
require "test_helper"
|
||||
|
||||
class Provider::Openai::BatchSlicerTest < ActiveSupport::TestCase
|
||||
test "returns empty array for empty input" do
|
||||
assert_equal [], Provider::Openai::BatchSlicer.call([], max_items: 25, max_tokens: 2000)
|
||||
end
|
||||
|
||||
test "single batch when within item cap and token budget" do
|
||||
items = Array.new(10) { |i| { id: i, name: "Item#{i}" } }
|
||||
|
||||
batches = Provider::Openai::BatchSlicer.call(items, max_items: 25, max_tokens: 10_000)
|
||||
|
||||
assert_equal 1, batches.size
|
||||
assert_equal items, batches.first
|
||||
end
|
||||
|
||||
test "splits on item cap" do
|
||||
items = Array.new(30) { |i| { id: i } }
|
||||
|
||||
batches = Provider::Openai::BatchSlicer.call(items, max_items: 10, max_tokens: 100_000)
|
||||
|
||||
assert_equal 3, batches.size
|
||||
assert_equal 10, batches.first.size
|
||||
assert_equal 10, batches.last.size
|
||||
assert_equal items, batches.flatten
|
||||
end
|
||||
|
||||
test "splits on token budget" do
|
||||
big_item = { payload: "x" * 200 } # ~63 tokens each
|
||||
items = Array.new(10) { big_item }
|
||||
|
||||
# ~125-token budget fits 2 items per batch
|
||||
batches = Provider::Openai::BatchSlicer.call(items, max_items: 100, max_tokens: 125)
|
||||
|
||||
assert batches.size > 1
|
||||
assert_equal items, batches.flatten
|
||||
batches.each do |batch|
|
||||
tokens = Assistant::TokenEstimator.estimate(batch)
|
||||
assert tokens <= 125, "Batch exceeded token budget: #{tokens}"
|
||||
end
|
||||
end
|
||||
|
||||
test "raises ContextOverflowError when single item exceeds budget" do
|
||||
huge = { payload: "x" * 10_000 }
|
||||
|
||||
assert_raises Provider::Openai::BatchSlicer::ContextOverflowError do
|
||||
Provider::Openai::BatchSlicer.call([ huge ], max_items: 25, max_tokens: 500)
|
||||
end
|
||||
end
|
||||
|
||||
test "raises ContextOverflowError when fixed_tokens exceed budget" do
|
||||
assert_raises Provider::Openai::BatchSlicer::ContextOverflowError do
|
||||
Provider::Openai::BatchSlicer.call([ { a: 1 } ], max_items: 25, max_tokens: 500, fixed_tokens: 600)
|
||||
end
|
||||
end
|
||||
|
||||
test "respects fixed_tokens when computing available budget" do
|
||||
items = Array.new(5) { { payload: "x" * 80 } } # ~25 tokens each
|
||||
|
||||
# Budget of 200 minus fixed 100 = 100 available; fits ~4 items per batch
|
||||
batches = Provider::Openai::BatchSlicer.call(items, max_items: 25, max_tokens: 200, fixed_tokens: 100)
|
||||
|
||||
assert batches.size >= 2
|
||||
assert_equal items, batches.flatten
|
||||
end
|
||||
|
||||
test "never produces empty batches" do
|
||||
items = Array.new(7) { { id: SecureRandom.hex(4) } }
|
||||
|
||||
batches = Provider::Openai::BatchSlicer.call(items, max_items: 2, max_tokens: 10_000)
|
||||
|
||||
batches.each { |b| assert b.any?, "Empty batch produced" }
|
||||
end
|
||||
end
|
||||
@@ -330,4 +330,130 @@ class Provider::OpenaiTest < ActiveSupport::TestCase
|
||||
|
||||
@subject.send(:create_langfuse_trace, name: "openai.test", input: { foo: "bar" })
|
||||
end
|
||||
|
||||
test "SUPPORTED_MODELS and VISION_CAPABLE_MODEL_PREFIXES are Ruby constants, not YAML-derived" do
|
||||
assert_kind_of Array, Provider::Openai::SUPPORTED_MODELS
|
||||
assert Provider::Openai::SUPPORTED_MODELS.all? { |s| s.is_a?(String) }
|
||||
assert Provider::Openai::SUPPORTED_MODELS.frozen?
|
||||
|
||||
assert_kind_of Array, Provider::Openai::VISION_CAPABLE_MODEL_PREFIXES
|
||||
assert Provider::Openai::VISION_CAPABLE_MODEL_PREFIXES.frozen?
|
||||
assert_equal "gpt-4.1", Provider::Openai::DEFAULT_MODEL
|
||||
end
|
||||
|
||||
test "budget readers default to conservative values" do
|
||||
with_env_overrides(
|
||||
"LLM_CONTEXT_WINDOW" => nil,
|
||||
"LLM_MAX_RESPONSE_TOKENS" => nil,
|
||||
"LLM_SYSTEM_PROMPT_RESERVE" => nil,
|
||||
"LLM_MAX_HISTORY_TOKENS" => nil,
|
||||
"LLM_MAX_ITEMS_PER_CALL" => nil
|
||||
) do
|
||||
subject = Provider::Openai.new("test-token")
|
||||
assert_equal 2048, subject.context_window
|
||||
assert_equal 512, subject.max_response_tokens
|
||||
assert_equal 256, subject.system_prompt_reserve
|
||||
assert_equal 2048 - 512 - 256, subject.max_history_tokens
|
||||
assert_equal 2048 - 512 - 256, subject.max_input_tokens
|
||||
assert_equal 25, subject.max_items_per_call
|
||||
end
|
||||
end
|
||||
|
||||
test "budget readers respect explicit env overrides" do
|
||||
with_env_overrides(
|
||||
"LLM_CONTEXT_WINDOW" => "8192",
|
||||
"LLM_MAX_RESPONSE_TOKENS" => "1024",
|
||||
"LLM_SYSTEM_PROMPT_RESERVE" => "512",
|
||||
"LLM_MAX_HISTORY_TOKENS" => "4096",
|
||||
"LLM_MAX_ITEMS_PER_CALL" => "50"
|
||||
) do
|
||||
subject = Provider::Openai.new("test-token")
|
||||
assert_equal 8192, subject.context_window
|
||||
assert_equal 1024, subject.max_response_tokens
|
||||
assert_equal 512, subject.system_prompt_reserve
|
||||
assert_equal 4096, subject.max_history_tokens # explicit overrides derived
|
||||
assert_equal 8192 - 1024 - 512, subject.max_input_tokens
|
||||
assert_equal 50, subject.max_items_per_call
|
||||
end
|
||||
end
|
||||
|
||||
test "budget readers fall back to Setting when ENV unset" do
|
||||
with_env_overrides(
|
||||
"LLM_CONTEXT_WINDOW" => nil,
|
||||
"LLM_MAX_RESPONSE_TOKENS" => nil,
|
||||
"LLM_MAX_ITEMS_PER_CALL" => nil
|
||||
) do
|
||||
Setting.llm_context_window = 8192
|
||||
Setting.llm_max_response_tokens = 1024
|
||||
Setting.llm_max_items_per_call = 40
|
||||
|
||||
subject = Provider::Openai.new("test-token")
|
||||
assert_equal 8192, subject.context_window
|
||||
assert_equal 1024, subject.max_response_tokens
|
||||
assert_equal 40, subject.max_items_per_call
|
||||
end
|
||||
ensure
|
||||
Setting.llm_context_window = nil
|
||||
Setting.llm_max_response_tokens = nil
|
||||
Setting.llm_max_items_per_call = nil
|
||||
end
|
||||
|
||||
test "budget readers: ENV beats Setting when both present" do
|
||||
with_env_overrides("LLM_CONTEXT_WINDOW" => "16384") do
|
||||
Setting.llm_context_window = 4096
|
||||
subject = Provider::Openai.new("test-token")
|
||||
assert_equal 16384, subject.context_window
|
||||
end
|
||||
ensure
|
||||
Setting.llm_context_window = nil
|
||||
end
|
||||
|
||||
test "budget readers: zero or negative values fall through to default" do
|
||||
with_env_overrides(
|
||||
"LLM_CONTEXT_WINDOW" => "0",
|
||||
"LLM_MAX_RESPONSE_TOKENS" => nil,
|
||||
"LLM_MAX_ITEMS_PER_CALL" => nil
|
||||
) do
|
||||
Setting.llm_context_window = 0
|
||||
subject = Provider::Openai.new("test-token")
|
||||
assert_equal 2048, subject.context_window
|
||||
end
|
||||
ensure
|
||||
Setting.llm_context_window = nil
|
||||
end
|
||||
|
||||
test "auto_categorize fans out oversized batches into sequential sub-calls" do
|
||||
with_env_overrides("LLM_MAX_ITEMS_PER_CALL" => "10") do
|
||||
subject = Provider::Openai.new("test-token")
|
||||
transactions = Array.new(25) { |i| { id: i.to_s, name: "txn#{i}", amount: 10, classification: "expense" } }
|
||||
user_categories = [ { id: "cat1", name: "Groceries", is_subcategory: false, parent_id: nil, classification: "expense" } ]
|
||||
|
||||
# Capture the batch size passed to each AutoCategorizer. `.new` is called
|
||||
# once per sub-batch; we record each invocation's transactions count.
|
||||
seen_sizes = []
|
||||
fake_instance = mock
|
||||
fake_instance.stubs(:auto_categorize).returns([])
|
||||
Provider::Openai::AutoCategorizer.stubs(:new).with do |*_args, **kwargs|
|
||||
seen_sizes << kwargs[:transactions].size
|
||||
true
|
||||
end.returns(fake_instance)
|
||||
|
||||
response = subject.auto_categorize(transactions: transactions, user_categories: user_categories)
|
||||
|
||||
assert response.success?
|
||||
assert_equal [ 10, 10, 5 ], seen_sizes
|
||||
end
|
||||
end
|
||||
|
||||
test "build_input no longer accepts inline messages history" do
|
||||
config = Provider::Openai::ChatConfig.new(functions: [], function_results: [])
|
||||
# Positive control: prompt works
|
||||
result = config.build_input(prompt: "hi")
|
||||
assert_equal [ { role: "user", content: "hi" } ], result
|
||||
|
||||
# `messages:` kwarg is no longer part of the signature — calling with it must raise
|
||||
assert_raises(ArgumentError) do
|
||||
config.build_input(prompt: "hi", messages: [ { role: "user", content: "old" } ])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user