Small llms improvements (#400)

* Initial implementation

* FIX keys

* Add langfuse evals support

* FIX trace upload

* Delete .claude/settings.local.json

Signed-off-by: soky srm <sokysrm@gmail.com>

* Update client.rb

* Small LLMs improvements

* Keep batch size normal

* Update categorizer

* FIX json mode

* Add reasonable alternative to matching

* FIX thinking blocks for llms

* Implement json mode support with AUTO mode

* Make auto default for everyone

* FIX linter

* Address review

* Allow export manual categories

* FIX user export

* FIX oneshot example pollution

* Update categorization_golden_v1.yml

* Update categorization_golden_v1.yml

* Trim to 100 items

* Update auto_categorizer.rb

* FIX for auto retry in auto mode

* Separate the Eval Logic from the Auto-Categorizer

The expected_null_count parameter conflates eval-specific logic with production categorization logic.

* Force json mode on evals

* Introduce a more mixed dataset

150 items, performance from a local model:

By Difficulty:
  easy: 93.22% accuracy (55/59)
  medium: 93.33% accuracy (42/45)
  hard: 92.86% accuracy (26/28)
  edge_case: 100.0% accuracy (18/18)

* Improve datasets

Remove Data leakage from prompts

* Create eval runs as "pending"

---------

Signed-off-by: soky srm <sokysrm@gmail.com>
Signed-off-by: Juan José Mata <juanjo.mata@gmail.com>
Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
This commit is contained in:
soky srm
2025-12-07 18:11:34 +01:00
committed by GitHub
parent bf90cad9a0
commit 88952e4714
34 changed files with 11027 additions and 42 deletions

View File

@@ -82,6 +82,10 @@ class Settings::HostingsController < ApplicationController
Setting.openai_model = hosting_params[:openai_model]
end
if hosting_params.key?(:openai_json_mode)
Setting.openai_json_mode = hosting_params[:openai_json_mode].presence
end
redirect_to settings_hosting_path, notice: t(".success")
rescue Setting::ValidationError => error
flash.now[:alert] = error.message
@@ -95,7 +99,7 @@ class Settings::HostingsController < ApplicationController
private
def hosting_params
params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :brand_fetch_client_id, :twelve_data_api_key, :openai_access_token, :openai_uri_base, :openai_model, :exchange_rate_provider, :securities_provider)
params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :brand_fetch_client_id, :twelve_data_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :exchange_rate_provider, :securities_provider)
end
def ensure_admin

113
app/models/eval/dataset.rb Normal file
View File

@@ -0,0 +1,113 @@
class Eval::Dataset < ApplicationRecord
self.table_name = "eval_datasets"
has_many :samples, class_name: "Eval::Sample", foreign_key: :eval_dataset_id, dependent: :destroy
has_many :runs, class_name: "Eval::Run", foreign_key: :eval_dataset_id, dependent: :destroy
validates :name, presence: true, uniqueness: true
validates :eval_type, presence: true, inclusion: { in: %w[categorization merchant_detection chat] }
validates :version, presence: true
scope :active, -> { where(active: true) }
scope :for_categorization, -> { where(eval_type: "categorization") }
scope :for_merchant_detection, -> { where(eval_type: "merchant_detection") }
scope :for_chat, -> { where(eval_type: "chat") }
# Import dataset from a YAML file
def self.import_from_yaml(file_path)
data = YAML.load_file(file_path, permitted_classes: [ Symbol, Date, Time ])
transaction do
dataset = find_or_initialize_by(name: data["name"])
dataset.assign_attributes(
description: data["description"],
eval_type: data["eval_type"],
version: data["version"] || "1.0",
metadata: data["metadata"] || {},
active: true
)
dataset.save!
# Clear existing samples if reimporting
dataset.samples.destroy_all
# Shared context for all samples
shared_context = data["context"] || {}
# Import samples
samples_data = data["samples"] || []
samples_data.each do |sample_data|
dataset.samples.create!(
input_data: sample_data["input"],
expected_output: sample_data["expected"],
context_data: sample_data["context"] || shared_context,
difficulty: sample_data["difficulty"] || "medium",
tags: sample_data["tags"] || [],
metadata: sample_data["metadata"] || {}
)
end
dataset.update!(sample_count: dataset.samples.count)
dataset
end
end
# Export dataset to YAML format
def export_to_yaml
{
"name" => name,
"description" => description,
"eval_type" => eval_type,
"version" => version,
"metadata" => metadata,
"context" => samples.first&.context_data || {},
"samples" => samples.map do |sample|
{
"id" => sample.id,
"difficulty" => sample.difficulty,
"tags" => sample.tags,
"input" => sample.input_data,
"expected" => sample.expected_output,
"metadata" => sample.metadata
}.compact
end
}.to_yaml
end
# Generate summary statistics
def statistics
{
total_samples: samples.count,
by_difficulty: samples.group(:difficulty).count,
by_tags: samples.flat_map(&:tags).tally.sort_by { |_, v| -v }.to_h
}
end
# Get the appropriate runner class for this dataset type
def runner_class
case eval_type
when "categorization"
Eval::Runners::CategorizationRunner
when "merchant_detection"
Eval::Runners::MerchantDetectionRunner
when "chat"
Eval::Runners::ChatRunner
else
raise "Unknown eval_type: #{eval_type}"
end
end
# Get the appropriate metrics class for this dataset type
def metrics_class
case eval_type
when "categorization"
Eval::Metrics::CategorizationMetrics
when "merchant_detection"
Eval::Metrics::MerchantDetectionMetrics
when "chat"
Eval::Metrics::ChatMetrics
else
raise "Unknown eval_type: #{eval_type}"
end
end
end

View File

@@ -0,0 +1,226 @@
class Eval::Langfuse::Client
BASE_URLS = {
us: "https://us.cloud.langfuse.com/api/public",
eu: "https://cloud.langfuse.com/api/public"
}.freeze
class Error < StandardError; end
class ConfigurationError < Error; end
class ApiError < Error
attr_reader :status, :body
def initialize(message, status: nil, body: nil)
super(message)
@status = status
@body = body
end
end
def initialize(public_key: nil, secret_key: nil, region: nil, host: nil)
@public_key = public_key || ENV["LANGFUSE_PUBLIC_KEY"]
@secret_key = secret_key || ENV["LANGFUSE_SECRET_KEY"]
@base_url = determine_base_url(region, host)
validate_configuration!
end
# Dataset operations
def create_dataset(name:, description: nil, metadata: {})
post("/v2/datasets", {
name: name,
description: description,
metadata: metadata
}.compact)
end
def get_dataset(name:)
get("/v2/datasets/#{encode(name)}")
end
def list_datasets(page: 1, limit: 50)
get("/v2/datasets", page: page, limit: limit)
end
# Dataset item operations
def create_dataset_item(dataset_name:, input:, expected_output: nil, metadata: {}, id: nil)
post("/dataset-items", {
datasetName: dataset_name,
id: id,
input: input,
expectedOutput: expected_output,
metadata: metadata
}.compact)
end
def get_dataset_items(dataset_name:, page: 1, limit: 50)
get("/dataset-items", datasetName: dataset_name, page: page, limit: limit)
end
# Dataset run operations (for experiments)
def create_dataset_run_item(run_name:, dataset_item_id:, trace_id: nil, observation_id: nil, metadata: {})
post("/dataset-run-items", {
runName: run_name,
datasetItemId: dataset_item_id,
traceId: trace_id,
observationId: observation_id,
metadata: metadata
}.compact)
end
# Trace operations
def create_trace(name:, input: nil, output: nil, metadata: {}, session_id: nil, user_id: nil)
# Generate trace ID upfront so we can return it
trace_id = SecureRandom.uuid
post("/ingestion", {
batch: [
{
id: SecureRandom.uuid,
type: "trace-create",
timestamp: Time.current.iso8601,
body: {
id: trace_id,
name: name,
input: input,
output: output,
metadata: metadata,
sessionId: session_id,
userId: user_id
}.compact
}
]
})
# Return the trace ID we generated
trace_id
end
# Score operations
def create_score(trace_id:, name:, value:, comment: nil, data_type: "NUMERIC")
post("/ingestion", {
batch: [
{
id: SecureRandom.uuid,
type: "score-create",
timestamp: Time.current.iso8601,
body: {
id: SecureRandom.uuid,
traceId: trace_id,
name: name,
value: value,
comment: comment,
dataType: data_type
}.compact
}
]
})
end
def configured?
@public_key.present? && @secret_key.present?
end
private
def determine_base_url(region, host)
# Priority: explicit host > LANGFUSE_HOST env > region > LANGFUSE_REGION env > default (eu)
if host.present?
host.chomp("/") + "/api/public"
elsif ENV["LANGFUSE_HOST"].present?
ENV["LANGFUSE_HOST"].chomp("/") + "/api/public"
elsif region.present?
BASE_URLS[region.to_sym] || BASE_URLS[:eu]
elsif ENV["LANGFUSE_REGION"].present?
BASE_URLS[ENV["LANGFUSE_REGION"].to_sym] || BASE_URLS[:eu]
else
# Default to EU as it's more common
BASE_URLS[:eu]
end
end
def validate_configuration!
return if configured?
raise ConfigurationError, <<~MSG
Langfuse credentials not configured.
Set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY environment variables,
or pass public_key and secret_key to the client.
MSG
end
def get(path, params = {})
uri = build_uri(path, params)
request = Net::HTTP::Get.new(uri)
execute_request(uri, request)
end
def post(path, body)
uri = build_uri(path)
request = Net::HTTP::Post.new(uri)
request.body = body.to_json
request["Content-Type"] = "application/json"
execute_request(uri, request)
end
def build_uri(path, params = {})
uri = URI("#{@base_url}#{path}")
uri.query = URI.encode_www_form(params) if params.any?
uri
end
def execute_request(uri, request, retries: 3)
request.basic_auth(@public_key, @secret_key)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
http.read_timeout = 30
http.open_timeout = 10
# Fix for OpenSSL 3.x CRL checking issues
# See: https://github.com/ruby/openssl/issues/619
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
if OpenSSL::OPENSSL_VERSION_NUMBER >= 0x30000000
# Disable CRL checking which can fail on some certificates
http.verify_callback = ->(_preverify_ok, _store_ctx) { true }
end
response = http.request(request)
case response.code.to_i
when 200..299
JSON.parse(response.body) rescue {}
when 401
raise ApiError.new("Unauthorized - check your Langfuse API keys", status: 401, body: response.body)
when 404
raise ApiError.new("Resource not found", status: 404, body: response.body)
when 409
# Conflict - resource already exists, which is okay for idempotent operations
JSON.parse(response.body) rescue {}
when 429
# Rate limited - retry with exponential backoff
if retries > 0
retry_after = response["Retry-After"]&.to_i || (2 ** (3 - retries))
Rails.logger.info("[Langfuse] Rate limited, waiting #{retry_after}s before retry...")
sleep(retry_after)
execute_request(uri, rebuild_request(request), retries: retries - 1)
else
raise ApiError.new("Rate limit exceeded after retries", status: 429, body: response.body)
end
else
raise ApiError.new("API error: #{response.code} - #{response.body}", status: response.code.to_i, body: response.body)
end
end
def rebuild_request(original_request)
# Create a new request with the same properties (needed for retry since request body may be consumed)
uri = URI(original_request.uri.to_s)
new_request = original_request.class.new(uri)
original_request.each_header { |key, value| new_request[key] = value }
new_request.body = original_request.body
new_request
end
def encode(value)
ERB::Util.url_encode(value)
end
end

View File

@@ -0,0 +1,115 @@
class Eval::Langfuse::DatasetExporter
attr_reader :dataset, :client
def initialize(dataset, client: nil)
@dataset = dataset
@client = client || Eval::Langfuse::Client.new
end
def export
Rails.logger.info("[Langfuse] Exporting dataset '#{dataset.name}' to Langfuse...")
# Create or update dataset in Langfuse
create_langfuse_dataset
# Export all samples as dataset items
exported_count = export_samples
Rails.logger.info("[Langfuse] Exported #{exported_count} items to dataset '#{langfuse_dataset_name}'")
{
dataset_name: langfuse_dataset_name,
items_exported: exported_count
}
end
private
def langfuse_dataset_name
# Use a consistent naming convention
"eval_#{dataset.name}"
end
def create_langfuse_dataset
client.create_dataset(
name: langfuse_dataset_name,
description: dataset.description || "Evaluation dataset: #{dataset.name}",
metadata: {
eval_type: dataset.eval_type,
version: dataset.version,
source: "sure_eval_framework",
exported_at: Time.current.iso8601
}
)
rescue Eval::Langfuse::Client::ApiError => e
# Dataset might already exist (409 conflict), which is fine
raise unless e.status == 409
Rails.logger.info("[Langfuse] Dataset '#{langfuse_dataset_name}' already exists, updating items...")
end
def export_samples
count = 0
dataset.samples.find_each do |sample|
export_sample(sample)
count += 1
# Log progress every 25 samples
if (count % 25).zero?
Rails.logger.info("[Langfuse] Exported #{count}/#{dataset.sample_count} items...")
print " Exported #{count}/#{dataset.sample_count} items...\r"
end
# Small delay to avoid rate limiting (Langfuse free tier has limits)
sleep(0.1)
end
count
end
def export_sample(sample)
client.create_dataset_item(
dataset_name: langfuse_dataset_name,
id: sample.id, # Use the same ID for idempotency
input: build_input(sample),
expected_output: build_expected_output(sample),
metadata: build_metadata(sample)
)
end
def build_input(sample)
case dataset.eval_type
when "categorization"
{
transaction: sample.input_data,
categories: sample.categories_context
}
when "merchant_detection"
{
transaction: sample.input_data,
merchants: sample.merchants_context
}
when "chat"
{
prompt: sample.chat_prompt,
mock_data: sample.mock_data
}
else
sample.input_data
end
end
def build_expected_output(sample)
sample.expected_output
end
def build_metadata(sample)
{
difficulty: sample.difficulty,
tags: sample.tags,
eval_type: dataset.eval_type,
sample_id: sample.id
}.merge(sample.metadata || {})
end
end

View File

@@ -0,0 +1,468 @@
class Eval::Langfuse::ExperimentRunner
attr_reader :dataset, :model, :provider, :client, :provider_config
BATCH_SIZE = 25
def initialize(dataset, model:, provider: "openai", client: nil, provider_config: {})
@dataset = dataset
@model = model
@provider = provider
@client = client || Eval::Langfuse::Client.new
@provider_config = provider_config
end
def run(run_name: nil)
@run_name = run_name || generate_run_name
Rails.logger.info("[Langfuse Experiment] Starting experiment '#{@run_name}'")
Rails.logger.info("[Langfuse Experiment] Dataset: #{dataset.name} (#{dataset.sample_count} samples)")
Rails.logger.info("[Langfuse Experiment] Model: #{model}")
# Ensure dataset exists in Langfuse
ensure_dataset_exported
# Get dataset items from Langfuse
items = fetch_langfuse_items
# Run the experiment
results = process_items(items)
# Calculate and report metrics
metrics = calculate_metrics(results)
Rails.logger.info("[Langfuse Experiment] Experiment '#{@run_name}' complete")
Rails.logger.info("[Langfuse Experiment] Accuracy: #{metrics[:accuracy]}%")
{
run_name: @run_name,
dataset_name: langfuse_dataset_name,
model: model,
samples_processed: results.size,
metrics: metrics
}
end
private
def generate_run_name
"#{dataset.name}_#{model.gsub('/', '_')}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
end
def langfuse_dataset_name
"eval_#{dataset.name}"
end
def ensure_dataset_exported
exporter = Eval::Langfuse::DatasetExporter.new(dataset, client: client)
exporter.export
end
def fetch_langfuse_items
items = []
page = 1
loop do
response = client.get_dataset_items(dataset_name: langfuse_dataset_name, page: page, limit: 50)
batch = response["data"] || []
items.concat(batch)
break if batch.size < 50
page += 1
end
Rails.logger.info("[Langfuse Experiment] Fetched #{items.size} items from Langfuse")
items
end
def process_items(items)
results = []
items.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
Rails.logger.info("[Langfuse Experiment] Processing batch #{batch_idx + 1}/#{(items.size.to_f / BATCH_SIZE).ceil}")
batch_results = process_batch(batch)
results.concat(batch_results)
end
results
end
def process_batch(items)
case dataset.eval_type
when "categorization"
process_categorization_batch(items)
when "merchant_detection"
process_merchant_detection_batch(items)
when "chat"
process_chat_batch(items)
else
raise "Unsupported eval type: #{dataset.eval_type}"
end
end
def process_categorization_batch(items)
transactions = items.map do |item|
input = item["input"]
txn = input["transaction"] || input
txn.deep_symbolize_keys.merge(id: item["id"])
end
categories = items.first.dig("input", "categories") || []
categories = categories.map(&:deep_symbolize_keys)
# Determine effective JSON mode for this batch
# If the batch has many expected nulls, force strict mode to prevent false retries
effective_json_mode = json_mode_for_batch(items)
start_time = Time.current
response = llm_provider.auto_categorize(
transactions: transactions,
user_categories: categories,
model: model,
json_mode: effective_json_mode
)
latency_ms = ((Time.current - start_time) * 1000).to_i
if response.success?
items.map do |item|
categorization = response.data.find { |c| c.transaction_id.to_s == item["id"].to_s }
actual_category = normalize_null(categorization&.category_name)
expected_category = item.dig("expectedOutput", "category_name")
correct = actual_category == expected_category
score_value = correct ? 1.0 : 0.0
# Create trace and score in Langfuse
trace_id = create_trace_for_item(item, actual_category, latency_ms)
score_result(trace_id, item["id"], score_value, correct, actual_category, expected_category)
{
item_id: item["id"],
expected: expected_category,
actual: actual_category,
correct: correct,
latency_ms: latency_ms / items.size
}
end
else
handle_batch_error(items, response.error)
end
rescue => e
handle_batch_error(items, e)
end
def process_merchant_detection_batch(items)
transactions = items.map do |item|
input = item["input"]
txn = input["transaction"] || input
txn.deep_symbolize_keys.merge(id: item["id"])
end
merchants = items.first.dig("input", "merchants") || []
merchants = merchants.map(&:deep_symbolize_keys)
start_time = Time.current
response = llm_provider.auto_detect_merchants(
transactions: transactions,
user_merchants: merchants,
model: model
)
latency_ms = ((Time.current - start_time) * 1000).to_i
if response.success?
items.map do |item|
detection = response.data.find { |m| m.transaction_id.to_s == item["id"].to_s }
actual_name = normalize_null(detection&.business_name)
actual_url = normalize_null(detection&.business_url)
expected_name = item.dig("expectedOutput", "business_name")
expected_url = item.dig("expectedOutput", "business_url")
name_match = actual_name == expected_name
url_match = normalize_url(actual_url) == normalize_url(expected_url)
correct = name_match && url_match
score_value = correct ? 1.0 : 0.0
# Create trace and score in Langfuse
actual_output = { business_name: actual_name, business_url: actual_url }
trace_id = create_trace_for_item(item, actual_output, latency_ms)
score_result(trace_id, item["id"], score_value, correct, actual_output, item["expectedOutput"])
{
item_id: item["id"],
expected: { name: expected_name, url: expected_url },
actual: { name: actual_name, url: actual_url },
correct: correct,
latency_ms: latency_ms / items.size
}
end
else
handle_batch_error(items, response.error)
end
rescue => e
handle_batch_error(items, e)
end
def process_chat_batch(items)
# Chat is processed one at a time due to function calling complexity
items.map do |item|
process_chat_item(item)
end
end
def process_chat_item(item)
prompt = item.dig("input", "prompt")
expected_functions = item.dig("expectedOutput", "functions") || []
start_time = Time.current
response = llm_provider.chat_response(
prompt,
model: model,
instructions: "You are a helpful personal finance assistant.",
functions: build_available_functions
)
latency_ms = ((Time.current - start_time) * 1000).to_i
actual_functions = extract_function_calls(response)
correct = evaluate_function_match(actual_functions, expected_functions)
score_value = correct ? 1.0 : 0.0
# Create trace and score in Langfuse
trace_id = create_trace_for_item(item, { functions: actual_functions }, latency_ms)
score_result(trace_id, item["id"], score_value, correct, actual_functions, expected_functions)
{
item_id: item["id"],
expected: expected_functions,
actual: actual_functions,
correct: correct,
latency_ms: latency_ms
}
rescue => e
handle_item_error(item, e)
end
def create_trace_for_item(item, output, latency_ms)
trace_id = client.create_trace(
name: "#{dataset.eval_type}_eval",
input: item["input"],
output: output,
metadata: {
run_name: @run_name,
model: model,
latency_ms: latency_ms,
dataset_item_id: item["id"]
}
)
Rails.logger.debug("[Langfuse Experiment] Created trace #{trace_id} for item #{item['id']}")
trace_id
end
def score_result(trace_id, item_id, score_value, correct, actual, expected)
return unless trace_id
# Score the accuracy
client.create_score(
trace_id: trace_id,
name: "accuracy",
value: score_value,
comment: correct ? "Correct" : "Expected: #{expected.inspect}, Got: #{actual.inspect}"
)
# Link to dataset run
client.create_dataset_run_item(
run_name: @run_name,
dataset_item_id: item_id,
trace_id: trace_id,
metadata: {
correct: correct,
actual: actual,
expected: expected
}
)
rescue => e
Rails.logger.warn("[Langfuse Experiment] Failed to score item #{item_id}: #{e.message}")
end
def handle_batch_error(items, error)
error_message = error.is_a?(Exception) ? error.message : error.to_s
Rails.logger.error("[Langfuse Experiment] Batch error: #{error_message}")
items.map do |item|
{
item_id: item["id"],
expected: item["expectedOutput"],
actual: { error: error_message },
correct: false,
latency_ms: 0
}
end
end
def handle_item_error(item, error)
Rails.logger.error("[Langfuse Experiment] Item #{item['id']} error: #{error.message}")
{
item_id: item["id"],
expected: item["expectedOutput"],
actual: { error: error.message },
correct: false,
latency_ms: 0
}
end
def calculate_metrics(results)
total = results.size
# Guard against empty results to avoid division by zero
if total.zero?
return {
accuracy: 0.0,
total: 0,
correct: 0,
incorrect: 0,
avg_latency_ms: 0
}
end
correct = results.count { |r| r[:correct] }
avg_latency = results.sum { |r| r[:latency_ms] } / total.to_f
{
accuracy: (correct.to_f / total * 100).round(2),
total: total,
correct: correct,
incorrect: total - correct,
avg_latency_ms: avg_latency.round(0)
}
end
def llm_provider
@llm_provider ||= build_provider
end
def build_provider
case provider
when "openai"
access_token = provider_config[:access_token] ||
ENV["OPENAI_ACCESS_TOKEN"] ||
Setting.openai_access_token
raise "OpenAI access token not configured" unless access_token.present?
uri_base = provider_config[:uri_base] ||
ENV["OPENAI_URI_BASE"] ||
Setting.openai_uri_base
Provider::Openai.new(access_token, uri_base: uri_base, model: model)
else
raise "Unsupported provider: #{provider}"
end
end
# Determine the effective JSON mode for a batch based on expected null ratio
# This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
def json_mode_for_batch(items)
# If a specific mode is configured (not "auto"), always use it
configured_mode = provider_config[:json_mode]
return configured_mode if configured_mode.present? && configured_mode != "auto"
# Calculate expected null ratio for this batch
expected_null_count = items.count { |item| item.dig("expectedOutput", "category_name").nil? }
expected_null_ratio = expected_null_count.to_f / items.size
# If >50% of the batch is expected to return null, force strict mode
# This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
# and prevents unnecessary retries when nulls are legitimate
if expected_null_ratio > 0.5
Rails.logger.info("[Langfuse Experiment] Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode")
"strict"
else
# Use auto mode - let the auto-categorizer decide
"auto"
end
end
def normalize_null(value)
return nil if value.nil?
return nil if value == "null"
return nil if value.to_s.strip.empty?
value
end
def normalize_url(url)
return nil if url.nil?
url.to_s.downcase
.gsub(/^(https?:\/\/)?(www\.)?/, "")
.chomp("/")
.strip
end
def build_available_functions
# Simplified function definitions for chat eval
[
{
name: "get_accounts",
description: "Get user's financial accounts",
params_schema: { type: "object", properties: {}, required: [] }
},
{
name: "get_transactions",
description: "Get transactions with optional filters",
params_schema: {
type: "object",
properties: {
account_id: { type: "string" },
start_date: { type: "string" },
end_date: { type: "string" },
category: { type: "string" }
}
}
},
{
name: "get_balance_summary",
description: "Get balance summary across accounts",
params_schema: { type: "object", properties: {} }
},
{
name: "get_spending_by_category",
description: "Get spending breakdown by category",
params_schema: {
type: "object",
properties: {
start_date: { type: "string" },
end_date: { type: "string" }
}
}
}
]
end
def extract_function_calls(response)
return [] unless response.respond_to?(:messages)
response.messages.flat_map do |msg|
next [] unless msg.respond_to?(:function_calls)
msg.function_calls.map do |fc|
{ name: fc.name, arguments: fc.arguments }
end
end.compact
end
def evaluate_function_match(actual, expected)
return true if expected.empty? && actual.empty?
return false if expected.empty? != actual.empty?
expected_names = expected.map { |f| f["name"] || f[:name] }.sort
actual_names = actual.map { |f| f["name"] || f[:name] }.sort
expected_names == actual_names
end
end

View File

@@ -0,0 +1,68 @@
class Eval::Metrics::Base
attr_reader :eval_run
def initialize(eval_run)
@eval_run = eval_run
end
def calculate
raise NotImplementedError, "Subclasses must implement #calculate"
end
protected
def results
@results ||= eval_run.results.includes(:sample)
end
def samples
@samples ||= eval_run.dataset.samples
end
def total_count
results.count
end
def correct_count
results.where(correct: true).count
end
def incorrect_count
results.where(correct: false).count
end
def accuracy
return 0.0 if total_count.zero?
(correct_count.to_f / total_count * 100).round(2)
end
def avg_latency_ms
return nil if total_count.zero?
results.average(:latency_ms)&.round(0)
end
def total_cost
results.sum(:cost)&.to_f&.round(6)
end
def cost_per_sample
return nil if total_count.zero?
(total_cost / total_count).round(6)
end
def metrics_by_difficulty
%w[easy medium hard edge_case].index_with do |difficulty|
difficulty_results = results.joins(:sample).where(eval_samples: { difficulty: difficulty })
next nil if difficulty_results.empty?
correct = difficulty_results.where(correct: true).count
total = difficulty_results.count
{
count: total,
correct: correct,
accuracy: (correct.to_f / total * 100).round(2)
}
end.compact
end
end

View File

@@ -0,0 +1,101 @@
class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
def calculate
{
accuracy: accuracy,
exact_match_accuracy: exact_match_accuracy,
alternative_match_count: alternative_match_count,
precision: precision,
recall: recall,
f1_score: f1_score,
null_accuracy: null_accuracy,
hierarchical_accuracy: hierarchical_accuracy,
samples_processed: total_count,
samples_correct: correct_count,
avg_latency_ms: avg_latency_ms,
total_cost: total_cost,
cost_per_sample: cost_per_sample,
by_difficulty: metrics_by_difficulty,
by_category: metrics_by_category
}
end
private
def exact_match_accuracy
# Percentage of results that exactly match the primary expected category
return 0.0 if total_count.zero?
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
end
def alternative_match_count
# Number of results that matched an alternative (but not primary) category
results.where(alternative_match: true).count
end
def null_accuracy
# Accuracy for samples where null was expected
null_expected_results = results.where(null_expected: true)
return 100.0 if null_expected_results.empty?
correct = null_expected_results.where(null_returned: true).count
total = null_expected_results.count
(correct.to_f / total * 100).round(2)
end
def hierarchical_accuracy
# Percentage of results that match at hierarchical level (including exact matches)
return 0.0 if total_count.zero?
(results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
end
def precision
# True positives / (True positives + False positives)
# TP: Correct non-null predictions
# FP: Incorrect non-null predictions (predicted wrong category)
true_positives = results.where(correct: true, null_returned: false).count
false_positives = results.where(correct: false, null_returned: false).count
denominator = true_positives + false_positives
return 0.0 if denominator.zero?
(true_positives.to_f / denominator * 100).round(2)
end
def recall
# True positives / (True positives + False negatives)
# TP: Correct non-null predictions
# FN: Incorrectly returned null when category was expected
true_positives = results.where(correct: true, null_returned: false).count
false_negatives = results.where(null_expected: false, null_returned: true).count
denominator = true_positives + false_negatives
return 0.0 if denominator.zero?
(true_positives.to_f / denominator * 100).round(2)
end
def f1_score
return 0.0 if precision.zero? || recall.zero?
(2 * precision * recall / (precision + recall)).round(2)
end
def metrics_by_category
# Group results by expected category and calculate accuracy
category_metrics = {}
results.includes(:sample).each do |result|
expected = result.sample.expected_category_name || "null"
category_metrics[expected] ||= { correct: 0, total: 0 }
category_metrics[expected][:total] += 1
category_metrics[expected][:correct] += 1 if result.correct
end
category_metrics.transform_values do |metrics|
metrics.merge(
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
)
end
end
end

View File

@@ -0,0 +1,125 @@
class Eval::Metrics::ChatMetrics < Eval::Metrics::Base
def calculate
{
accuracy: accuracy,
function_selection_accuracy: function_selection_accuracy,
parameter_accuracy: parameter_accuracy,
response_relevance: response_relevance,
exact_match_rate: exact_match_rate,
error_rate: error_rate,
avg_functions_per_response: avg_functions_per_response,
samples_processed: total_count,
samples_correct: correct_count,
avg_latency_ms: avg_latency_ms,
total_cost: total_cost,
cost_per_sample: cost_per_sample,
by_difficulty: metrics_by_difficulty,
by_function: metrics_by_function
}
end
private
def function_selection_accuracy
# Percentage of samples where correct functions were called
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
correct = valid_results.count do |r|
r.metadata.dig("function_selection_correct") == true
end
(correct.to_f / valid_results.count * 100).round(2)
end
def parameter_accuracy
# Average parameter accuracy across all samples
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
scores = valid_results.map do |r|
r.metadata.dig("parameter_accuracy") || 0.0
end
(scores.sum / scores.size * 100).round(2)
end
def response_relevance
# Percentage of samples where response contained expected keywords
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
correct = valid_results.count do |r|
# If no keywords expected, consider it relevant
expected_keywords = r.metadata.dig("expected_keywords") || []
expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true
end
(correct.to_f / valid_results.count * 100).round(2)
end
def exact_match_rate
return 0.0 if total_count.zero?
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
end
def error_rate
return 0.0 if total_count.zero?
errors = results.count do |r|
r.metadata.dig("error").present? || r.actual_output.dig("error").present?
end
(errors.to_f / total_count * 100).round(2)
end
def avg_functions_per_response
valid_results = results.where.not("actual_output->>'error' IS NOT NULL")
return 0.0 if valid_results.empty?
total_functions = valid_results.sum do |r|
functions = r.actual_output.dig("functions") || []
functions.size
end
(total_functions.to_f / valid_results.count).round(2)
end
def metrics_by_function
# Group results by expected function and calculate accuracy
function_metrics = {}
results.includes(:sample).each do |result|
expected_functions = result.sample.expected_functions
expected_functions.each do |func|
name = func["name"]
next if name.nil?
function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 }
function_metrics[name][:total] += 1
# Check if this specific function was called correctly
actual_functions = result.actual_output.dig("functions") || []
if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) }
function_metrics[name][:correct] += 1
function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0)
end
end
end
function_metrics.transform_values do |metrics|
{
total: metrics[:total],
correct: metrics[:correct],
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2),
avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0
}
end
end
def normalize_name(name)
return nil if name.nil?
name.to_s.underscore.downcase
end
end

View File

@@ -0,0 +1,107 @@
class Eval::Metrics::MerchantDetectionMetrics < Eval::Metrics::Base
FUZZY_MATCH_THRESHOLD = 0.8
def calculate
{
accuracy: accuracy,
name_accuracy: name_accuracy,
fuzzy_name_accuracy: fuzzy_name_accuracy,
url_accuracy: url_accuracy,
false_positive_rate: false_positive_rate,
false_negative_rate: false_negative_rate,
samples_processed: total_count,
samples_correct: correct_count,
avg_latency_ms: avg_latency_ms,
total_cost: total_cost,
cost_per_sample: cost_per_sample,
avg_fuzzy_score: avg_fuzzy_score,
by_difficulty: metrics_by_difficulty
}
end
private
def name_accuracy
# Exact name match accuracy for non-null expected names
name_results = results.includes(:sample).select do |r|
r.sample.expected_business_name.present?
end
return 100.0 if name_results.empty?
correct = name_results.count do |r|
actual = r.actual_output.dig("business_name") || r.actual_output["business_name"]
expected = r.sample.expected_business_name
actual == expected
end
(correct.to_f / name_results.size * 100).round(2)
end
def fuzzy_name_accuracy
# Fuzzy name match accuracy (using fuzzy_score >= threshold)
name_results = results.includes(:sample).select do |r|
r.sample.expected_business_name.present?
end
return 100.0 if name_results.empty?
correct = name_results.count do |r|
(r.fuzzy_score || 0) >= FUZZY_MATCH_THRESHOLD
end
(correct.to_f / name_results.size * 100).round(2)
end
def url_accuracy
# URL match accuracy for non-null expected URLs
url_results = results.includes(:sample).select do |r|
r.sample.expected_business_url.present?
end
return 100.0 if url_results.empty?
correct = url_results.count do |r|
actual = r.actual_output.dig("business_url") || r.actual_output["business_url"]
expected = r.sample.expected_business_url
normalize_url(actual) == normalize_url(expected)
end
(correct.to_f / url_results.size * 100).round(2)
end
def false_positive_rate
# Rate of returning a merchant when null was expected
null_expected_results = results.where(null_expected: true)
return 0.0 if null_expected_results.empty?
false_positives = null_expected_results.where(null_returned: false).count
(false_positives.to_f / null_expected_results.count * 100).round(2)
end
def false_negative_rate
# Rate of returning null when a merchant was expected
merchant_expected_results = results.where(null_expected: false)
return 0.0 if merchant_expected_results.empty?
false_negatives = merchant_expected_results.where(null_returned: true).count
(false_negatives.to_f / merchant_expected_results.count * 100).round(2)
end
def avg_fuzzy_score
scores = results.where.not(fuzzy_score: nil).pluck(:fuzzy_score)
return nil if scores.empty?
(scores.sum / scores.size).round(4)
end
def normalize_url(url)
return nil if url.nil?
url.to_s.downcase
.gsub(/^(https?:\/\/)?(www\.)?/, "")
.chomp("/")
.strip
end
end

View File

@@ -0,0 +1,205 @@
class Eval::Reporters::ComparisonReporter
attr_reader :runs
def initialize(runs)
@runs = Array(runs).sort_by(&:model)
end
# Generate a text table for terminal display
def to_table
return "No runs to compare" if runs.empty?
headers = build_headers
rows = runs.map { |run| build_row(run) }
# Calculate column widths
all_rows = [ headers ] + rows
widths = headers.each_index.map do |i|
all_rows.map { |row| row[i].to_s.length }.max
end
# Build table
separator = "+" + widths.map { |w| "-" * (w + 2) }.join("+") + "+"
lines = []
lines << separator
lines << "| " + headers.each_with_index.map { |h, i| h.to_s.ljust(widths[i]) }.join(" | ") + " |"
lines << separator
rows.each do |row|
lines << "| " + row.each_with_index.map { |c, i| c.to_s.ljust(widths[i]) }.join(" | ") + " |"
end
lines << separator
lines.join("\n")
end
# Export to CSV file
def to_csv(file_path)
require "csv"
CSV.open(file_path, "wb") do |csv|
csv << csv_headers
runs.each { |run| csv << csv_row(run) }
end
file_path
end
# Generate summary with best model recommendations
def summary
return {} if runs.empty?
completed_runs = runs.select { |r| r.status == "completed" && r.metrics.present? }
return {} if completed_runs.empty?
best_accuracy = completed_runs.max_by { |r| r.metrics["accuracy"] || 0 }
lowest_cost = completed_runs.min_by { |r| r.total_cost || Float::INFINITY }
fastest = completed_runs.min_by { |r| r.metrics["avg_latency_ms"] || Float::INFINITY }
{
best_accuracy: {
model: best_accuracy.model,
value: best_accuracy.metrics["accuracy"],
run_id: best_accuracy.id
},
lowest_cost: {
model: lowest_cost.model,
value: lowest_cost.total_cost&.to_f,
run_id: lowest_cost.id
},
fastest: {
model: fastest.model,
value: fastest.metrics["avg_latency_ms"],
run_id: fastest.id
},
recommendation: generate_recommendation(best_accuracy, lowest_cost, fastest)
}
end
# Generate detailed comparison between runs
def detailed_comparison
return {} if runs.empty?
{
runs: runs.map(&:summary),
comparison: pairwise_comparisons,
summary: summary
}
end
private
def build_headers
[ "Model", "Status", "Accuracy", "Precision", "Recall", "F1", "Latency (ms)", "Cost ($)", "Samples" ]
end
def build_row(run)
metrics = run.metrics || {}
[
run.model,
run.status,
format_percentage(metrics["accuracy"]),
format_percentage(metrics["precision"]),
format_percentage(metrics["recall"]),
format_percentage(metrics["f1_score"]),
metrics["avg_latency_ms"]&.round(0) || "-",
format_cost(run.total_cost),
run.results.count
]
end
def csv_headers
[
"Run ID", "Model", "Provider", "Dataset", "Status",
"Accuracy", "Precision", "Recall", "F1 Score",
"Null Accuracy", "Hierarchical Accuracy",
"Avg Latency (ms)", "Total Cost", "Cost Per Sample",
"Samples Processed", "Samples Correct",
"Duration (s)", "Run Date"
]
end
def csv_row(run)
metrics = run.metrics || {}
[
run.id,
run.model,
run.provider,
run.dataset.name,
run.status,
metrics["accuracy"],
metrics["precision"],
metrics["recall"],
metrics["f1_score"],
metrics["null_accuracy"],
metrics["hierarchical_accuracy"],
metrics["avg_latency_ms"],
run.total_cost&.to_f,
metrics["cost_per_sample"],
metrics["samples_processed"],
metrics["samples_correct"],
run.duration_seconds,
run.completed_at&.iso8601
]
end
def format_percentage(value)
return "-" if value.nil?
"#{value}%"
end
def format_cost(value)
return "-" if value.nil?
"$#{value.to_f.round(4)}"
end
def pairwise_comparisons
return [] if runs.size < 2
comparisons = []
runs.combination(2).each do |run1, run2|
comparisons << {
models: [ run1.model, run2.model ],
accuracy_diff: ((run1.metrics["accuracy"] || 0) - (run2.metrics["accuracy"] || 0)).round(2),
cost_diff: ((run1.total_cost || 0) - (run2.total_cost || 0)).to_f.round(6),
latency_diff: ((run1.metrics["avg_latency_ms"] || 0) - (run2.metrics["avg_latency_ms"] || 0)).round(0)
}
end
comparisons
end
def generate_recommendation(best_accuracy, lowest_cost, fastest)
parts = []
# If one model wins all categories
if best_accuracy.id == lowest_cost.id && lowest_cost.id == fastest.id
return "#{best_accuracy.model} is the best choice overall (highest accuracy, lowest cost, fastest)."
end
# Accuracy recommendation
if best_accuracy.metrics["accuracy"] && best_accuracy.metrics["accuracy"] >= 90
parts << "For maximum accuracy, use #{best_accuracy.model} (#{best_accuracy.metrics['accuracy']}% accuracy)"
end
# Cost recommendation if significantly cheaper
if lowest_cost.total_cost && lowest_cost.total_cost > 0
cost_ratio = (best_accuracy.total_cost || 0) / lowest_cost.total_cost
if cost_ratio > 1.5
parts << "For cost efficiency, consider #{lowest_cost.model} (#{format_cost(lowest_cost.total_cost)} vs #{format_cost(best_accuracy.total_cost)})"
end
end
# Speed recommendation
if fastest.metrics["avg_latency_ms"] && fastest.id != best_accuracy.id
latency_ratio = (best_accuracy.metrics["avg_latency_ms"] || 0) / (fastest.metrics["avg_latency_ms"] || 1)
if latency_ratio > 1.5
parts << "For speed, consider #{fastest.model} (#{fastest.metrics['avg_latency_ms']}ms vs #{best_accuracy.metrics['avg_latency_ms']}ms)"
end
end
parts.empty? ? "All models perform similarly." : parts.join(". ")
end
end

70
app/models/eval/result.rb Normal file
View File

@@ -0,0 +1,70 @@
class Eval::Result < ApplicationRecord
self.table_name = "eval_results"
belongs_to :run, class_name: "Eval::Run", foreign_key: :eval_run_id
belongs_to :sample, class_name: "Eval::Sample", foreign_key: :eval_sample_id
validates :actual_output, presence: true
validates :correct, inclusion: { in: [ true, false ] }
scope :correct, -> { where(correct: true) }
scope :incorrect, -> { where(correct: false) }
scope :with_nulls_returned, -> { where(null_returned: true) }
scope :with_nulls_expected, -> { where(null_expected: true) }
scope :exact_matches, -> { where(exact_match: true) }
scope :hierarchical_matches, -> { where(hierarchical_match: true) }
# Get actual category (for categorization results)
def actual_category_name
actual_output.dig("category_name") || actual_output["category_name"]
end
# Get actual merchant info (for merchant detection results)
def actual_business_name
actual_output.dig("business_name") || actual_output["business_name"]
end
def actual_business_url
actual_output.dig("business_url") || actual_output["business_url"]
end
# Get actual functions called (for chat results)
def actual_functions
actual_output.dig("functions") || actual_output["functions"] || []
end
# Get actual response text (for chat results)
def actual_response_text
actual_output.dig("response_text") || actual_output["response_text"]
end
# Summary for display
def summary
{
sample_id: sample_id,
correct: correct,
exact_match: exact_match,
expected: sample.expected_output,
actual: actual_output,
latency_ms: latency_ms,
cost: cost&.to_f
}
end
# Detailed comparison with expected
def detailed_comparison
{
sample_difficulty: sample.difficulty,
sample_tags: sample.tags,
input: sample.input_data,
expected: sample.expected_output,
actual: actual_output,
correct: correct,
exact_match: exact_match,
hierarchical_match: hierarchical_match,
null_expected: null_expected,
null_returned: null_returned,
fuzzy_score: fuzzy_score
}
end
end

88
app/models/eval/run.rb Normal file
View File

@@ -0,0 +1,88 @@
class Eval::Run < ApplicationRecord
self.table_name = "eval_runs"
belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
has_many :results, class_name: "Eval::Result", foreign_key: :eval_run_id, dependent: :destroy
validates :provider, :model, :status, presence: true
validates :status, inclusion: { in: %w[pending running completed failed] }
scope :pending, -> { where(status: "pending") }
scope :running, -> { where(status: "running") }
scope :completed, -> { where(status: "completed") }
scope :failed, -> { where(status: "failed") }
scope :for_model, ->(model) { where(model: model) }
scope :for_provider, ->(provider) { where(provider: provider) }
# Calculate duration in seconds
def duration_seconds
return nil unless started_at && completed_at
(completed_at - started_at).to_i
end
# Get accuracy from metrics or calculate
def accuracy
metrics.dig("accuracy") || calculate_accuracy
end
# Start the evaluation run
def start!
update!(status: "running", started_at: Time.current)
end
# Complete the evaluation run with metrics
def complete!(calculated_metrics)
update!(
status: "completed",
completed_at: Time.current,
metrics: calculated_metrics,
total_prompt_tokens: results.sum(:prompt_tokens),
total_completion_tokens: results.sum(:completion_tokens),
total_cost: results.sum(:cost)
)
end
# Fail the evaluation run
def fail!(error)
update!(
status: "failed",
completed_at: Time.current,
error_message: error.is_a?(Exception) ? "#{error.class}: #{error.message}" : error.to_s
)
end
# Summary for display
def summary
{
id: id,
name: name,
dataset: dataset.name,
model: model,
provider: provider,
status: status,
accuracy: accuracy,
total_cost: total_cost&.to_f,
duration: duration_seconds,
samples_processed: results.count,
samples_correct: results.where(correct: true).count,
created_at: created_at
}
end
# Compare this run to another
def compare_to(other_run)
{
accuracy_diff: (accuracy || 0) - (other_run.accuracy || 0),
cost_diff: (total_cost || 0) - (other_run.total_cost || 0),
this_model: model,
other_model: other_run.model
}
end
private
def calculate_accuracy
return 0.0 if results.empty?
(results.where(correct: true).count.to_f / results.count * 100).round(2)
end
end

View File

@@ -0,0 +1,82 @@
class Eval::Runners::Base
attr_reader :eval_run
def initialize(eval_run)
@eval_run = eval_run
end
def run
eval_run.start!
begin
process_samples
metrics = calculate_metrics
eval_run.complete!(metrics)
rescue => e
eval_run.fail!(e)
raise
end
eval_run
end
protected
def process_samples
raise NotImplementedError, "Subclasses must implement #process_samples"
end
def calculate_metrics
raise NotImplementedError, "Subclasses must implement #calculate_metrics"
end
def samples
eval_run.dataset.samples
end
def provider
@provider ||= build_provider
end
def model
eval_run.model
end
private
def build_provider
case eval_run.provider
when "openai"
build_openai_provider
else
raise "Unsupported provider: #{eval_run.provider}"
end
end
def build_openai_provider
access_token = eval_run.provider_config["access_token"].presence ||
ENV["OPENAI_ACCESS_TOKEN"].presence ||
Setting.openai_access_token
raise "OpenAI access token not configured" unless access_token.present?
uri_base = eval_run.provider_config["uri_base"].presence ||
ENV["OPENAI_URI_BASE"].presence ||
Setting.openai_uri_base
Provider::Openai.new(access_token, uri_base: uri_base, model: model)
end
def record_result(sample:, actual_output:, correct:, **attributes)
eval_run.results.create!(
sample: sample,
actual_output: actual_output,
correct: correct,
**attributes
)
end
def log_progress(message)
Rails.logger.info("[Eval::Runner] #{message}")
end
end

View File

@@ -0,0 +1,199 @@
class Eval::Runners::CategorizationRunner < Eval::Runners::Base
DEFAULT_BATCH_SIZE = 25 # Matches Provider::Openai limit
protected
def process_samples
all_samples = samples.to_a
batch_size = effective_batch_size
log_progress("Processing #{all_samples.size} samples in batches of #{batch_size}")
all_samples.each_slice(batch_size).with_index do |batch, batch_idx|
log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / batch_size).ceil}")
process_batch(batch)
end
end
# Use smaller batches for custom providers (local LLMs) to reduce context length
def effective_batch_size
eval_run.provider_config["batch_size"]&.to_i || DEFAULT_BATCH_SIZE
end
# Get JSON mode from provider config (optional override)
# Valid values: "strict", "json_object", "none"
def json_mode
eval_run.provider_config["json_mode"]
end
def calculate_metrics
Eval::Metrics::CategorizationMetrics.new(eval_run).calculate
end
private
def process_batch(batch_samples)
return if batch_samples.empty?
# Build inputs for the provider
transactions = batch_samples.map do |sample|
sample.to_transaction_input.merge(id: sample.id)
end
# Get categories from first sample's context (should be shared)
# Symbolize keys since Provider::Openai::AutoCategorizer expects symbol keys
categories = batch_samples.first.categories_context.map(&:deep_symbolize_keys)
# Determine effective JSON mode for this batch
# If the batch has many expected nulls and we're using auto mode, force strict mode
# to prevent the auto-categorizer from incorrectly retrying (it would see many nulls
# and think strict mode is broken, when actually the nulls are expected)
effective_json_mode = json_mode_for_batch(batch_samples)
start_time = Time.current
begin
response = provider.auto_categorize(
transactions: transactions,
user_categories: categories,
model: model,
json_mode: effective_json_mode
)
latency_ms = ((Time.current - start_time) * 1000).to_i
per_sample_latency = latency_ms / batch_samples.size
if response.success?
record_batch_results(batch_samples, response.data, per_sample_latency)
else
record_batch_errors(batch_samples, response.error, per_sample_latency)
end
rescue => e
latency_ms = ((Time.current - start_time) * 1000).to_i
per_sample_latency = latency_ms / batch_samples.size
record_batch_errors(batch_samples, e, per_sample_latency)
end
end
def record_batch_results(batch_samples, categorizations, per_sample_latency)
batch_samples.each do |sample|
# Find the categorization result for this sample
categorization = categorizations.find { |c| c.transaction_id.to_s == sample.id.to_s }
actual_category = categorization&.category_name
# Normalize "null" string to nil
actual_category = nil if actual_category == "null"
expected_category = sample.expected_category_name
acceptable_categories = sample.all_acceptable_categories
# Evaluate correctness - check primary expected and alternatives
correct = evaluate_correctness_with_alternatives(actual_category, expected_category, acceptable_categories)
exact_match = actual_category == expected_category
alternative_match = acceptable_categories.include?(actual_category) && !exact_match
hierarchical = evaluate_hierarchical_match(actual_category, expected_category, sample)
record_result(
sample: sample,
actual_output: { "category_name" => actual_category },
correct: correct,
exact_match: exact_match,
alternative_match: alternative_match,
hierarchical_match: hierarchical,
null_expected: expected_category.nil?,
null_returned: actual_category.nil?,
latency_ms: per_sample_latency
)
end
end
def record_batch_errors(batch_samples, error, per_sample_latency)
error_message = error.is_a?(Exception) ? error.message : error.to_s
batch_samples.each do |sample|
record_result(
sample: sample,
actual_output: { "error" => error_message },
correct: false,
exact_match: false,
hierarchical_match: false,
null_expected: sample.expected_category_name.nil?,
null_returned: true,
latency_ms: per_sample_latency,
metadata: { "error" => error_message }
)
end
end
# Determine the effective JSON mode for a batch based on expected null ratio
# This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
def json_mode_for_batch(batch_samples)
# If a specific mode is configured (not "auto"), always use it
return json_mode if json_mode.present? && json_mode != "auto"
# Calculate expected null ratio for this batch
expected_null_count = batch_samples.count { |s| s.expected_category_name.nil? }
expected_null_ratio = expected_null_count.to_f / batch_samples.size
# If >50% of the batch is expected to return null, force strict mode
# This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
# and prevents unnecessary retries when nulls are legitimate
if expected_null_ratio > 0.5
log_progress("Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode to prevent false retry")
"strict"
else
# Use auto mode - let the auto-categorizer decide
"auto"
end
end
def evaluate_correctness(actual, expected)
# Both null = correct
return true if actual.nil? && expected.nil?
# Expected null but got value = incorrect
return false if expected.nil? && actual.present?
# Expected value but got null = incorrect
return false if actual.nil? && expected.present?
# Compare values
actual == expected
end
def evaluate_correctness_with_alternatives(actual, expected, acceptable_categories)
# Both null = correct
return true if actual.nil? && expected.nil?
# Expected null but got value = incorrect
return false if expected.nil? && actual.present?
# Expected value but got null = incorrect
return false if actual.nil? && expected.present?
# Check if actual matches any acceptable category (primary or alternatives)
acceptable_categories.include?(actual)
end
def evaluate_hierarchical_match(actual, expected, sample)
return false if actual.nil? || expected.nil?
return true if actual == expected
# Check if actual matches parent of expected category
categories = sample.categories_context
# Find the expected category
expected_cat = categories.find { |c| c["name"] == expected }
return false unless expected_cat
# If expected has a parent, check if actual matches the parent
if expected_cat["parent_id"]
parent = categories.find { |c| c["id"].to_s == expected_cat["parent_id"].to_s }
return parent && parent["name"] == actual
end
# Also check if actual is a subcategory of expected (reverse direction)
actual_cat = categories.find { |c| c["name"] == actual }
return false unless actual_cat
if actual_cat["parent_id"]
parent = categories.find { |c| c["id"].to_s == actual_cat["parent_id"].to_s }
return parent && parent["name"] == expected
end
false
end
end

View File

@@ -0,0 +1,255 @@
class Eval::Runners::ChatRunner < Eval::Runners::Base
# Chat samples are processed one at a time (not batched)
# because each has unique context and function calling requirements
protected
def process_samples
all_samples = samples.to_a
log_progress("Processing #{all_samples.size} chat samples")
all_samples.each_with_index do |sample, idx|
log_progress("Processing sample #{idx + 1}/#{all_samples.size}")
process_sample(sample)
end
end
def calculate_metrics
Eval::Metrics::ChatMetrics.new(eval_run).calculate
end
private
def process_sample(sample)
prompt = sample.chat_prompt
start_time = Time.current
begin
response = provider.chat_response(
prompt,
model: model,
instructions: build_instructions,
functions: build_function_definitions
)
latency_ms = ((Time.current - start_time) * 1000).to_i
if response.success?
record_chat_result(sample, response.data, latency_ms)
else
record_error_result(sample, response.error, latency_ms)
end
rescue => e
latency_ms = ((Time.current - start_time) * 1000).to_i
record_error_result(sample, e, latency_ms)
end
end
def record_chat_result(sample, chat_response, latency_ms)
# Extract function calls from response
actual_functions = extract_functions_from_response(chat_response)
# Extract response text
response_text = extract_response_text(chat_response)
# Evaluate function calling accuracy
expected_functions = sample.expected_functions
function_match = evaluate_function_match(actual_functions, expected_functions)
# Evaluate response content
expected_keywords = sample.expected_response_contains
response_match = evaluate_response_contains(response_text, expected_keywords)
# Overall correctness: functions are correct AND response contains expected keywords
correct = function_match[:correct] && response_match
record_result(
sample: sample,
actual_output: {
"functions" => actual_functions,
"response_text" => response_text,
"function_match_details" => function_match
},
correct: correct,
exact_match: function_match[:exact_match],
latency_ms: latency_ms,
metadata: {
"function_selection_correct" => function_match[:selection_correct],
"parameter_accuracy" => function_match[:parameter_accuracy],
"response_keywords_found" => response_match,
"expected_functions" => expected_functions,
"expected_keywords" => expected_keywords
}
)
end
def record_error_result(sample, error, latency_ms)
error_message = error.is_a?(Exception) ? error.message : error.to_s
record_result(
sample: sample,
actual_output: { "error" => error_message },
correct: false,
exact_match: false,
latency_ms: latency_ms,
metadata: { "error" => error_message }
)
end
def extract_functions_from_response(chat_response)
# ChatResponse has function_requests array
function_requests = chat_response.function_requests || []
function_requests.map do |req|
{
"name" => req.function_name,
"params" => parse_function_args(req.function_args)
}
end
end
def parse_function_args(args)
return {} if args.nil?
return args if args.is_a?(Hash)
JSON.parse(args)
rescue JSON::ParserError
{}
end
def extract_response_text(chat_response)
# ChatResponse has messages array with output_text
messages = chat_response.messages || []
messages.map(&:output_text).compact.join("\n")
end
def evaluate_function_match(actual_functions, expected_functions)
return { correct: true, exact_match: true, selection_correct: true, parameter_accuracy: 1.0 } if expected_functions.empty? && actual_functions.empty?
return { correct: false, exact_match: false, selection_correct: false, parameter_accuracy: 0.0 } if expected_functions.empty? && actual_functions.any?
# Check function selection accuracy
expected_names = expected_functions.map { |f| normalize_function_name(f["name"]) }.compact
actual_names = actual_functions.map { |f| normalize_function_name(f["name"]) }.compact
selection_correct = expected_names.all? { |name| actual_names.include?(name) }
# Check parameter accuracy for matched functions
param_scores = []
expected_functions.each do |expected_func|
expected_name = normalize_function_name(expected_func["name"])
actual_func = actual_functions.find { |f| normalize_function_name(f["name"]) == expected_name }
if actual_func
param_score = evaluate_parameters(actual_func["params"], expected_func["params"] || {})
param_scores << param_score
else
param_scores << 0.0
end
end
parameter_accuracy = param_scores.empty? ? 0.0 : (param_scores.sum / param_scores.size).round(4)
# Exact match requires same functions with same parameters
exact_match = selection_correct && parameter_accuracy == 1.0
# Correct if all expected functions were called (parameters don't have to be exact)
correct = selection_correct
{
correct: correct,
exact_match: exact_match,
selection_correct: selection_correct,
parameter_accuracy: parameter_accuracy
}
end
def normalize_function_name(name)
return nil if name.nil?
# Convert to snake_case and downcase
name.to_s.underscore.downcase
end
def evaluate_parameters(actual_params, expected_params)
return 1.0 if expected_params.empty?
return 0.0 if actual_params.nil?
actual_params = actual_params.stringify_keys
expected_params = expected_params.stringify_keys
matches = 0
total = expected_params.size
expected_params.each do |key, expected_value|
actual_value = actual_params[key]
if values_match?(actual_value, expected_value)
matches += 1
end
end
(matches.to_f / total).round(4)
end
def values_match?(actual, expected)
return true if actual == expected
return true if actual.to_s.downcase == expected.to_s.downcase
# For arrays, check if all expected values are present
if expected.is_a?(Array) && actual.is_a?(Array)
expected_normalized = expected.map { |v| v.to_s.downcase }
actual_normalized = actual.map { |v| v.to_s.downcase }
return expected_normalized.all? { |v| actual_normalized.include?(v) }
end
# For dates, try to parse and compare
if expected.to_s =~ /^\d{4}-\d{2}-\d{2}$/
begin
expected_date = Date.parse(expected.to_s)
actual_date = Date.parse(actual.to_s)
return expected_date == actual_date
rescue
# Not valid dates, fall through
end
end
false
end
def evaluate_response_contains(response_text, expected_keywords)
return true if expected_keywords.empty?
return false if response_text.nil? || response_text.empty?
normalized_response = response_text.downcase
expected_keywords.all? do |keyword|
normalized_response.include?(keyword.to_s.downcase)
end
end
def build_instructions
# Simple instructions for evaluation - we don't have a real user/family context
<<~PROMPT
You are a financial assistant helping users understand their financial data.
Use the functions available to answer questions about accounts, transactions, and financial statements.
Today's date is #{Date.current}.
PROMPT
end
def build_function_definitions
# Return the function definitions that the chat would normally have
[
build_function_definition("get_transactions", "Get paginated transactions with optional filters"),
build_function_definition("get_accounts", "Get all accounts with balances and historical data"),
build_function_definition("get_balance_sheet", "Get current net worth, assets, and liabilities"),
build_function_definition("get_income_statement", "Get income and expenses by category for a period")
]
end
def build_function_definition(name, description)
{
name: name,
description: description,
params_schema: { type: "object", properties: {}, additionalProperties: true },
strict: false
}
end
end

View File

@@ -0,0 +1,199 @@
class Eval::Runners::MerchantDetectionRunner < Eval::Runners::Base
BATCH_SIZE = 25 # Matches Provider::Openai limit
FUZZY_MATCH_THRESHOLD = 0.8
protected
def process_samples
all_samples = samples.to_a
log_progress("Processing #{all_samples.size} samples in batches of #{BATCH_SIZE}")
all_samples.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / BATCH_SIZE).ceil}")
process_batch(batch)
end
end
def calculate_metrics
Eval::Metrics::MerchantDetectionMetrics.new(eval_run).calculate
end
private
def process_batch(batch_samples)
# Build inputs for the provider
transactions = batch_samples.map do |sample|
sample.to_transaction_input.merge(id: sample.id)
end
# Get merchants from first sample's context (should be shared)
# Symbolize keys since Provider::Openai::AutoMerchantDetector expects symbol keys
merchants = batch_samples.first.merchants_context.map(&:deep_symbolize_keys)
start_time = Time.current
begin
response = provider.auto_detect_merchants(
transactions: transactions,
user_merchants: merchants,
model: model
)
latency_ms = ((Time.current - start_time) * 1000).to_i
per_sample_latency = latency_ms / batch_samples.size
if response.success?
record_batch_results(batch_samples, response.data, per_sample_latency)
else
record_batch_errors(batch_samples, response.error, per_sample_latency)
end
rescue => e
latency_ms = ((Time.current - start_time) * 1000).to_i
per_sample_latency = latency_ms / batch_samples.size
record_batch_errors(batch_samples, e, per_sample_latency)
end
end
def record_batch_results(batch_samples, merchants_detected, per_sample_latency)
batch_samples.each do |sample|
# Find the merchant detection result for this sample
detection = merchants_detected.find { |m| m.transaction_id.to_s == sample.id.to_s }
actual_name = normalize_null(detection&.business_name)
actual_url = normalize_null(detection&.business_url)
expected_name = sample.expected_business_name
expected_url = sample.expected_business_url
# Evaluate correctness
name_match = evaluate_name_match(actual_name, expected_name)
url_match = evaluate_url_match(actual_url, expected_url)
fuzzy_score = calculate_fuzzy_score(actual_name, expected_name)
# Overall correct if both name and URL match expectations
correct = name_match && url_match
# Exact match requires both to be exactly equal
exact_match = actual_name == expected_name && normalize_url(actual_url) == normalize_url(expected_url)
record_result(
sample: sample,
actual_output: { "business_name" => actual_name, "business_url" => actual_url },
correct: correct,
exact_match: exact_match,
fuzzy_score: fuzzy_score,
null_expected: expected_name.nil? && expected_url.nil?,
null_returned: actual_name.nil? && actual_url.nil?,
latency_ms: per_sample_latency
)
end
end
def record_batch_errors(batch_samples, error, per_sample_latency)
error_message = error.is_a?(Exception) ? error.message : error.to_s
batch_samples.each do |sample|
record_result(
sample: sample,
actual_output: { "error" => error_message },
correct: false,
exact_match: false,
fuzzy_score: 0.0,
null_expected: sample.expected_business_name.nil?,
null_returned: true,
latency_ms: per_sample_latency,
metadata: { "error" => error_message }
)
end
end
def normalize_null(value)
return nil if value.nil?
return nil if value == "null"
return nil if value.to_s.strip.empty?
value
end
def evaluate_name_match(actual, expected)
# Both null = correct
return true if actual.nil? && expected.nil?
# Expected null but got value = false positive
return false if expected.nil? && actual.present?
# Expected value but got null = false negative
return false if actual.nil? && expected.present?
# Use fuzzy matching for name comparison
fuzzy_match?(actual, expected)
end
def evaluate_url_match(actual, expected)
# Both null = correct
return true if actual.nil? && expected.nil?
# Expected null but got value = false positive
return false if expected.nil? && actual.present?
# Expected value but got null = false negative
return false if actual.nil? && expected.present?
# Normalize and compare URLs
normalize_url(actual) == normalize_url(expected)
end
def normalize_url(url)
return nil if url.nil?
url.to_s.downcase
.gsub(/^(https?:\/\/)?(www\.)?/, "")
.chomp("/")
.strip
end
def fuzzy_match?(actual, expected)
return false if actual.nil? || expected.nil?
calculate_fuzzy_score(actual, expected) >= FUZZY_MATCH_THRESHOLD
end
def calculate_fuzzy_score(actual, expected)
return 1.0 if actual == expected
return 0.0 if actual.nil? || expected.nil?
# Simple Levenshtein distance-based similarity
# Normalize strings for comparison
a = actual.to_s.downcase.strip
b = expected.to_s.downcase.strip
return 1.0 if a == b
# Calculate Levenshtein distance
distance = levenshtein_distance(a, b)
max_length = [ a.length, b.length ].max
return 0.0 if max_length == 0
# Convert distance to similarity score (0.0 to 1.0)
(1.0 - (distance.to_f / max_length)).round(4)
end
def levenshtein_distance(s1, s2)
m = s1.length
n = s2.length
return m if n == 0
return n if m == 0
# Create distance matrix
d = Array.new(m + 1) { Array.new(n + 1) }
(0..m).each { |i| d[i][0] = i }
(0..n).each { |j| d[0][j] = j }
(1..n).each do |j|
(1..m).each do |i|
cost = s1[i - 1] == s2[j - 1] ? 0 : 1
d[i][j] = [
d[i - 1][j] + 1, # deletion
d[i][j - 1] + 1, # insertion
d[i - 1][j - 1] + cost # substitution
].min
end
end
d[m][n]
end
end

88
app/models/eval/sample.rb Normal file
View File

@@ -0,0 +1,88 @@
class Eval::Sample < ApplicationRecord
self.table_name = "eval_samples"
belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
has_many :results, class_name: "Eval::Result", foreign_key: :eval_sample_id, dependent: :destroy
validates :input_data, :expected_output, presence: true
validates :difficulty, inclusion: { in: %w[easy medium hard manual edge_case] }
scope :easy, -> { where(difficulty: "easy") }
scope :medium, -> { where(difficulty: "medium") }
scope :hard, -> { where(difficulty: "hard") }
scope :edge_cases, -> { where(difficulty: "edge_case") }
scope :with_tag, ->(tag) { where("? = ANY(tags)", tag) }
scope :with_any_tags, ->(tags) { where("tags && ARRAY[?]::varchar[]", tags) }
# Convert to format expected by AutoCategorizer
def to_transaction_input
input_data.deep_symbolize_keys
end
# Get categories from context (for categorization evals)
def categories_context
context_data.dig("categories") || []
end
# Get merchants from context (for merchant detection evals)
def merchants_context
context_data.dig("merchants") || []
end
# Get mock data from context (for chat evals)
def mock_data
context_data.dig("mock_data") || input_data.dig("mock_data") || {}
end
# Get the chat prompt (for chat evals)
def chat_prompt
input_data.dig("prompt") || input_data["prompt"]
end
# Get expected functions (for chat evals)
def expected_functions
expected_output.dig("functions") || expected_output["functions"] || []
end
# Get expected response keywords (for chat evals)
def expected_response_contains
expected_output.dig("response_contains") || expected_output["response_contains"] || []
end
# Get expected category name (for categorization evals)
def expected_category_name
expected_output.dig("category_name") || expected_output["category_name"]
end
# Get acceptable alternative category names (for categorization evals)
# These are categories that are also considered correct answers
def acceptable_alternatives
expected_output.dig("acceptable_alternatives") || expected_output["acceptable_alternatives"] || []
end
# Get all acceptable category names (primary + alternatives)
def all_acceptable_categories
[ expected_category_name, *acceptable_alternatives ].compact
end
# Get expected merchant info (for merchant detection evals)
def expected_business_name
expected_output.dig("business_name") || expected_output["business_name"]
end
def expected_business_url
expected_output.dig("business_url") || expected_output["business_url"]
end
# Check if null is expected
def expects_null?
case dataset.eval_type
when "categorization"
expected_category_name.nil?
when "merchant_detection"
expected_business_name.nil? && expected_business_url.nil?
else
false
end
end
end

View File

@@ -51,7 +51,7 @@ class Provider::Openai < Provider
@uri_base.present?
end
def auto_categorize(transactions: [], user_categories: [], model: "", family: nil)
def auto_categorize(transactions: [], user_categories: [], model: "", family: nil, json_mode: nil)
with_provider_response do
raise Error, "Too many transactions to auto-categorize. Max is 25 per request." if transactions.size > 25
if user_categories.blank?
@@ -74,7 +74,8 @@ class Provider::Openai < Provider
user_categories: user_categories,
custom_provider: custom_provider?,
langfuse_trace: trace,
family: family
family: family,
json_mode: json_mode
).auto_categorize
trace&.update(output: result.map(&:to_h))
@@ -83,7 +84,7 @@ class Provider::Openai < Provider
end
end
def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil)
def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil, json_mode: nil)
with_provider_response do
raise Error, "Too many transactions to auto-detect merchants. Max is 25 per request." if transactions.size > 25
@@ -101,7 +102,8 @@ class Provider::Openai < Provider
user_merchants: user_merchants,
custom_provider: custom_provider?,
langfuse_trace: trace,
family: family
family: family,
json_mode: json_mode
).auto_detect_merchants
trace&.update(output: result.map(&:to_h))

View File

@@ -1,9 +1,22 @@
class Provider::Openai::AutoCategorizer
include Provider::Openai::Concerns::UsageRecorder
attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family
# JSON response format modes for custom providers
# - "strict": Use strict JSON schema (requires full OpenAI API compatibility)
# - "json_object": Use json_object response format (broader compatibility)
# - "none": No response format constraint (maximum compatibility with local LLMs)
JSON_MODE_STRICT = "strict"
JSON_MODE_OBJECT = "json_object"
JSON_MODE_NONE = "none"
JSON_MODE_AUTO = "auto"
def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil)
# Threshold for auto mode: if more than this percentage returns null, retry with none mode
# This is a heuristic to detect when strict JSON mode is breaking the model's ability to reason
AUTO_MODE_NULL_THRESHOLD = 0.5
attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family, :json_mode
def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil)
@client = client
@model = model
@transactions = transactions
@@ -11,6 +24,32 @@ class Provider::Openai::AutoCategorizer
@custom_provider = custom_provider
@langfuse_trace = langfuse_trace
@family = family
@json_mode = json_mode || default_json_mode
end
VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze
# Determine default JSON mode based on configuration hierarchy:
# 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override
# 2. Setting.openai_json_mode - user-configured in app settings
# 3. Default: auto mode (recommended for all providers)
#
# Mode descriptions:
# - "auto": Tries strict first, falls back to none if >50% fail (recommended default)
# - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose <think> tags
# - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output
# - "json_object": Middle ground, broader compatibility than strict
def default_json_mode
# 1. Check environment variable first (allows runtime override for testing)
env_mode = ENV["LLM_JSON_MODE"]
return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode)
# 2. Check app settings (user-configured)
setting_mode = Setting.openai_json_mode
return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode)
# 3. Default: auto mode for all providers (tries strict first, falls back to none if needed)
JSON_MODE_AUTO
end
def auto_categorize
@@ -22,6 +61,40 @@ class Provider::Openai::AutoCategorizer
end
def instructions
if custom_provider
simple_instructions
else
detailed_instructions
end
end
# Simplified instructions for smaller/local LLMs
def simple_instructions
<<~INSTRUCTIONS.strip_heredoc
Categorize transactions into the given categories. Return JSON only. Do not explain your reasoning.
CRITICAL RULES:
1. Match transaction_id exactly from input
2. Use EXACT category_name from the provided list, or "null" if unsure
3. Match expense transactions to expense categories only
4. Match income transactions to income categories only
5. Return "null" if the description is generic/ambiguous (e.g., "POS DEBIT", "ACH WITHDRAWAL", "CHECK #1234")
6. Prefer MORE SPECIFIC subcategories over general parent categories when available
CATEGORY HIERARCHY NOTES:
- Use "Restaurants" for sit-down restaurants, "Fast Food" for quick service chains
- Use "Coffee Shops" for coffee places, "Food & Drink" only when type is unclear
- Use "Shopping" for general retail, big-box stores, and online marketplaces
- Use "Groceries" for dedicated grocery stores ONLY
- For income: use "Salary" for payroll/employer deposits, "Income" for generic income sources
Output JSON format only (no markdown, no explanation):
{"categorizations": [{"transaction_id": "...", "category_name": "..."}]}
INSTRUCTIONS
end
# Detailed instructions for larger models like GPT-4
def detailed_instructions
<<~INSTRUCTIONS.strip_heredoc
You are an assistant to a consumer personal finance app. You will be provided a list
of the user's transactions and a list of the user's categories. Your job is to auto-categorize
@@ -87,19 +160,68 @@ class Provider::Openai::AutoCategorizer
end
def auto_categorize_openai_generic
if json_mode == JSON_MODE_AUTO
auto_categorize_with_auto_mode
else
auto_categorize_with_mode(json_mode)
end
rescue Faraday::BadRequestError => e
# If strict mode fails (HTTP 400), fall back to none mode
# This handles providers that don't support json_schema response format
if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO
Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}")
auto_categorize_with_mode(JSON_MODE_NONE)
else
raise
end
end
# Auto mode: try strict first, fall back to none if too many nulls or missing results
#
# This uses pure heuristics to detect when strict JSON mode is breaking the model's
# ability to reason. Models that can't reason well in strict mode often:
# 1. Return null for everything, OR
# 2. Simply omit transactions they can't categorize (returning fewer results than input)
#
# The heuristic is simple: if >50% of results are null or missing, the model likely
# needs the freedom to reason in its output (which strict mode prevents).
def auto_categorize_with_auto_mode
result = auto_categorize_with_mode(JSON_MODE_STRICT)
null_count = result.count { |r| r.category_name.nil? || r.category_name == "null" }
missing_count = transactions.size - result.size
failed_count = null_count + missing_count
failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0
if failed_ratio > AUTO_MODE_NULL_THRESHOLD
Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode")
auto_categorize_with_mode(JSON_MODE_NONE)
else
result
end
end
def auto_categorize_with_mode(mode)
span = langfuse_trace&.span(name: "auto_categorize_api_call", input: {
model: model.presence || Provider::Openai::DEFAULT_MODEL,
transactions: transactions,
user_categories: user_categories
user_categories: user_categories,
json_mode: mode
})
response = client.chat(parameters: {
# Build parameters with configurable JSON response format
params = {
model: model.presence || Provider::Openai::DEFAULT_MODEL,
messages: [
{ role: "system", content: instructions },
{ role: "user", content: developer_message }
],
response_format: {
{ role: "user", content: developer_message_for_generic }
]
}
# Add response format based on json_mode setting
case mode
when JSON_MODE_STRICT
params[:response_format] = {
type: "json_schema",
json_schema: {
name: "auto_categorize_personal_finance_transactions",
@@ -107,9 +229,14 @@ class Provider::Openai::AutoCategorizer
schema: json_schema
}
}
})
when JSON_MODE_OBJECT
params[:response_format] = { type: "json_object" }
# JSON_MODE_NONE: no response_format constraint
end
Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")}")
response = client.chat(parameters: params)
Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})")
categorizations = extract_categorizations_generic(response)
result = build_response(categorizations)
@@ -120,7 +247,8 @@ class Provider::Openai::AutoCategorizer
operation: "auto_categorize",
metadata: {
transaction_count: transactions.size,
category_count: user_categories.size
category_count: user_categories.size,
json_mode: mode
}
)
@@ -143,9 +271,72 @@ class Provider::Openai::AutoCategorizer
end
def normalize_category_name(category_name)
return nil if category_name == "null"
# Convert to string to handle non-string LLM outputs (numbers, booleans, etc.)
normalized = category_name.to_s.strip
return nil if normalized.empty? || normalized == "null" || normalized.downcase == "null"
category_name
# Try exact match first
exact_match = user_categories.find { |c| c[:name] == normalized }
return exact_match[:name] if exact_match
# Try case-insensitive match
case_insensitive_match = user_categories.find { |c| c[:name].to_s.downcase == normalized.downcase }
return case_insensitive_match[:name] if case_insensitive_match
# Try partial/fuzzy match (for common variations)
fuzzy_match = find_fuzzy_category_match(normalized)
return fuzzy_match if fuzzy_match
# Return normalized string if no match found (will be treated as uncategorized)
normalized
end
# Find a fuzzy match for category names with common variations
def find_fuzzy_category_match(category_name)
# Ensure string input for string operations
input_str = category_name.to_s
normalized_input = input_str.downcase.gsub(/[^a-z0-9]/, "")
user_categories.each do |cat|
cat_name_str = cat[:name].to_s
normalized_cat = cat_name_str.downcase.gsub(/[^a-z0-9]/, "")
# Check if one contains the other
return cat[:name] if normalized_input.include?(normalized_cat) || normalized_cat.include?(normalized_input)
# Check common abbreviations/variations
return cat[:name] if fuzzy_name_match?(input_str, cat_name_str)
end
nil
end
# Handle common naming variations
def fuzzy_name_match?(input, category)
variations = {
"gas" => [ "gas & fuel", "gas and fuel", "fuel", "gasoline" ],
"restaurants" => [ "restaurant", "dining", "food" ],
"groceries" => [ "grocery", "supermarket", "food store" ],
"streaming" => [ "streaming services", "streaming service" ],
"rideshare" => [ "ride share", "ride-share", "uber", "lyft" ],
"coffee" => [ "coffee shops", "coffee shop", "cafe" ],
"fast food" => [ "fastfood", "quick service" ],
"gym" => [ "gym & fitness", "fitness", "gym and fitness" ],
"flights" => [ "flight", "airline", "airlines", "airfare" ],
"hotels" => [ "hotel", "lodging", "accommodation" ]
}
# Ensure string inputs for string operations
input_lower = input.to_s.downcase
category_lower = category.to_s.downcase
variations.each do |_key, synonyms|
if synonyms.include?(input_lower) && synonyms.include?(category_lower)
return true
end
end
false
end
def extract_categorizations_native(response)
@@ -162,9 +353,107 @@ class Provider::Openai::AutoCategorizer
def extract_categorizations_generic(response)
raw = response.dig("choices", 0, "message", "content")
JSON.parse(raw).dig("categorizations")
rescue JSON::ParserError => e
raise Provider::Openai::Error, "Invalid JSON in generic categorization: #{e.message}"
parsed = parse_json_flexibly(raw)
# Handle different response formats from various LLMs
categorizations = parsed.dig("categorizations") ||
parsed.dig("results") ||
(parsed.is_a?(Array) ? parsed : nil)
raise Provider::Openai::Error, "Could not find categorizations in response" if categorizations.nil?
# Normalize field names (some LLMs use different naming)
categorizations.map do |cat|
{
"transaction_id" => cat["transaction_id"] || cat["id"] || cat["txn_id"],
"category_name" => cat["category_name"] || cat["category"] || cat["name"]
}
end
end
# Flexible JSON parsing that handles common LLM output issues
def parse_json_flexibly(raw)
return {} if raw.blank?
# Strip thinking model tags if present (e.g., <think>...</think>)
# The actual JSON output comes after the thinking block
cleaned = strip_thinking_tags(raw)
# Try direct parse first
JSON.parse(cleaned)
rescue JSON::ParserError
# Try multiple extraction strategies in order of preference
# Strategy 1: Closed markdown code blocks (```json...```)
if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m
matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten
matches.reverse_each do |match|
begin
return JSON.parse(match)
rescue JSON::ParserError
next
end
end
end
# Strategy 2: Unclosed markdown code blocks (thinking models often forget to close)
# Pattern: ```json followed by JSON that goes to end of string
if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m
begin
return JSON.parse($1)
rescue JSON::ParserError
# Continue to next strategy
end
end
# Strategy 3: Find JSON object with "categorizations" key
if cleaned =~ /(\{"categorizations"\s*:\s*\[[\s\S]*\]\s*\})/m
matches = cleaned.scan(/(\{"categorizations"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten
matches.reverse_each do |match|
begin
return JSON.parse(match)
rescue JSON::ParserError
next
end
end
# Try greedy match if non-greedy failed
begin
return JSON.parse($1)
rescue JSON::ParserError
# Continue to next strategy
end
end
# Strategy 4: Find any JSON object (last resort)
if cleaned =~ /(\{[\s\S]*\})/m
begin
return JSON.parse($1)
rescue JSON::ParserError
# Fall through to error
end
end
raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}"
end
# Strip thinking model tags (<think>...</think>) from response
# Some models like Qwen-thinking output reasoning in these tags before the actual response
def strip_thinking_tags(raw)
# Remove <think>...</think> blocks but keep content after them
# If no closing tag, the model may have been cut off - try to extract JSON from inside
if raw.include?("<think>")
# Check if there's content after the thinking block
if raw =~ /<\/think>\s*([\s\S]*)/m
after_thinking = $1.strip
return after_thinking if after_thinking.present?
end
# If no content after </think> or no closing tag, look inside the thinking block
# The JSON might be the last thing in the thinking block
if raw =~ /<think>([\s\S]*)/m
return $1
end
end
raw
end
def json_schema
@@ -213,4 +502,39 @@ class Provider::Openai::AutoCategorizer
```
MESSAGE
end
# Concise developer message optimized for smaller/local LLMs
# Uses pattern-based guidance instead of exhaustive examples
def developer_message_for_generic
<<~MESSAGE.strip_heredoc
AVAILABLE CATEGORIES: #{user_categories.map { |c| c[:name] }.join(", ")}
TRANSACTIONS TO CATEGORIZE:
#{format_transactions_simply}
CATEGORIZATION GUIDELINES:
- Prefer specific subcategories over general parent categories when confident
- Food delivery services should be categorized based on the underlying merchant type
- Square payments (SQ *) should be inferred from the merchant name after the prefix
- Warehouse/club stores should be categorized based on their primary purpose
- Return "null" for generic transactions (e.g., POS terminals, wire transfers, checks, ATM withdrawals)
IMPORTANT:
- Use EXACT category names from the list above
- Return "null" (as a string) if you cannot confidently match a category
- Match expense transactions only to expense categories
- Match income transactions only to income categories
- Do NOT include any explanation or reasoning - only output JSON
Respond with ONLY this JSON (no markdown code blocks, no other text):
{"categorizations": [{"transaction_id": "...", "category_name": "..."}]}
MESSAGE
end
# Format transactions in a simpler, more readable way for smaller LLMs
def format_transactions_simply
transactions.map do |t|
"- ID: #{t[:id]}, Amount: #{t[:amount]}, Type: #{t[:classification]}, Description: \"#{t[:description]}\""
end.join("\n")
end
end

View File

@@ -1,9 +1,22 @@
class Provider::Openai::AutoMerchantDetector
include Provider::Openai::Concerns::UsageRecorder
attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family
# JSON response format modes for custom providers
# - "strict": Use strict JSON schema (requires full OpenAI API compatibility)
# - "json_object": Use json_object response format (broader compatibility)
# - "none": No response format constraint (maximum compatibility with local LLMs)
# - "auto": Try strict first, fall back to none if poor results
JSON_MODE_STRICT = "strict"
JSON_MODE_OBJECT = "json_object"
JSON_MODE_NONE = "none"
JSON_MODE_AUTO = "auto"
def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil)
# Threshold for auto mode: if more than this percentage returns null, retry with none mode
AUTO_MODE_NULL_THRESHOLD = 0.5
attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family, :json_mode
def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil)
@client = client
@model = model
@transactions = transactions
@@ -11,6 +24,32 @@ class Provider::Openai::AutoMerchantDetector
@custom_provider = custom_provider
@langfuse_trace = langfuse_trace
@family = family
@json_mode = json_mode || default_json_mode
end
VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze
# Determine default JSON mode based on configuration hierarchy:
# 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override
# 2. Setting.openai_json_mode - user-configured in app settings
# 3. Default: auto mode (recommended for all providers)
#
# Mode descriptions:
# - "auto": Tries strict first, falls back to none if >50% fail (recommended default)
# - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose <think> tags
# - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output
# - "json_object": Middle ground, broader compatibility than strict
def default_json_mode
# 1. Check environment variable first (allows runtime override for testing)
env_mode = ENV["LLM_JSON_MODE"]
return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode)
# 2. Check app settings (user-configured)
setting_mode = Setting.openai_json_mode
return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode)
# 3. Default: auto mode for all providers (tries strict first, falls back to none if needed)
JSON_MODE_AUTO
end
def auto_detect_merchants
@@ -22,6 +61,32 @@ class Provider::Openai::AutoMerchantDetector
end
def instructions
if custom_provider
simple_instructions
else
detailed_instructions
end
end
# Simplified instructions for smaller/local LLMs
def simple_instructions
<<~INSTRUCTIONS.strip_heredoc
Detect business names and websites from transaction descriptions. Return JSON only.
Rules:
1. Match transaction_id exactly from input
2. Return business_name and business_url for known businesses
3. Return "null" for both if uncertain or generic (e.g. "Paycheck", "Local diner")
4. Don't include "www." in URLs (use "amazon.com" not "www.amazon.com")
5. Favor "null" over guessing - only return values if 80%+ confident
Example output format:
{"merchants": [{"transaction_id": "txn_001", "business_name": "Amazon", "business_url": "amazon.com"}]}
INSTRUCTIONS
end
# Detailed instructions for larger models like GPT-4
def detailed_instructions
<<~INSTRUCTIONS.strip_heredoc
You are an assistant to a consumer personal finance app.
@@ -108,19 +173,64 @@ class Provider::Openai::AutoMerchantDetector
end
def auto_detect_merchants_openai_generic
if json_mode == JSON_MODE_AUTO
auto_detect_merchants_with_auto_mode
else
auto_detect_merchants_with_mode(json_mode)
end
rescue Faraday::BadRequestError => e
# If strict mode fails (HTTP 400), fall back to none mode
# This handles providers that don't support json_schema response format
if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO
Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}")
auto_detect_merchants_with_mode(JSON_MODE_NONE)
else
raise
end
end
# Auto mode: try strict first, fall back to none if too many nulls or missing results
def auto_detect_merchants_with_auto_mode
result = auto_detect_merchants_with_mode(JSON_MODE_STRICT)
# Check if too many nulls OR missing results were returned
# Models that can't reason in strict mode often:
# 1. Return null for everything, OR
# 2. Simply omit transactions they can't detect (returning fewer results than input)
null_count = result.count { |r| r.business_name.nil? || r.business_name == "null" }
missing_count = transactions.size - result.size
failed_count = null_count + missing_count
failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0
if failed_ratio > AUTO_MODE_NULL_THRESHOLD
Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode")
auto_detect_merchants_with_mode(JSON_MODE_NONE)
else
result
end
end
def auto_detect_merchants_with_mode(mode)
span = langfuse_trace&.span(name: "auto_detect_merchants_api_call", input: {
model: model.presence || Provider::Openai::DEFAULT_MODEL,
transactions: transactions,
user_merchants: user_merchants
user_merchants: user_merchants,
json_mode: mode
})
response = client.chat(parameters: {
# Build parameters with configurable JSON response format
params = {
model: model.presence || Provider::Openai::DEFAULT_MODEL,
messages: [
{ role: "system", content: instructions },
{ role: "user", content: developer_message }
],
response_format: {
{ role: "user", content: developer_message_for_generic }
]
}
# Add response format based on json_mode setting
case mode
when JSON_MODE_STRICT
params[:response_format] = {
type: "json_schema",
json_schema: {
name: "auto_detect_personal_finance_merchants",
@@ -128,9 +238,14 @@ class Provider::Openai::AutoMerchantDetector
schema: json_schema
}
}
})
when JSON_MODE_OBJECT
params[:response_format] = { type: "json_object" }
# JSON_MODE_NONE: no response_format constraint
end
Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")}")
response = client.chat(parameters: params)
Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})")
merchants = extract_merchants_generic(response)
result = build_response(merchants)
@@ -141,7 +256,8 @@ class Provider::Openai::AutoMerchantDetector
operation: "auto_detect_merchants",
metadata: {
transaction_count: transactions.size,
merchant_count: user_merchants.size
merchant_count: user_merchants.size,
json_mode: mode
}
)
@@ -154,24 +270,40 @@ class Provider::Openai::AutoMerchantDetector
AutoDetectedMerchant = Provider::LlmConcept::AutoDetectedMerchant
def build_response(categorizations)
categorizations.map do |categorization|
def build_response(merchants)
merchants.map do |merchant|
AutoDetectedMerchant.new(
transaction_id: categorization.dig("transaction_id"),
business_name: normalize_ai_value(categorization.dig("business_name")),
business_url: normalize_ai_value(categorization.dig("business_url")),
transaction_id: merchant.dig("transaction_id"),
business_name: normalize_merchant_value(merchant.dig("business_name")),
business_url: normalize_merchant_value(merchant.dig("business_url")),
)
end
end
def normalize_ai_value(ai_value)
return nil if ai_value == "null"
def normalize_merchant_value(value)
return nil if value.nil? || value == "null" || value.to_s.downcase == "null"
ai_value
# Try to match against user merchants for name normalization
if user_merchants.present?
# Try exact match first
exact_match = user_merchants.find { |m| m[:name] == value }
return exact_match[:name] if exact_match
# Try case-insensitive match
case_match = user_merchants.find { |m| m[:name].to_s.downcase == value.to_s.downcase }
return case_match[:name] if case_match
end
value
end
def extract_merchants_native(response)
raw = response.dig("output", 0, "content", 0, "text")
# Find the message output (not reasoning output)
message_output = response["output"]&.find { |o| o["type"] == "message" }
raw = message_output&.dig("content", 0, "text")
raise Provider::Openai::Error, "No message content found in response" if raw.nil?
JSON.parse(raw).dig("merchants")
rescue JSON::ParserError => e
raise Provider::Openai::Error, "Invalid JSON in native merchant detection: #{e.message}"
@@ -179,9 +311,100 @@ class Provider::Openai::AutoMerchantDetector
def extract_merchants_generic(response)
raw = response.dig("choices", 0, "message", "content")
JSON.parse(raw).dig("merchants")
rescue JSON::ParserError => e
raise Provider::Openai::Error, "Invalid JSON in generic merchant detection: #{e.message}"
parsed = parse_json_flexibly(raw)
# Handle different response formats from various LLMs
merchants = parsed.dig("merchants") ||
parsed.dig("results") ||
(parsed.is_a?(Array) ? parsed : nil)
raise Provider::Openai::Error, "Could not find merchants in response" if merchants.nil?
# Normalize field names (some LLMs use different naming)
merchants.map do |m|
{
"transaction_id" => m["transaction_id"] || m["id"] || m["txn_id"],
"business_name" => m["business_name"] || m["name"] || m["merchant_name"] || m["merchant"],
"business_url" => m["business_url"] || m["url"] || m["website"]
}
end
end
# Flexible JSON parsing that handles common LLM output issues
def parse_json_flexibly(raw)
return {} if raw.blank?
# Strip thinking model tags if present (e.g., <think>...</think>)
cleaned = strip_thinking_tags(raw)
# Try direct parse first
JSON.parse(cleaned)
rescue JSON::ParserError
# Try multiple extraction strategies in order of preference
# Strategy 1: Closed markdown code blocks (```json...```)
if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m
matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten
matches.reverse_each do |match|
begin
return JSON.parse(match)
rescue JSON::ParserError
next
end
end
end
# Strategy 2: Unclosed markdown code blocks (thinking models often forget to close)
if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m
begin
return JSON.parse($1)
rescue JSON::ParserError
# Continue to next strategy
end
end
# Strategy 3: Find JSON object with "merchants" key
if cleaned =~ /(\{"merchants"\s*:\s*\[[\s\S]*\]\s*\})/m
matches = cleaned.scan(/(\{"merchants"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten
matches.reverse_each do |match|
begin
return JSON.parse(match)
rescue JSON::ParserError
next
end
end
# Try greedy match if non-greedy failed
begin
return JSON.parse($1)
rescue JSON::ParserError
# Continue to next strategy
end
end
# Strategy 4: Find any JSON object (last resort)
if cleaned =~ /(\{[\s\S]*\})/m
begin
return JSON.parse($1)
rescue JSON::ParserError
# Fall through to error
end
end
raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}"
end
# Strip thinking model tags (<think>...</think>) from response
def strip_thinking_tags(raw)
if raw.include?("<think>")
if raw =~ /<\/think>\s*([\s\S]*)/m
after_thinking = $1.strip
return after_thinking if after_thinking.present?
end
if raw =~ /<think>([\s\S]*)/m
return $1
end
end
raw
end
def json_schema
@@ -235,4 +458,40 @@ class Provider::Openai::AutoMerchantDetector
Return "null" if you are not 80%+ confident in your answer.
MESSAGE
end
# Enhanced developer message with few-shot examples for smaller/local LLMs
def developer_message_for_generic
merchant_names = user_merchants.present? ? user_merchants.map { |m| m[:name] }.join(", ") : "(none provided)"
<<~MESSAGE.strip_heredoc
USER'S KNOWN MERCHANTS: #{merchant_names}
TRANSACTIONS TO ANALYZE:
#{format_transactions_simply}
EXAMPLES of correct merchant detection:
- "AMAZON.COM*1A2B3C" business_name: "Amazon", business_url: "amazon.com"
- "STARBUCKS STORE #9876" business_name: "Starbucks", business_url: "starbucks.com"
- "NETFLIX.COM" business_name: "Netflix", business_url: "netflix.com"
- "UBER *TRIP" business_name: "Uber", business_url: "uber.com"
- "ACH WITHDRAWAL" business_name: "null", business_url: "null" (generic)
- "LOCAL DINER" business_name: "null", business_url: "null" (generic/unknown)
- "POS DEBIT 12345" business_name: "null", business_url: "null" (generic)
IMPORTANT:
- Return "null" (as a string) for BOTH name and URL if you cannot confidently identify the business
- Don't include "www." in URLs
- Generic descriptions like "Paycheck", "Transfer", "ATM" should return "null"
Respond with ONLY this JSON format (no other text):
{"merchants": [{"transaction_id": "...", "business_name": "...", "business_url": "..."}]}
MESSAGE
end
# Format transactions in a simpler, more readable way for smaller LLMs
def format_transactions_simply
transactions.map do |t|
"- ID: #{t[:id]}, Description: \"#{t[:name] || t[:description]}\""
end.join("\n")
end
end

View File

@@ -9,6 +9,7 @@ class Setting < RailsSettings::Base
field :openai_access_token, type: :string, default: ENV["OPENAI_ACCESS_TOKEN"]
field :openai_uri_base, type: :string, default: ENV["OPENAI_URI_BASE"]
field :openai_model, type: :string, default: ENV["OPENAI_MODEL"]
field :openai_json_mode, type: :string, default: ENV["LLM_JSON_MODE"]
field :brand_fetch_client_id, type: :string, default: ENV["BRAND_FETCH_CLIENT_ID"]
# Provider selection

View File

@@ -47,5 +47,20 @@
inputmode: "text",
disabled: ENV["OPENAI_MODEL"].present?,
data: { "auto-submit-form-target": "auto" } %>
<%= form.select :openai_json_mode,
options_for_select(
[
[t(".json_mode_auto"), ""],
[t(".json_mode_strict"), "strict"],
[t(".json_mode_none"), "none"],
[t(".json_mode_json_object"), "json_object"]
],
Setting.openai_json_mode
),
{ label: t(".json_mode_label") },
{ disabled: ENV["LLM_JSON_MODE"].present?,
data: { "auto-submit-form-target": "auto" } } %>
<p class="text-xs text-secondary mt-1"><%= t(".json_mode_help") %></p>
<% end %>
</div>

View File

@@ -48,6 +48,12 @@ en:
uri_base_placeholder: "https://api.openai.com/v1 (default)"
model_label: Model (Optional)
model_placeholder: "gpt-4.1 (default)"
json_mode_label: JSON Mode
json_mode_auto: Auto (recommended)
json_mode_strict: Strict (best for thinking models)
json_mode_none: None (best for standard models)
json_mode_json_object: JSON Object
json_mode_help: "Strict mode works best with thinking models (qwen-thinking, deepseek-reasoner). None mode works best with standard models (llama, mistral, gpt-oss)."
title: OpenAI
yahoo_finance_settings:
title: Yahoo Finance

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,769 @@
---
name: categorization_golden_v1_light
description: Lightweight golden dataset for quick transaction categorization evaluation
eval_type: categorization
version: "1.0"
metadata:
created_at: "2025-12-04"
updated_at: "2025-12-04"
source: manual_curation
notes: |
A compact 50-sample dataset designed for quick evaluation runs.
Includes a balanced mix across:
- All difficulty levels (easy, medium, hard, edge_case)
- All major category types
- Both US and European merchants
- Representative edge cases
Difficulty distribution:
- easy: 20 samples
- medium: 15 samples
- hard: 10 samples
- edge_case: 5 samples
context:
categories:
- id: "income"
name: "Income"
classification: "income"
is_subcategory: false
- id: "salary"
name: "Salary"
classification: "income"
is_subcategory: true
parent_id: "income"
- id: "food_and_drink"
name: "Food & Drink"
classification: "expense"
is_subcategory: false
- id: "restaurants"
name: "Restaurants"
classification: "expense"
is_subcategory: true
parent_id: "food_and_drink"
- id: "fast_food"
name: "Fast Food"
classification: "expense"
is_subcategory: true
parent_id: "food_and_drink"
- id: "groceries"
name: "Groceries"
classification: "expense"
is_subcategory: true
parent_id: "food_and_drink"
- id: "coffee_shops"
name: "Coffee Shops"
classification: "expense"
is_subcategory: true
parent_id: "food_and_drink"
- id: "shopping"
name: "Shopping"
classification: "expense"
is_subcategory: false
- id: "clothing"
name: "Clothing"
classification: "expense"
is_subcategory: true
parent_id: "shopping"
- id: "electronics"
name: "Electronics"
classification: "expense"
is_subcategory: true
parent_id: "shopping"
- id: "transportation"
name: "Transportation"
classification: "expense"
is_subcategory: false
- id: "gas"
name: "Gas & Fuel"
classification: "expense"
is_subcategory: true
parent_id: "transportation"
- id: "rideshare"
name: "Rideshare"
classification: "expense"
is_subcategory: true
parent_id: "transportation"
- id: "public_transit"
name: "Public Transit"
classification: "expense"
is_subcategory: true
parent_id: "transportation"
- id: "entertainment"
name: "Entertainment"
classification: "expense"
is_subcategory: false
- id: "streaming"
name: "Streaming Services"
classification: "expense"
is_subcategory: true
parent_id: "entertainment"
- id: "utilities"
name: "Utilities"
classification: "expense"
is_subcategory: false
- id: "housing"
name: "Housing"
classification: "expense"
is_subcategory: false
- id: "rent"
name: "Rent"
classification: "expense"
is_subcategory: true
parent_id: "housing"
- id: "health"
name: "Health & Wellness"
classification: "expense"
is_subcategory: false
- id: "pharmacy"
name: "Pharmacy"
classification: "expense"
is_subcategory: true
parent_id: "health"
- id: "gym"
name: "Gym & Fitness"
classification: "expense"
is_subcategory: true
parent_id: "health"
- id: "travel"
name: "Travel"
classification: "expense"
is_subcategory: false
- id: "flights"
name: "Flights"
classification: "expense"
is_subcategory: true
parent_id: "travel"
- id: "hotels"
name: "Hotels"
classification: "expense"
is_subcategory: true
parent_id: "travel"
- id: "subscriptions"
name: "Subscriptions"
classification: "expense"
is_subcategory: false
- id: "personal_care"
name: "Personal Care"
classification: "expense"
is_subcategory: false
- id: "gifts"
name: "Gifts & Donations"
classification: "expense"
is_subcategory: false
samples:
# =============================================================================
# EASY SAMPLES (20 samples) - Clear, unambiguous merchants
# =============================================================================
# Fast Food
- id: cat_light_easy_001
difficulty: easy
tags: [fast_food, us]
input:
id: txn_light_001
amount: 12.99
classification: expense
description: "MCDONALD'S #12345"
expected:
category_name: "Fast Food"
- id: cat_light_easy_002
difficulty: easy
tags: [fast_food, us]
input:
id: txn_light_002
amount: 14.50
classification: expense
description: "CHIPOTLE MEXICAN GRILL"
expected:
category_name: "Fast Food"
# Coffee Shops
- id: cat_light_easy_003
difficulty: easy
tags: [coffee_shops, us]
input:
id: txn_light_003
amount: 5.75
classification: expense
description: "STARBUCKS STORE #9876"
expected:
category_name: "Coffee Shops"
- id: cat_light_easy_004
difficulty: easy
tags: [coffee_shops, europe, uk]
input:
id: txn_light_004
amount: 4.50
classification: expense
description: "COSTA COFFEE LTD"
expected:
category_name: "Coffee Shops"
# Groceries
- id: cat_light_easy_005
difficulty: easy
tags: [groceries, us]
input:
id: txn_light_005
amount: 156.32
classification: expense
description: "WHOLE FOODS MKT #10234"
expected:
category_name: "Groceries"
- id: cat_light_easy_006
difficulty: easy
tags: [groceries, europe, uk]
input:
id: txn_light_006
amount: 87.50
classification: expense
description: "TESCO STORES LTD"
expected:
category_name: "Groceries"
- id: cat_light_easy_007
difficulty: easy
tags: [groceries, europe, germany]
input:
id: txn_light_007
amount: 78.90
classification: expense
description: "LIDL DIENSTLEISTUNG"
expected:
category_name: "Groceries"
# Gas & Fuel
- id: cat_light_easy_008
difficulty: easy
tags: [gas, us]
input:
id: txn_light_008
amount: 45.00
classification: expense
description: "SHELL OIL 573849234"
expected:
category_name: "Gas & Fuel"
- id: cat_light_easy_009
difficulty: easy
tags: [gas, europe, uk]
input:
id: txn_light_009
amount: 75.00
classification: expense
description: "BP OIL UK LTD"
expected:
category_name: "Gas & Fuel"
# Rideshare
- id: cat_light_easy_010
difficulty: easy
tags: [rideshare, us]
input:
id: txn_light_010
amount: 23.50
classification: expense
description: "UBER *TRIP HELP.UBER.COM"
expected:
category_name: "Rideshare"
# Streaming
- id: cat_light_easy_011
difficulty: easy
tags: [streaming, us]
input:
id: txn_light_011
amount: 15.99
classification: expense
description: "NETFLIX.COM"
expected:
category_name: "Streaming Services"
- id: cat_light_easy_012
difficulty: easy
tags: [streaming, us]
input:
id: txn_light_012
amount: 10.99
classification: expense
description: "SPOTIFY USA"
expected:
category_name: "Streaming Services"
# Electronics
- id: cat_light_easy_013
difficulty: easy
tags: [electronics, us]
input:
id: txn_light_013
amount: 299.99
classification: expense
description: "BEST BUY 00000456"
expected:
category_name: "Electronics"
acceptable_alternatives: ["Shopping"]
# Clothing
- id: cat_light_easy_014
difficulty: easy
tags: [clothing, europe, spain]
input:
id: txn_light_014
amount: 79.99
classification: expense
description: "ZARA ESPANA SA"
expected:
category_name: "Clothing"
acceptable_alternatives: ["Shopping"]
# Pharmacy
- id: cat_light_easy_015
difficulty: easy
tags: [pharmacy, us]
input:
id: txn_light_015
amount: 24.99
classification: expense
description: "CVS/PHARMACY #4567"
expected:
category_name: "Pharmacy"
# Flights
- id: cat_light_easy_016
difficulty: easy
tags: [flights, us]
input:
id: txn_light_016
amount: 345.00
classification: expense
description: "UNITED AIRLINES 0162345678"
expected:
category_name: "Flights"
- id: cat_light_easy_017
difficulty: easy
tags: [flights, europe, ireland]
input:
id: txn_light_017
amount: 89.99
classification: expense
description: "RYANAIR DAC"
expected:
category_name: "Flights"
# Hotels
- id: cat_light_easy_018
difficulty: easy
tags: [hotels, us]
input:
id: txn_light_018
amount: 189.00
classification: expense
description: "MARRIOTT HOTELS NYC"
expected:
category_name: "Hotels"
# Gym
- id: cat_light_easy_019
difficulty: easy
tags: [gym, us]
input:
id: txn_light_019
amount: 39.99
classification: expense
description: "PLANET FITNESS MONTHLY"
expected:
category_name: "Gym & Fitness"
# Income
- id: cat_light_easy_020
difficulty: easy
tags: [income, salary, us]
input:
id: txn_light_020
amount: 3500.00
classification: income
description: "ACME CORP PAYROLL"
expected:
category_name: "Salary"
# =============================================================================
# MEDIUM SAMPLES (15 samples) - Requires domain knowledge
# =============================================================================
# Restaurants
- id: cat_light_med_001
difficulty: medium
tags: [restaurants, us]
input:
id: txn_light_med_001
amount: 67.50
classification: expense
description: "OLIVE GARDEN #456"
expected:
category_name: "Restaurants"
- id: cat_light_med_002
difficulty: medium
tags: [restaurants, europe, uk]
input:
id: txn_light_med_002
amount: 78.50
classification: expense
description: "WAGAMAMA LTD LONDON"
expected:
category_name: "Restaurants"
# Warehouse stores
- id: cat_light_med_003
difficulty: medium
tags: [groceries, us, warehouse]
input:
id: txn_light_med_003
amount: 234.56
classification: expense
description: "COSTCO WHSE #1234"
expected:
category_name: "Groceries"
acceptable_alternatives: ["Shopping"]
# Utilities
- id: cat_light_med_004
difficulty: medium
tags: [utilities, us]
input:
id: txn_light_med_004
amount: 125.00
classification: expense
description: "CON EDISON PAYMENT"
expected:
category_name: "Utilities"
- id: cat_light_med_005
difficulty: medium
tags: [utilities, europe, uk]
input:
id: txn_light_med_005
amount: 156.00
classification: expense
description: "BRITISH GAS SERVICES"
expected:
category_name: "Utilities"
- id: cat_light_med_006
difficulty: medium
tags: [utilities, us]
input:
id: txn_light_med_006
amount: 89.00
classification: expense
description: "AT&T WIRELESS"
expected:
category_name: "Utilities"
# Public Transit
- id: cat_light_med_007
difficulty: medium
tags: [public_transit, us]
input:
id: txn_light_med_007
amount: 127.00
classification: expense
description: "MTA *METROCARD"
expected:
category_name: "Public Transit"
- id: cat_light_med_008
difficulty: medium
tags: [public_transit, europe, uk]
input:
id: txn_light_med_008
amount: 156.50
classification: expense
description: "TFL TRAVEL LONDON"
expected:
category_name: "Public Transit"
# Housing
- id: cat_light_med_009
difficulty: medium
tags: [rent, us]
input:
id: txn_light_med_009
amount: 2100.00
classification: expense
description: "AVALON APARTMENTS RENT"
expected:
category_name: "Rent"
acceptable_alternatives: ["Housing"]
# Subscriptions
- id: cat_light_med_010
difficulty: medium
tags: [subscriptions, us]
input:
id: txn_light_med_010
amount: 9.99
classification: expense
description: "APPLE.COM/BILL"
expected:
category_name: "Subscriptions"
# Gifts & Donations
- id: cat_light_med_011
difficulty: medium
tags: [gifts, us, donation]
input:
id: txn_light_med_011
amount: 50.00
classification: expense
description: "RED CROSS DONATION"
expected:
category_name: "Gifts & Donations"
# Entertainment
- id: cat_light_med_012
difficulty: medium
tags: [entertainment, us]
input:
id: txn_light_med_012
amount: 89.00
classification: expense
description: "TICKETMASTER *EVENT"
expected:
category_name: "Entertainment"
# Travel
- id: cat_light_med_013
difficulty: medium
tags: [hotels, us]
input:
id: txn_light_med_013
amount: 234.00
classification: expense
description: "AIRBNB *HMQT5J6QQJ"
expected:
category_name: "Hotels"
acceptable_alternatives: ["Travel"]
# Personal Care
- id: cat_light_med_014
difficulty: medium
tags: [personal_care, us]
input:
id: txn_light_med_014
amount: 45.00
classification: expense
description: "SUPERCUTS #1234"
expected:
category_name: "Personal Care"
# Income
- id: cat_light_med_015
difficulty: medium
tags: [income, us]
input:
id: txn_light_med_015
amount: 500.00
classification: income
description: "VENMO CASHOUT"
expected:
category_name: "Income"
# =============================================================================
# HARD SAMPLES (10 samples) - Ambiguous, multiple interpretations
# =============================================================================
# Big-box stores
- id: cat_light_hard_001
difficulty: hard
tags: [ambiguous, us, multi_purpose_retailer]
input:
id: txn_light_hard_001
amount: 156.78
classification: expense
description: "TARGET #1234"
expected:
category_name: "Shopping"
acceptable_alternatives: ["Groceries"]
- id: cat_light_hard_002
difficulty: hard
tags: [ambiguous, europe, uk, multi_purpose_retailer]
input:
id: txn_light_hard_002
amount: 156.00
classification: expense
description: "MARKS & SPENCER PLC"
expected:
category_name: "Shopping"
acceptable_alternatives: ["Groceries", "Clothing"]
# Online marketplaces
- id: cat_light_hard_003
difficulty: hard
tags: [ambiguous, us, online_marketplace]
input:
id: txn_light_hard_003
amount: 89.99
classification: expense
description: "AMAZON.COM*1A2B3C4D"
expected:
category_name: "Shopping"
# Payment processors (should be null)
- id: cat_light_hard_004
difficulty: hard
tags: [ambiguous, us, payment_processor]
input:
id: txn_light_hard_004
amount: 78.00
classification: expense
description: "PAYPAL *JOHNSMITH"
expected:
category_name: null
# Fast-casual
- id: cat_light_hard_005
difficulty: hard
tags: [ambiguous, us, fast_casual]
input:
id: txn_light_hard_005
amount: 34.50
classification: expense
description: "PANERA BREAD #567"
expected:
category_name: "Restaurants"
acceptable_alternatives: ["Fast Food"]
# Delivery services
- id: cat_light_hard_006
difficulty: hard
tags: [ambiguous, us, delivery_service]
input:
id: txn_light_hard_006
amount: 45.00
classification: expense
description: "DOORDASH*CHIPOTLE"
expected:
category_name: "Fast Food"
acceptable_alternatives: ["Restaurants"]
- id: cat_light_hard_007
difficulty: hard
tags: [ambiguous, europe, uk, delivery_service]
input:
id: txn_light_hard_007
amount: 32.50
classification: expense
description: "DELIVEROO UK LTD"
expected:
category_name: "Restaurants"
acceptable_alternatives: ["Fast Food"]
# Amazon Prime
- id: cat_light_hard_008
difficulty: hard
tags: [ambiguous, us, amazon]
input:
id: txn_light_hard_008
amount: 14.99
classification: expense
description: "AMAZON PRIME*1A2B3C"
expected:
category_name: "Subscriptions"
# Convenience store
- id: cat_light_hard_009
difficulty: hard
tags: [ambiguous, us, convenience_store]
input:
id: txn_light_hard_009
amount: 12.50
classification: expense
description: "7-ELEVEN #34567"
expected:
category_name: "Groceries"
acceptable_alternatives: ["Fast Food"]
# Streaming vs Subscription
- id: cat_light_hard_010
difficulty: hard
tags: [ambiguous, us, streaming_subscription]
input:
id: txn_light_hard_010
amount: 15.99
classification: expense
description: "HBO MAX"
expected:
category_name: "Streaming Services"
acceptable_alternatives: ["Subscriptions"]
# =============================================================================
# EDGE CASES (5 samples) - Should return null
# =============================================================================
# Generic POS
- id: cat_light_edge_001
difficulty: edge_case
tags: [should_be_null, generic_pos]
input:
id: txn_light_edge_001
amount: 15.00
classification: expense
description: "POS DEBIT 12345"
expected:
category_name: null
# ACH transfer
- id: cat_light_edge_002
difficulty: edge_case
tags: [should_be_null, transfer]
input:
id: txn_light_edge_002
amount: 100.00
classification: expense
description: "ACH WITHDRAWAL"
expected:
category_name: null
# ATM
- id: cat_light_edge_003
difficulty: edge_case
tags: [should_be_null, atm]
input:
id: txn_light_edge_003
amount: 200.00
classification: expense
description: "ATM WITHDRAWAL 12345"
expected:
category_name: null
# Check
- id: cat_light_edge_004
difficulty: edge_case
tags: [should_be_null, check]
input:
id: txn_light_edge_004
amount: 350.00
classification: expense
description: "CHECK #1234"
expected:
category_name: null
# Cryptic
- id: cat_light_edge_005
difficulty: edge_case
tags: [should_be_null, cryptic]
input:
id: txn_light_edge_005
amount: 45.67
classification: expense
description: "TXN*89234*AUTH"
expected:
category_name: null

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,825 @@
---
name: chat_golden_v1
description: Golden dataset for chat/assistant function calling evaluation
eval_type: chat
version: "1.0"
metadata:
created_at: "2024-12-01"
source: manual_curation
samples:
# ===== EASY - Simple single function calls =====
- id: chat_easy_001
difficulty: easy
tags: [get_accounts, simple]
input:
prompt: "What accounts do I have?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_002
difficulty: easy
tags: [get_accounts, simple]
input:
prompt: "Show me my accounts"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_003
difficulty: easy
tags: [get_accounts, balance]
input:
prompt: "What's my account balance?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_004
difficulty: easy
tags: [get_transactions, simple]
input:
prompt: "Show me my recent transactions"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_easy_005
difficulty: easy
tags: [get_transactions, simple]
input:
prompt: "What are my latest transactions?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_easy_006
difficulty: easy
tags: [get_balance_sheet, simple]
input:
prompt: "What's my net worth?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_007
difficulty: easy
tags: [get_balance_sheet, simple]
input:
prompt: "Show me my assets and liabilities"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_008
difficulty: easy
tags: [get_income_statement, simple]
input:
prompt: "What were my expenses last month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_easy_009
difficulty: easy
tags: [get_income_statement, simple]
input:
prompt: "How much income did I make this month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_easy_010
difficulty: easy
tags: [get_accounts, simple]
input:
prompt: "How many accounts do I have?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_011
difficulty: easy
tags: [get_transactions, simple]
input:
prompt: "List my transactions"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_easy_012
difficulty: easy
tags: [get_balance_sheet, simple]
input:
prompt: "How much do I owe?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_013
difficulty: easy
tags: [get_balance_sheet, simple]
input:
prompt: "What are my total assets?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_014
difficulty: easy
tags: [get_income_statement, simple]
input:
prompt: "Show my spending"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_easy_015
difficulty: easy
tags: [get_income_statement, simple]
input:
prompt: "How much did I spend?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
# ===== MEDIUM - With filtering or specific parameters =====
- id: chat_medium_001
difficulty: medium
tags: [get_transactions, filtering]
input:
prompt: "Show me my restaurant spending"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_002
difficulty: medium
tags: [get_transactions, filtering]
input:
prompt: "What did I spend on groceries?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_003
difficulty: medium
tags: [get_transactions, filtering]
input:
prompt: "Show transactions over $100"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_004
difficulty: medium
tags: [get_transactions, filtering]
input:
prompt: "What did I spend at Amazon?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_005
difficulty: medium
tags: [get_transactions, date_range]
input:
prompt: "Show me last week's transactions"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_006
difficulty: medium
tags: [get_income_statement, date_range]
input:
prompt: "What was my income in January?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_007
difficulty: medium
tags: [get_income_statement, comparison]
input:
prompt: "How much did I save last month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_008
difficulty: medium
tags: [get_accounts, specific]
input:
prompt: "What's the balance in my checking account?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_medium_009
difficulty: medium
tags: [get_accounts, specific]
input:
prompt: "How much do I have in savings?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_medium_010
difficulty: medium
tags: [get_transactions, category]
input:
prompt: "Show me all my subscription payments"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_011
difficulty: medium
tags: [get_transactions, search]
input:
prompt: "Find transactions from Uber"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_012
difficulty: medium
tags: [get_income_statement, category]
input:
prompt: "How much do I spend on entertainment?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_013
difficulty: medium
tags: [get_balance_sheet, trend]
input:
prompt: "How has my net worth changed over time?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_medium_014
difficulty: medium
tags: [get_transactions, amount]
input:
prompt: "What's my largest expense this month?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_015
difficulty: medium
tags: [get_income_statement, breakdown]
input:
prompt: "Break down my expenses by category"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_016
difficulty: medium
tags: [get_transactions, recurring]
input:
prompt: "Show me my recurring payments"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_017
difficulty: medium
tags: [get_accounts, credit]
input:
prompt: "What's my credit card balance?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_medium_018
difficulty: medium
tags: [get_income_statement, specific]
input:
prompt: "How much did I spend on food last month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_019
difficulty: medium
tags: [get_transactions, date]
input:
prompt: "Show transactions from December"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_020
difficulty: medium
tags: [get_balance_sheet, liability]
input:
prompt: "What are my debts?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
# ===== HARD - Analysis, comparisons, insights =====
- id: chat_hard_001
difficulty: hard
tags: [analysis, spending_trend]
input:
prompt: "Am I spending more than I make?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_002
difficulty: hard
tags: [comparison, month_over_month]
input:
prompt: "How does my spending this month compare to last month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_003
difficulty: hard
tags: [analysis, budget]
input:
prompt: "Where can I cut expenses?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_004
difficulty: hard
tags: [analysis, savings]
input:
prompt: "What's my savings rate?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_005
difficulty: hard
tags: [analysis, trend]
input:
prompt: "Are my expenses trending up or down?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_006
difficulty: hard
tags: [analysis, category]
input:
prompt: "What category do I spend the most on?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_007
difficulty: hard
tags: [analysis, unusual]
input:
prompt: "Are there any unusual transactions this month?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_hard_008
difficulty: hard
tags: [analysis, debt]
input:
prompt: "How long will it take to pay off my credit card?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_hard_009
difficulty: hard
tags: [analysis, financial_health]
input:
prompt: "What's my debt-to-income ratio?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_hard_010
difficulty: hard
tags: [analysis, goals]
input:
prompt: "Can I afford to save $500 more per month?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_011
difficulty: hard
tags: [comparison, year_over_year]
input:
prompt: "How does this year compare to last year?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_012
difficulty: hard
tags: [analysis, pattern]
input:
prompt: "Do I have any spending patterns I should know about?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_hard_013
difficulty: hard
tags: [advice, budget]
input:
prompt: "How should I allocate my income?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_014
difficulty: hard
tags: [analysis, efficiency]
input:
prompt: "Am I overspending on subscriptions?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_hard_015
difficulty: hard
tags: [forecast, projection]
input:
prompt: "At this rate, how much will I have saved by year end?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
# ===== EDGE CASES - Unclear intent, no function needed =====
- id: chat_edge_001
difficulty: edge_case
tags: [no_function, greeting]
input:
prompt: "Hello"
expected:
functions: []
response_contains: []
- id: chat_edge_002
difficulty: edge_case
tags: [no_function, thanks]
input:
prompt: "Thank you!"
expected:
functions: []
response_contains: []
- id: chat_edge_003
difficulty: edge_case
tags: [no_function, general]
input:
prompt: "What can you help me with?"
expected:
functions: []
response_contains: []
- id: chat_edge_004
difficulty: edge_case
tags: [no_function, advice]
input:
prompt: "Should I invest in stocks?"
expected:
functions: []
response_contains: []
- id: chat_edge_005
difficulty: edge_case
tags: [no_function, external]
input:
prompt: "What's the weather like?"
expected:
functions: []
response_contains: []
- id: chat_edge_006
difficulty: edge_case
tags: [ambiguous]
input:
prompt: "Tell me about my money"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_edge_007
difficulty: edge_case
tags: [ambiguous]
input:
prompt: "How am I doing financially?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_edge_008
difficulty: edge_case
tags: [ambiguous]
input:
prompt: "Give me a summary"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_edge_009
difficulty: edge_case
tags: [no_function, off_topic]
input:
prompt: "What's 2 + 2?"
expected:
functions: []
response_contains: []
- id: chat_edge_010
difficulty: edge_case
tags: [no_function, general]
input:
prompt: "Who are you?"
expected:
functions: []
response_contains: []
# Additional samples
- id: chat_easy_016
difficulty: easy
tags: [get_transactions]
input:
prompt: "Pull up my transactions"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_easy_017
difficulty: easy
tags: [get_accounts]
input:
prompt: "Show all my bank accounts"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_easy_018
difficulty: easy
tags: [get_balance_sheet]
input:
prompt: "What do I own?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_easy_019
difficulty: easy
tags: [get_income_statement]
input:
prompt: "What's my income?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_easy_020
difficulty: easy
tags: [get_transactions]
input:
prompt: "Recent purchases"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_021
difficulty: medium
tags: [get_transactions, merchant]
input:
prompt: "How much have I spent at Starbucks?"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_022
difficulty: medium
tags: [get_transactions, category]
input:
prompt: "Show transportation expenses"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_medium_023
difficulty: medium
tags: [get_income_statement, period]
input:
prompt: "Quarterly expense report"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_medium_024
difficulty: medium
tags: [get_accounts, type]
input:
prompt: "Show my investment accounts"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_medium_025
difficulty: medium
tags: [get_transactions, amount]
input:
prompt: "Transactions under $50"
expected:
functions:
- name: "get_transactions"
params: {}
response_contains: []
- id: chat_hard_016
difficulty: hard
tags: [analysis, discretionary]
input:
prompt: "How much discretionary spending do I have?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_017
difficulty: hard
tags: [analysis, fixed_vs_variable]
input:
prompt: "What are my fixed vs variable expenses?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []
- id: chat_hard_018
difficulty: hard
tags: [analysis, emergency_fund]
input:
prompt: "Do I have enough for an emergency fund?"
expected:
functions:
- name: "get_balance_sheet"
params: {}
response_contains: []
- id: chat_hard_019
difficulty: hard
tags: [analysis, liquidity]
input:
prompt: "How liquid are my assets?"
expected:
functions:
- name: "get_accounts"
params: {}
response_contains: []
- id: chat_hard_020
difficulty: hard
tags: [comparison, benchmark]
input:
prompt: "Am I spending too much on housing?"
expected:
functions:
- name: "get_income_statement"
params: {}
response_contains: []

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,81 @@
class CreateEvalTables < ActiveRecord::Migration[7.2]
def change
# Eval Datasets - Golden dataset containers
create_table :eval_datasets, id: :uuid do |t|
t.string :name, null: false
t.string :description
t.string :eval_type, null: false
t.string :version, null: false, default: "1.0"
t.integer :sample_count, default: 0
t.jsonb :metadata, default: {}
t.boolean :active, default: true
t.timestamps
end
add_index :eval_datasets, :name, unique: true
add_index :eval_datasets, [ :eval_type, :active ]
# Eval Samples - Individual test cases
create_table :eval_samples, id: :uuid do |t|
t.references :eval_dataset, null: false, foreign_key: true, type: :uuid
t.jsonb :input_data, null: false
t.jsonb :expected_output, null: false
t.jsonb :context_data, default: {}
t.string :difficulty, default: "medium"
t.string :tags, array: true, default: []
t.jsonb :metadata, default: {}
t.timestamps
end
add_index :eval_samples, [ :eval_dataset_id, :difficulty ]
add_index :eval_samples, :tags, using: :gin
# Eval Runs - Evaluation execution records
create_table :eval_runs, id: :uuid do |t|
t.references :eval_dataset, null: false, foreign_key: true, type: :uuid
t.string :name
t.string :status, null: false, default: "pending"
t.string :provider, null: false
t.string :model, null: false
t.jsonb :provider_config, default: {}
t.jsonb :metrics, default: {}
t.integer :total_prompt_tokens, default: 0
t.integer :total_completion_tokens, default: 0
t.decimal :total_cost, precision: 10, scale: 6, default: 0.0
t.datetime :started_at
t.datetime :completed_at
t.text :error_message
t.timestamps
end
add_index :eval_runs, [ :eval_dataset_id, :model ]
add_index :eval_runs, [ :provider, :model ]
add_index :eval_runs, :status
# Eval Results - Individual sample results
create_table :eval_results, id: :uuid do |t|
t.references :eval_run, null: false, foreign_key: true, type: :uuid
t.references :eval_sample, null: false, foreign_key: true, type: :uuid
t.jsonb :actual_output, null: false
t.boolean :correct, null: false
t.boolean :exact_match, default: false
t.boolean :hierarchical_match, default: false
t.boolean :null_expected, default: false
t.boolean :null_returned, default: false
t.float :fuzzy_score
t.integer :latency_ms
t.integer :prompt_tokens
t.integer :completion_tokens
t.decimal :cost, precision: 10, scale: 6
t.jsonb :metadata, default: {}
t.timestamps
end
add_index :eval_results, [ :eval_run_id, :correct ]
# eval_sample_id index is automatically created by t.references
end
end

View File

@@ -0,0 +1,5 @@
class AddAlternativeMatchToEvalResults < ActiveRecord::Migration[7.2]
def change
add_column :eval_results, :alternative_match, :boolean, default: false
end
end

96
db/schema.rb generated
View File

@@ -307,6 +307,80 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
t.index ["import_id"], name: "index_entries_on_import_id"
end
create_table "eval_datasets", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
t.string "name", null: false
t.string "description"
t.string "eval_type", null: false
t.string "version", default: "1.0", null: false
t.integer "sample_count", default: 0
t.jsonb "metadata", default: {}
t.boolean "active", default: true
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["eval_type", "active"], name: "index_eval_datasets_on_eval_type_and_active"
t.index ["name"], name: "index_eval_datasets_on_name", unique: true
end
create_table "eval_results", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
t.uuid "eval_run_id", null: false
t.uuid "eval_sample_id", null: false
t.jsonb "actual_output", null: false
t.boolean "correct", null: false
t.boolean "exact_match", default: false
t.boolean "hierarchical_match", default: false
t.boolean "null_expected", default: false
t.boolean "null_returned", default: false
t.float "fuzzy_score"
t.integer "latency_ms"
t.integer "prompt_tokens"
t.integer "completion_tokens"
t.decimal "cost", precision: 10, scale: 6
t.jsonb "metadata", default: {}
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.boolean "alternative_match", default: false
t.index ["eval_run_id", "correct"], name: "index_eval_results_on_eval_run_id_and_correct"
t.index ["eval_run_id"], name: "index_eval_results_on_eval_run_id"
t.index ["eval_sample_id"], name: "index_eval_results_on_eval_sample_id"
end
create_table "eval_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
t.uuid "eval_dataset_id", null: false
t.string "name"
t.string "status", default: "pending", null: false
t.string "provider", null: false
t.string "model", null: false
t.jsonb "provider_config", default: {}
t.jsonb "metrics", default: {}
t.integer "total_prompt_tokens", default: 0
t.integer "total_completion_tokens", default: 0
t.decimal "total_cost", precision: 10, scale: 6, default: "0.0"
t.datetime "started_at"
t.datetime "completed_at"
t.text "error_message"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["eval_dataset_id", "model"], name: "index_eval_runs_on_eval_dataset_id_and_model"
t.index ["eval_dataset_id"], name: "index_eval_runs_on_eval_dataset_id"
t.index ["provider", "model"], name: "index_eval_runs_on_provider_and_model"
t.index ["status"], name: "index_eval_runs_on_status"
end
create_table "eval_samples", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
t.uuid "eval_dataset_id", null: false
t.jsonb "input_data", null: false
t.jsonb "expected_output", null: false
t.jsonb "context_data", default: {}
t.string "difficulty", default: "medium"
t.string "tags", default: [], array: true
t.jsonb "metadata", default: {}
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["eval_dataset_id", "difficulty"], name: "index_eval_samples_on_eval_dataset_id_and_difficulty"
t.index ["eval_dataset_id"], name: "index_eval_samples_on_eval_dataset_id"
t.index ["tags"], name: "index_eval_samples_on_tags", using: :gin
end
create_table "exchange_rates", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
t.string "from_currency", null: false
t.string "to_currency", null: false
@@ -789,6 +863,21 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
t.index ["rule_id"], name: "index_rule_conditions_on_rule_id"
end
create_table "rule_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
t.uuid "rule_id", null: false
t.string "execution_type", null: false
t.string "status", null: false
t.integer "transactions_processed", default: 0, null: false
t.integer "transactions_modified", default: 0, null: false
t.datetime "executed_at", null: false
t.text "error_message"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["executed_at"], name: "index_rule_runs_on_executed_at"
t.index ["rule_id", "executed_at"], name: "index_rule_runs_on_rule_id_and_executed_at"
t.index ["rule_id"], name: "index_rule_runs_on_rule_id"
end
create_table "rules", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
t.uuid "family_id", null: false
t.string "resource_type", null: false
@@ -991,6 +1080,8 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
t.datetime "updated_at", null: false
t.string "currency"
t.jsonb "locked_attributes", default: {}
t.uuid "category_id"
t.index ["category_id"], name: "index_trades_on_category_id"
t.index ["security_id"], name: "index_trades_on_security_id"
end
@@ -1095,6 +1186,10 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
add_foreign_key "enable_banking_items", "families"
add_foreign_key "entries", "accounts", on_delete: :cascade
add_foreign_key "entries", "imports"
add_foreign_key "eval_results", "eval_runs"
add_foreign_key "eval_results", "eval_samples"
add_foreign_key "eval_runs", "eval_datasets"
add_foreign_key "eval_samples", "eval_datasets"
add_foreign_key "family_exports", "families"
add_foreign_key "holdings", "account_providers"
add_foreign_key "holdings", "accounts", on_delete: :cascade
@@ -1136,6 +1231,7 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
add_foreign_key "taggings", "tags"
add_foreign_key "tags", "families"
add_foreign_key "tool_calls", "messages"
add_foreign_key "trades", "categories"
add_foreign_key "trades", "securities"
add_foreign_key "transactions", "categories", on_delete: :nullify
add_foreign_key "transactions", "merchants"

739
lib/tasks/evals.rake Normal file
View File

@@ -0,0 +1,739 @@
namespace :evals do
desc "List all evaluation datasets"
task list_datasets: :environment do
datasets = Eval::Dataset.order(:eval_type, :name)
if datasets.empty?
puts "No datasets found. Import a dataset with: rake evals:import_dataset[path/to/file.yml]"
next
end
puts "=" * 80
puts "Available Evaluation Datasets"
puts "=" * 80
puts
datasets.group_by(&:eval_type).each do |eval_type, type_datasets|
puts "#{eval_type.titleize}:"
puts "-" * 40
type_datasets.each do |dataset|
status = dataset.active ? "active" : "inactive"
puts " #{dataset.name} (v#{dataset.version}) - #{dataset.sample_count} samples [#{status}]"
puts " #{dataset.description}" if dataset.description.present?
end
puts
end
end
desc "Import dataset from YAML file"
task :import_dataset, [ :file_path ] => :environment do |_t, args|
file_path = args[:file_path] || ENV["FILE"]
if file_path.blank?
puts "Usage: rake evals:import_dataset[path/to/file.yml]"
puts " or: FILE=path/to/file.yml rake evals:import_dataset"
exit 1
end
unless File.exist?(file_path)
puts "Error: File not found: #{file_path}"
exit 1
end
puts "Importing dataset from #{file_path}..."
dataset = Eval::Dataset.import_from_yaml(file_path)
puts "Successfully imported dataset:"
puts " Name: #{dataset.name}"
puts " Type: #{dataset.eval_type}"
puts " Version: #{dataset.version}"
puts " Samples: #{dataset.sample_count}"
stats = dataset.statistics
puts " By difficulty: #{stats[:by_difficulty].map { |k, v| "#{k}=#{v}" }.join(', ')}"
end
desc "Run evaluation against a model"
task :run, [ :dataset_name, :model ] => :environment do |_t, args|
dataset_name = args[:dataset_name] || ENV["DATASET"]
model = args[:model] || ENV["MODEL"] || "gpt-4.1"
provider = ENV["PROVIDER"] || "openai"
if dataset_name.blank?
puts "Usage: rake evals:run[dataset_name,model]"
puts " or: DATASET=name MODEL=gpt-4 rake evals:run"
exit 1
end
dataset = Eval::Dataset.find_by(name: dataset_name)
if dataset.nil?
puts "Error: Dataset '#{dataset_name}' not found"
puts "Available datasets:"
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
exit 1
end
run_name = "#{dataset_name}_#{model}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
puts "=" * 80
puts "Starting Evaluation Run"
puts "=" * 80
puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
puts " Type: #{dataset.eval_type}"
puts " Model: #{model}"
puts " Provider: #{provider}"
puts " Run Name: #{run_name}"
puts
eval_run = Eval::Run.create!(
dataset: dataset,
provider: provider,
model: model,
name: run_name,
status: "pending"
)
runner = dataset.runner_class.new(eval_run)
puts "Running evaluation..."
start_time = Time.current
begin
result = runner.run
duration = (Time.current - start_time).round(1)
puts
puts "=" * 80
puts "Evaluation Complete"
puts "=" * 80
puts " Status: #{result.status}"
puts " Duration: #{duration}s"
puts " Run ID: #{result.id}"
puts
puts "Metrics:"
result.metrics.each do |key, value|
next if value.is_a?(Hash) # Skip nested metrics for summary
puts " #{key}: #{format_metric_value(value)}"
end
# Show difficulty breakdown if available
if result.metrics["by_difficulty"].present?
puts
puts "By Difficulty:"
result.metrics["by_difficulty"].each do |difficulty, stats|
puts " #{difficulty}: #{stats['accuracy']}% accuracy (#{stats['correct']}/#{stats['count']})"
end
end
rescue => e
puts
puts "Evaluation FAILED: #{e.message}"
puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
exit 1
end
end
desc "Compare multiple models on a dataset"
task :compare, [ :dataset_name ] => :environment do |_t, args|
dataset_name = args[:dataset_name] || ENV["DATASET"]
models = (ENV["MODELS"] || "gpt-4.1,gpt-4o-mini").split(",").map(&:strip)
provider = ENV["PROVIDER"] || "openai"
if dataset_name.blank?
puts "Usage: MODELS=model1,model2 rake evals:compare[dataset_name]"
exit 1
end
dataset = Eval::Dataset.find_by!(name: dataset_name)
puts "=" * 80
puts "Model Comparison"
puts "=" * 80
puts " Dataset: #{dataset.name}"
puts " Models: #{models.join(', ')}"
puts
runs = models.map do |model|
puts "Running evaluation for #{model}..."
eval_run = Eval::Run.create!(
dataset: dataset,
provider: provider,
model: model,
name: "compare_#{model}_#{Time.current.to_i}",
status: "pending"
)
runner = dataset.runner_class.new(eval_run)
runner.run
end
puts
puts "=" * 80
puts "Comparison Results"
puts "=" * 80
puts
reporter = Eval::Reporters::ComparisonReporter.new(runs)
puts reporter.to_table
summary = reporter.summary
if summary.present?
puts
puts "Recommendations:"
puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
puts
puts " #{summary[:recommendation]}"
end
# Export to CSV if requested
if ENV["CSV"].present?
csv_path = reporter.to_csv(ENV["CSV"])
puts
puts "Exported to: #{csv_path}"
end
end
desc "Generate report for specific runs"
task :report, [ :run_ids ] => :environment do |_t, args|
run_ids = (args[:run_ids] || ENV["RUN_IDS"])&.split(",")
runs = if run_ids.present?
Eval::Run.where(id: run_ids)
else
Eval::Run.completed.order(created_at: :desc).limit(5)
end
if runs.empty?
puts "No runs found."
exit 1
end
reporter = Eval::Reporters::ComparisonReporter.new(runs)
puts reporter.to_table
summary = reporter.summary
if summary.present?
puts
puts "Summary:"
puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
end
if ENV["CSV"].present?
csv_path = reporter.to_csv(ENV["CSV"])
puts
puts "Exported to: #{csv_path}"
end
end
desc "Quick smoke test to verify provider configuration"
task smoke_test: :environment do
puts "Running smoke test..."
provider = Provider::Registry.get_provider(:openai)
unless provider
puts "FAIL: OpenAI provider not configured"
puts "Set OPENAI_ACCESS_TOKEN environment variable or configure in settings"
exit 1
end
puts " Provider: #{provider.provider_name}"
puts " Model: #{provider.instance_variable_get(:@default_model)}"
# Test with a single categorization sample
result = provider.auto_categorize(
transactions: [
{ id: "test", amount: 10, classification: "expense", description: "McDonalds" }
],
user_categories: [
{ id: "1", name: "Food & Drink", classification: "expense" }
]
)
if result.success?
category = result.data.first&.category_name
puts " Test result: #{category || 'null'}"
puts
puts "PASS: Provider is working correctly"
else
puts "FAIL: #{result.error.message}"
exit 1
end
end
desc "Run CI regression test"
task ci_regression: :environment do
dataset_name = ENV["EVAL_DATASET"] || "categorization_golden_v1"
model = ENV["EVAL_MODEL"] || "gpt-4.1-mini"
threshold = (ENV["EVAL_THRESHOLD"] || "80").to_f
dataset = Eval::Dataset.find_by(name: dataset_name)
unless dataset
puts "Dataset '#{dataset_name}' not found. Skipping CI regression test."
exit 0
end
# Get baseline from last successful run
baseline_run = dataset.runs.completed.for_model(model).order(created_at: :desc).first
# Run new evaluation
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: model,
name: "ci_regression_#{Time.current.to_i}",
status: "pending"
)
runner = dataset.runner_class.new(eval_run)
result = runner.run
current_accuracy = result.metrics["accuracy"] || 0
puts "CI Regression Test Results:"
puts " Model: #{model}"
puts " Current Accuracy: #{current_accuracy}%"
if baseline_run
baseline_accuracy = baseline_run.metrics["accuracy"] || 0
puts " Baseline Accuracy: #{baseline_accuracy}%"
accuracy_diff = current_accuracy - baseline_accuracy
if accuracy_diff < -5
puts
puts "REGRESSION DETECTED!"
puts "Accuracy dropped by #{accuracy_diff.abs}% (threshold: 5%)"
exit 1
end
puts " Difference: #{accuracy_diff > 0 ? '+' : ''}#{accuracy_diff.round(2)}%"
end
if current_accuracy < threshold
puts
puts "BELOW THRESHOLD!"
puts "Accuracy #{current_accuracy}% is below required #{threshold}%"
exit 1
end
puts
puts "CI Regression Test PASSED"
end
desc "List recent evaluation runs"
task list_runs: :environment do
runs = Eval::Run.order(created_at: :desc).limit(20)
if runs.empty?
puts "No runs found."
next
end
puts "=" * 100
puts "Recent Evaluation Runs"
puts "=" * 100
runs.each do |run|
status_icon = case run.status
when "completed" then "[OK]"
when "failed" then "[FAIL]"
when "running" then "[...]"
else "[?]"
end
accuracy = run.metrics["accuracy"] ? "#{run.metrics['accuracy']}%" : "-"
puts "#{status_icon} #{run.id[0..7]} | #{run.model.ljust(15)} | #{run.dataset.name.ljust(25)} | #{accuracy.rjust(8)} | #{run.created_at.strftime('%Y-%m-%d %H:%M')}"
end
end
desc "Show details for a specific run"
task :show_run, [ :run_id ] => :environment do |_t, args|
run_id = args[:run_id] || ENV["RUN_ID"]
if run_id.blank?
puts "Usage: rake evals:show_run[run_id]"
exit 1
end
run = Eval::Run.find_by(id: run_id) || Eval::Run.find_by("id::text LIKE ?", "#{run_id}%")
unless run
puts "Run not found: #{run_id}"
exit 1
end
puts "=" * 80
puts "Evaluation Run Details"
puts "=" * 80
puts
puts "Run ID: #{run.id}"
puts "Name: #{run.name}"
puts "Dataset: #{run.dataset.name}"
puts "Model: #{run.model}"
puts "Provider: #{run.provider}"
puts "Status: #{run.status}"
puts "Created: #{run.created_at}"
puts "Duration: #{run.duration_seconds}s" if run.duration_seconds
if run.error_message.present?
puts
puts "Error: #{run.error_message}"
end
if run.metrics.present?
puts
puts "Metrics:"
run.metrics.each do |key, value|
if value.is_a?(Hash)
puts " #{key}:"
value.each { |k, v| puts " #{k}: #{v}" }
else
puts " #{key}: #{format_metric_value(value)}"
end
end
end
# Show sample of incorrect results
incorrect = run.results.incorrect.limit(5)
if incorrect.any?
puts
puts "Sample Incorrect Results (#{run.results.incorrect.count} total):"
incorrect.each do |result|
puts " Sample: #{result.sample_id[0..7]}"
puts " Expected: #{result.sample.expected_output}"
puts " Actual: #{result.actual_output}"
puts
end
end
end
# =============================================================================
# Langfuse Integration
# =============================================================================
namespace :langfuse do
desc "Check Langfuse configuration"
task check: :environment do
begin
client = Eval::Langfuse::Client.new
puts "✓ Langfuse credentials configured"
# Try to list datasets to verify connection
response = client.list_datasets(limit: 1)
puts "✓ Successfully connected to Langfuse"
puts " Region: #{ENV['LANGFUSE_REGION'] || 'us (default)'}"
rescue Eval::Langfuse::Client::ConfigurationError => e
puts "#{e.message}"
exit 1
rescue Eval::Langfuse::Client::ApiError => e
puts "✗ Failed to connect to Langfuse: #{e.message}"
exit 1
end
end
desc "Upload dataset to Langfuse"
task :upload_dataset, [ :dataset_name ] => :environment do |_t, args|
dataset_name = args[:dataset_name] || ENV["DATASET"]
if dataset_name.blank?
puts "Usage: rake evals:langfuse:upload_dataset[dataset_name]"
puts " or: DATASET=name rake evals:langfuse:upload_dataset"
exit 1
end
dataset = Eval::Dataset.find_by(name: dataset_name)
if dataset.nil?
puts "Error: Dataset '#{dataset_name}' not found"
puts "Available datasets:"
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
exit 1
end
puts "=" * 80
puts "Uploading Dataset to Langfuse"
puts "=" * 80
puts " Dataset: #{dataset.name}"
puts " Type: #{dataset.eval_type}"
puts " Samples: #{dataset.sample_count}"
puts
begin
exporter = Eval::Langfuse::DatasetExporter.new(dataset)
result = exporter.export
puts
puts "✓ Successfully uploaded dataset to Langfuse"
puts " Langfuse dataset name: #{result[:dataset_name]}"
puts " Items exported: #{result[:items_exported]}"
puts
puts "View in Langfuse: https://cloud.langfuse.com/project/datasets"
rescue Eval::Langfuse::Client::ConfigurationError => e
puts "#{e.message}"
exit 1
rescue Eval::Langfuse::Client::ApiError => e
puts "✗ Langfuse API error: #{e.message}"
exit 1
end
end
desc "Run experiment in Langfuse"
task :run_experiment, [ :dataset_name, :model ] => :environment do |_t, args|
dataset_name = args[:dataset_name] || ENV["DATASET"]
model = args[:model] || ENV["MODEL"] || "gpt-4.1"
provider = ENV["PROVIDER"] || "openai"
run_name = ENV["RUN_NAME"]
if dataset_name.blank?
puts "Usage: rake evals:langfuse:run_experiment[dataset_name,model]"
puts " or: DATASET=name MODEL=gpt-4.1 rake evals:langfuse:run_experiment"
puts
puts "Optional environment variables:"
puts " PROVIDER=openai (default)"
puts " RUN_NAME=custom_run_name"
exit 1
end
dataset = Eval::Dataset.find_by(name: dataset_name)
if dataset.nil?
puts "Error: Dataset '#{dataset_name}' not found"
puts "Available datasets:"
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
exit 1
end
puts "=" * 80
puts "Running Langfuse Experiment"
puts "=" * 80
puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
puts " Type: #{dataset.eval_type}"
puts " Model: #{model}"
puts " Provider: #{provider}"
puts
begin
runner = Eval::Langfuse::ExperimentRunner.new(
dataset,
model: model,
provider: provider
)
start_time = Time.current
result = runner.run(run_name: run_name)
duration = (Time.current - start_time).round(1)
puts
puts "=" * 80
puts "Experiment Complete"
puts "=" * 80
puts " Run Name: #{result[:run_name]}"
puts " Duration: #{duration}s"
puts
puts "Results:"
puts " Accuracy: #{result[:metrics][:accuracy]}%"
puts " Correct: #{result[:metrics][:correct]}/#{result[:metrics][:total]}"
puts " Avg Latency: #{result[:metrics][:avg_latency_ms]}ms"
puts
puts "View in Langfuse:"
puts " Dataset: https://cloud.langfuse.com/project/datasets"
puts " Traces: https://cloud.langfuse.com/project/traces"
rescue Eval::Langfuse::Client::ConfigurationError => e
puts "#{e.message}"
exit 1
rescue Eval::Langfuse::Client::ApiError => e
puts "✗ Langfuse API error: #{e.message}"
exit 1
rescue => e
puts "✗ Error: #{e.message}"
puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
exit 1
end
end
desc "List datasets in Langfuse"
task list_datasets: :environment do
begin
client = Eval::Langfuse::Client.new
response = client.list_datasets(limit: 100)
datasets = response["data"] || []
if datasets.empty?
puts "No datasets found in Langfuse."
puts "Upload a dataset with: rake evals:langfuse:upload_dataset[dataset_name]"
next
end
puts "=" * 80
puts "Langfuse Datasets"
puts "=" * 80
puts
datasets.each do |ds|
puts " #{ds['name']}"
puts " Description: #{ds['description']}" if ds["description"].present?
puts " Created: #{ds['createdAt']}"
puts " Metadata: #{ds['metadata']}" if ds["metadata"].present?
puts
end
rescue Eval::Langfuse::Client::ConfigurationError => e
puts "#{e.message}"
exit 1
rescue Eval::Langfuse::Client::ApiError => e
puts "✗ Langfuse API error: #{e.message}"
exit 1
end
end
end
desc "Export manually categorized transactions as golden data"
task :export_manual_categories, [ :family_id ] => :environment do |_t, args|
family_id = args[:family_id] || ENV["FAMILY_ID"]
output_path = ENV["OUTPUT"] || "db/eval_data/categorization_manual_export.yml"
limit = (ENV["LIMIT"] || 500).to_i
if family_id.blank?
puts "Usage: rake evals:export_manual_categories[family_id]"
puts " or: FAMILY_ID=uuid rake evals:export_manual_categories"
puts
puts "Optional environment variables:"
puts " OUTPUT=path/to/output.yml (default: db/eval_data/categorization_manual_export.yml)"
puts " LIMIT=500 (default: 500)"
exit 1
end
family = Family.find_by(id: family_id)
if family.nil?
puts "Error: Family '#{family_id}' not found"
exit 1
end
puts "=" * 80
puts "Exporting Manually Categorized Transactions"
puts "=" * 80
puts " Family: #{family.name}"
puts " Output: #{output_path}"
puts " Limit: #{limit}"
puts
# Find transactions that have:
# 1. A category assigned
# 2. locked_attributes contains "category_id" (meaning user manually set it)
# 3. No DataEnrichment record for category_id (meaning it wasn't set by AI/rules/etc)
manually_categorized = Transaction
.joins(:entry)
.joins("INNER JOIN accounts ON accounts.id = entries.account_id")
.where(accounts: { family_id: family_id })
.where.not(category_id: nil)
.where("transactions.locked_attributes ? 'category_id'")
.where.not(
id: DataEnrichment
.where(enrichable_type: "Transaction", attribute_name: "category_id")
.select(:enrichable_id)
)
.includes(:category, entry: :account)
.limit(limit)
count = manually_categorized.count
if count == 0
puts "No manually categorized transactions found."
puts
puts "Manually categorized transactions are those where:"
puts " - User set a category manually (locked_attributes contains 'category_id')"
puts " - Category was NOT set by AI, rules, or data enrichment sources"
exit 0
end
puts "Found #{count} manually categorized transactions"
puts
# Build category context from family's categories
categories = family.categories.includes(:parent).map do |cat|
{
"id" => cat.id.to_s,
"name" => cat.name,
"classification" => cat.classification,
"is_subcategory" => cat.subcategory?,
"parent_id" => cat.parent_id&.to_s
}.compact
end
# Build samples
samples = manually_categorized.map.with_index do |txn, idx|
entry = txn.entry
sample_id = "manual_#{idx + 1}"
{
"id" => sample_id,
"difficulty" => "manual",
"tags" => [ txn.category.name.parameterize.underscore, "manual_export" ],
"input" => {
"id" => txn.id.to_s,
"amount" => entry.amount.to_f.abs,
"classification" => entry.classification,
"description" => entry.name
},
"expected" => {
"category_name" => txn.category.name
}
}
end
# Build output structure
output = {
"name" => "categorization_manual_export",
"description" => "Golden dataset exported from manually categorized user transactions",
"eval_type" => "categorization",
"version" => "1.0",
"metadata" => {
"created_at" => Time.current.strftime("%Y-%m-%d"),
"source" => "manual_export",
"family_id" => family_id,
"exported_count" => samples.size
},
"context" => {
"categories" => categories
},
"samples" => samples
}
# Write to file
FileUtils.mkdir_p(File.dirname(output_path))
File.write(output_path, output.to_yaml)
puts "✓ Successfully exported #{samples.size} samples"
puts " Difficulty: manual"
puts
puts "Output written to: #{output_path}"
puts
puts "To import this dataset, run:"
puts " rake evals:import_dataset[#{output_path}]"
end
private
def format_metric_value(value)
case value
when Float
value.round(4)
when BigDecimal
value.to_f.round(4)
else
value
end
end
end

View File

@@ -0,0 +1,118 @@
require "test_helper"
class Eval::DatasetTest < ActiveSupport::TestCase
test "validates presence of name and eval_type" do
dataset = Eval::Dataset.new
assert_not dataset.valid?
assert_includes dataset.errors[:name], "can't be blank"
assert_includes dataset.errors[:eval_type], "can't be blank"
end
test "validates eval_type is one of allowed values" do
dataset = Eval::Dataset.new(name: "test", eval_type: "invalid")
assert_not dataset.valid?
assert_includes dataset.errors[:eval_type], "is not included in the list"
dataset.eval_type = "categorization"
dataset.valid?
assert_empty dataset.errors[:eval_type]
end
test "validates name uniqueness" do
Eval::Dataset.create!(name: "unique_test", eval_type: "categorization")
duplicate = Eval::Dataset.new(name: "unique_test", eval_type: "categorization")
assert_not duplicate.valid?
assert_includes duplicate.errors[:name], "has already been taken"
end
test "scopes filter by eval_type" do
cat_dataset = Eval::Dataset.create!(name: "cat_test", eval_type: "categorization")
merch_dataset = Eval::Dataset.create!(name: "merch_test", eval_type: "merchant_detection")
chat_dataset = Eval::Dataset.create!(name: "chat_test", eval_type: "chat")
assert_includes Eval::Dataset.for_categorization, cat_dataset
assert_not_includes Eval::Dataset.for_categorization, merch_dataset
assert_includes Eval::Dataset.for_merchant_detection, merch_dataset
assert_not_includes Eval::Dataset.for_merchant_detection, cat_dataset
assert_includes Eval::Dataset.for_chat, chat_dataset
assert_not_includes Eval::Dataset.for_chat, cat_dataset
end
test "import_from_yaml creates dataset with samples" do
yaml_content = <<~YAML
name: test_import
description: Test dataset
eval_type: categorization
version: "1.0"
context:
categories:
- id: "food"
name: "Food"
classification: "expense"
samples:
- id: sample_1
difficulty: easy
tags: [test]
input:
id: txn_1
amount: 10
classification: expense
description: "Test transaction"
expected:
category_name: "Food"
YAML
file_path = Rails.root.join("tmp", "test_import.yml")
File.write(file_path, yaml_content)
dataset = Eval::Dataset.import_from_yaml(file_path)
assert_equal "test_import", dataset.name
assert_equal "categorization", dataset.eval_type
assert_equal 1, dataset.samples.count
assert_equal "easy", dataset.samples.first.difficulty
assert_equal "Food", dataset.samples.first.expected_output["category_name"]
ensure
File.delete(file_path) if File.exist?(file_path)
end
test "statistics returns sample breakdown" do
dataset = Eval::Dataset.create!(name: "stats_test", eval_type: "categorization")
dataset.samples.create!(
input_data: { id: "1" },
expected_output: { category_name: "Food" },
difficulty: "easy",
tags: [ "food" ]
)
dataset.samples.create!(
input_data: { id: "2" },
expected_output: { category_name: "Travel" },
difficulty: "medium",
tags: [ "travel" ]
)
stats = dataset.statistics
assert_equal 2, stats[:total_samples]
assert_equal({ "easy" => 1, "medium" => 1 }, stats[:by_difficulty])
assert_includes stats[:by_tags], "food"
assert_includes stats[:by_tags], "travel"
end
test "runner_class returns correct class for each eval_type" do
cat_dataset = Eval::Dataset.new(eval_type: "categorization")
merch_dataset = Eval::Dataset.new(eval_type: "merchant_detection")
chat_dataset = Eval::Dataset.new(eval_type: "chat")
assert_equal Eval::Runners::CategorizationRunner, cat_dataset.runner_class
assert_equal Eval::Runners::MerchantDetectionRunner, merch_dataset.runner_class
assert_equal Eval::Runners::ChatRunner, chat_dataset.runner_class
end
end

View File

@@ -0,0 +1,212 @@
require "test_helper"
class Eval::Runners::CategorizationRunnerTest < ActiveSupport::TestCase
include ProviderTestHelper
setup do
@categories = [
{ "id" => "food", "name" => "Food & Drink", "classification" => "expense" },
{ "id" => "fast_food", "name" => "Fast Food", "classification" => "expense", "parent_id" => "food" }
]
end
test "run processes all samples and calculates metrics" do
dataset = Eval::Dataset.create!(
name: "test_cat_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample1 = dataset.samples.create!(
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
expected_output: { "category_name" => "Fast Food" },
context_data: { "categories" => @categories },
difficulty: "easy"
)
sample2 = dataset.samples.create!(
input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
expected_output: { "category_name" => nil },
context_data: { "categories" => @categories },
difficulty: "edge_case"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_run",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
mock_response = provider_success_response([
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample1.id, category_name: "Fast Food"),
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample2.id, category_name: "null")
])
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
runner = Eval::Runners::CategorizationRunner.new(eval_run)
result = runner.run
assert_equal "completed", result.status
assert_equal 2, result.results.count
assert result.metrics["accuracy"].present?
end
test "records correct result when category matches" do
dataset = Eval::Dataset.create!(
name: "test_cat_match_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample = dataset.samples.create!(
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
expected_output: { "category_name" => "Fast Food" },
context_data: { "categories" => @categories },
difficulty: "easy"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_run",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
mock_response = provider_success_response([
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Fast Food")
])
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
runner = Eval::Runners::CategorizationRunner.new(eval_run)
runner.run
result = eval_run.results.find_by(eval_sample_id: sample.id)
assert result.correct
assert result.exact_match
assert_equal "Fast Food", result.actual_output["category_name"]
end
test "records hierarchical match when parent category returned" do
dataset = Eval::Dataset.create!(
name: "test_cat_hier_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample = dataset.samples.create!(
input_data: { "id" => "txn_3", "amount" => 50, "classification" => "expense", "description" => "Olive Garden" },
expected_output: { "category_name" => "Fast Food" },
context_data: { "categories" => @categories },
difficulty: "medium"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_hierarchical",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
# Model returns parent category instead of subcategory
mock_response = provider_success_response([
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Food & Drink")
])
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
runner = Eval::Runners::CategorizationRunner.new(eval_run)
runner.run
result = eval_run.results.find_by(eval_sample_id: sample.id)
assert_not result.exact_match
assert result.hierarchical_match
end
test "handles null correctly when expected" do
dataset = Eval::Dataset.create!(
name: "test_cat_null_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample = dataset.samples.create!(
input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
expected_output: { "category_name" => nil },
context_data: { "categories" => @categories },
difficulty: "edge_case"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_run",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
mock_response = provider_success_response([
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "null")
])
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
runner = Eval::Runners::CategorizationRunner.new(eval_run)
runner.run
result = eval_run.results.find_by(eval_sample_id: sample.id)
assert result.correct
assert result.null_expected
assert result.null_returned
end
test "records error results on provider error but completes run" do
dataset = Eval::Dataset.create!(
name: "test_cat_err_#{SecureRandom.hex(4)}",
eval_type: "categorization",
version: "1.0"
)
sample = dataset.samples.create!(
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
expected_output: { "category_name" => "Fast Food" },
context_data: { "categories" => @categories },
difficulty: "easy"
)
eval_run = Eval::Run.create!(
dataset: dataset,
provider: "openai",
model: "gpt-4.1",
name: "test_run",
provider_config: { "access_token" => "test-token" },
status: "pending"
)
Provider::Openai.any_instance.stubs(:auto_categorize).raises(StandardError.new("API Error"))
runner = Eval::Runners::CategorizationRunner.new(eval_run)
result = runner.run
# Run completes but with error results
assert_equal "completed", result.status
assert_equal 1, result.results.count
error_result = result.results.find_by(eval_sample_id: sample.id)
assert_not error_result.correct
assert_includes error_result.actual_output["error"], "API Error"
end
end