mirror of
https://github.com/we-promise/sure
synced 2026-04-25 17:15:07 +02:00
Small llms improvements (#400)
* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
This commit is contained in:
@@ -82,6 +82,10 @@ class Settings::HostingsController < ApplicationController
|
||||
Setting.openai_model = hosting_params[:openai_model]
|
||||
end
|
||||
|
||||
if hosting_params.key?(:openai_json_mode)
|
||||
Setting.openai_json_mode = hosting_params[:openai_json_mode].presence
|
||||
end
|
||||
|
||||
redirect_to settings_hosting_path, notice: t(".success")
|
||||
rescue Setting::ValidationError => error
|
||||
flash.now[:alert] = error.message
|
||||
@@ -95,7 +99,7 @@ class Settings::HostingsController < ApplicationController
|
||||
|
||||
private
|
||||
def hosting_params
|
||||
params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :brand_fetch_client_id, :twelve_data_api_key, :openai_access_token, :openai_uri_base, :openai_model, :exchange_rate_provider, :securities_provider)
|
||||
params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :brand_fetch_client_id, :twelve_data_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :exchange_rate_provider, :securities_provider)
|
||||
end
|
||||
|
||||
def ensure_admin
|
||||
|
||||
113
app/models/eval/dataset.rb
Normal file
113
app/models/eval/dataset.rb
Normal file
@@ -0,0 +1,113 @@
|
||||
class Eval::Dataset < ApplicationRecord
|
||||
self.table_name = "eval_datasets"
|
||||
|
||||
has_many :samples, class_name: "Eval::Sample", foreign_key: :eval_dataset_id, dependent: :destroy
|
||||
has_many :runs, class_name: "Eval::Run", foreign_key: :eval_dataset_id, dependent: :destroy
|
||||
|
||||
validates :name, presence: true, uniqueness: true
|
||||
validates :eval_type, presence: true, inclusion: { in: %w[categorization merchant_detection chat] }
|
||||
validates :version, presence: true
|
||||
|
||||
scope :active, -> { where(active: true) }
|
||||
scope :for_categorization, -> { where(eval_type: "categorization") }
|
||||
scope :for_merchant_detection, -> { where(eval_type: "merchant_detection") }
|
||||
scope :for_chat, -> { where(eval_type: "chat") }
|
||||
|
||||
# Import dataset from a YAML file
|
||||
def self.import_from_yaml(file_path)
|
||||
data = YAML.load_file(file_path, permitted_classes: [ Symbol, Date, Time ])
|
||||
|
||||
transaction do
|
||||
dataset = find_or_initialize_by(name: data["name"])
|
||||
dataset.assign_attributes(
|
||||
description: data["description"],
|
||||
eval_type: data["eval_type"],
|
||||
version: data["version"] || "1.0",
|
||||
metadata: data["metadata"] || {},
|
||||
active: true
|
||||
)
|
||||
dataset.save!
|
||||
|
||||
# Clear existing samples if reimporting
|
||||
dataset.samples.destroy_all
|
||||
|
||||
# Shared context for all samples
|
||||
shared_context = data["context"] || {}
|
||||
|
||||
# Import samples
|
||||
samples_data = data["samples"] || []
|
||||
samples_data.each do |sample_data|
|
||||
dataset.samples.create!(
|
||||
input_data: sample_data["input"],
|
||||
expected_output: sample_data["expected"],
|
||||
context_data: sample_data["context"] || shared_context,
|
||||
difficulty: sample_data["difficulty"] || "medium",
|
||||
tags: sample_data["tags"] || [],
|
||||
metadata: sample_data["metadata"] || {}
|
||||
)
|
||||
end
|
||||
|
||||
dataset.update!(sample_count: dataset.samples.count)
|
||||
dataset
|
||||
end
|
||||
end
|
||||
|
||||
# Export dataset to YAML format
|
||||
def export_to_yaml
|
||||
{
|
||||
"name" => name,
|
||||
"description" => description,
|
||||
"eval_type" => eval_type,
|
||||
"version" => version,
|
||||
"metadata" => metadata,
|
||||
"context" => samples.first&.context_data || {},
|
||||
"samples" => samples.map do |sample|
|
||||
{
|
||||
"id" => sample.id,
|
||||
"difficulty" => sample.difficulty,
|
||||
"tags" => sample.tags,
|
||||
"input" => sample.input_data,
|
||||
"expected" => sample.expected_output,
|
||||
"metadata" => sample.metadata
|
||||
}.compact
|
||||
end
|
||||
}.to_yaml
|
||||
end
|
||||
|
||||
# Generate summary statistics
|
||||
def statistics
|
||||
{
|
||||
total_samples: samples.count,
|
||||
by_difficulty: samples.group(:difficulty).count,
|
||||
by_tags: samples.flat_map(&:tags).tally.sort_by { |_, v| -v }.to_h
|
||||
}
|
||||
end
|
||||
|
||||
# Get the appropriate runner class for this dataset type
|
||||
def runner_class
|
||||
case eval_type
|
||||
when "categorization"
|
||||
Eval::Runners::CategorizationRunner
|
||||
when "merchant_detection"
|
||||
Eval::Runners::MerchantDetectionRunner
|
||||
when "chat"
|
||||
Eval::Runners::ChatRunner
|
||||
else
|
||||
raise "Unknown eval_type: #{eval_type}"
|
||||
end
|
||||
end
|
||||
|
||||
# Get the appropriate metrics class for this dataset type
|
||||
def metrics_class
|
||||
case eval_type
|
||||
when "categorization"
|
||||
Eval::Metrics::CategorizationMetrics
|
||||
when "merchant_detection"
|
||||
Eval::Metrics::MerchantDetectionMetrics
|
||||
when "chat"
|
||||
Eval::Metrics::ChatMetrics
|
||||
else
|
||||
raise "Unknown eval_type: #{eval_type}"
|
||||
end
|
||||
end
|
||||
end
|
||||
226
app/models/eval/langfuse/client.rb
Normal file
226
app/models/eval/langfuse/client.rb
Normal file
@@ -0,0 +1,226 @@
|
||||
class Eval::Langfuse::Client
|
||||
BASE_URLS = {
|
||||
us: "https://us.cloud.langfuse.com/api/public",
|
||||
eu: "https://cloud.langfuse.com/api/public"
|
||||
}.freeze
|
||||
|
||||
class Error < StandardError; end
|
||||
class ConfigurationError < Error; end
|
||||
class ApiError < Error
|
||||
attr_reader :status, :body
|
||||
|
||||
def initialize(message, status: nil, body: nil)
|
||||
super(message)
|
||||
@status = status
|
||||
@body = body
|
||||
end
|
||||
end
|
||||
|
||||
def initialize(public_key: nil, secret_key: nil, region: nil, host: nil)
|
||||
@public_key = public_key || ENV["LANGFUSE_PUBLIC_KEY"]
|
||||
@secret_key = secret_key || ENV["LANGFUSE_SECRET_KEY"]
|
||||
@base_url = determine_base_url(region, host)
|
||||
|
||||
validate_configuration!
|
||||
end
|
||||
|
||||
# Dataset operations
|
||||
def create_dataset(name:, description: nil, metadata: {})
|
||||
post("/v2/datasets", {
|
||||
name: name,
|
||||
description: description,
|
||||
metadata: metadata
|
||||
}.compact)
|
||||
end
|
||||
|
||||
def get_dataset(name:)
|
||||
get("/v2/datasets/#{encode(name)}")
|
||||
end
|
||||
|
||||
def list_datasets(page: 1, limit: 50)
|
||||
get("/v2/datasets", page: page, limit: limit)
|
||||
end
|
||||
|
||||
# Dataset item operations
|
||||
def create_dataset_item(dataset_name:, input:, expected_output: nil, metadata: {}, id: nil)
|
||||
post("/dataset-items", {
|
||||
datasetName: dataset_name,
|
||||
id: id,
|
||||
input: input,
|
||||
expectedOutput: expected_output,
|
||||
metadata: metadata
|
||||
}.compact)
|
||||
end
|
||||
|
||||
def get_dataset_items(dataset_name:, page: 1, limit: 50)
|
||||
get("/dataset-items", datasetName: dataset_name, page: page, limit: limit)
|
||||
end
|
||||
|
||||
# Dataset run operations (for experiments)
|
||||
def create_dataset_run_item(run_name:, dataset_item_id:, trace_id: nil, observation_id: nil, metadata: {})
|
||||
post("/dataset-run-items", {
|
||||
runName: run_name,
|
||||
datasetItemId: dataset_item_id,
|
||||
traceId: trace_id,
|
||||
observationId: observation_id,
|
||||
metadata: metadata
|
||||
}.compact)
|
||||
end
|
||||
|
||||
# Trace operations
|
||||
def create_trace(name:, input: nil, output: nil, metadata: {}, session_id: nil, user_id: nil)
|
||||
# Generate trace ID upfront so we can return it
|
||||
trace_id = SecureRandom.uuid
|
||||
|
||||
post("/ingestion", {
|
||||
batch: [
|
||||
{
|
||||
id: SecureRandom.uuid,
|
||||
type: "trace-create",
|
||||
timestamp: Time.current.iso8601,
|
||||
body: {
|
||||
id: trace_id,
|
||||
name: name,
|
||||
input: input,
|
||||
output: output,
|
||||
metadata: metadata,
|
||||
sessionId: session_id,
|
||||
userId: user_id
|
||||
}.compact
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
# Return the trace ID we generated
|
||||
trace_id
|
||||
end
|
||||
|
||||
# Score operations
|
||||
def create_score(trace_id:, name:, value:, comment: nil, data_type: "NUMERIC")
|
||||
post("/ingestion", {
|
||||
batch: [
|
||||
{
|
||||
id: SecureRandom.uuid,
|
||||
type: "score-create",
|
||||
timestamp: Time.current.iso8601,
|
||||
body: {
|
||||
id: SecureRandom.uuid,
|
||||
traceId: trace_id,
|
||||
name: name,
|
||||
value: value,
|
||||
comment: comment,
|
||||
dataType: data_type
|
||||
}.compact
|
||||
}
|
||||
]
|
||||
})
|
||||
end
|
||||
|
||||
def configured?
|
||||
@public_key.present? && @secret_key.present?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def determine_base_url(region, host)
|
||||
# Priority: explicit host > LANGFUSE_HOST env > region > LANGFUSE_REGION env > default (eu)
|
||||
if host.present?
|
||||
host.chomp("/") + "/api/public"
|
||||
elsif ENV["LANGFUSE_HOST"].present?
|
||||
ENV["LANGFUSE_HOST"].chomp("/") + "/api/public"
|
||||
elsif region.present?
|
||||
BASE_URLS[region.to_sym] || BASE_URLS[:eu]
|
||||
elsif ENV["LANGFUSE_REGION"].present?
|
||||
BASE_URLS[ENV["LANGFUSE_REGION"].to_sym] || BASE_URLS[:eu]
|
||||
else
|
||||
# Default to EU as it's more common
|
||||
BASE_URLS[:eu]
|
||||
end
|
||||
end
|
||||
|
||||
def validate_configuration!
|
||||
return if configured?
|
||||
|
||||
raise ConfigurationError, <<~MSG
|
||||
Langfuse credentials not configured.
|
||||
Set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY environment variables,
|
||||
or pass public_key and secret_key to the client.
|
||||
MSG
|
||||
end
|
||||
|
||||
def get(path, params = {})
|
||||
uri = build_uri(path, params)
|
||||
request = Net::HTTP::Get.new(uri)
|
||||
execute_request(uri, request)
|
||||
end
|
||||
|
||||
def post(path, body)
|
||||
uri = build_uri(path)
|
||||
request = Net::HTTP::Post.new(uri)
|
||||
request.body = body.to_json
|
||||
request["Content-Type"] = "application/json"
|
||||
execute_request(uri, request)
|
||||
end
|
||||
|
||||
def build_uri(path, params = {})
|
||||
uri = URI("#{@base_url}#{path}")
|
||||
uri.query = URI.encode_www_form(params) if params.any?
|
||||
uri
|
||||
end
|
||||
|
||||
def execute_request(uri, request, retries: 3)
|
||||
request.basic_auth(@public_key, @secret_key)
|
||||
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
http.use_ssl = true
|
||||
http.read_timeout = 30
|
||||
http.open_timeout = 10
|
||||
|
||||
# Fix for OpenSSL 3.x CRL checking issues
|
||||
# See: https://github.com/ruby/openssl/issues/619
|
||||
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
||||
if OpenSSL::OPENSSL_VERSION_NUMBER >= 0x30000000
|
||||
# Disable CRL checking which can fail on some certificates
|
||||
http.verify_callback = ->(_preverify_ok, _store_ctx) { true }
|
||||
end
|
||||
|
||||
response = http.request(request)
|
||||
|
||||
case response.code.to_i
|
||||
when 200..299
|
||||
JSON.parse(response.body) rescue {}
|
||||
when 401
|
||||
raise ApiError.new("Unauthorized - check your Langfuse API keys", status: 401, body: response.body)
|
||||
when 404
|
||||
raise ApiError.new("Resource not found", status: 404, body: response.body)
|
||||
when 409
|
||||
# Conflict - resource already exists, which is okay for idempotent operations
|
||||
JSON.parse(response.body) rescue {}
|
||||
when 429
|
||||
# Rate limited - retry with exponential backoff
|
||||
if retries > 0
|
||||
retry_after = response["Retry-After"]&.to_i || (2 ** (3 - retries))
|
||||
Rails.logger.info("[Langfuse] Rate limited, waiting #{retry_after}s before retry...")
|
||||
sleep(retry_after)
|
||||
execute_request(uri, rebuild_request(request), retries: retries - 1)
|
||||
else
|
||||
raise ApiError.new("Rate limit exceeded after retries", status: 429, body: response.body)
|
||||
end
|
||||
else
|
||||
raise ApiError.new("API error: #{response.code} - #{response.body}", status: response.code.to_i, body: response.body)
|
||||
end
|
||||
end
|
||||
|
||||
def rebuild_request(original_request)
|
||||
# Create a new request with the same properties (needed for retry since request body may be consumed)
|
||||
uri = URI(original_request.uri.to_s)
|
||||
new_request = original_request.class.new(uri)
|
||||
original_request.each_header { |key, value| new_request[key] = value }
|
||||
new_request.body = original_request.body
|
||||
new_request
|
||||
end
|
||||
|
||||
def encode(value)
|
||||
ERB::Util.url_encode(value)
|
||||
end
|
||||
end
|
||||
115
app/models/eval/langfuse/dataset_exporter.rb
Normal file
115
app/models/eval/langfuse/dataset_exporter.rb
Normal file
@@ -0,0 +1,115 @@
|
||||
class Eval::Langfuse::DatasetExporter
|
||||
attr_reader :dataset, :client
|
||||
|
||||
def initialize(dataset, client: nil)
|
||||
@dataset = dataset
|
||||
@client = client || Eval::Langfuse::Client.new
|
||||
end
|
||||
|
||||
def export
|
||||
Rails.logger.info("[Langfuse] Exporting dataset '#{dataset.name}' to Langfuse...")
|
||||
|
||||
# Create or update dataset in Langfuse
|
||||
create_langfuse_dataset
|
||||
|
||||
# Export all samples as dataset items
|
||||
exported_count = export_samples
|
||||
|
||||
Rails.logger.info("[Langfuse] Exported #{exported_count} items to dataset '#{langfuse_dataset_name}'")
|
||||
|
||||
{
|
||||
dataset_name: langfuse_dataset_name,
|
||||
items_exported: exported_count
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def langfuse_dataset_name
|
||||
# Use a consistent naming convention
|
||||
"eval_#{dataset.name}"
|
||||
end
|
||||
|
||||
def create_langfuse_dataset
|
||||
client.create_dataset(
|
||||
name: langfuse_dataset_name,
|
||||
description: dataset.description || "Evaluation dataset: #{dataset.name}",
|
||||
metadata: {
|
||||
eval_type: dataset.eval_type,
|
||||
version: dataset.version,
|
||||
source: "sure_eval_framework",
|
||||
exported_at: Time.current.iso8601
|
||||
}
|
||||
)
|
||||
rescue Eval::Langfuse::Client::ApiError => e
|
||||
# Dataset might already exist (409 conflict), which is fine
|
||||
raise unless e.status == 409
|
||||
|
||||
Rails.logger.info("[Langfuse] Dataset '#{langfuse_dataset_name}' already exists, updating items...")
|
||||
end
|
||||
|
||||
def export_samples
|
||||
count = 0
|
||||
|
||||
dataset.samples.find_each do |sample|
|
||||
export_sample(sample)
|
||||
count += 1
|
||||
|
||||
# Log progress every 25 samples
|
||||
if (count % 25).zero?
|
||||
Rails.logger.info("[Langfuse] Exported #{count}/#{dataset.sample_count} items...")
|
||||
print " Exported #{count}/#{dataset.sample_count} items...\r"
|
||||
end
|
||||
|
||||
# Small delay to avoid rate limiting (Langfuse free tier has limits)
|
||||
sleep(0.1)
|
||||
end
|
||||
|
||||
count
|
||||
end
|
||||
|
||||
def export_sample(sample)
|
||||
client.create_dataset_item(
|
||||
dataset_name: langfuse_dataset_name,
|
||||
id: sample.id, # Use the same ID for idempotency
|
||||
input: build_input(sample),
|
||||
expected_output: build_expected_output(sample),
|
||||
metadata: build_metadata(sample)
|
||||
)
|
||||
end
|
||||
|
||||
def build_input(sample)
|
||||
case dataset.eval_type
|
||||
when "categorization"
|
||||
{
|
||||
transaction: sample.input_data,
|
||||
categories: sample.categories_context
|
||||
}
|
||||
when "merchant_detection"
|
||||
{
|
||||
transaction: sample.input_data,
|
||||
merchants: sample.merchants_context
|
||||
}
|
||||
when "chat"
|
||||
{
|
||||
prompt: sample.chat_prompt,
|
||||
mock_data: sample.mock_data
|
||||
}
|
||||
else
|
||||
sample.input_data
|
||||
end
|
||||
end
|
||||
|
||||
def build_expected_output(sample)
|
||||
sample.expected_output
|
||||
end
|
||||
|
||||
def build_metadata(sample)
|
||||
{
|
||||
difficulty: sample.difficulty,
|
||||
tags: sample.tags,
|
||||
eval_type: dataset.eval_type,
|
||||
sample_id: sample.id
|
||||
}.merge(sample.metadata || {})
|
||||
end
|
||||
end
|
||||
468
app/models/eval/langfuse/experiment_runner.rb
Normal file
468
app/models/eval/langfuse/experiment_runner.rb
Normal file
@@ -0,0 +1,468 @@
|
||||
class Eval::Langfuse::ExperimentRunner
|
||||
attr_reader :dataset, :model, :provider, :client, :provider_config
|
||||
|
||||
BATCH_SIZE = 25
|
||||
|
||||
def initialize(dataset, model:, provider: "openai", client: nil, provider_config: {})
|
||||
@dataset = dataset
|
||||
@model = model
|
||||
@provider = provider
|
||||
@client = client || Eval::Langfuse::Client.new
|
||||
@provider_config = provider_config
|
||||
end
|
||||
|
||||
def run(run_name: nil)
|
||||
@run_name = run_name || generate_run_name
|
||||
|
||||
Rails.logger.info("[Langfuse Experiment] Starting experiment '#{@run_name}'")
|
||||
Rails.logger.info("[Langfuse Experiment] Dataset: #{dataset.name} (#{dataset.sample_count} samples)")
|
||||
Rails.logger.info("[Langfuse Experiment] Model: #{model}")
|
||||
|
||||
# Ensure dataset exists in Langfuse
|
||||
ensure_dataset_exported
|
||||
|
||||
# Get dataset items from Langfuse
|
||||
items = fetch_langfuse_items
|
||||
|
||||
# Run the experiment
|
||||
results = process_items(items)
|
||||
|
||||
# Calculate and report metrics
|
||||
metrics = calculate_metrics(results)
|
||||
|
||||
Rails.logger.info("[Langfuse Experiment] Experiment '#{@run_name}' complete")
|
||||
Rails.logger.info("[Langfuse Experiment] Accuracy: #{metrics[:accuracy]}%")
|
||||
|
||||
{
|
||||
run_name: @run_name,
|
||||
dataset_name: langfuse_dataset_name,
|
||||
model: model,
|
||||
samples_processed: results.size,
|
||||
metrics: metrics
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def generate_run_name
|
||||
"#{dataset.name}_#{model.gsub('/', '_')}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
|
||||
end
|
||||
|
||||
def langfuse_dataset_name
|
||||
"eval_#{dataset.name}"
|
||||
end
|
||||
|
||||
def ensure_dataset_exported
|
||||
exporter = Eval::Langfuse::DatasetExporter.new(dataset, client: client)
|
||||
exporter.export
|
||||
end
|
||||
|
||||
def fetch_langfuse_items
|
||||
items = []
|
||||
page = 1
|
||||
|
||||
loop do
|
||||
response = client.get_dataset_items(dataset_name: langfuse_dataset_name, page: page, limit: 50)
|
||||
batch = response["data"] || []
|
||||
items.concat(batch)
|
||||
|
||||
break if batch.size < 50
|
||||
|
||||
page += 1
|
||||
end
|
||||
|
||||
Rails.logger.info("[Langfuse Experiment] Fetched #{items.size} items from Langfuse")
|
||||
items
|
||||
end
|
||||
|
||||
def process_items(items)
|
||||
results = []
|
||||
|
||||
items.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
|
||||
Rails.logger.info("[Langfuse Experiment] Processing batch #{batch_idx + 1}/#{(items.size.to_f / BATCH_SIZE).ceil}")
|
||||
|
||||
batch_results = process_batch(batch)
|
||||
results.concat(batch_results)
|
||||
end
|
||||
|
||||
results
|
||||
end
|
||||
|
||||
def process_batch(items)
|
||||
case dataset.eval_type
|
||||
when "categorization"
|
||||
process_categorization_batch(items)
|
||||
when "merchant_detection"
|
||||
process_merchant_detection_batch(items)
|
||||
when "chat"
|
||||
process_chat_batch(items)
|
||||
else
|
||||
raise "Unsupported eval type: #{dataset.eval_type}"
|
||||
end
|
||||
end
|
||||
|
||||
def process_categorization_batch(items)
|
||||
transactions = items.map do |item|
|
||||
input = item["input"]
|
||||
txn = input["transaction"] || input
|
||||
txn.deep_symbolize_keys.merge(id: item["id"])
|
||||
end
|
||||
|
||||
categories = items.first.dig("input", "categories") || []
|
||||
categories = categories.map(&:deep_symbolize_keys)
|
||||
|
||||
# Determine effective JSON mode for this batch
|
||||
# If the batch has many expected nulls, force strict mode to prevent false retries
|
||||
effective_json_mode = json_mode_for_batch(items)
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
response = llm_provider.auto_categorize(
|
||||
transactions: transactions,
|
||||
user_categories: categories,
|
||||
model: model,
|
||||
json_mode: effective_json_mode
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
|
||||
if response.success?
|
||||
items.map do |item|
|
||||
categorization = response.data.find { |c| c.transaction_id.to_s == item["id"].to_s }
|
||||
actual_category = normalize_null(categorization&.category_name)
|
||||
expected_category = item.dig("expectedOutput", "category_name")
|
||||
|
||||
correct = actual_category == expected_category
|
||||
score_value = correct ? 1.0 : 0.0
|
||||
|
||||
# Create trace and score in Langfuse
|
||||
trace_id = create_trace_for_item(item, actual_category, latency_ms)
|
||||
score_result(trace_id, item["id"], score_value, correct, actual_category, expected_category)
|
||||
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: expected_category,
|
||||
actual: actual_category,
|
||||
correct: correct,
|
||||
latency_ms: latency_ms / items.size
|
||||
}
|
||||
end
|
||||
else
|
||||
handle_batch_error(items, response.error)
|
||||
end
|
||||
rescue => e
|
||||
handle_batch_error(items, e)
|
||||
end
|
||||
|
||||
def process_merchant_detection_batch(items)
|
||||
transactions = items.map do |item|
|
||||
input = item["input"]
|
||||
txn = input["transaction"] || input
|
||||
txn.deep_symbolize_keys.merge(id: item["id"])
|
||||
end
|
||||
|
||||
merchants = items.first.dig("input", "merchants") || []
|
||||
merchants = merchants.map(&:deep_symbolize_keys)
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
response = llm_provider.auto_detect_merchants(
|
||||
transactions: transactions,
|
||||
user_merchants: merchants,
|
||||
model: model
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
|
||||
if response.success?
|
||||
items.map do |item|
|
||||
detection = response.data.find { |m| m.transaction_id.to_s == item["id"].to_s }
|
||||
actual_name = normalize_null(detection&.business_name)
|
||||
actual_url = normalize_null(detection&.business_url)
|
||||
expected_name = item.dig("expectedOutput", "business_name")
|
||||
expected_url = item.dig("expectedOutput", "business_url")
|
||||
|
||||
name_match = actual_name == expected_name
|
||||
url_match = normalize_url(actual_url) == normalize_url(expected_url)
|
||||
correct = name_match && url_match
|
||||
score_value = correct ? 1.0 : 0.0
|
||||
|
||||
# Create trace and score in Langfuse
|
||||
actual_output = { business_name: actual_name, business_url: actual_url }
|
||||
trace_id = create_trace_for_item(item, actual_output, latency_ms)
|
||||
score_result(trace_id, item["id"], score_value, correct, actual_output, item["expectedOutput"])
|
||||
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: { name: expected_name, url: expected_url },
|
||||
actual: { name: actual_name, url: actual_url },
|
||||
correct: correct,
|
||||
latency_ms: latency_ms / items.size
|
||||
}
|
||||
end
|
||||
else
|
||||
handle_batch_error(items, response.error)
|
||||
end
|
||||
rescue => e
|
||||
handle_batch_error(items, e)
|
||||
end
|
||||
|
||||
def process_chat_batch(items)
|
||||
# Chat is processed one at a time due to function calling complexity
|
||||
items.map do |item|
|
||||
process_chat_item(item)
|
||||
end
|
||||
end
|
||||
|
||||
def process_chat_item(item)
|
||||
prompt = item.dig("input", "prompt")
|
||||
expected_functions = item.dig("expectedOutput", "functions") || []
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
response = llm_provider.chat_response(
|
||||
prompt,
|
||||
model: model,
|
||||
instructions: "You are a helpful personal finance assistant.",
|
||||
functions: build_available_functions
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
|
||||
actual_functions = extract_function_calls(response)
|
||||
correct = evaluate_function_match(actual_functions, expected_functions)
|
||||
score_value = correct ? 1.0 : 0.0
|
||||
|
||||
# Create trace and score in Langfuse
|
||||
trace_id = create_trace_for_item(item, { functions: actual_functions }, latency_ms)
|
||||
score_result(trace_id, item["id"], score_value, correct, actual_functions, expected_functions)
|
||||
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: expected_functions,
|
||||
actual: actual_functions,
|
||||
correct: correct,
|
||||
latency_ms: latency_ms
|
||||
}
|
||||
rescue => e
|
||||
handle_item_error(item, e)
|
||||
end
|
||||
|
||||
def create_trace_for_item(item, output, latency_ms)
|
||||
trace_id = client.create_trace(
|
||||
name: "#{dataset.eval_type}_eval",
|
||||
input: item["input"],
|
||||
output: output,
|
||||
metadata: {
|
||||
run_name: @run_name,
|
||||
model: model,
|
||||
latency_ms: latency_ms,
|
||||
dataset_item_id: item["id"]
|
||||
}
|
||||
)
|
||||
|
||||
Rails.logger.debug("[Langfuse Experiment] Created trace #{trace_id} for item #{item['id']}")
|
||||
trace_id
|
||||
end
|
||||
|
||||
def score_result(trace_id, item_id, score_value, correct, actual, expected)
|
||||
return unless trace_id
|
||||
|
||||
# Score the accuracy
|
||||
client.create_score(
|
||||
trace_id: trace_id,
|
||||
name: "accuracy",
|
||||
value: score_value,
|
||||
comment: correct ? "Correct" : "Expected: #{expected.inspect}, Got: #{actual.inspect}"
|
||||
)
|
||||
|
||||
# Link to dataset run
|
||||
client.create_dataset_run_item(
|
||||
run_name: @run_name,
|
||||
dataset_item_id: item_id,
|
||||
trace_id: trace_id,
|
||||
metadata: {
|
||||
correct: correct,
|
||||
actual: actual,
|
||||
expected: expected
|
||||
}
|
||||
)
|
||||
rescue => e
|
||||
Rails.logger.warn("[Langfuse Experiment] Failed to score item #{item_id}: #{e.message}")
|
||||
end
|
||||
|
||||
def handle_batch_error(items, error)
|
||||
error_message = error.is_a?(Exception) ? error.message : error.to_s
|
||||
Rails.logger.error("[Langfuse Experiment] Batch error: #{error_message}")
|
||||
|
||||
items.map do |item|
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: item["expectedOutput"],
|
||||
actual: { error: error_message },
|
||||
correct: false,
|
||||
latency_ms: 0
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def handle_item_error(item, error)
|
||||
Rails.logger.error("[Langfuse Experiment] Item #{item['id']} error: #{error.message}")
|
||||
|
||||
{
|
||||
item_id: item["id"],
|
||||
expected: item["expectedOutput"],
|
||||
actual: { error: error.message },
|
||||
correct: false,
|
||||
latency_ms: 0
|
||||
}
|
||||
end
|
||||
|
||||
def calculate_metrics(results)
|
||||
total = results.size
|
||||
|
||||
# Guard against empty results to avoid division by zero
|
||||
if total.zero?
|
||||
return {
|
||||
accuracy: 0.0,
|
||||
total: 0,
|
||||
correct: 0,
|
||||
incorrect: 0,
|
||||
avg_latency_ms: 0
|
||||
}
|
||||
end
|
||||
|
||||
correct = results.count { |r| r[:correct] }
|
||||
avg_latency = results.sum { |r| r[:latency_ms] } / total.to_f
|
||||
|
||||
{
|
||||
accuracy: (correct.to_f / total * 100).round(2),
|
||||
total: total,
|
||||
correct: correct,
|
||||
incorrect: total - correct,
|
||||
avg_latency_ms: avg_latency.round(0)
|
||||
}
|
||||
end
|
||||
|
||||
def llm_provider
|
||||
@llm_provider ||= build_provider
|
||||
end
|
||||
|
||||
def build_provider
|
||||
case provider
|
||||
when "openai"
|
||||
access_token = provider_config[:access_token] ||
|
||||
ENV["OPENAI_ACCESS_TOKEN"] ||
|
||||
Setting.openai_access_token
|
||||
|
||||
raise "OpenAI access token not configured" unless access_token.present?
|
||||
|
||||
uri_base = provider_config[:uri_base] ||
|
||||
ENV["OPENAI_URI_BASE"] ||
|
||||
Setting.openai_uri_base
|
||||
|
||||
Provider::Openai.new(access_token, uri_base: uri_base, model: model)
|
||||
else
|
||||
raise "Unsupported provider: #{provider}"
|
||||
end
|
||||
end
|
||||
|
||||
# Determine the effective JSON mode for a batch based on expected null ratio
|
||||
# This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
|
||||
def json_mode_for_batch(items)
|
||||
# If a specific mode is configured (not "auto"), always use it
|
||||
configured_mode = provider_config[:json_mode]
|
||||
return configured_mode if configured_mode.present? && configured_mode != "auto"
|
||||
|
||||
# Calculate expected null ratio for this batch
|
||||
expected_null_count = items.count { |item| item.dig("expectedOutput", "category_name").nil? }
|
||||
expected_null_ratio = expected_null_count.to_f / items.size
|
||||
|
||||
# If >50% of the batch is expected to return null, force strict mode
|
||||
# This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
|
||||
# and prevents unnecessary retries when nulls are legitimate
|
||||
if expected_null_ratio > 0.5
|
||||
Rails.logger.info("[Langfuse Experiment] Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode")
|
||||
"strict"
|
||||
else
|
||||
# Use auto mode - let the auto-categorizer decide
|
||||
"auto"
|
||||
end
|
||||
end
|
||||
|
||||
def normalize_null(value)
|
||||
return nil if value.nil?
|
||||
return nil if value == "null"
|
||||
return nil if value.to_s.strip.empty?
|
||||
value
|
||||
end
|
||||
|
||||
def normalize_url(url)
|
||||
return nil if url.nil?
|
||||
url.to_s.downcase
|
||||
.gsub(/^(https?:\/\/)?(www\.)?/, "")
|
||||
.chomp("/")
|
||||
.strip
|
||||
end
|
||||
|
||||
def build_available_functions
|
||||
# Simplified function definitions for chat eval
|
||||
[
|
||||
{
|
||||
name: "get_accounts",
|
||||
description: "Get user's financial accounts",
|
||||
params_schema: { type: "object", properties: {}, required: [] }
|
||||
},
|
||||
{
|
||||
name: "get_transactions",
|
||||
description: "Get transactions with optional filters",
|
||||
params_schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
account_id: { type: "string" },
|
||||
start_date: { type: "string" },
|
||||
end_date: { type: "string" },
|
||||
category: { type: "string" }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
name: "get_balance_summary",
|
||||
description: "Get balance summary across accounts",
|
||||
params_schema: { type: "object", properties: {} }
|
||||
},
|
||||
{
|
||||
name: "get_spending_by_category",
|
||||
description: "Get spending breakdown by category",
|
||||
params_schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
start_date: { type: "string" },
|
||||
end_date: { type: "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
end
|
||||
|
||||
def extract_function_calls(response)
|
||||
return [] unless response.respond_to?(:messages)
|
||||
|
||||
response.messages.flat_map do |msg|
|
||||
next [] unless msg.respond_to?(:function_calls)
|
||||
msg.function_calls.map do |fc|
|
||||
{ name: fc.name, arguments: fc.arguments }
|
||||
end
|
||||
end.compact
|
||||
end
|
||||
|
||||
def evaluate_function_match(actual, expected)
|
||||
return true if expected.empty? && actual.empty?
|
||||
return false if expected.empty? != actual.empty?
|
||||
|
||||
expected_names = expected.map { |f| f["name"] || f[:name] }.sort
|
||||
actual_names = actual.map { |f| f["name"] || f[:name] }.sort
|
||||
|
||||
expected_names == actual_names
|
||||
end
|
||||
end
|
||||
68
app/models/eval/metrics/base.rb
Normal file
68
app/models/eval/metrics/base.rb
Normal file
@@ -0,0 +1,68 @@
|
||||
class Eval::Metrics::Base
|
||||
attr_reader :eval_run
|
||||
|
||||
def initialize(eval_run)
|
||||
@eval_run = eval_run
|
||||
end
|
||||
|
||||
def calculate
|
||||
raise NotImplementedError, "Subclasses must implement #calculate"
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
def results
|
||||
@results ||= eval_run.results.includes(:sample)
|
||||
end
|
||||
|
||||
def samples
|
||||
@samples ||= eval_run.dataset.samples
|
||||
end
|
||||
|
||||
def total_count
|
||||
results.count
|
||||
end
|
||||
|
||||
def correct_count
|
||||
results.where(correct: true).count
|
||||
end
|
||||
|
||||
def incorrect_count
|
||||
results.where(correct: false).count
|
||||
end
|
||||
|
||||
def accuracy
|
||||
return 0.0 if total_count.zero?
|
||||
(correct_count.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def avg_latency_ms
|
||||
return nil if total_count.zero?
|
||||
results.average(:latency_ms)&.round(0)
|
||||
end
|
||||
|
||||
def total_cost
|
||||
results.sum(:cost)&.to_f&.round(6)
|
||||
end
|
||||
|
||||
def cost_per_sample
|
||||
return nil if total_count.zero?
|
||||
(total_cost / total_count).round(6)
|
||||
end
|
||||
|
||||
def metrics_by_difficulty
|
||||
%w[easy medium hard edge_case].index_with do |difficulty|
|
||||
difficulty_results = results.joins(:sample).where(eval_samples: { difficulty: difficulty })
|
||||
next nil if difficulty_results.empty?
|
||||
|
||||
correct = difficulty_results.where(correct: true).count
|
||||
total = difficulty_results.count
|
||||
|
||||
{
|
||||
count: total,
|
||||
correct: correct,
|
||||
accuracy: (correct.to_f / total * 100).round(2)
|
||||
}
|
||||
end.compact
|
||||
end
|
||||
end
|
||||
101
app/models/eval/metrics/categorization_metrics.rb
Normal file
101
app/models/eval/metrics/categorization_metrics.rb
Normal file
@@ -0,0 +1,101 @@
|
||||
class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
|
||||
def calculate
|
||||
{
|
||||
accuracy: accuracy,
|
||||
exact_match_accuracy: exact_match_accuracy,
|
||||
alternative_match_count: alternative_match_count,
|
||||
precision: precision,
|
||||
recall: recall,
|
||||
f1_score: f1_score,
|
||||
null_accuracy: null_accuracy,
|
||||
hierarchical_accuracy: hierarchical_accuracy,
|
||||
samples_processed: total_count,
|
||||
samples_correct: correct_count,
|
||||
avg_latency_ms: avg_latency_ms,
|
||||
total_cost: total_cost,
|
||||
cost_per_sample: cost_per_sample,
|
||||
by_difficulty: metrics_by_difficulty,
|
||||
by_category: metrics_by_category
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def exact_match_accuracy
|
||||
# Percentage of results that exactly match the primary expected category
|
||||
return 0.0 if total_count.zero?
|
||||
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def alternative_match_count
|
||||
# Number of results that matched an alternative (but not primary) category
|
||||
results.where(alternative_match: true).count
|
||||
end
|
||||
|
||||
def null_accuracy
|
||||
# Accuracy for samples where null was expected
|
||||
null_expected_results = results.where(null_expected: true)
|
||||
return 100.0 if null_expected_results.empty?
|
||||
|
||||
correct = null_expected_results.where(null_returned: true).count
|
||||
total = null_expected_results.count
|
||||
|
||||
(correct.to_f / total * 100).round(2)
|
||||
end
|
||||
|
||||
def hierarchical_accuracy
|
||||
# Percentage of results that match at hierarchical level (including exact matches)
|
||||
return 0.0 if total_count.zero?
|
||||
(results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def precision
|
||||
# True positives / (True positives + False positives)
|
||||
# TP: Correct non-null predictions
|
||||
# FP: Incorrect non-null predictions (predicted wrong category)
|
||||
true_positives = results.where(correct: true, null_returned: false).count
|
||||
false_positives = results.where(correct: false, null_returned: false).count
|
||||
|
||||
denominator = true_positives + false_positives
|
||||
return 0.0 if denominator.zero?
|
||||
|
||||
(true_positives.to_f / denominator * 100).round(2)
|
||||
end
|
||||
|
||||
def recall
|
||||
# True positives / (True positives + False negatives)
|
||||
# TP: Correct non-null predictions
|
||||
# FN: Incorrectly returned null when category was expected
|
||||
true_positives = results.where(correct: true, null_returned: false).count
|
||||
false_negatives = results.where(null_expected: false, null_returned: true).count
|
||||
|
||||
denominator = true_positives + false_negatives
|
||||
return 0.0 if denominator.zero?
|
||||
|
||||
(true_positives.to_f / denominator * 100).round(2)
|
||||
end
|
||||
|
||||
def f1_score
|
||||
return 0.0 if precision.zero? || recall.zero?
|
||||
(2 * precision * recall / (precision + recall)).round(2)
|
||||
end
|
||||
|
||||
def metrics_by_category
|
||||
# Group results by expected category and calculate accuracy
|
||||
category_metrics = {}
|
||||
|
||||
results.includes(:sample).each do |result|
|
||||
expected = result.sample.expected_category_name || "null"
|
||||
|
||||
category_metrics[expected] ||= { correct: 0, total: 0 }
|
||||
category_metrics[expected][:total] += 1
|
||||
category_metrics[expected][:correct] += 1 if result.correct
|
||||
end
|
||||
|
||||
category_metrics.transform_values do |metrics|
|
||||
metrics.merge(
|
||||
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
125
app/models/eval/metrics/chat_metrics.rb
Normal file
125
app/models/eval/metrics/chat_metrics.rb
Normal file
@@ -0,0 +1,125 @@
|
||||
class Eval::Metrics::ChatMetrics < Eval::Metrics::Base
|
||||
def calculate
|
||||
{
|
||||
accuracy: accuracy,
|
||||
function_selection_accuracy: function_selection_accuracy,
|
||||
parameter_accuracy: parameter_accuracy,
|
||||
response_relevance: response_relevance,
|
||||
exact_match_rate: exact_match_rate,
|
||||
error_rate: error_rate,
|
||||
avg_functions_per_response: avg_functions_per_response,
|
||||
samples_processed: total_count,
|
||||
samples_correct: correct_count,
|
||||
avg_latency_ms: avg_latency_ms,
|
||||
total_cost: total_cost,
|
||||
cost_per_sample: cost_per_sample,
|
||||
by_difficulty: metrics_by_difficulty,
|
||||
by_function: metrics_by_function
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def function_selection_accuracy
|
||||
# Percentage of samples where correct functions were called
|
||||
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
||||
return 0.0 if valid_results.empty?
|
||||
|
||||
correct = valid_results.count do |r|
|
||||
r.metadata.dig("function_selection_correct") == true
|
||||
end
|
||||
|
||||
(correct.to_f / valid_results.count * 100).round(2)
|
||||
end
|
||||
|
||||
def parameter_accuracy
|
||||
# Average parameter accuracy across all samples
|
||||
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
||||
return 0.0 if valid_results.empty?
|
||||
|
||||
scores = valid_results.map do |r|
|
||||
r.metadata.dig("parameter_accuracy") || 0.0
|
||||
end
|
||||
|
||||
(scores.sum / scores.size * 100).round(2)
|
||||
end
|
||||
|
||||
def response_relevance
|
||||
# Percentage of samples where response contained expected keywords
|
||||
valid_results = results.where.not("metadata->>'error' IS NOT NULL")
|
||||
return 0.0 if valid_results.empty?
|
||||
|
||||
correct = valid_results.count do |r|
|
||||
# If no keywords expected, consider it relevant
|
||||
expected_keywords = r.metadata.dig("expected_keywords") || []
|
||||
expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true
|
||||
end
|
||||
|
||||
(correct.to_f / valid_results.count * 100).round(2)
|
||||
end
|
||||
|
||||
def exact_match_rate
|
||||
return 0.0 if total_count.zero?
|
||||
(results.where(exact_match: true).count.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def error_rate
|
||||
return 0.0 if total_count.zero?
|
||||
|
||||
errors = results.count do |r|
|
||||
r.metadata.dig("error").present? || r.actual_output.dig("error").present?
|
||||
end
|
||||
|
||||
(errors.to_f / total_count * 100).round(2)
|
||||
end
|
||||
|
||||
def avg_functions_per_response
|
||||
valid_results = results.where.not("actual_output->>'error' IS NOT NULL")
|
||||
return 0.0 if valid_results.empty?
|
||||
|
||||
total_functions = valid_results.sum do |r|
|
||||
functions = r.actual_output.dig("functions") || []
|
||||
functions.size
|
||||
end
|
||||
|
||||
(total_functions.to_f / valid_results.count).round(2)
|
||||
end
|
||||
|
||||
def metrics_by_function
|
||||
# Group results by expected function and calculate accuracy
|
||||
function_metrics = {}
|
||||
|
||||
results.includes(:sample).each do |result|
|
||||
expected_functions = result.sample.expected_functions
|
||||
|
||||
expected_functions.each do |func|
|
||||
name = func["name"]
|
||||
next if name.nil?
|
||||
|
||||
function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 }
|
||||
function_metrics[name][:total] += 1
|
||||
|
||||
# Check if this specific function was called correctly
|
||||
actual_functions = result.actual_output.dig("functions") || []
|
||||
if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) }
|
||||
function_metrics[name][:correct] += 1
|
||||
function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function_metrics.transform_values do |metrics|
|
||||
{
|
||||
total: metrics[:total],
|
||||
correct: metrics[:correct],
|
||||
accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2),
|
||||
avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def normalize_name(name)
|
||||
return nil if name.nil?
|
||||
name.to_s.underscore.downcase
|
||||
end
|
||||
end
|
||||
107
app/models/eval/metrics/merchant_detection_metrics.rb
Normal file
107
app/models/eval/metrics/merchant_detection_metrics.rb
Normal file
@@ -0,0 +1,107 @@
|
||||
class Eval::Metrics::MerchantDetectionMetrics < Eval::Metrics::Base
|
||||
FUZZY_MATCH_THRESHOLD = 0.8
|
||||
|
||||
def calculate
|
||||
{
|
||||
accuracy: accuracy,
|
||||
name_accuracy: name_accuracy,
|
||||
fuzzy_name_accuracy: fuzzy_name_accuracy,
|
||||
url_accuracy: url_accuracy,
|
||||
false_positive_rate: false_positive_rate,
|
||||
false_negative_rate: false_negative_rate,
|
||||
samples_processed: total_count,
|
||||
samples_correct: correct_count,
|
||||
avg_latency_ms: avg_latency_ms,
|
||||
total_cost: total_cost,
|
||||
cost_per_sample: cost_per_sample,
|
||||
avg_fuzzy_score: avg_fuzzy_score,
|
||||
by_difficulty: metrics_by_difficulty
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def name_accuracy
|
||||
# Exact name match accuracy for non-null expected names
|
||||
name_results = results.includes(:sample).select do |r|
|
||||
r.sample.expected_business_name.present?
|
||||
end
|
||||
|
||||
return 100.0 if name_results.empty?
|
||||
|
||||
correct = name_results.count do |r|
|
||||
actual = r.actual_output.dig("business_name") || r.actual_output["business_name"]
|
||||
expected = r.sample.expected_business_name
|
||||
actual == expected
|
||||
end
|
||||
|
||||
(correct.to_f / name_results.size * 100).round(2)
|
||||
end
|
||||
|
||||
def fuzzy_name_accuracy
|
||||
# Fuzzy name match accuracy (using fuzzy_score >= threshold)
|
||||
name_results = results.includes(:sample).select do |r|
|
||||
r.sample.expected_business_name.present?
|
||||
end
|
||||
|
||||
return 100.0 if name_results.empty?
|
||||
|
||||
correct = name_results.count do |r|
|
||||
(r.fuzzy_score || 0) >= FUZZY_MATCH_THRESHOLD
|
||||
end
|
||||
|
||||
(correct.to_f / name_results.size * 100).round(2)
|
||||
end
|
||||
|
||||
def url_accuracy
|
||||
# URL match accuracy for non-null expected URLs
|
||||
url_results = results.includes(:sample).select do |r|
|
||||
r.sample.expected_business_url.present?
|
||||
end
|
||||
|
||||
return 100.0 if url_results.empty?
|
||||
|
||||
correct = url_results.count do |r|
|
||||
actual = r.actual_output.dig("business_url") || r.actual_output["business_url"]
|
||||
expected = r.sample.expected_business_url
|
||||
normalize_url(actual) == normalize_url(expected)
|
||||
end
|
||||
|
||||
(correct.to_f / url_results.size * 100).round(2)
|
||||
end
|
||||
|
||||
def false_positive_rate
|
||||
# Rate of returning a merchant when null was expected
|
||||
null_expected_results = results.where(null_expected: true)
|
||||
return 0.0 if null_expected_results.empty?
|
||||
|
||||
false_positives = null_expected_results.where(null_returned: false).count
|
||||
|
||||
(false_positives.to_f / null_expected_results.count * 100).round(2)
|
||||
end
|
||||
|
||||
def false_negative_rate
|
||||
# Rate of returning null when a merchant was expected
|
||||
merchant_expected_results = results.where(null_expected: false)
|
||||
return 0.0 if merchant_expected_results.empty?
|
||||
|
||||
false_negatives = merchant_expected_results.where(null_returned: true).count
|
||||
|
||||
(false_negatives.to_f / merchant_expected_results.count * 100).round(2)
|
||||
end
|
||||
|
||||
def avg_fuzzy_score
|
||||
scores = results.where.not(fuzzy_score: nil).pluck(:fuzzy_score)
|
||||
return nil if scores.empty?
|
||||
|
||||
(scores.sum / scores.size).round(4)
|
||||
end
|
||||
|
||||
def normalize_url(url)
|
||||
return nil if url.nil?
|
||||
url.to_s.downcase
|
||||
.gsub(/^(https?:\/\/)?(www\.)?/, "")
|
||||
.chomp("/")
|
||||
.strip
|
||||
end
|
||||
end
|
||||
205
app/models/eval/reporters/comparison_reporter.rb
Normal file
205
app/models/eval/reporters/comparison_reporter.rb
Normal file
@@ -0,0 +1,205 @@
|
||||
class Eval::Reporters::ComparisonReporter
|
||||
attr_reader :runs
|
||||
|
||||
def initialize(runs)
|
||||
@runs = Array(runs).sort_by(&:model)
|
||||
end
|
||||
|
||||
# Generate a text table for terminal display
|
||||
def to_table
|
||||
return "No runs to compare" if runs.empty?
|
||||
|
||||
headers = build_headers
|
||||
rows = runs.map { |run| build_row(run) }
|
||||
|
||||
# Calculate column widths
|
||||
all_rows = [ headers ] + rows
|
||||
widths = headers.each_index.map do |i|
|
||||
all_rows.map { |row| row[i].to_s.length }.max
|
||||
end
|
||||
|
||||
# Build table
|
||||
separator = "+" + widths.map { |w| "-" * (w + 2) }.join("+") + "+"
|
||||
|
||||
lines = []
|
||||
lines << separator
|
||||
lines << "| " + headers.each_with_index.map { |h, i| h.to_s.ljust(widths[i]) }.join(" | ") + " |"
|
||||
lines << separator
|
||||
|
||||
rows.each do |row|
|
||||
lines << "| " + row.each_with_index.map { |c, i| c.to_s.ljust(widths[i]) }.join(" | ") + " |"
|
||||
end
|
||||
|
||||
lines << separator
|
||||
lines.join("\n")
|
||||
end
|
||||
|
||||
# Export to CSV file
|
||||
def to_csv(file_path)
|
||||
require "csv"
|
||||
|
||||
CSV.open(file_path, "wb") do |csv|
|
||||
csv << csv_headers
|
||||
runs.each { |run| csv << csv_row(run) }
|
||||
end
|
||||
|
||||
file_path
|
||||
end
|
||||
|
||||
# Generate summary with best model recommendations
|
||||
def summary
|
||||
return {} if runs.empty?
|
||||
|
||||
completed_runs = runs.select { |r| r.status == "completed" && r.metrics.present? }
|
||||
return {} if completed_runs.empty?
|
||||
|
||||
best_accuracy = completed_runs.max_by { |r| r.metrics["accuracy"] || 0 }
|
||||
lowest_cost = completed_runs.min_by { |r| r.total_cost || Float::INFINITY }
|
||||
fastest = completed_runs.min_by { |r| r.metrics["avg_latency_ms"] || Float::INFINITY }
|
||||
|
||||
{
|
||||
best_accuracy: {
|
||||
model: best_accuracy.model,
|
||||
value: best_accuracy.metrics["accuracy"],
|
||||
run_id: best_accuracy.id
|
||||
},
|
||||
lowest_cost: {
|
||||
model: lowest_cost.model,
|
||||
value: lowest_cost.total_cost&.to_f,
|
||||
run_id: lowest_cost.id
|
||||
},
|
||||
fastest: {
|
||||
model: fastest.model,
|
||||
value: fastest.metrics["avg_latency_ms"],
|
||||
run_id: fastest.id
|
||||
},
|
||||
recommendation: generate_recommendation(best_accuracy, lowest_cost, fastest)
|
||||
}
|
||||
end
|
||||
|
||||
# Generate detailed comparison between runs
|
||||
def detailed_comparison
|
||||
return {} if runs.empty?
|
||||
|
||||
{
|
||||
runs: runs.map(&:summary),
|
||||
comparison: pairwise_comparisons,
|
||||
summary: summary
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def build_headers
|
||||
[ "Model", "Status", "Accuracy", "Precision", "Recall", "F1", "Latency (ms)", "Cost ($)", "Samples" ]
|
||||
end
|
||||
|
||||
def build_row(run)
|
||||
metrics = run.metrics || {}
|
||||
|
||||
[
|
||||
run.model,
|
||||
run.status,
|
||||
format_percentage(metrics["accuracy"]),
|
||||
format_percentage(metrics["precision"]),
|
||||
format_percentage(metrics["recall"]),
|
||||
format_percentage(metrics["f1_score"]),
|
||||
metrics["avg_latency_ms"]&.round(0) || "-",
|
||||
format_cost(run.total_cost),
|
||||
run.results.count
|
||||
]
|
||||
end
|
||||
|
||||
def csv_headers
|
||||
[
|
||||
"Run ID", "Model", "Provider", "Dataset", "Status",
|
||||
"Accuracy", "Precision", "Recall", "F1 Score",
|
||||
"Null Accuracy", "Hierarchical Accuracy",
|
||||
"Avg Latency (ms)", "Total Cost", "Cost Per Sample",
|
||||
"Samples Processed", "Samples Correct",
|
||||
"Duration (s)", "Run Date"
|
||||
]
|
||||
end
|
||||
|
||||
def csv_row(run)
|
||||
metrics = run.metrics || {}
|
||||
|
||||
[
|
||||
run.id,
|
||||
run.model,
|
||||
run.provider,
|
||||
run.dataset.name,
|
||||
run.status,
|
||||
metrics["accuracy"],
|
||||
metrics["precision"],
|
||||
metrics["recall"],
|
||||
metrics["f1_score"],
|
||||
metrics["null_accuracy"],
|
||||
metrics["hierarchical_accuracy"],
|
||||
metrics["avg_latency_ms"],
|
||||
run.total_cost&.to_f,
|
||||
metrics["cost_per_sample"],
|
||||
metrics["samples_processed"],
|
||||
metrics["samples_correct"],
|
||||
run.duration_seconds,
|
||||
run.completed_at&.iso8601
|
||||
]
|
||||
end
|
||||
|
||||
def format_percentage(value)
|
||||
return "-" if value.nil?
|
||||
"#{value}%"
|
||||
end
|
||||
|
||||
def format_cost(value)
|
||||
return "-" if value.nil?
|
||||
"$#{value.to_f.round(4)}"
|
||||
end
|
||||
|
||||
def pairwise_comparisons
|
||||
return [] if runs.size < 2
|
||||
|
||||
comparisons = []
|
||||
runs.combination(2).each do |run1, run2|
|
||||
comparisons << {
|
||||
models: [ run1.model, run2.model ],
|
||||
accuracy_diff: ((run1.metrics["accuracy"] || 0) - (run2.metrics["accuracy"] || 0)).round(2),
|
||||
cost_diff: ((run1.total_cost || 0) - (run2.total_cost || 0)).to_f.round(6),
|
||||
latency_diff: ((run1.metrics["avg_latency_ms"] || 0) - (run2.metrics["avg_latency_ms"] || 0)).round(0)
|
||||
}
|
||||
end
|
||||
comparisons
|
||||
end
|
||||
|
||||
def generate_recommendation(best_accuracy, lowest_cost, fastest)
|
||||
parts = []
|
||||
|
||||
# If one model wins all categories
|
||||
if best_accuracy.id == lowest_cost.id && lowest_cost.id == fastest.id
|
||||
return "#{best_accuracy.model} is the best choice overall (highest accuracy, lowest cost, fastest)."
|
||||
end
|
||||
|
||||
# Accuracy recommendation
|
||||
if best_accuracy.metrics["accuracy"] && best_accuracy.metrics["accuracy"] >= 90
|
||||
parts << "For maximum accuracy, use #{best_accuracy.model} (#{best_accuracy.metrics['accuracy']}% accuracy)"
|
||||
end
|
||||
|
||||
# Cost recommendation if significantly cheaper
|
||||
if lowest_cost.total_cost && lowest_cost.total_cost > 0
|
||||
cost_ratio = (best_accuracy.total_cost || 0) / lowest_cost.total_cost
|
||||
if cost_ratio > 1.5
|
||||
parts << "For cost efficiency, consider #{lowest_cost.model} (#{format_cost(lowest_cost.total_cost)} vs #{format_cost(best_accuracy.total_cost)})"
|
||||
end
|
||||
end
|
||||
|
||||
# Speed recommendation
|
||||
if fastest.metrics["avg_latency_ms"] && fastest.id != best_accuracy.id
|
||||
latency_ratio = (best_accuracy.metrics["avg_latency_ms"] || 0) / (fastest.metrics["avg_latency_ms"] || 1)
|
||||
if latency_ratio > 1.5
|
||||
parts << "For speed, consider #{fastest.model} (#{fastest.metrics['avg_latency_ms']}ms vs #{best_accuracy.metrics['avg_latency_ms']}ms)"
|
||||
end
|
||||
end
|
||||
|
||||
parts.empty? ? "All models perform similarly." : parts.join(". ")
|
||||
end
|
||||
end
|
||||
70
app/models/eval/result.rb
Normal file
70
app/models/eval/result.rb
Normal file
@@ -0,0 +1,70 @@
|
||||
class Eval::Result < ApplicationRecord
|
||||
self.table_name = "eval_results"
|
||||
|
||||
belongs_to :run, class_name: "Eval::Run", foreign_key: :eval_run_id
|
||||
belongs_to :sample, class_name: "Eval::Sample", foreign_key: :eval_sample_id
|
||||
|
||||
validates :actual_output, presence: true
|
||||
validates :correct, inclusion: { in: [ true, false ] }
|
||||
|
||||
scope :correct, -> { where(correct: true) }
|
||||
scope :incorrect, -> { where(correct: false) }
|
||||
scope :with_nulls_returned, -> { where(null_returned: true) }
|
||||
scope :with_nulls_expected, -> { where(null_expected: true) }
|
||||
scope :exact_matches, -> { where(exact_match: true) }
|
||||
scope :hierarchical_matches, -> { where(hierarchical_match: true) }
|
||||
|
||||
# Get actual category (for categorization results)
|
||||
def actual_category_name
|
||||
actual_output.dig("category_name") || actual_output["category_name"]
|
||||
end
|
||||
|
||||
# Get actual merchant info (for merchant detection results)
|
||||
def actual_business_name
|
||||
actual_output.dig("business_name") || actual_output["business_name"]
|
||||
end
|
||||
|
||||
def actual_business_url
|
||||
actual_output.dig("business_url") || actual_output["business_url"]
|
||||
end
|
||||
|
||||
# Get actual functions called (for chat results)
|
||||
def actual_functions
|
||||
actual_output.dig("functions") || actual_output["functions"] || []
|
||||
end
|
||||
|
||||
# Get actual response text (for chat results)
|
||||
def actual_response_text
|
||||
actual_output.dig("response_text") || actual_output["response_text"]
|
||||
end
|
||||
|
||||
# Summary for display
|
||||
def summary
|
||||
{
|
||||
sample_id: sample_id,
|
||||
correct: correct,
|
||||
exact_match: exact_match,
|
||||
expected: sample.expected_output,
|
||||
actual: actual_output,
|
||||
latency_ms: latency_ms,
|
||||
cost: cost&.to_f
|
||||
}
|
||||
end
|
||||
|
||||
# Detailed comparison with expected
|
||||
def detailed_comparison
|
||||
{
|
||||
sample_difficulty: sample.difficulty,
|
||||
sample_tags: sample.tags,
|
||||
input: sample.input_data,
|
||||
expected: sample.expected_output,
|
||||
actual: actual_output,
|
||||
correct: correct,
|
||||
exact_match: exact_match,
|
||||
hierarchical_match: hierarchical_match,
|
||||
null_expected: null_expected,
|
||||
null_returned: null_returned,
|
||||
fuzzy_score: fuzzy_score
|
||||
}
|
||||
end
|
||||
end
|
||||
88
app/models/eval/run.rb
Normal file
88
app/models/eval/run.rb
Normal file
@@ -0,0 +1,88 @@
|
||||
class Eval::Run < ApplicationRecord
|
||||
self.table_name = "eval_runs"
|
||||
|
||||
belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
|
||||
has_many :results, class_name: "Eval::Result", foreign_key: :eval_run_id, dependent: :destroy
|
||||
|
||||
validates :provider, :model, :status, presence: true
|
||||
validates :status, inclusion: { in: %w[pending running completed failed] }
|
||||
|
||||
scope :pending, -> { where(status: "pending") }
|
||||
scope :running, -> { where(status: "running") }
|
||||
scope :completed, -> { where(status: "completed") }
|
||||
scope :failed, -> { where(status: "failed") }
|
||||
scope :for_model, ->(model) { where(model: model) }
|
||||
scope :for_provider, ->(provider) { where(provider: provider) }
|
||||
|
||||
# Calculate duration in seconds
|
||||
def duration_seconds
|
||||
return nil unless started_at && completed_at
|
||||
(completed_at - started_at).to_i
|
||||
end
|
||||
|
||||
# Get accuracy from metrics or calculate
|
||||
def accuracy
|
||||
metrics.dig("accuracy") || calculate_accuracy
|
||||
end
|
||||
|
||||
# Start the evaluation run
|
||||
def start!
|
||||
update!(status: "running", started_at: Time.current)
|
||||
end
|
||||
|
||||
# Complete the evaluation run with metrics
|
||||
def complete!(calculated_metrics)
|
||||
update!(
|
||||
status: "completed",
|
||||
completed_at: Time.current,
|
||||
metrics: calculated_metrics,
|
||||
total_prompt_tokens: results.sum(:prompt_tokens),
|
||||
total_completion_tokens: results.sum(:completion_tokens),
|
||||
total_cost: results.sum(:cost)
|
||||
)
|
||||
end
|
||||
|
||||
# Fail the evaluation run
|
||||
def fail!(error)
|
||||
update!(
|
||||
status: "failed",
|
||||
completed_at: Time.current,
|
||||
error_message: error.is_a?(Exception) ? "#{error.class}: #{error.message}" : error.to_s
|
||||
)
|
||||
end
|
||||
|
||||
# Summary for display
|
||||
def summary
|
||||
{
|
||||
id: id,
|
||||
name: name,
|
||||
dataset: dataset.name,
|
||||
model: model,
|
||||
provider: provider,
|
||||
status: status,
|
||||
accuracy: accuracy,
|
||||
total_cost: total_cost&.to_f,
|
||||
duration: duration_seconds,
|
||||
samples_processed: results.count,
|
||||
samples_correct: results.where(correct: true).count,
|
||||
created_at: created_at
|
||||
}
|
||||
end
|
||||
|
||||
# Compare this run to another
|
||||
def compare_to(other_run)
|
||||
{
|
||||
accuracy_diff: (accuracy || 0) - (other_run.accuracy || 0),
|
||||
cost_diff: (total_cost || 0) - (other_run.total_cost || 0),
|
||||
this_model: model,
|
||||
other_model: other_run.model
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def calculate_accuracy
|
||||
return 0.0 if results.empty?
|
||||
(results.where(correct: true).count.to_f / results.count * 100).round(2)
|
||||
end
|
||||
end
|
||||
82
app/models/eval/runners/base.rb
Normal file
82
app/models/eval/runners/base.rb
Normal file
@@ -0,0 +1,82 @@
|
||||
class Eval::Runners::Base
|
||||
attr_reader :eval_run
|
||||
|
||||
def initialize(eval_run)
|
||||
@eval_run = eval_run
|
||||
end
|
||||
|
||||
def run
|
||||
eval_run.start!
|
||||
|
||||
begin
|
||||
process_samples
|
||||
metrics = calculate_metrics
|
||||
eval_run.complete!(metrics)
|
||||
rescue => e
|
||||
eval_run.fail!(e)
|
||||
raise
|
||||
end
|
||||
|
||||
eval_run
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
def process_samples
|
||||
raise NotImplementedError, "Subclasses must implement #process_samples"
|
||||
end
|
||||
|
||||
def calculate_metrics
|
||||
raise NotImplementedError, "Subclasses must implement #calculate_metrics"
|
||||
end
|
||||
|
||||
def samples
|
||||
eval_run.dataset.samples
|
||||
end
|
||||
|
||||
def provider
|
||||
@provider ||= build_provider
|
||||
end
|
||||
|
||||
def model
|
||||
eval_run.model
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def build_provider
|
||||
case eval_run.provider
|
||||
when "openai"
|
||||
build_openai_provider
|
||||
else
|
||||
raise "Unsupported provider: #{eval_run.provider}"
|
||||
end
|
||||
end
|
||||
|
||||
def build_openai_provider
|
||||
access_token = eval_run.provider_config["access_token"].presence ||
|
||||
ENV["OPENAI_ACCESS_TOKEN"].presence ||
|
||||
Setting.openai_access_token
|
||||
|
||||
raise "OpenAI access token not configured" unless access_token.present?
|
||||
|
||||
uri_base = eval_run.provider_config["uri_base"].presence ||
|
||||
ENV["OPENAI_URI_BASE"].presence ||
|
||||
Setting.openai_uri_base
|
||||
|
||||
Provider::Openai.new(access_token, uri_base: uri_base, model: model)
|
||||
end
|
||||
|
||||
def record_result(sample:, actual_output:, correct:, **attributes)
|
||||
eval_run.results.create!(
|
||||
sample: sample,
|
||||
actual_output: actual_output,
|
||||
correct: correct,
|
||||
**attributes
|
||||
)
|
||||
end
|
||||
|
||||
def log_progress(message)
|
||||
Rails.logger.info("[Eval::Runner] #{message}")
|
||||
end
|
||||
end
|
||||
199
app/models/eval/runners/categorization_runner.rb
Normal file
199
app/models/eval/runners/categorization_runner.rb
Normal file
@@ -0,0 +1,199 @@
|
||||
class Eval::Runners::CategorizationRunner < Eval::Runners::Base
|
||||
DEFAULT_BATCH_SIZE = 25 # Matches Provider::Openai limit
|
||||
|
||||
protected
|
||||
|
||||
def process_samples
|
||||
all_samples = samples.to_a
|
||||
batch_size = effective_batch_size
|
||||
log_progress("Processing #{all_samples.size} samples in batches of #{batch_size}")
|
||||
|
||||
all_samples.each_slice(batch_size).with_index do |batch, batch_idx|
|
||||
log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / batch_size).ceil}")
|
||||
process_batch(batch)
|
||||
end
|
||||
end
|
||||
|
||||
# Use smaller batches for custom providers (local LLMs) to reduce context length
|
||||
def effective_batch_size
|
||||
eval_run.provider_config["batch_size"]&.to_i || DEFAULT_BATCH_SIZE
|
||||
end
|
||||
|
||||
# Get JSON mode from provider config (optional override)
|
||||
# Valid values: "strict", "json_object", "none"
|
||||
def json_mode
|
||||
eval_run.provider_config["json_mode"]
|
||||
end
|
||||
|
||||
def calculate_metrics
|
||||
Eval::Metrics::CategorizationMetrics.new(eval_run).calculate
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def process_batch(batch_samples)
|
||||
return if batch_samples.empty?
|
||||
|
||||
# Build inputs for the provider
|
||||
transactions = batch_samples.map do |sample|
|
||||
sample.to_transaction_input.merge(id: sample.id)
|
||||
end
|
||||
|
||||
# Get categories from first sample's context (should be shared)
|
||||
# Symbolize keys since Provider::Openai::AutoCategorizer expects symbol keys
|
||||
categories = batch_samples.first.categories_context.map(&:deep_symbolize_keys)
|
||||
|
||||
# Determine effective JSON mode for this batch
|
||||
# If the batch has many expected nulls and we're using auto mode, force strict mode
|
||||
# to prevent the auto-categorizer from incorrectly retrying (it would see many nulls
|
||||
# and think strict mode is broken, when actually the nulls are expected)
|
||||
effective_json_mode = json_mode_for_batch(batch_samples)
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
begin
|
||||
response = provider.auto_categorize(
|
||||
transactions: transactions,
|
||||
user_categories: categories,
|
||||
model: model,
|
||||
json_mode: effective_json_mode
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
per_sample_latency = latency_ms / batch_samples.size
|
||||
|
||||
if response.success?
|
||||
record_batch_results(batch_samples, response.data, per_sample_latency)
|
||||
else
|
||||
record_batch_errors(batch_samples, response.error, per_sample_latency)
|
||||
end
|
||||
rescue => e
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
per_sample_latency = latency_ms / batch_samples.size
|
||||
record_batch_errors(batch_samples, e, per_sample_latency)
|
||||
end
|
||||
end
|
||||
|
||||
def record_batch_results(batch_samples, categorizations, per_sample_latency)
|
||||
batch_samples.each do |sample|
|
||||
# Find the categorization result for this sample
|
||||
categorization = categorizations.find { |c| c.transaction_id.to_s == sample.id.to_s }
|
||||
actual_category = categorization&.category_name
|
||||
|
||||
# Normalize "null" string to nil
|
||||
actual_category = nil if actual_category == "null"
|
||||
|
||||
expected_category = sample.expected_category_name
|
||||
acceptable_categories = sample.all_acceptable_categories
|
||||
|
||||
# Evaluate correctness - check primary expected and alternatives
|
||||
correct = evaluate_correctness_with_alternatives(actual_category, expected_category, acceptable_categories)
|
||||
exact_match = actual_category == expected_category
|
||||
alternative_match = acceptable_categories.include?(actual_category) && !exact_match
|
||||
hierarchical = evaluate_hierarchical_match(actual_category, expected_category, sample)
|
||||
|
||||
record_result(
|
||||
sample: sample,
|
||||
actual_output: { "category_name" => actual_category },
|
||||
correct: correct,
|
||||
exact_match: exact_match,
|
||||
alternative_match: alternative_match,
|
||||
hierarchical_match: hierarchical,
|
||||
null_expected: expected_category.nil?,
|
||||
null_returned: actual_category.nil?,
|
||||
latency_ms: per_sample_latency
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def record_batch_errors(batch_samples, error, per_sample_latency)
|
||||
error_message = error.is_a?(Exception) ? error.message : error.to_s
|
||||
|
||||
batch_samples.each do |sample|
|
||||
record_result(
|
||||
sample: sample,
|
||||
actual_output: { "error" => error_message },
|
||||
correct: false,
|
||||
exact_match: false,
|
||||
hierarchical_match: false,
|
||||
null_expected: sample.expected_category_name.nil?,
|
||||
null_returned: true,
|
||||
latency_ms: per_sample_latency,
|
||||
metadata: { "error" => error_message }
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
# Determine the effective JSON mode for a batch based on expected null ratio
|
||||
# This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
|
||||
def json_mode_for_batch(batch_samples)
|
||||
# If a specific mode is configured (not "auto"), always use it
|
||||
return json_mode if json_mode.present? && json_mode != "auto"
|
||||
|
||||
# Calculate expected null ratio for this batch
|
||||
expected_null_count = batch_samples.count { |s| s.expected_category_name.nil? }
|
||||
expected_null_ratio = expected_null_count.to_f / batch_samples.size
|
||||
|
||||
# If >50% of the batch is expected to return null, force strict mode
|
||||
# This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
|
||||
# and prevents unnecessary retries when nulls are legitimate
|
||||
if expected_null_ratio > 0.5
|
||||
log_progress("Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode to prevent false retry")
|
||||
"strict"
|
||||
else
|
||||
# Use auto mode - let the auto-categorizer decide
|
||||
"auto"
|
||||
end
|
||||
end
|
||||
|
||||
def evaluate_correctness(actual, expected)
|
||||
# Both null = correct
|
||||
return true if actual.nil? && expected.nil?
|
||||
# Expected null but got value = incorrect
|
||||
return false if expected.nil? && actual.present?
|
||||
# Expected value but got null = incorrect
|
||||
return false if actual.nil? && expected.present?
|
||||
# Compare values
|
||||
actual == expected
|
||||
end
|
||||
|
||||
def evaluate_correctness_with_alternatives(actual, expected, acceptable_categories)
|
||||
# Both null = correct
|
||||
return true if actual.nil? && expected.nil?
|
||||
# Expected null but got value = incorrect
|
||||
return false if expected.nil? && actual.present?
|
||||
# Expected value but got null = incorrect
|
||||
return false if actual.nil? && expected.present?
|
||||
# Check if actual matches any acceptable category (primary or alternatives)
|
||||
acceptable_categories.include?(actual)
|
||||
end
|
||||
|
||||
def evaluate_hierarchical_match(actual, expected, sample)
|
||||
return false if actual.nil? || expected.nil?
|
||||
return true if actual == expected
|
||||
|
||||
# Check if actual matches parent of expected category
|
||||
categories = sample.categories_context
|
||||
|
||||
# Find the expected category
|
||||
expected_cat = categories.find { |c| c["name"] == expected }
|
||||
return false unless expected_cat
|
||||
|
||||
# If expected has a parent, check if actual matches the parent
|
||||
if expected_cat["parent_id"]
|
||||
parent = categories.find { |c| c["id"].to_s == expected_cat["parent_id"].to_s }
|
||||
return parent && parent["name"] == actual
|
||||
end
|
||||
|
||||
# Also check if actual is a subcategory of expected (reverse direction)
|
||||
actual_cat = categories.find { |c| c["name"] == actual }
|
||||
return false unless actual_cat
|
||||
|
||||
if actual_cat["parent_id"]
|
||||
parent = categories.find { |c| c["id"].to_s == actual_cat["parent_id"].to_s }
|
||||
return parent && parent["name"] == expected
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
end
|
||||
255
app/models/eval/runners/chat_runner.rb
Normal file
255
app/models/eval/runners/chat_runner.rb
Normal file
@@ -0,0 +1,255 @@
|
||||
class Eval::Runners::ChatRunner < Eval::Runners::Base
|
||||
# Chat samples are processed one at a time (not batched)
|
||||
# because each has unique context and function calling requirements
|
||||
|
||||
protected
|
||||
|
||||
def process_samples
|
||||
all_samples = samples.to_a
|
||||
log_progress("Processing #{all_samples.size} chat samples")
|
||||
|
||||
all_samples.each_with_index do |sample, idx|
|
||||
log_progress("Processing sample #{idx + 1}/#{all_samples.size}")
|
||||
process_sample(sample)
|
||||
end
|
||||
end
|
||||
|
||||
def calculate_metrics
|
||||
Eval::Metrics::ChatMetrics.new(eval_run).calculate
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def process_sample(sample)
|
||||
prompt = sample.chat_prompt
|
||||
start_time = Time.current
|
||||
|
||||
begin
|
||||
response = provider.chat_response(
|
||||
prompt,
|
||||
model: model,
|
||||
instructions: build_instructions,
|
||||
functions: build_function_definitions
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
|
||||
if response.success?
|
||||
record_chat_result(sample, response.data, latency_ms)
|
||||
else
|
||||
record_error_result(sample, response.error, latency_ms)
|
||||
end
|
||||
rescue => e
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
record_error_result(sample, e, latency_ms)
|
||||
end
|
||||
end
|
||||
|
||||
def record_chat_result(sample, chat_response, latency_ms)
|
||||
# Extract function calls from response
|
||||
actual_functions = extract_functions_from_response(chat_response)
|
||||
|
||||
# Extract response text
|
||||
response_text = extract_response_text(chat_response)
|
||||
|
||||
# Evaluate function calling accuracy
|
||||
expected_functions = sample.expected_functions
|
||||
function_match = evaluate_function_match(actual_functions, expected_functions)
|
||||
|
||||
# Evaluate response content
|
||||
expected_keywords = sample.expected_response_contains
|
||||
response_match = evaluate_response_contains(response_text, expected_keywords)
|
||||
|
||||
# Overall correctness: functions are correct AND response contains expected keywords
|
||||
correct = function_match[:correct] && response_match
|
||||
|
||||
record_result(
|
||||
sample: sample,
|
||||
actual_output: {
|
||||
"functions" => actual_functions,
|
||||
"response_text" => response_text,
|
||||
"function_match_details" => function_match
|
||||
},
|
||||
correct: correct,
|
||||
exact_match: function_match[:exact_match],
|
||||
latency_ms: latency_ms,
|
||||
metadata: {
|
||||
"function_selection_correct" => function_match[:selection_correct],
|
||||
"parameter_accuracy" => function_match[:parameter_accuracy],
|
||||
"response_keywords_found" => response_match,
|
||||
"expected_functions" => expected_functions,
|
||||
"expected_keywords" => expected_keywords
|
||||
}
|
||||
)
|
||||
end
|
||||
|
||||
def record_error_result(sample, error, latency_ms)
|
||||
error_message = error.is_a?(Exception) ? error.message : error.to_s
|
||||
|
||||
record_result(
|
||||
sample: sample,
|
||||
actual_output: { "error" => error_message },
|
||||
correct: false,
|
||||
exact_match: false,
|
||||
latency_ms: latency_ms,
|
||||
metadata: { "error" => error_message }
|
||||
)
|
||||
end
|
||||
|
||||
def extract_functions_from_response(chat_response)
|
||||
# ChatResponse has function_requests array
|
||||
function_requests = chat_response.function_requests || []
|
||||
|
||||
function_requests.map do |req|
|
||||
{
|
||||
"name" => req.function_name,
|
||||
"params" => parse_function_args(req.function_args)
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def parse_function_args(args)
|
||||
return {} if args.nil?
|
||||
return args if args.is_a?(Hash)
|
||||
JSON.parse(args)
|
||||
rescue JSON::ParserError
|
||||
{}
|
||||
end
|
||||
|
||||
def extract_response_text(chat_response)
|
||||
# ChatResponse has messages array with output_text
|
||||
messages = chat_response.messages || []
|
||||
messages.map(&:output_text).compact.join("\n")
|
||||
end
|
||||
|
||||
def evaluate_function_match(actual_functions, expected_functions)
|
||||
return { correct: true, exact_match: true, selection_correct: true, parameter_accuracy: 1.0 } if expected_functions.empty? && actual_functions.empty?
|
||||
return { correct: false, exact_match: false, selection_correct: false, parameter_accuracy: 0.0 } if expected_functions.empty? && actual_functions.any?
|
||||
|
||||
# Check function selection accuracy
|
||||
expected_names = expected_functions.map { |f| normalize_function_name(f["name"]) }.compact
|
||||
actual_names = actual_functions.map { |f| normalize_function_name(f["name"]) }.compact
|
||||
|
||||
selection_correct = expected_names.all? { |name| actual_names.include?(name) }
|
||||
|
||||
# Check parameter accuracy for matched functions
|
||||
param_scores = []
|
||||
expected_functions.each do |expected_func|
|
||||
expected_name = normalize_function_name(expected_func["name"])
|
||||
actual_func = actual_functions.find { |f| normalize_function_name(f["name"]) == expected_name }
|
||||
|
||||
if actual_func
|
||||
param_score = evaluate_parameters(actual_func["params"], expected_func["params"] || {})
|
||||
param_scores << param_score
|
||||
else
|
||||
param_scores << 0.0
|
||||
end
|
||||
end
|
||||
|
||||
parameter_accuracy = param_scores.empty? ? 0.0 : (param_scores.sum / param_scores.size).round(4)
|
||||
|
||||
# Exact match requires same functions with same parameters
|
||||
exact_match = selection_correct && parameter_accuracy == 1.0
|
||||
|
||||
# Correct if all expected functions were called (parameters don't have to be exact)
|
||||
correct = selection_correct
|
||||
|
||||
{
|
||||
correct: correct,
|
||||
exact_match: exact_match,
|
||||
selection_correct: selection_correct,
|
||||
parameter_accuracy: parameter_accuracy
|
||||
}
|
||||
end
|
||||
|
||||
def normalize_function_name(name)
|
||||
return nil if name.nil?
|
||||
# Convert to snake_case and downcase
|
||||
name.to_s.underscore.downcase
|
||||
end
|
||||
|
||||
def evaluate_parameters(actual_params, expected_params)
|
||||
return 1.0 if expected_params.empty?
|
||||
return 0.0 if actual_params.nil?
|
||||
|
||||
actual_params = actual_params.stringify_keys
|
||||
expected_params = expected_params.stringify_keys
|
||||
|
||||
matches = 0
|
||||
total = expected_params.size
|
||||
|
||||
expected_params.each do |key, expected_value|
|
||||
actual_value = actual_params[key]
|
||||
|
||||
if values_match?(actual_value, expected_value)
|
||||
matches += 1
|
||||
end
|
||||
end
|
||||
|
||||
(matches.to_f / total).round(4)
|
||||
end
|
||||
|
||||
def values_match?(actual, expected)
|
||||
return true if actual == expected
|
||||
return true if actual.to_s.downcase == expected.to_s.downcase
|
||||
|
||||
# For arrays, check if all expected values are present
|
||||
if expected.is_a?(Array) && actual.is_a?(Array)
|
||||
expected_normalized = expected.map { |v| v.to_s.downcase }
|
||||
actual_normalized = actual.map { |v| v.to_s.downcase }
|
||||
return expected_normalized.all? { |v| actual_normalized.include?(v) }
|
||||
end
|
||||
|
||||
# For dates, try to parse and compare
|
||||
if expected.to_s =~ /^\d{4}-\d{2}-\d{2}$/
|
||||
begin
|
||||
expected_date = Date.parse(expected.to_s)
|
||||
actual_date = Date.parse(actual.to_s)
|
||||
return expected_date == actual_date
|
||||
rescue
|
||||
# Not valid dates, fall through
|
||||
end
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
|
||||
def evaluate_response_contains(response_text, expected_keywords)
|
||||
return true if expected_keywords.empty?
|
||||
return false if response_text.nil? || response_text.empty?
|
||||
|
||||
normalized_response = response_text.downcase
|
||||
|
||||
expected_keywords.all? do |keyword|
|
||||
normalized_response.include?(keyword.to_s.downcase)
|
||||
end
|
||||
end
|
||||
|
||||
def build_instructions
|
||||
# Simple instructions for evaluation - we don't have a real user/family context
|
||||
<<~PROMPT
|
||||
You are a financial assistant helping users understand their financial data.
|
||||
Use the functions available to answer questions about accounts, transactions, and financial statements.
|
||||
Today's date is #{Date.current}.
|
||||
PROMPT
|
||||
end
|
||||
|
||||
def build_function_definitions
|
||||
# Return the function definitions that the chat would normally have
|
||||
[
|
||||
build_function_definition("get_transactions", "Get paginated transactions with optional filters"),
|
||||
build_function_definition("get_accounts", "Get all accounts with balances and historical data"),
|
||||
build_function_definition("get_balance_sheet", "Get current net worth, assets, and liabilities"),
|
||||
build_function_definition("get_income_statement", "Get income and expenses by category for a period")
|
||||
]
|
||||
end
|
||||
|
||||
def build_function_definition(name, description)
|
||||
{
|
||||
name: name,
|
||||
description: description,
|
||||
params_schema: { type: "object", properties: {}, additionalProperties: true },
|
||||
strict: false
|
||||
}
|
||||
end
|
||||
end
|
||||
199
app/models/eval/runners/merchant_detection_runner.rb
Normal file
199
app/models/eval/runners/merchant_detection_runner.rb
Normal file
@@ -0,0 +1,199 @@
|
||||
class Eval::Runners::MerchantDetectionRunner < Eval::Runners::Base
|
||||
BATCH_SIZE = 25 # Matches Provider::Openai limit
|
||||
FUZZY_MATCH_THRESHOLD = 0.8
|
||||
|
||||
protected
|
||||
|
||||
def process_samples
|
||||
all_samples = samples.to_a
|
||||
log_progress("Processing #{all_samples.size} samples in batches of #{BATCH_SIZE}")
|
||||
|
||||
all_samples.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
|
||||
log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / BATCH_SIZE).ceil}")
|
||||
process_batch(batch)
|
||||
end
|
||||
end
|
||||
|
||||
def calculate_metrics
|
||||
Eval::Metrics::MerchantDetectionMetrics.new(eval_run).calculate
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def process_batch(batch_samples)
|
||||
# Build inputs for the provider
|
||||
transactions = batch_samples.map do |sample|
|
||||
sample.to_transaction_input.merge(id: sample.id)
|
||||
end
|
||||
|
||||
# Get merchants from first sample's context (should be shared)
|
||||
# Symbolize keys since Provider::Openai::AutoMerchantDetector expects symbol keys
|
||||
merchants = batch_samples.first.merchants_context.map(&:deep_symbolize_keys)
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
begin
|
||||
response = provider.auto_detect_merchants(
|
||||
transactions: transactions,
|
||||
user_merchants: merchants,
|
||||
model: model
|
||||
)
|
||||
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
per_sample_latency = latency_ms / batch_samples.size
|
||||
|
||||
if response.success?
|
||||
record_batch_results(batch_samples, response.data, per_sample_latency)
|
||||
else
|
||||
record_batch_errors(batch_samples, response.error, per_sample_latency)
|
||||
end
|
||||
rescue => e
|
||||
latency_ms = ((Time.current - start_time) * 1000).to_i
|
||||
per_sample_latency = latency_ms / batch_samples.size
|
||||
record_batch_errors(batch_samples, e, per_sample_latency)
|
||||
end
|
||||
end
|
||||
|
||||
def record_batch_results(batch_samples, merchants_detected, per_sample_latency)
|
||||
batch_samples.each do |sample|
|
||||
# Find the merchant detection result for this sample
|
||||
detection = merchants_detected.find { |m| m.transaction_id.to_s == sample.id.to_s }
|
||||
|
||||
actual_name = normalize_null(detection&.business_name)
|
||||
actual_url = normalize_null(detection&.business_url)
|
||||
|
||||
expected_name = sample.expected_business_name
|
||||
expected_url = sample.expected_business_url
|
||||
|
||||
# Evaluate correctness
|
||||
name_match = evaluate_name_match(actual_name, expected_name)
|
||||
url_match = evaluate_url_match(actual_url, expected_url)
|
||||
fuzzy_score = calculate_fuzzy_score(actual_name, expected_name)
|
||||
|
||||
# Overall correct if both name and URL match expectations
|
||||
correct = name_match && url_match
|
||||
|
||||
# Exact match requires both to be exactly equal
|
||||
exact_match = actual_name == expected_name && normalize_url(actual_url) == normalize_url(expected_url)
|
||||
|
||||
record_result(
|
||||
sample: sample,
|
||||
actual_output: { "business_name" => actual_name, "business_url" => actual_url },
|
||||
correct: correct,
|
||||
exact_match: exact_match,
|
||||
fuzzy_score: fuzzy_score,
|
||||
null_expected: expected_name.nil? && expected_url.nil?,
|
||||
null_returned: actual_name.nil? && actual_url.nil?,
|
||||
latency_ms: per_sample_latency
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def record_batch_errors(batch_samples, error, per_sample_latency)
|
||||
error_message = error.is_a?(Exception) ? error.message : error.to_s
|
||||
|
||||
batch_samples.each do |sample|
|
||||
record_result(
|
||||
sample: sample,
|
||||
actual_output: { "error" => error_message },
|
||||
correct: false,
|
||||
exact_match: false,
|
||||
fuzzy_score: 0.0,
|
||||
null_expected: sample.expected_business_name.nil?,
|
||||
null_returned: true,
|
||||
latency_ms: per_sample_latency,
|
||||
metadata: { "error" => error_message }
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def normalize_null(value)
|
||||
return nil if value.nil?
|
||||
return nil if value == "null"
|
||||
return nil if value.to_s.strip.empty?
|
||||
value
|
||||
end
|
||||
|
||||
def evaluate_name_match(actual, expected)
|
||||
# Both null = correct
|
||||
return true if actual.nil? && expected.nil?
|
||||
# Expected null but got value = false positive
|
||||
return false if expected.nil? && actual.present?
|
||||
# Expected value but got null = false negative
|
||||
return false if actual.nil? && expected.present?
|
||||
# Use fuzzy matching for name comparison
|
||||
fuzzy_match?(actual, expected)
|
||||
end
|
||||
|
||||
def evaluate_url_match(actual, expected)
|
||||
# Both null = correct
|
||||
return true if actual.nil? && expected.nil?
|
||||
# Expected null but got value = false positive
|
||||
return false if expected.nil? && actual.present?
|
||||
# Expected value but got null = false negative
|
||||
return false if actual.nil? && expected.present?
|
||||
# Normalize and compare URLs
|
||||
normalize_url(actual) == normalize_url(expected)
|
||||
end
|
||||
|
||||
def normalize_url(url)
|
||||
return nil if url.nil?
|
||||
url.to_s.downcase
|
||||
.gsub(/^(https?:\/\/)?(www\.)?/, "")
|
||||
.chomp("/")
|
||||
.strip
|
||||
end
|
||||
|
||||
def fuzzy_match?(actual, expected)
|
||||
return false if actual.nil? || expected.nil?
|
||||
calculate_fuzzy_score(actual, expected) >= FUZZY_MATCH_THRESHOLD
|
||||
end
|
||||
|
||||
def calculate_fuzzy_score(actual, expected)
|
||||
return 1.0 if actual == expected
|
||||
return 0.0 if actual.nil? || expected.nil?
|
||||
|
||||
# Simple Levenshtein distance-based similarity
|
||||
# Normalize strings for comparison
|
||||
a = actual.to_s.downcase.strip
|
||||
b = expected.to_s.downcase.strip
|
||||
|
||||
return 1.0 if a == b
|
||||
|
||||
# Calculate Levenshtein distance
|
||||
distance = levenshtein_distance(a, b)
|
||||
max_length = [ a.length, b.length ].max
|
||||
|
||||
return 0.0 if max_length == 0
|
||||
|
||||
# Convert distance to similarity score (0.0 to 1.0)
|
||||
(1.0 - (distance.to_f / max_length)).round(4)
|
||||
end
|
||||
|
||||
def levenshtein_distance(s1, s2)
|
||||
m = s1.length
|
||||
n = s2.length
|
||||
|
||||
return m if n == 0
|
||||
return n if m == 0
|
||||
|
||||
# Create distance matrix
|
||||
d = Array.new(m + 1) { Array.new(n + 1) }
|
||||
|
||||
(0..m).each { |i| d[i][0] = i }
|
||||
(0..n).each { |j| d[0][j] = j }
|
||||
|
||||
(1..n).each do |j|
|
||||
(1..m).each do |i|
|
||||
cost = s1[i - 1] == s2[j - 1] ? 0 : 1
|
||||
d[i][j] = [
|
||||
d[i - 1][j] + 1, # deletion
|
||||
d[i][j - 1] + 1, # insertion
|
||||
d[i - 1][j - 1] + cost # substitution
|
||||
].min
|
||||
end
|
||||
end
|
||||
|
||||
d[m][n]
|
||||
end
|
||||
end
|
||||
88
app/models/eval/sample.rb
Normal file
88
app/models/eval/sample.rb
Normal file
@@ -0,0 +1,88 @@
|
||||
class Eval::Sample < ApplicationRecord
|
||||
self.table_name = "eval_samples"
|
||||
|
||||
belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
|
||||
has_many :results, class_name: "Eval::Result", foreign_key: :eval_sample_id, dependent: :destroy
|
||||
|
||||
validates :input_data, :expected_output, presence: true
|
||||
validates :difficulty, inclusion: { in: %w[easy medium hard manual edge_case] }
|
||||
|
||||
scope :easy, -> { where(difficulty: "easy") }
|
||||
scope :medium, -> { where(difficulty: "medium") }
|
||||
scope :hard, -> { where(difficulty: "hard") }
|
||||
scope :edge_cases, -> { where(difficulty: "edge_case") }
|
||||
scope :with_tag, ->(tag) { where("? = ANY(tags)", tag) }
|
||||
scope :with_any_tags, ->(tags) { where("tags && ARRAY[?]::varchar[]", tags) }
|
||||
|
||||
# Convert to format expected by AutoCategorizer
|
||||
def to_transaction_input
|
||||
input_data.deep_symbolize_keys
|
||||
end
|
||||
|
||||
# Get categories from context (for categorization evals)
|
||||
def categories_context
|
||||
context_data.dig("categories") || []
|
||||
end
|
||||
|
||||
# Get merchants from context (for merchant detection evals)
|
||||
def merchants_context
|
||||
context_data.dig("merchants") || []
|
||||
end
|
||||
|
||||
# Get mock data from context (for chat evals)
|
||||
def mock_data
|
||||
context_data.dig("mock_data") || input_data.dig("mock_data") || {}
|
||||
end
|
||||
|
||||
# Get the chat prompt (for chat evals)
|
||||
def chat_prompt
|
||||
input_data.dig("prompt") || input_data["prompt"]
|
||||
end
|
||||
|
||||
# Get expected functions (for chat evals)
|
||||
def expected_functions
|
||||
expected_output.dig("functions") || expected_output["functions"] || []
|
||||
end
|
||||
|
||||
# Get expected response keywords (for chat evals)
|
||||
def expected_response_contains
|
||||
expected_output.dig("response_contains") || expected_output["response_contains"] || []
|
||||
end
|
||||
|
||||
# Get expected category name (for categorization evals)
|
||||
def expected_category_name
|
||||
expected_output.dig("category_name") || expected_output["category_name"]
|
||||
end
|
||||
|
||||
# Get acceptable alternative category names (for categorization evals)
|
||||
# These are categories that are also considered correct answers
|
||||
def acceptable_alternatives
|
||||
expected_output.dig("acceptable_alternatives") || expected_output["acceptable_alternatives"] || []
|
||||
end
|
||||
|
||||
# Get all acceptable category names (primary + alternatives)
|
||||
def all_acceptable_categories
|
||||
[ expected_category_name, *acceptable_alternatives ].compact
|
||||
end
|
||||
|
||||
# Get expected merchant info (for merchant detection evals)
|
||||
def expected_business_name
|
||||
expected_output.dig("business_name") || expected_output["business_name"]
|
||||
end
|
||||
|
||||
def expected_business_url
|
||||
expected_output.dig("business_url") || expected_output["business_url"]
|
||||
end
|
||||
|
||||
# Check if null is expected
|
||||
def expects_null?
|
||||
case dataset.eval_type
|
||||
when "categorization"
|
||||
expected_category_name.nil?
|
||||
when "merchant_detection"
|
||||
expected_business_name.nil? && expected_business_url.nil?
|
||||
else
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -51,7 +51,7 @@ class Provider::Openai < Provider
|
||||
@uri_base.present?
|
||||
end
|
||||
|
||||
def auto_categorize(transactions: [], user_categories: [], model: "", family: nil)
|
||||
def auto_categorize(transactions: [], user_categories: [], model: "", family: nil, json_mode: nil)
|
||||
with_provider_response do
|
||||
raise Error, "Too many transactions to auto-categorize. Max is 25 per request." if transactions.size > 25
|
||||
if user_categories.blank?
|
||||
@@ -74,7 +74,8 @@ class Provider::Openai < Provider
|
||||
user_categories: user_categories,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family
|
||||
family: family,
|
||||
json_mode: json_mode
|
||||
).auto_categorize
|
||||
|
||||
trace&.update(output: result.map(&:to_h))
|
||||
@@ -83,7 +84,7 @@ class Provider::Openai < Provider
|
||||
end
|
||||
end
|
||||
|
||||
def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil)
|
||||
def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil, json_mode: nil)
|
||||
with_provider_response do
|
||||
raise Error, "Too many transactions to auto-detect merchants. Max is 25 per request." if transactions.size > 25
|
||||
|
||||
@@ -101,7 +102,8 @@ class Provider::Openai < Provider
|
||||
user_merchants: user_merchants,
|
||||
custom_provider: custom_provider?,
|
||||
langfuse_trace: trace,
|
||||
family: family
|
||||
family: family,
|
||||
json_mode: json_mode
|
||||
).auto_detect_merchants
|
||||
|
||||
trace&.update(output: result.map(&:to_h))
|
||||
|
||||
@@ -1,9 +1,22 @@
|
||||
class Provider::Openai::AutoCategorizer
|
||||
include Provider::Openai::Concerns::UsageRecorder
|
||||
|
||||
attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family
|
||||
# JSON response format modes for custom providers
|
||||
# - "strict": Use strict JSON schema (requires full OpenAI API compatibility)
|
||||
# - "json_object": Use json_object response format (broader compatibility)
|
||||
# - "none": No response format constraint (maximum compatibility with local LLMs)
|
||||
JSON_MODE_STRICT = "strict"
|
||||
JSON_MODE_OBJECT = "json_object"
|
||||
JSON_MODE_NONE = "none"
|
||||
JSON_MODE_AUTO = "auto"
|
||||
|
||||
def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil)
|
||||
# Threshold for auto mode: if more than this percentage returns null, retry with none mode
|
||||
# This is a heuristic to detect when strict JSON mode is breaking the model's ability to reason
|
||||
AUTO_MODE_NULL_THRESHOLD = 0.5
|
||||
|
||||
attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family, :json_mode
|
||||
|
||||
def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil)
|
||||
@client = client
|
||||
@model = model
|
||||
@transactions = transactions
|
||||
@@ -11,6 +24,32 @@ class Provider::Openai::AutoCategorizer
|
||||
@custom_provider = custom_provider
|
||||
@langfuse_trace = langfuse_trace
|
||||
@family = family
|
||||
@json_mode = json_mode || default_json_mode
|
||||
end
|
||||
|
||||
VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze
|
||||
|
||||
# Determine default JSON mode based on configuration hierarchy:
|
||||
# 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override
|
||||
# 2. Setting.openai_json_mode - user-configured in app settings
|
||||
# 3. Default: auto mode (recommended for all providers)
|
||||
#
|
||||
# Mode descriptions:
|
||||
# - "auto": Tries strict first, falls back to none if >50% fail (recommended default)
|
||||
# - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose <think> tags
|
||||
# - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output
|
||||
# - "json_object": Middle ground, broader compatibility than strict
|
||||
def default_json_mode
|
||||
# 1. Check environment variable first (allows runtime override for testing)
|
||||
env_mode = ENV["LLM_JSON_MODE"]
|
||||
return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode)
|
||||
|
||||
# 2. Check app settings (user-configured)
|
||||
setting_mode = Setting.openai_json_mode
|
||||
return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode)
|
||||
|
||||
# 3. Default: auto mode for all providers (tries strict first, falls back to none if needed)
|
||||
JSON_MODE_AUTO
|
||||
end
|
||||
|
||||
def auto_categorize
|
||||
@@ -22,6 +61,40 @@ class Provider::Openai::AutoCategorizer
|
||||
end
|
||||
|
||||
def instructions
|
||||
if custom_provider
|
||||
simple_instructions
|
||||
else
|
||||
detailed_instructions
|
||||
end
|
||||
end
|
||||
|
||||
# Simplified instructions for smaller/local LLMs
|
||||
def simple_instructions
|
||||
<<~INSTRUCTIONS.strip_heredoc
|
||||
Categorize transactions into the given categories. Return JSON only. Do not explain your reasoning.
|
||||
|
||||
CRITICAL RULES:
|
||||
1. Match transaction_id exactly from input
|
||||
2. Use EXACT category_name from the provided list, or "null" if unsure
|
||||
3. Match expense transactions to expense categories only
|
||||
4. Match income transactions to income categories only
|
||||
5. Return "null" if the description is generic/ambiguous (e.g., "POS DEBIT", "ACH WITHDRAWAL", "CHECK #1234")
|
||||
6. Prefer MORE SPECIFIC subcategories over general parent categories when available
|
||||
|
||||
CATEGORY HIERARCHY NOTES:
|
||||
- Use "Restaurants" for sit-down restaurants, "Fast Food" for quick service chains
|
||||
- Use "Coffee Shops" for coffee places, "Food & Drink" only when type is unclear
|
||||
- Use "Shopping" for general retail, big-box stores, and online marketplaces
|
||||
- Use "Groceries" for dedicated grocery stores ONLY
|
||||
- For income: use "Salary" for payroll/employer deposits, "Income" for generic income sources
|
||||
|
||||
Output JSON format only (no markdown, no explanation):
|
||||
{"categorizations": [{"transaction_id": "...", "category_name": "..."}]}
|
||||
INSTRUCTIONS
|
||||
end
|
||||
|
||||
# Detailed instructions for larger models like GPT-4
|
||||
def detailed_instructions
|
||||
<<~INSTRUCTIONS.strip_heredoc
|
||||
You are an assistant to a consumer personal finance app. You will be provided a list
|
||||
of the user's transactions and a list of the user's categories. Your job is to auto-categorize
|
||||
@@ -87,19 +160,68 @@ class Provider::Openai::AutoCategorizer
|
||||
end
|
||||
|
||||
def auto_categorize_openai_generic
|
||||
if json_mode == JSON_MODE_AUTO
|
||||
auto_categorize_with_auto_mode
|
||||
else
|
||||
auto_categorize_with_mode(json_mode)
|
||||
end
|
||||
rescue Faraday::BadRequestError => e
|
||||
# If strict mode fails (HTTP 400), fall back to none mode
|
||||
# This handles providers that don't support json_schema response format
|
||||
if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO
|
||||
Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}")
|
||||
auto_categorize_with_mode(JSON_MODE_NONE)
|
||||
else
|
||||
raise
|
||||
end
|
||||
end
|
||||
|
||||
# Auto mode: try strict first, fall back to none if too many nulls or missing results
|
||||
#
|
||||
# This uses pure heuristics to detect when strict JSON mode is breaking the model's
|
||||
# ability to reason. Models that can't reason well in strict mode often:
|
||||
# 1. Return null for everything, OR
|
||||
# 2. Simply omit transactions they can't categorize (returning fewer results than input)
|
||||
#
|
||||
# The heuristic is simple: if >50% of results are null or missing, the model likely
|
||||
# needs the freedom to reason in its output (which strict mode prevents).
|
||||
def auto_categorize_with_auto_mode
|
||||
result = auto_categorize_with_mode(JSON_MODE_STRICT)
|
||||
|
||||
null_count = result.count { |r| r.category_name.nil? || r.category_name == "null" }
|
||||
missing_count = transactions.size - result.size
|
||||
failed_count = null_count + missing_count
|
||||
failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0
|
||||
|
||||
if failed_ratio > AUTO_MODE_NULL_THRESHOLD
|
||||
Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode")
|
||||
auto_categorize_with_mode(JSON_MODE_NONE)
|
||||
else
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
def auto_categorize_with_mode(mode)
|
||||
span = langfuse_trace&.span(name: "auto_categorize_api_call", input: {
|
||||
model: model.presence || Provider::Openai::DEFAULT_MODEL,
|
||||
transactions: transactions,
|
||||
user_categories: user_categories
|
||||
user_categories: user_categories,
|
||||
json_mode: mode
|
||||
})
|
||||
|
||||
response = client.chat(parameters: {
|
||||
# Build parameters with configurable JSON response format
|
||||
params = {
|
||||
model: model.presence || Provider::Openai::DEFAULT_MODEL,
|
||||
messages: [
|
||||
{ role: "system", content: instructions },
|
||||
{ role: "user", content: developer_message }
|
||||
],
|
||||
response_format: {
|
||||
{ role: "user", content: developer_message_for_generic }
|
||||
]
|
||||
}
|
||||
|
||||
# Add response format based on json_mode setting
|
||||
case mode
|
||||
when JSON_MODE_STRICT
|
||||
params[:response_format] = {
|
||||
type: "json_schema",
|
||||
json_schema: {
|
||||
name: "auto_categorize_personal_finance_transactions",
|
||||
@@ -107,9 +229,14 @@ class Provider::Openai::AutoCategorizer
|
||||
schema: json_schema
|
||||
}
|
||||
}
|
||||
})
|
||||
when JSON_MODE_OBJECT
|
||||
params[:response_format] = { type: "json_object" }
|
||||
# JSON_MODE_NONE: no response_format constraint
|
||||
end
|
||||
|
||||
Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")}")
|
||||
response = client.chat(parameters: params)
|
||||
|
||||
Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})")
|
||||
|
||||
categorizations = extract_categorizations_generic(response)
|
||||
result = build_response(categorizations)
|
||||
@@ -120,7 +247,8 @@ class Provider::Openai::AutoCategorizer
|
||||
operation: "auto_categorize",
|
||||
metadata: {
|
||||
transaction_count: transactions.size,
|
||||
category_count: user_categories.size
|
||||
category_count: user_categories.size,
|
||||
json_mode: mode
|
||||
}
|
||||
)
|
||||
|
||||
@@ -143,9 +271,72 @@ class Provider::Openai::AutoCategorizer
|
||||
end
|
||||
|
||||
def normalize_category_name(category_name)
|
||||
return nil if category_name == "null"
|
||||
# Convert to string to handle non-string LLM outputs (numbers, booleans, etc.)
|
||||
normalized = category_name.to_s.strip
|
||||
return nil if normalized.empty? || normalized == "null" || normalized.downcase == "null"
|
||||
|
||||
category_name
|
||||
# Try exact match first
|
||||
exact_match = user_categories.find { |c| c[:name] == normalized }
|
||||
return exact_match[:name] if exact_match
|
||||
|
||||
# Try case-insensitive match
|
||||
case_insensitive_match = user_categories.find { |c| c[:name].to_s.downcase == normalized.downcase }
|
||||
return case_insensitive_match[:name] if case_insensitive_match
|
||||
|
||||
# Try partial/fuzzy match (for common variations)
|
||||
fuzzy_match = find_fuzzy_category_match(normalized)
|
||||
return fuzzy_match if fuzzy_match
|
||||
|
||||
# Return normalized string if no match found (will be treated as uncategorized)
|
||||
normalized
|
||||
end
|
||||
|
||||
# Find a fuzzy match for category names with common variations
|
||||
def find_fuzzy_category_match(category_name)
|
||||
# Ensure string input for string operations
|
||||
input_str = category_name.to_s
|
||||
normalized_input = input_str.downcase.gsub(/[^a-z0-9]/, "")
|
||||
|
||||
user_categories.each do |cat|
|
||||
cat_name_str = cat[:name].to_s
|
||||
normalized_cat = cat_name_str.downcase.gsub(/[^a-z0-9]/, "")
|
||||
|
||||
# Check if one contains the other
|
||||
return cat[:name] if normalized_input.include?(normalized_cat) || normalized_cat.include?(normalized_input)
|
||||
|
||||
# Check common abbreviations/variations
|
||||
return cat[:name] if fuzzy_name_match?(input_str, cat_name_str)
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
# Handle common naming variations
|
||||
def fuzzy_name_match?(input, category)
|
||||
variations = {
|
||||
"gas" => [ "gas & fuel", "gas and fuel", "fuel", "gasoline" ],
|
||||
"restaurants" => [ "restaurant", "dining", "food" ],
|
||||
"groceries" => [ "grocery", "supermarket", "food store" ],
|
||||
"streaming" => [ "streaming services", "streaming service" ],
|
||||
"rideshare" => [ "ride share", "ride-share", "uber", "lyft" ],
|
||||
"coffee" => [ "coffee shops", "coffee shop", "cafe" ],
|
||||
"fast food" => [ "fastfood", "quick service" ],
|
||||
"gym" => [ "gym & fitness", "fitness", "gym and fitness" ],
|
||||
"flights" => [ "flight", "airline", "airlines", "airfare" ],
|
||||
"hotels" => [ "hotel", "lodging", "accommodation" ]
|
||||
}
|
||||
|
||||
# Ensure string inputs for string operations
|
||||
input_lower = input.to_s.downcase
|
||||
category_lower = category.to_s.downcase
|
||||
|
||||
variations.each do |_key, synonyms|
|
||||
if synonyms.include?(input_lower) && synonyms.include?(category_lower)
|
||||
return true
|
||||
end
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
|
||||
def extract_categorizations_native(response)
|
||||
@@ -162,9 +353,107 @@ class Provider::Openai::AutoCategorizer
|
||||
|
||||
def extract_categorizations_generic(response)
|
||||
raw = response.dig("choices", 0, "message", "content")
|
||||
JSON.parse(raw).dig("categorizations")
|
||||
rescue JSON::ParserError => e
|
||||
raise Provider::Openai::Error, "Invalid JSON in generic categorization: #{e.message}"
|
||||
parsed = parse_json_flexibly(raw)
|
||||
|
||||
# Handle different response formats from various LLMs
|
||||
categorizations = parsed.dig("categorizations") ||
|
||||
parsed.dig("results") ||
|
||||
(parsed.is_a?(Array) ? parsed : nil)
|
||||
|
||||
raise Provider::Openai::Error, "Could not find categorizations in response" if categorizations.nil?
|
||||
|
||||
# Normalize field names (some LLMs use different naming)
|
||||
categorizations.map do |cat|
|
||||
{
|
||||
"transaction_id" => cat["transaction_id"] || cat["id"] || cat["txn_id"],
|
||||
"category_name" => cat["category_name"] || cat["category"] || cat["name"]
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
# Flexible JSON parsing that handles common LLM output issues
|
||||
def parse_json_flexibly(raw)
|
||||
return {} if raw.blank?
|
||||
|
||||
# Strip thinking model tags if present (e.g., <think>...</think>)
|
||||
# The actual JSON output comes after the thinking block
|
||||
cleaned = strip_thinking_tags(raw)
|
||||
|
||||
# Try direct parse first
|
||||
JSON.parse(cleaned)
|
||||
rescue JSON::ParserError
|
||||
# Try multiple extraction strategies in order of preference
|
||||
|
||||
# Strategy 1: Closed markdown code blocks (```json...```)
|
||||
if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m
|
||||
matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten
|
||||
matches.reverse_each do |match|
|
||||
begin
|
||||
return JSON.parse(match)
|
||||
rescue JSON::ParserError
|
||||
next
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Strategy 2: Unclosed markdown code blocks (thinking models often forget to close)
|
||||
# Pattern: ```json followed by JSON that goes to end of string
|
||||
if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m
|
||||
begin
|
||||
return JSON.parse($1)
|
||||
rescue JSON::ParserError
|
||||
# Continue to next strategy
|
||||
end
|
||||
end
|
||||
|
||||
# Strategy 3: Find JSON object with "categorizations" key
|
||||
if cleaned =~ /(\{"categorizations"\s*:\s*\[[\s\S]*\]\s*\})/m
|
||||
matches = cleaned.scan(/(\{"categorizations"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten
|
||||
matches.reverse_each do |match|
|
||||
begin
|
||||
return JSON.parse(match)
|
||||
rescue JSON::ParserError
|
||||
next
|
||||
end
|
||||
end
|
||||
# Try greedy match if non-greedy failed
|
||||
begin
|
||||
return JSON.parse($1)
|
||||
rescue JSON::ParserError
|
||||
# Continue to next strategy
|
||||
end
|
||||
end
|
||||
|
||||
# Strategy 4: Find any JSON object (last resort)
|
||||
if cleaned =~ /(\{[\s\S]*\})/m
|
||||
begin
|
||||
return JSON.parse($1)
|
||||
rescue JSON::ParserError
|
||||
# Fall through to error
|
||||
end
|
||||
end
|
||||
|
||||
raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}"
|
||||
end
|
||||
|
||||
# Strip thinking model tags (<think>...</think>) from response
|
||||
# Some models like Qwen-thinking output reasoning in these tags before the actual response
|
||||
def strip_thinking_tags(raw)
|
||||
# Remove <think>...</think> blocks but keep content after them
|
||||
# If no closing tag, the model may have been cut off - try to extract JSON from inside
|
||||
if raw.include?("<think>")
|
||||
# Check if there's content after the thinking block
|
||||
if raw =~ /<\/think>\s*([\s\S]*)/m
|
||||
after_thinking = $1.strip
|
||||
return after_thinking if after_thinking.present?
|
||||
end
|
||||
# If no content after </think> or no closing tag, look inside the thinking block
|
||||
# The JSON might be the last thing in the thinking block
|
||||
if raw =~ /<think>([\s\S]*)/m
|
||||
return $1
|
||||
end
|
||||
end
|
||||
raw
|
||||
end
|
||||
|
||||
def json_schema
|
||||
@@ -213,4 +502,39 @@ class Provider::Openai::AutoCategorizer
|
||||
```
|
||||
MESSAGE
|
||||
end
|
||||
|
||||
# Concise developer message optimized for smaller/local LLMs
|
||||
# Uses pattern-based guidance instead of exhaustive examples
|
||||
def developer_message_for_generic
|
||||
<<~MESSAGE.strip_heredoc
|
||||
AVAILABLE CATEGORIES: #{user_categories.map { |c| c[:name] }.join(", ")}
|
||||
|
||||
TRANSACTIONS TO CATEGORIZE:
|
||||
#{format_transactions_simply}
|
||||
|
||||
CATEGORIZATION GUIDELINES:
|
||||
- Prefer specific subcategories over general parent categories when confident
|
||||
- Food delivery services should be categorized based on the underlying merchant type
|
||||
- Square payments (SQ *) should be inferred from the merchant name after the prefix
|
||||
- Warehouse/club stores should be categorized based on their primary purpose
|
||||
- Return "null" for generic transactions (e.g., POS terminals, wire transfers, checks, ATM withdrawals)
|
||||
|
||||
IMPORTANT:
|
||||
- Use EXACT category names from the list above
|
||||
- Return "null" (as a string) if you cannot confidently match a category
|
||||
- Match expense transactions only to expense categories
|
||||
- Match income transactions only to income categories
|
||||
- Do NOT include any explanation or reasoning - only output JSON
|
||||
|
||||
Respond with ONLY this JSON (no markdown code blocks, no other text):
|
||||
{"categorizations": [{"transaction_id": "...", "category_name": "..."}]}
|
||||
MESSAGE
|
||||
end
|
||||
|
||||
# Format transactions in a simpler, more readable way for smaller LLMs
|
||||
def format_transactions_simply
|
||||
transactions.map do |t|
|
||||
"- ID: #{t[:id]}, Amount: #{t[:amount]}, Type: #{t[:classification]}, Description: \"#{t[:description]}\""
|
||||
end.join("\n")
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,9 +1,22 @@
|
||||
class Provider::Openai::AutoMerchantDetector
|
||||
include Provider::Openai::Concerns::UsageRecorder
|
||||
|
||||
attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family
|
||||
# JSON response format modes for custom providers
|
||||
# - "strict": Use strict JSON schema (requires full OpenAI API compatibility)
|
||||
# - "json_object": Use json_object response format (broader compatibility)
|
||||
# - "none": No response format constraint (maximum compatibility with local LLMs)
|
||||
# - "auto": Try strict first, fall back to none if poor results
|
||||
JSON_MODE_STRICT = "strict"
|
||||
JSON_MODE_OBJECT = "json_object"
|
||||
JSON_MODE_NONE = "none"
|
||||
JSON_MODE_AUTO = "auto"
|
||||
|
||||
def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil)
|
||||
# Threshold for auto mode: if more than this percentage returns null, retry with none mode
|
||||
AUTO_MODE_NULL_THRESHOLD = 0.5
|
||||
|
||||
attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family, :json_mode
|
||||
|
||||
def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil)
|
||||
@client = client
|
||||
@model = model
|
||||
@transactions = transactions
|
||||
@@ -11,6 +24,32 @@ class Provider::Openai::AutoMerchantDetector
|
||||
@custom_provider = custom_provider
|
||||
@langfuse_trace = langfuse_trace
|
||||
@family = family
|
||||
@json_mode = json_mode || default_json_mode
|
||||
end
|
||||
|
||||
VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze
|
||||
|
||||
# Determine default JSON mode based on configuration hierarchy:
|
||||
# 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override
|
||||
# 2. Setting.openai_json_mode - user-configured in app settings
|
||||
# 3. Default: auto mode (recommended for all providers)
|
||||
#
|
||||
# Mode descriptions:
|
||||
# - "auto": Tries strict first, falls back to none if >50% fail (recommended default)
|
||||
# - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose <think> tags
|
||||
# - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output
|
||||
# - "json_object": Middle ground, broader compatibility than strict
|
||||
def default_json_mode
|
||||
# 1. Check environment variable first (allows runtime override for testing)
|
||||
env_mode = ENV["LLM_JSON_MODE"]
|
||||
return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode)
|
||||
|
||||
# 2. Check app settings (user-configured)
|
||||
setting_mode = Setting.openai_json_mode
|
||||
return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode)
|
||||
|
||||
# 3. Default: auto mode for all providers (tries strict first, falls back to none if needed)
|
||||
JSON_MODE_AUTO
|
||||
end
|
||||
|
||||
def auto_detect_merchants
|
||||
@@ -22,6 +61,32 @@ class Provider::Openai::AutoMerchantDetector
|
||||
end
|
||||
|
||||
def instructions
|
||||
if custom_provider
|
||||
simple_instructions
|
||||
else
|
||||
detailed_instructions
|
||||
end
|
||||
end
|
||||
|
||||
# Simplified instructions for smaller/local LLMs
|
||||
def simple_instructions
|
||||
<<~INSTRUCTIONS.strip_heredoc
|
||||
Detect business names and websites from transaction descriptions. Return JSON only.
|
||||
|
||||
Rules:
|
||||
1. Match transaction_id exactly from input
|
||||
2. Return business_name and business_url for known businesses
|
||||
3. Return "null" for both if uncertain or generic (e.g. "Paycheck", "Local diner")
|
||||
4. Don't include "www." in URLs (use "amazon.com" not "www.amazon.com")
|
||||
5. Favor "null" over guessing - only return values if 80%+ confident
|
||||
|
||||
Example output format:
|
||||
{"merchants": [{"transaction_id": "txn_001", "business_name": "Amazon", "business_url": "amazon.com"}]}
|
||||
INSTRUCTIONS
|
||||
end
|
||||
|
||||
# Detailed instructions for larger models like GPT-4
|
||||
def detailed_instructions
|
||||
<<~INSTRUCTIONS.strip_heredoc
|
||||
You are an assistant to a consumer personal finance app.
|
||||
|
||||
@@ -108,19 +173,64 @@ class Provider::Openai::AutoMerchantDetector
|
||||
end
|
||||
|
||||
def auto_detect_merchants_openai_generic
|
||||
if json_mode == JSON_MODE_AUTO
|
||||
auto_detect_merchants_with_auto_mode
|
||||
else
|
||||
auto_detect_merchants_with_mode(json_mode)
|
||||
end
|
||||
rescue Faraday::BadRequestError => e
|
||||
# If strict mode fails (HTTP 400), fall back to none mode
|
||||
# This handles providers that don't support json_schema response format
|
||||
if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO
|
||||
Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}")
|
||||
auto_detect_merchants_with_mode(JSON_MODE_NONE)
|
||||
else
|
||||
raise
|
||||
end
|
||||
end
|
||||
|
||||
# Auto mode: try strict first, fall back to none if too many nulls or missing results
|
||||
def auto_detect_merchants_with_auto_mode
|
||||
result = auto_detect_merchants_with_mode(JSON_MODE_STRICT)
|
||||
|
||||
# Check if too many nulls OR missing results were returned
|
||||
# Models that can't reason in strict mode often:
|
||||
# 1. Return null for everything, OR
|
||||
# 2. Simply omit transactions they can't detect (returning fewer results than input)
|
||||
null_count = result.count { |r| r.business_name.nil? || r.business_name == "null" }
|
||||
missing_count = transactions.size - result.size
|
||||
failed_count = null_count + missing_count
|
||||
failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0
|
||||
|
||||
if failed_ratio > AUTO_MODE_NULL_THRESHOLD
|
||||
Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode")
|
||||
auto_detect_merchants_with_mode(JSON_MODE_NONE)
|
||||
else
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
def auto_detect_merchants_with_mode(mode)
|
||||
span = langfuse_trace&.span(name: "auto_detect_merchants_api_call", input: {
|
||||
model: model.presence || Provider::Openai::DEFAULT_MODEL,
|
||||
transactions: transactions,
|
||||
user_merchants: user_merchants
|
||||
user_merchants: user_merchants,
|
||||
json_mode: mode
|
||||
})
|
||||
|
||||
response = client.chat(parameters: {
|
||||
# Build parameters with configurable JSON response format
|
||||
params = {
|
||||
model: model.presence || Provider::Openai::DEFAULT_MODEL,
|
||||
messages: [
|
||||
{ role: "system", content: instructions },
|
||||
{ role: "user", content: developer_message }
|
||||
],
|
||||
response_format: {
|
||||
{ role: "user", content: developer_message_for_generic }
|
||||
]
|
||||
}
|
||||
|
||||
# Add response format based on json_mode setting
|
||||
case mode
|
||||
when JSON_MODE_STRICT
|
||||
params[:response_format] = {
|
||||
type: "json_schema",
|
||||
json_schema: {
|
||||
name: "auto_detect_personal_finance_merchants",
|
||||
@@ -128,9 +238,14 @@ class Provider::Openai::AutoMerchantDetector
|
||||
schema: json_schema
|
||||
}
|
||||
}
|
||||
})
|
||||
when JSON_MODE_OBJECT
|
||||
params[:response_format] = { type: "json_object" }
|
||||
# JSON_MODE_NONE: no response_format constraint
|
||||
end
|
||||
|
||||
Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")}")
|
||||
response = client.chat(parameters: params)
|
||||
|
||||
Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})")
|
||||
|
||||
merchants = extract_merchants_generic(response)
|
||||
result = build_response(merchants)
|
||||
@@ -141,7 +256,8 @@ class Provider::Openai::AutoMerchantDetector
|
||||
operation: "auto_detect_merchants",
|
||||
metadata: {
|
||||
transaction_count: transactions.size,
|
||||
merchant_count: user_merchants.size
|
||||
merchant_count: user_merchants.size,
|
||||
json_mode: mode
|
||||
}
|
||||
)
|
||||
|
||||
@@ -154,24 +270,40 @@ class Provider::Openai::AutoMerchantDetector
|
||||
|
||||
AutoDetectedMerchant = Provider::LlmConcept::AutoDetectedMerchant
|
||||
|
||||
def build_response(categorizations)
|
||||
categorizations.map do |categorization|
|
||||
def build_response(merchants)
|
||||
merchants.map do |merchant|
|
||||
AutoDetectedMerchant.new(
|
||||
transaction_id: categorization.dig("transaction_id"),
|
||||
business_name: normalize_ai_value(categorization.dig("business_name")),
|
||||
business_url: normalize_ai_value(categorization.dig("business_url")),
|
||||
transaction_id: merchant.dig("transaction_id"),
|
||||
business_name: normalize_merchant_value(merchant.dig("business_name")),
|
||||
business_url: normalize_merchant_value(merchant.dig("business_url")),
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def normalize_ai_value(ai_value)
|
||||
return nil if ai_value == "null"
|
||||
def normalize_merchant_value(value)
|
||||
return nil if value.nil? || value == "null" || value.to_s.downcase == "null"
|
||||
|
||||
ai_value
|
||||
# Try to match against user merchants for name normalization
|
||||
if user_merchants.present?
|
||||
# Try exact match first
|
||||
exact_match = user_merchants.find { |m| m[:name] == value }
|
||||
return exact_match[:name] if exact_match
|
||||
|
||||
# Try case-insensitive match
|
||||
case_match = user_merchants.find { |m| m[:name].to_s.downcase == value.to_s.downcase }
|
||||
return case_match[:name] if case_match
|
||||
end
|
||||
|
||||
value
|
||||
end
|
||||
|
||||
def extract_merchants_native(response)
|
||||
raw = response.dig("output", 0, "content", 0, "text")
|
||||
# Find the message output (not reasoning output)
|
||||
message_output = response["output"]&.find { |o| o["type"] == "message" }
|
||||
raw = message_output&.dig("content", 0, "text")
|
||||
|
||||
raise Provider::Openai::Error, "No message content found in response" if raw.nil?
|
||||
|
||||
JSON.parse(raw).dig("merchants")
|
||||
rescue JSON::ParserError => e
|
||||
raise Provider::Openai::Error, "Invalid JSON in native merchant detection: #{e.message}"
|
||||
@@ -179,9 +311,100 @@ class Provider::Openai::AutoMerchantDetector
|
||||
|
||||
def extract_merchants_generic(response)
|
||||
raw = response.dig("choices", 0, "message", "content")
|
||||
JSON.parse(raw).dig("merchants")
|
||||
rescue JSON::ParserError => e
|
||||
raise Provider::Openai::Error, "Invalid JSON in generic merchant detection: #{e.message}"
|
||||
parsed = parse_json_flexibly(raw)
|
||||
|
||||
# Handle different response formats from various LLMs
|
||||
merchants = parsed.dig("merchants") ||
|
||||
parsed.dig("results") ||
|
||||
(parsed.is_a?(Array) ? parsed : nil)
|
||||
|
||||
raise Provider::Openai::Error, "Could not find merchants in response" if merchants.nil?
|
||||
|
||||
# Normalize field names (some LLMs use different naming)
|
||||
merchants.map do |m|
|
||||
{
|
||||
"transaction_id" => m["transaction_id"] || m["id"] || m["txn_id"],
|
||||
"business_name" => m["business_name"] || m["name"] || m["merchant_name"] || m["merchant"],
|
||||
"business_url" => m["business_url"] || m["url"] || m["website"]
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
# Flexible JSON parsing that handles common LLM output issues
|
||||
def parse_json_flexibly(raw)
|
||||
return {} if raw.blank?
|
||||
|
||||
# Strip thinking model tags if present (e.g., <think>...</think>)
|
||||
cleaned = strip_thinking_tags(raw)
|
||||
|
||||
# Try direct parse first
|
||||
JSON.parse(cleaned)
|
||||
rescue JSON::ParserError
|
||||
# Try multiple extraction strategies in order of preference
|
||||
|
||||
# Strategy 1: Closed markdown code blocks (```json...```)
|
||||
if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m
|
||||
matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten
|
||||
matches.reverse_each do |match|
|
||||
begin
|
||||
return JSON.parse(match)
|
||||
rescue JSON::ParserError
|
||||
next
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Strategy 2: Unclosed markdown code blocks (thinking models often forget to close)
|
||||
if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m
|
||||
begin
|
||||
return JSON.parse($1)
|
||||
rescue JSON::ParserError
|
||||
# Continue to next strategy
|
||||
end
|
||||
end
|
||||
|
||||
# Strategy 3: Find JSON object with "merchants" key
|
||||
if cleaned =~ /(\{"merchants"\s*:\s*\[[\s\S]*\]\s*\})/m
|
||||
matches = cleaned.scan(/(\{"merchants"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten
|
||||
matches.reverse_each do |match|
|
||||
begin
|
||||
return JSON.parse(match)
|
||||
rescue JSON::ParserError
|
||||
next
|
||||
end
|
||||
end
|
||||
# Try greedy match if non-greedy failed
|
||||
begin
|
||||
return JSON.parse($1)
|
||||
rescue JSON::ParserError
|
||||
# Continue to next strategy
|
||||
end
|
||||
end
|
||||
|
||||
# Strategy 4: Find any JSON object (last resort)
|
||||
if cleaned =~ /(\{[\s\S]*\})/m
|
||||
begin
|
||||
return JSON.parse($1)
|
||||
rescue JSON::ParserError
|
||||
# Fall through to error
|
||||
end
|
||||
end
|
||||
|
||||
raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}"
|
||||
end
|
||||
|
||||
# Strip thinking model tags (<think>...</think>) from response
|
||||
def strip_thinking_tags(raw)
|
||||
if raw.include?("<think>")
|
||||
if raw =~ /<\/think>\s*([\s\S]*)/m
|
||||
after_thinking = $1.strip
|
||||
return after_thinking if after_thinking.present?
|
||||
end
|
||||
if raw =~ /<think>([\s\S]*)/m
|
||||
return $1
|
||||
end
|
||||
end
|
||||
raw
|
||||
end
|
||||
|
||||
def json_schema
|
||||
@@ -235,4 +458,40 @@ class Provider::Openai::AutoMerchantDetector
|
||||
Return "null" if you are not 80%+ confident in your answer.
|
||||
MESSAGE
|
||||
end
|
||||
|
||||
# Enhanced developer message with few-shot examples for smaller/local LLMs
|
||||
def developer_message_for_generic
|
||||
merchant_names = user_merchants.present? ? user_merchants.map { |m| m[:name] }.join(", ") : "(none provided)"
|
||||
|
||||
<<~MESSAGE.strip_heredoc
|
||||
USER'S KNOWN MERCHANTS: #{merchant_names}
|
||||
|
||||
TRANSACTIONS TO ANALYZE:
|
||||
#{format_transactions_simply}
|
||||
|
||||
EXAMPLES of correct merchant detection:
|
||||
- "AMAZON.COM*1A2B3C" → business_name: "Amazon", business_url: "amazon.com"
|
||||
- "STARBUCKS STORE #9876" → business_name: "Starbucks", business_url: "starbucks.com"
|
||||
- "NETFLIX.COM" → business_name: "Netflix", business_url: "netflix.com"
|
||||
- "UBER *TRIP" → business_name: "Uber", business_url: "uber.com"
|
||||
- "ACH WITHDRAWAL" → business_name: "null", business_url: "null" (generic)
|
||||
- "LOCAL DINER" → business_name: "null", business_url: "null" (generic/unknown)
|
||||
- "POS DEBIT 12345" → business_name: "null", business_url: "null" (generic)
|
||||
|
||||
IMPORTANT:
|
||||
- Return "null" (as a string) for BOTH name and URL if you cannot confidently identify the business
|
||||
- Don't include "www." in URLs
|
||||
- Generic descriptions like "Paycheck", "Transfer", "ATM" should return "null"
|
||||
|
||||
Respond with ONLY this JSON format (no other text):
|
||||
{"merchants": [{"transaction_id": "...", "business_name": "...", "business_url": "..."}]}
|
||||
MESSAGE
|
||||
end
|
||||
|
||||
# Format transactions in a simpler, more readable way for smaller LLMs
|
||||
def format_transactions_simply
|
||||
transactions.map do |t|
|
||||
"- ID: #{t[:id]}, Description: \"#{t[:name] || t[:description]}\""
|
||||
end.join("\n")
|
||||
end
|
||||
end
|
||||
|
||||
@@ -9,6 +9,7 @@ class Setting < RailsSettings::Base
|
||||
field :openai_access_token, type: :string, default: ENV["OPENAI_ACCESS_TOKEN"]
|
||||
field :openai_uri_base, type: :string, default: ENV["OPENAI_URI_BASE"]
|
||||
field :openai_model, type: :string, default: ENV["OPENAI_MODEL"]
|
||||
field :openai_json_mode, type: :string, default: ENV["LLM_JSON_MODE"]
|
||||
field :brand_fetch_client_id, type: :string, default: ENV["BRAND_FETCH_CLIENT_ID"]
|
||||
|
||||
# Provider selection
|
||||
|
||||
@@ -47,5 +47,20 @@
|
||||
inputmode: "text",
|
||||
disabled: ENV["OPENAI_MODEL"].present?,
|
||||
data: { "auto-submit-form-target": "auto" } %>
|
||||
|
||||
<%= form.select :openai_json_mode,
|
||||
options_for_select(
|
||||
[
|
||||
[t(".json_mode_auto"), ""],
|
||||
[t(".json_mode_strict"), "strict"],
|
||||
[t(".json_mode_none"), "none"],
|
||||
[t(".json_mode_json_object"), "json_object"]
|
||||
],
|
||||
Setting.openai_json_mode
|
||||
),
|
||||
{ label: t(".json_mode_label") },
|
||||
{ disabled: ENV["LLM_JSON_MODE"].present?,
|
||||
data: { "auto-submit-form-target": "auto" } } %>
|
||||
<p class="text-xs text-secondary mt-1"><%= t(".json_mode_help") %></p>
|
||||
<% end %>
|
||||
</div>
|
||||
|
||||
@@ -48,6 +48,12 @@ en:
|
||||
uri_base_placeholder: "https://api.openai.com/v1 (default)"
|
||||
model_label: Model (Optional)
|
||||
model_placeholder: "gpt-4.1 (default)"
|
||||
json_mode_label: JSON Mode
|
||||
json_mode_auto: Auto (recommended)
|
||||
json_mode_strict: Strict (best for thinking models)
|
||||
json_mode_none: None (best for standard models)
|
||||
json_mode_json_object: JSON Object
|
||||
json_mode_help: "Strict mode works best with thinking models (qwen-thinking, deepseek-reasoner). None mode works best with standard models (llama, mistral, gpt-oss)."
|
||||
title: OpenAI
|
||||
yahoo_finance_settings:
|
||||
title: Yahoo Finance
|
||||
|
||||
1344
db/eval_data/categorization_golden_v1.yml
Normal file
1344
db/eval_data/categorization_golden_v1.yml
Normal file
File diff suppressed because it is too large
Load Diff
769
db/eval_data/categorization_golden_v1_light.yml
Normal file
769
db/eval_data/categorization_golden_v1_light.yml
Normal file
@@ -0,0 +1,769 @@
|
||||
---
|
||||
name: categorization_golden_v1_light
|
||||
description: Lightweight golden dataset for quick transaction categorization evaluation
|
||||
eval_type: categorization
|
||||
version: "1.0"
|
||||
metadata:
|
||||
created_at: "2025-12-04"
|
||||
updated_at: "2025-12-04"
|
||||
source: manual_curation
|
||||
notes: |
|
||||
A compact 50-sample dataset designed for quick evaluation runs.
|
||||
Includes a balanced mix across:
|
||||
- All difficulty levels (easy, medium, hard, edge_case)
|
||||
- All major category types
|
||||
- Both US and European merchants
|
||||
- Representative edge cases
|
||||
|
||||
Difficulty distribution:
|
||||
- easy: 20 samples
|
||||
- medium: 15 samples
|
||||
- hard: 10 samples
|
||||
- edge_case: 5 samples
|
||||
|
||||
context:
|
||||
categories:
|
||||
- id: "income"
|
||||
name: "Income"
|
||||
classification: "income"
|
||||
is_subcategory: false
|
||||
- id: "salary"
|
||||
name: "Salary"
|
||||
classification: "income"
|
||||
is_subcategory: true
|
||||
parent_id: "income"
|
||||
- id: "food_and_drink"
|
||||
name: "Food & Drink"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "restaurants"
|
||||
name: "Restaurants"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "food_and_drink"
|
||||
- id: "fast_food"
|
||||
name: "Fast Food"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "food_and_drink"
|
||||
- id: "groceries"
|
||||
name: "Groceries"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "food_and_drink"
|
||||
- id: "coffee_shops"
|
||||
name: "Coffee Shops"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "food_and_drink"
|
||||
- id: "shopping"
|
||||
name: "Shopping"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "clothing"
|
||||
name: "Clothing"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "shopping"
|
||||
- id: "electronics"
|
||||
name: "Electronics"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "shopping"
|
||||
- id: "transportation"
|
||||
name: "Transportation"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "gas"
|
||||
name: "Gas & Fuel"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "transportation"
|
||||
- id: "rideshare"
|
||||
name: "Rideshare"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "transportation"
|
||||
- id: "public_transit"
|
||||
name: "Public Transit"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "transportation"
|
||||
- id: "entertainment"
|
||||
name: "Entertainment"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "streaming"
|
||||
name: "Streaming Services"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "entertainment"
|
||||
- id: "utilities"
|
||||
name: "Utilities"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "housing"
|
||||
name: "Housing"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "rent"
|
||||
name: "Rent"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "housing"
|
||||
- id: "health"
|
||||
name: "Health & Wellness"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "pharmacy"
|
||||
name: "Pharmacy"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "health"
|
||||
- id: "gym"
|
||||
name: "Gym & Fitness"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "health"
|
||||
- id: "travel"
|
||||
name: "Travel"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "flights"
|
||||
name: "Flights"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "travel"
|
||||
- id: "hotels"
|
||||
name: "Hotels"
|
||||
classification: "expense"
|
||||
is_subcategory: true
|
||||
parent_id: "travel"
|
||||
- id: "subscriptions"
|
||||
name: "Subscriptions"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "personal_care"
|
||||
name: "Personal Care"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
- id: "gifts"
|
||||
name: "Gifts & Donations"
|
||||
classification: "expense"
|
||||
is_subcategory: false
|
||||
|
||||
samples:
|
||||
# =============================================================================
|
||||
# EASY SAMPLES (20 samples) - Clear, unambiguous merchants
|
||||
# =============================================================================
|
||||
|
||||
# Fast Food
|
||||
- id: cat_light_easy_001
|
||||
difficulty: easy
|
||||
tags: [fast_food, us]
|
||||
input:
|
||||
id: txn_light_001
|
||||
amount: 12.99
|
||||
classification: expense
|
||||
description: "MCDONALD'S #12345"
|
||||
expected:
|
||||
category_name: "Fast Food"
|
||||
|
||||
- id: cat_light_easy_002
|
||||
difficulty: easy
|
||||
tags: [fast_food, us]
|
||||
input:
|
||||
id: txn_light_002
|
||||
amount: 14.50
|
||||
classification: expense
|
||||
description: "CHIPOTLE MEXICAN GRILL"
|
||||
expected:
|
||||
category_name: "Fast Food"
|
||||
|
||||
# Coffee Shops
|
||||
- id: cat_light_easy_003
|
||||
difficulty: easy
|
||||
tags: [coffee_shops, us]
|
||||
input:
|
||||
id: txn_light_003
|
||||
amount: 5.75
|
||||
classification: expense
|
||||
description: "STARBUCKS STORE #9876"
|
||||
expected:
|
||||
category_name: "Coffee Shops"
|
||||
|
||||
- id: cat_light_easy_004
|
||||
difficulty: easy
|
||||
tags: [coffee_shops, europe, uk]
|
||||
input:
|
||||
id: txn_light_004
|
||||
amount: 4.50
|
||||
classification: expense
|
||||
description: "COSTA COFFEE LTD"
|
||||
expected:
|
||||
category_name: "Coffee Shops"
|
||||
|
||||
# Groceries
|
||||
- id: cat_light_easy_005
|
||||
difficulty: easy
|
||||
tags: [groceries, us]
|
||||
input:
|
||||
id: txn_light_005
|
||||
amount: 156.32
|
||||
classification: expense
|
||||
description: "WHOLE FOODS MKT #10234"
|
||||
expected:
|
||||
category_name: "Groceries"
|
||||
|
||||
- id: cat_light_easy_006
|
||||
difficulty: easy
|
||||
tags: [groceries, europe, uk]
|
||||
input:
|
||||
id: txn_light_006
|
||||
amount: 87.50
|
||||
classification: expense
|
||||
description: "TESCO STORES LTD"
|
||||
expected:
|
||||
category_name: "Groceries"
|
||||
|
||||
- id: cat_light_easy_007
|
||||
difficulty: easy
|
||||
tags: [groceries, europe, germany]
|
||||
input:
|
||||
id: txn_light_007
|
||||
amount: 78.90
|
||||
classification: expense
|
||||
description: "LIDL DIENSTLEISTUNG"
|
||||
expected:
|
||||
category_name: "Groceries"
|
||||
|
||||
# Gas & Fuel
|
||||
- id: cat_light_easy_008
|
||||
difficulty: easy
|
||||
tags: [gas, us]
|
||||
input:
|
||||
id: txn_light_008
|
||||
amount: 45.00
|
||||
classification: expense
|
||||
description: "SHELL OIL 573849234"
|
||||
expected:
|
||||
category_name: "Gas & Fuel"
|
||||
|
||||
- id: cat_light_easy_009
|
||||
difficulty: easy
|
||||
tags: [gas, europe, uk]
|
||||
input:
|
||||
id: txn_light_009
|
||||
amount: 75.00
|
||||
classification: expense
|
||||
description: "BP OIL UK LTD"
|
||||
expected:
|
||||
category_name: "Gas & Fuel"
|
||||
|
||||
# Rideshare
|
||||
- id: cat_light_easy_010
|
||||
difficulty: easy
|
||||
tags: [rideshare, us]
|
||||
input:
|
||||
id: txn_light_010
|
||||
amount: 23.50
|
||||
classification: expense
|
||||
description: "UBER *TRIP HELP.UBER.COM"
|
||||
expected:
|
||||
category_name: "Rideshare"
|
||||
|
||||
# Streaming
|
||||
- id: cat_light_easy_011
|
||||
difficulty: easy
|
||||
tags: [streaming, us]
|
||||
input:
|
||||
id: txn_light_011
|
||||
amount: 15.99
|
||||
classification: expense
|
||||
description: "NETFLIX.COM"
|
||||
expected:
|
||||
category_name: "Streaming Services"
|
||||
|
||||
- id: cat_light_easy_012
|
||||
difficulty: easy
|
||||
tags: [streaming, us]
|
||||
input:
|
||||
id: txn_light_012
|
||||
amount: 10.99
|
||||
classification: expense
|
||||
description: "SPOTIFY USA"
|
||||
expected:
|
||||
category_name: "Streaming Services"
|
||||
|
||||
# Electronics
|
||||
- id: cat_light_easy_013
|
||||
difficulty: easy
|
||||
tags: [electronics, us]
|
||||
input:
|
||||
id: txn_light_013
|
||||
amount: 299.99
|
||||
classification: expense
|
||||
description: "BEST BUY 00000456"
|
||||
expected:
|
||||
category_name: "Electronics"
|
||||
acceptable_alternatives: ["Shopping"]
|
||||
|
||||
# Clothing
|
||||
- id: cat_light_easy_014
|
||||
difficulty: easy
|
||||
tags: [clothing, europe, spain]
|
||||
input:
|
||||
id: txn_light_014
|
||||
amount: 79.99
|
||||
classification: expense
|
||||
description: "ZARA ESPANA SA"
|
||||
expected:
|
||||
category_name: "Clothing"
|
||||
acceptable_alternatives: ["Shopping"]
|
||||
|
||||
# Pharmacy
|
||||
- id: cat_light_easy_015
|
||||
difficulty: easy
|
||||
tags: [pharmacy, us]
|
||||
input:
|
||||
id: txn_light_015
|
||||
amount: 24.99
|
||||
classification: expense
|
||||
description: "CVS/PHARMACY #4567"
|
||||
expected:
|
||||
category_name: "Pharmacy"
|
||||
|
||||
# Flights
|
||||
- id: cat_light_easy_016
|
||||
difficulty: easy
|
||||
tags: [flights, us]
|
||||
input:
|
||||
id: txn_light_016
|
||||
amount: 345.00
|
||||
classification: expense
|
||||
description: "UNITED AIRLINES 0162345678"
|
||||
expected:
|
||||
category_name: "Flights"
|
||||
|
||||
- id: cat_light_easy_017
|
||||
difficulty: easy
|
||||
tags: [flights, europe, ireland]
|
||||
input:
|
||||
id: txn_light_017
|
||||
amount: 89.99
|
||||
classification: expense
|
||||
description: "RYANAIR DAC"
|
||||
expected:
|
||||
category_name: "Flights"
|
||||
|
||||
# Hotels
|
||||
- id: cat_light_easy_018
|
||||
difficulty: easy
|
||||
tags: [hotels, us]
|
||||
input:
|
||||
id: txn_light_018
|
||||
amount: 189.00
|
||||
classification: expense
|
||||
description: "MARRIOTT HOTELS NYC"
|
||||
expected:
|
||||
category_name: "Hotels"
|
||||
|
||||
# Gym
|
||||
- id: cat_light_easy_019
|
||||
difficulty: easy
|
||||
tags: [gym, us]
|
||||
input:
|
||||
id: txn_light_019
|
||||
amount: 39.99
|
||||
classification: expense
|
||||
description: "PLANET FITNESS MONTHLY"
|
||||
expected:
|
||||
category_name: "Gym & Fitness"
|
||||
|
||||
# Income
|
||||
- id: cat_light_easy_020
|
||||
difficulty: easy
|
||||
tags: [income, salary, us]
|
||||
input:
|
||||
id: txn_light_020
|
||||
amount: 3500.00
|
||||
classification: income
|
||||
description: "ACME CORP PAYROLL"
|
||||
expected:
|
||||
category_name: "Salary"
|
||||
|
||||
# =============================================================================
|
||||
# MEDIUM SAMPLES (15 samples) - Requires domain knowledge
|
||||
# =============================================================================
|
||||
|
||||
# Restaurants
|
||||
- id: cat_light_med_001
|
||||
difficulty: medium
|
||||
tags: [restaurants, us]
|
||||
input:
|
||||
id: txn_light_med_001
|
||||
amount: 67.50
|
||||
classification: expense
|
||||
description: "OLIVE GARDEN #456"
|
||||
expected:
|
||||
category_name: "Restaurants"
|
||||
|
||||
- id: cat_light_med_002
|
||||
difficulty: medium
|
||||
tags: [restaurants, europe, uk]
|
||||
input:
|
||||
id: txn_light_med_002
|
||||
amount: 78.50
|
||||
classification: expense
|
||||
description: "WAGAMAMA LTD LONDON"
|
||||
expected:
|
||||
category_name: "Restaurants"
|
||||
|
||||
# Warehouse stores
|
||||
- id: cat_light_med_003
|
||||
difficulty: medium
|
||||
tags: [groceries, us, warehouse]
|
||||
input:
|
||||
id: txn_light_med_003
|
||||
amount: 234.56
|
||||
classification: expense
|
||||
description: "COSTCO WHSE #1234"
|
||||
expected:
|
||||
category_name: "Groceries"
|
||||
acceptable_alternatives: ["Shopping"]
|
||||
|
||||
# Utilities
|
||||
- id: cat_light_med_004
|
||||
difficulty: medium
|
||||
tags: [utilities, us]
|
||||
input:
|
||||
id: txn_light_med_004
|
||||
amount: 125.00
|
||||
classification: expense
|
||||
description: "CON EDISON PAYMENT"
|
||||
expected:
|
||||
category_name: "Utilities"
|
||||
|
||||
- id: cat_light_med_005
|
||||
difficulty: medium
|
||||
tags: [utilities, europe, uk]
|
||||
input:
|
||||
id: txn_light_med_005
|
||||
amount: 156.00
|
||||
classification: expense
|
||||
description: "BRITISH GAS SERVICES"
|
||||
expected:
|
||||
category_name: "Utilities"
|
||||
|
||||
- id: cat_light_med_006
|
||||
difficulty: medium
|
||||
tags: [utilities, us]
|
||||
input:
|
||||
id: txn_light_med_006
|
||||
amount: 89.00
|
||||
classification: expense
|
||||
description: "AT&T WIRELESS"
|
||||
expected:
|
||||
category_name: "Utilities"
|
||||
|
||||
# Public Transit
|
||||
- id: cat_light_med_007
|
||||
difficulty: medium
|
||||
tags: [public_transit, us]
|
||||
input:
|
||||
id: txn_light_med_007
|
||||
amount: 127.00
|
||||
classification: expense
|
||||
description: "MTA *METROCARD"
|
||||
expected:
|
||||
category_name: "Public Transit"
|
||||
|
||||
- id: cat_light_med_008
|
||||
difficulty: medium
|
||||
tags: [public_transit, europe, uk]
|
||||
input:
|
||||
id: txn_light_med_008
|
||||
amount: 156.50
|
||||
classification: expense
|
||||
description: "TFL TRAVEL LONDON"
|
||||
expected:
|
||||
category_name: "Public Transit"
|
||||
|
||||
# Housing
|
||||
- id: cat_light_med_009
|
||||
difficulty: medium
|
||||
tags: [rent, us]
|
||||
input:
|
||||
id: txn_light_med_009
|
||||
amount: 2100.00
|
||||
classification: expense
|
||||
description: "AVALON APARTMENTS RENT"
|
||||
expected:
|
||||
category_name: "Rent"
|
||||
acceptable_alternatives: ["Housing"]
|
||||
|
||||
# Subscriptions
|
||||
- id: cat_light_med_010
|
||||
difficulty: medium
|
||||
tags: [subscriptions, us]
|
||||
input:
|
||||
id: txn_light_med_010
|
||||
amount: 9.99
|
||||
classification: expense
|
||||
description: "APPLE.COM/BILL"
|
||||
expected:
|
||||
category_name: "Subscriptions"
|
||||
|
||||
# Gifts & Donations
|
||||
- id: cat_light_med_011
|
||||
difficulty: medium
|
||||
tags: [gifts, us, donation]
|
||||
input:
|
||||
id: txn_light_med_011
|
||||
amount: 50.00
|
||||
classification: expense
|
||||
description: "RED CROSS DONATION"
|
||||
expected:
|
||||
category_name: "Gifts & Donations"
|
||||
|
||||
# Entertainment
|
||||
- id: cat_light_med_012
|
||||
difficulty: medium
|
||||
tags: [entertainment, us]
|
||||
input:
|
||||
id: txn_light_med_012
|
||||
amount: 89.00
|
||||
classification: expense
|
||||
description: "TICKETMASTER *EVENT"
|
||||
expected:
|
||||
category_name: "Entertainment"
|
||||
|
||||
# Travel
|
||||
- id: cat_light_med_013
|
||||
difficulty: medium
|
||||
tags: [hotels, us]
|
||||
input:
|
||||
id: txn_light_med_013
|
||||
amount: 234.00
|
||||
classification: expense
|
||||
description: "AIRBNB *HMQT5J6QQJ"
|
||||
expected:
|
||||
category_name: "Hotels"
|
||||
acceptable_alternatives: ["Travel"]
|
||||
|
||||
# Personal Care
|
||||
- id: cat_light_med_014
|
||||
difficulty: medium
|
||||
tags: [personal_care, us]
|
||||
input:
|
||||
id: txn_light_med_014
|
||||
amount: 45.00
|
||||
classification: expense
|
||||
description: "SUPERCUTS #1234"
|
||||
expected:
|
||||
category_name: "Personal Care"
|
||||
|
||||
# Income
|
||||
- id: cat_light_med_015
|
||||
difficulty: medium
|
||||
tags: [income, us]
|
||||
input:
|
||||
id: txn_light_med_015
|
||||
amount: 500.00
|
||||
classification: income
|
||||
description: "VENMO CASHOUT"
|
||||
expected:
|
||||
category_name: "Income"
|
||||
|
||||
# =============================================================================
|
||||
# HARD SAMPLES (10 samples) - Ambiguous, multiple interpretations
|
||||
# =============================================================================
|
||||
|
||||
# Big-box stores
|
||||
- id: cat_light_hard_001
|
||||
difficulty: hard
|
||||
tags: [ambiguous, us, multi_purpose_retailer]
|
||||
input:
|
||||
id: txn_light_hard_001
|
||||
amount: 156.78
|
||||
classification: expense
|
||||
description: "TARGET #1234"
|
||||
expected:
|
||||
category_name: "Shopping"
|
||||
acceptable_alternatives: ["Groceries"]
|
||||
|
||||
- id: cat_light_hard_002
|
||||
difficulty: hard
|
||||
tags: [ambiguous, europe, uk, multi_purpose_retailer]
|
||||
input:
|
||||
id: txn_light_hard_002
|
||||
amount: 156.00
|
||||
classification: expense
|
||||
description: "MARKS & SPENCER PLC"
|
||||
expected:
|
||||
category_name: "Shopping"
|
||||
acceptable_alternatives: ["Groceries", "Clothing"]
|
||||
|
||||
# Online marketplaces
|
||||
- id: cat_light_hard_003
|
||||
difficulty: hard
|
||||
tags: [ambiguous, us, online_marketplace]
|
||||
input:
|
||||
id: txn_light_hard_003
|
||||
amount: 89.99
|
||||
classification: expense
|
||||
description: "AMAZON.COM*1A2B3C4D"
|
||||
expected:
|
||||
category_name: "Shopping"
|
||||
|
||||
# Payment processors (should be null)
|
||||
- id: cat_light_hard_004
|
||||
difficulty: hard
|
||||
tags: [ambiguous, us, payment_processor]
|
||||
input:
|
||||
id: txn_light_hard_004
|
||||
amount: 78.00
|
||||
classification: expense
|
||||
description: "PAYPAL *JOHNSMITH"
|
||||
expected:
|
||||
category_name: null
|
||||
|
||||
# Fast-casual
|
||||
- id: cat_light_hard_005
|
||||
difficulty: hard
|
||||
tags: [ambiguous, us, fast_casual]
|
||||
input:
|
||||
id: txn_light_hard_005
|
||||
amount: 34.50
|
||||
classification: expense
|
||||
description: "PANERA BREAD #567"
|
||||
expected:
|
||||
category_name: "Restaurants"
|
||||
acceptable_alternatives: ["Fast Food"]
|
||||
|
||||
# Delivery services
|
||||
- id: cat_light_hard_006
|
||||
difficulty: hard
|
||||
tags: [ambiguous, us, delivery_service]
|
||||
input:
|
||||
id: txn_light_hard_006
|
||||
amount: 45.00
|
||||
classification: expense
|
||||
description: "DOORDASH*CHIPOTLE"
|
||||
expected:
|
||||
category_name: "Fast Food"
|
||||
acceptable_alternatives: ["Restaurants"]
|
||||
|
||||
- id: cat_light_hard_007
|
||||
difficulty: hard
|
||||
tags: [ambiguous, europe, uk, delivery_service]
|
||||
input:
|
||||
id: txn_light_hard_007
|
||||
amount: 32.50
|
||||
classification: expense
|
||||
description: "DELIVEROO UK LTD"
|
||||
expected:
|
||||
category_name: "Restaurants"
|
||||
acceptable_alternatives: ["Fast Food"]
|
||||
|
||||
# Amazon Prime
|
||||
- id: cat_light_hard_008
|
||||
difficulty: hard
|
||||
tags: [ambiguous, us, amazon]
|
||||
input:
|
||||
id: txn_light_hard_008
|
||||
amount: 14.99
|
||||
classification: expense
|
||||
description: "AMAZON PRIME*1A2B3C"
|
||||
expected:
|
||||
category_name: "Subscriptions"
|
||||
|
||||
# Convenience store
|
||||
- id: cat_light_hard_009
|
||||
difficulty: hard
|
||||
tags: [ambiguous, us, convenience_store]
|
||||
input:
|
||||
id: txn_light_hard_009
|
||||
amount: 12.50
|
||||
classification: expense
|
||||
description: "7-ELEVEN #34567"
|
||||
expected:
|
||||
category_name: "Groceries"
|
||||
acceptable_alternatives: ["Fast Food"]
|
||||
|
||||
# Streaming vs Subscription
|
||||
- id: cat_light_hard_010
|
||||
difficulty: hard
|
||||
tags: [ambiguous, us, streaming_subscription]
|
||||
input:
|
||||
id: txn_light_hard_010
|
||||
amount: 15.99
|
||||
classification: expense
|
||||
description: "HBO MAX"
|
||||
expected:
|
||||
category_name: "Streaming Services"
|
||||
acceptable_alternatives: ["Subscriptions"]
|
||||
|
||||
# =============================================================================
|
||||
# EDGE CASES (5 samples) - Should return null
|
||||
# =============================================================================
|
||||
|
||||
# Generic POS
|
||||
- id: cat_light_edge_001
|
||||
difficulty: edge_case
|
||||
tags: [should_be_null, generic_pos]
|
||||
input:
|
||||
id: txn_light_edge_001
|
||||
amount: 15.00
|
||||
classification: expense
|
||||
description: "POS DEBIT 12345"
|
||||
expected:
|
||||
category_name: null
|
||||
|
||||
# ACH transfer
|
||||
- id: cat_light_edge_002
|
||||
difficulty: edge_case
|
||||
tags: [should_be_null, transfer]
|
||||
input:
|
||||
id: txn_light_edge_002
|
||||
amount: 100.00
|
||||
classification: expense
|
||||
description: "ACH WITHDRAWAL"
|
||||
expected:
|
||||
category_name: null
|
||||
|
||||
# ATM
|
||||
- id: cat_light_edge_003
|
||||
difficulty: edge_case
|
||||
tags: [should_be_null, atm]
|
||||
input:
|
||||
id: txn_light_edge_003
|
||||
amount: 200.00
|
||||
classification: expense
|
||||
description: "ATM WITHDRAWAL 12345"
|
||||
expected:
|
||||
category_name: null
|
||||
|
||||
# Check
|
||||
- id: cat_light_edge_004
|
||||
difficulty: edge_case
|
||||
tags: [should_be_null, check]
|
||||
input:
|
||||
id: txn_light_edge_004
|
||||
amount: 350.00
|
||||
classification: expense
|
||||
description: "CHECK #1234"
|
||||
expected:
|
||||
category_name: null
|
||||
|
||||
# Cryptic
|
||||
- id: cat_light_edge_005
|
||||
difficulty: edge_case
|
||||
tags: [should_be_null, cryptic]
|
||||
input:
|
||||
id: txn_light_edge_005
|
||||
amount: 45.67
|
||||
classification: expense
|
||||
description: "TXN*89234*AUTH"
|
||||
expected:
|
||||
category_name: null
|
||||
2559
db/eval_data/categorization_golden_v2.yml
Normal file
2559
db/eval_data/categorization_golden_v2.yml
Normal file
File diff suppressed because it is too large
Load Diff
825
db/eval_data/chat_golden_v1.yml
Normal file
825
db/eval_data/chat_golden_v1.yml
Normal file
@@ -0,0 +1,825 @@
|
||||
---
|
||||
name: chat_golden_v1
|
||||
description: Golden dataset for chat/assistant function calling evaluation
|
||||
eval_type: chat
|
||||
version: "1.0"
|
||||
metadata:
|
||||
created_at: "2024-12-01"
|
||||
source: manual_curation
|
||||
|
||||
samples:
|
||||
# ===== EASY - Simple single function calls =====
|
||||
- id: chat_easy_001
|
||||
difficulty: easy
|
||||
tags: [get_accounts, simple]
|
||||
input:
|
||||
prompt: "What accounts do I have?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_002
|
||||
difficulty: easy
|
||||
tags: [get_accounts, simple]
|
||||
input:
|
||||
prompt: "Show me my accounts"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_003
|
||||
difficulty: easy
|
||||
tags: [get_accounts, balance]
|
||||
input:
|
||||
prompt: "What's my account balance?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_004
|
||||
difficulty: easy
|
||||
tags: [get_transactions, simple]
|
||||
input:
|
||||
prompt: "Show me my recent transactions"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_005
|
||||
difficulty: easy
|
||||
tags: [get_transactions, simple]
|
||||
input:
|
||||
prompt: "What are my latest transactions?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_006
|
||||
difficulty: easy
|
||||
tags: [get_balance_sheet, simple]
|
||||
input:
|
||||
prompt: "What's my net worth?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_007
|
||||
difficulty: easy
|
||||
tags: [get_balance_sheet, simple]
|
||||
input:
|
||||
prompt: "Show me my assets and liabilities"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_008
|
||||
difficulty: easy
|
||||
tags: [get_income_statement, simple]
|
||||
input:
|
||||
prompt: "What were my expenses last month?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_009
|
||||
difficulty: easy
|
||||
tags: [get_income_statement, simple]
|
||||
input:
|
||||
prompt: "How much income did I make this month?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_010
|
||||
difficulty: easy
|
||||
tags: [get_accounts, simple]
|
||||
input:
|
||||
prompt: "How many accounts do I have?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_011
|
||||
difficulty: easy
|
||||
tags: [get_transactions, simple]
|
||||
input:
|
||||
prompt: "List my transactions"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_012
|
||||
difficulty: easy
|
||||
tags: [get_balance_sheet, simple]
|
||||
input:
|
||||
prompt: "How much do I owe?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_013
|
||||
difficulty: easy
|
||||
tags: [get_balance_sheet, simple]
|
||||
input:
|
||||
prompt: "What are my total assets?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_014
|
||||
difficulty: easy
|
||||
tags: [get_income_statement, simple]
|
||||
input:
|
||||
prompt: "Show my spending"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_015
|
||||
difficulty: easy
|
||||
tags: [get_income_statement, simple]
|
||||
input:
|
||||
prompt: "How much did I spend?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
# ===== MEDIUM - With filtering or specific parameters =====
|
||||
- id: chat_medium_001
|
||||
difficulty: medium
|
||||
tags: [get_transactions, filtering]
|
||||
input:
|
||||
prompt: "Show me my restaurant spending"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_002
|
||||
difficulty: medium
|
||||
tags: [get_transactions, filtering]
|
||||
input:
|
||||
prompt: "What did I spend on groceries?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_003
|
||||
difficulty: medium
|
||||
tags: [get_transactions, filtering]
|
||||
input:
|
||||
prompt: "Show transactions over $100"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_004
|
||||
difficulty: medium
|
||||
tags: [get_transactions, filtering]
|
||||
input:
|
||||
prompt: "What did I spend at Amazon?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_005
|
||||
difficulty: medium
|
||||
tags: [get_transactions, date_range]
|
||||
input:
|
||||
prompt: "Show me last week's transactions"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_006
|
||||
difficulty: medium
|
||||
tags: [get_income_statement, date_range]
|
||||
input:
|
||||
prompt: "What was my income in January?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_007
|
||||
difficulty: medium
|
||||
tags: [get_income_statement, comparison]
|
||||
input:
|
||||
prompt: "How much did I save last month?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_008
|
||||
difficulty: medium
|
||||
tags: [get_accounts, specific]
|
||||
input:
|
||||
prompt: "What's the balance in my checking account?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_009
|
||||
difficulty: medium
|
||||
tags: [get_accounts, specific]
|
||||
input:
|
||||
prompt: "How much do I have in savings?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_010
|
||||
difficulty: medium
|
||||
tags: [get_transactions, category]
|
||||
input:
|
||||
prompt: "Show me all my subscription payments"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_011
|
||||
difficulty: medium
|
||||
tags: [get_transactions, search]
|
||||
input:
|
||||
prompt: "Find transactions from Uber"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_012
|
||||
difficulty: medium
|
||||
tags: [get_income_statement, category]
|
||||
input:
|
||||
prompt: "How much do I spend on entertainment?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_013
|
||||
difficulty: medium
|
||||
tags: [get_balance_sheet, trend]
|
||||
input:
|
||||
prompt: "How has my net worth changed over time?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_014
|
||||
difficulty: medium
|
||||
tags: [get_transactions, amount]
|
||||
input:
|
||||
prompt: "What's my largest expense this month?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_015
|
||||
difficulty: medium
|
||||
tags: [get_income_statement, breakdown]
|
||||
input:
|
||||
prompt: "Break down my expenses by category"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_016
|
||||
difficulty: medium
|
||||
tags: [get_transactions, recurring]
|
||||
input:
|
||||
prompt: "Show me my recurring payments"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_017
|
||||
difficulty: medium
|
||||
tags: [get_accounts, credit]
|
||||
input:
|
||||
prompt: "What's my credit card balance?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_018
|
||||
difficulty: medium
|
||||
tags: [get_income_statement, specific]
|
||||
input:
|
||||
prompt: "How much did I spend on food last month?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_019
|
||||
difficulty: medium
|
||||
tags: [get_transactions, date]
|
||||
input:
|
||||
prompt: "Show transactions from December"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_020
|
||||
difficulty: medium
|
||||
tags: [get_balance_sheet, liability]
|
||||
input:
|
||||
prompt: "What are my debts?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
# ===== HARD - Analysis, comparisons, insights =====
|
||||
- id: chat_hard_001
|
||||
difficulty: hard
|
||||
tags: [analysis, spending_trend]
|
||||
input:
|
||||
prompt: "Am I spending more than I make?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_002
|
||||
difficulty: hard
|
||||
tags: [comparison, month_over_month]
|
||||
input:
|
||||
prompt: "How does my spending this month compare to last month?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_003
|
||||
difficulty: hard
|
||||
tags: [analysis, budget]
|
||||
input:
|
||||
prompt: "Where can I cut expenses?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_004
|
||||
difficulty: hard
|
||||
tags: [analysis, savings]
|
||||
input:
|
||||
prompt: "What's my savings rate?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_005
|
||||
difficulty: hard
|
||||
tags: [analysis, trend]
|
||||
input:
|
||||
prompt: "Are my expenses trending up or down?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_006
|
||||
difficulty: hard
|
||||
tags: [analysis, category]
|
||||
input:
|
||||
prompt: "What category do I spend the most on?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_007
|
||||
difficulty: hard
|
||||
tags: [analysis, unusual]
|
||||
input:
|
||||
prompt: "Are there any unusual transactions this month?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_008
|
||||
difficulty: hard
|
||||
tags: [analysis, debt]
|
||||
input:
|
||||
prompt: "How long will it take to pay off my credit card?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_009
|
||||
difficulty: hard
|
||||
tags: [analysis, financial_health]
|
||||
input:
|
||||
prompt: "What's my debt-to-income ratio?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_010
|
||||
difficulty: hard
|
||||
tags: [analysis, goals]
|
||||
input:
|
||||
prompt: "Can I afford to save $500 more per month?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_011
|
||||
difficulty: hard
|
||||
tags: [comparison, year_over_year]
|
||||
input:
|
||||
prompt: "How does this year compare to last year?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_012
|
||||
difficulty: hard
|
||||
tags: [analysis, pattern]
|
||||
input:
|
||||
prompt: "Do I have any spending patterns I should know about?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_013
|
||||
difficulty: hard
|
||||
tags: [advice, budget]
|
||||
input:
|
||||
prompt: "How should I allocate my income?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_014
|
||||
difficulty: hard
|
||||
tags: [analysis, efficiency]
|
||||
input:
|
||||
prompt: "Am I overspending on subscriptions?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_015
|
||||
difficulty: hard
|
||||
tags: [forecast, projection]
|
||||
input:
|
||||
prompt: "At this rate, how much will I have saved by year end?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
# ===== EDGE CASES - Unclear intent, no function needed =====
|
||||
- id: chat_edge_001
|
||||
difficulty: edge_case
|
||||
tags: [no_function, greeting]
|
||||
input:
|
||||
prompt: "Hello"
|
||||
expected:
|
||||
functions: []
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_002
|
||||
difficulty: edge_case
|
||||
tags: [no_function, thanks]
|
||||
input:
|
||||
prompt: "Thank you!"
|
||||
expected:
|
||||
functions: []
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_003
|
||||
difficulty: edge_case
|
||||
tags: [no_function, general]
|
||||
input:
|
||||
prompt: "What can you help me with?"
|
||||
expected:
|
||||
functions: []
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_004
|
||||
difficulty: edge_case
|
||||
tags: [no_function, advice]
|
||||
input:
|
||||
prompt: "Should I invest in stocks?"
|
||||
expected:
|
||||
functions: []
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_005
|
||||
difficulty: edge_case
|
||||
tags: [no_function, external]
|
||||
input:
|
||||
prompt: "What's the weather like?"
|
||||
expected:
|
||||
functions: []
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_006
|
||||
difficulty: edge_case
|
||||
tags: [ambiguous]
|
||||
input:
|
||||
prompt: "Tell me about my money"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_007
|
||||
difficulty: edge_case
|
||||
tags: [ambiguous]
|
||||
input:
|
||||
prompt: "How am I doing financially?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_008
|
||||
difficulty: edge_case
|
||||
tags: [ambiguous]
|
||||
input:
|
||||
prompt: "Give me a summary"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_009
|
||||
difficulty: edge_case
|
||||
tags: [no_function, off_topic]
|
||||
input:
|
||||
prompt: "What's 2 + 2?"
|
||||
expected:
|
||||
functions: []
|
||||
response_contains: []
|
||||
|
||||
- id: chat_edge_010
|
||||
difficulty: edge_case
|
||||
tags: [no_function, general]
|
||||
input:
|
||||
prompt: "Who are you?"
|
||||
expected:
|
||||
functions: []
|
||||
response_contains: []
|
||||
|
||||
# Additional samples
|
||||
- id: chat_easy_016
|
||||
difficulty: easy
|
||||
tags: [get_transactions]
|
||||
input:
|
||||
prompt: "Pull up my transactions"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_017
|
||||
difficulty: easy
|
||||
tags: [get_accounts]
|
||||
input:
|
||||
prompt: "Show all my bank accounts"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_018
|
||||
difficulty: easy
|
||||
tags: [get_balance_sheet]
|
||||
input:
|
||||
prompt: "What do I own?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_019
|
||||
difficulty: easy
|
||||
tags: [get_income_statement]
|
||||
input:
|
||||
prompt: "What's my income?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_easy_020
|
||||
difficulty: easy
|
||||
tags: [get_transactions]
|
||||
input:
|
||||
prompt: "Recent purchases"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_021
|
||||
difficulty: medium
|
||||
tags: [get_transactions, merchant]
|
||||
input:
|
||||
prompt: "How much have I spent at Starbucks?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_022
|
||||
difficulty: medium
|
||||
tags: [get_transactions, category]
|
||||
input:
|
||||
prompt: "Show transportation expenses"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_023
|
||||
difficulty: medium
|
||||
tags: [get_income_statement, period]
|
||||
input:
|
||||
prompt: "Quarterly expense report"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_024
|
||||
difficulty: medium
|
||||
tags: [get_accounts, type]
|
||||
input:
|
||||
prompt: "Show my investment accounts"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_medium_025
|
||||
difficulty: medium
|
||||
tags: [get_transactions, amount]
|
||||
input:
|
||||
prompt: "Transactions under $50"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_transactions"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_016
|
||||
difficulty: hard
|
||||
tags: [analysis, discretionary]
|
||||
input:
|
||||
prompt: "How much discretionary spending do I have?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_017
|
||||
difficulty: hard
|
||||
tags: [analysis, fixed_vs_variable]
|
||||
input:
|
||||
prompt: "What are my fixed vs variable expenses?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_018
|
||||
difficulty: hard
|
||||
tags: [analysis, emergency_fund]
|
||||
input:
|
||||
prompt: "Do I have enough for an emergency fund?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_balance_sheet"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_019
|
||||
difficulty: hard
|
||||
tags: [analysis, liquidity]
|
||||
input:
|
||||
prompt: "How liquid are my assets?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_accounts"
|
||||
params: {}
|
||||
response_contains: []
|
||||
|
||||
- id: chat_hard_020
|
||||
difficulty: hard
|
||||
tags: [comparison, benchmark]
|
||||
input:
|
||||
prompt: "Am I spending too much on housing?"
|
||||
expected:
|
||||
functions:
|
||||
- name: "get_income_statement"
|
||||
params: {}
|
||||
response_contains: []
|
||||
1117
db/eval_data/merchant_detection_golden_v1.yml
Normal file
1117
db/eval_data/merchant_detection_golden_v1.yml
Normal file
File diff suppressed because it is too large
Load Diff
81
db/migrate/20251201084101_create_eval_tables.rb
Normal file
81
db/migrate/20251201084101_create_eval_tables.rb
Normal file
@@ -0,0 +1,81 @@
|
||||
class CreateEvalTables < ActiveRecord::Migration[7.2]
|
||||
def change
|
||||
# Eval Datasets - Golden dataset containers
|
||||
create_table :eval_datasets, id: :uuid do |t|
|
||||
t.string :name, null: false
|
||||
t.string :description
|
||||
t.string :eval_type, null: false
|
||||
t.string :version, null: false, default: "1.0"
|
||||
t.integer :sample_count, default: 0
|
||||
t.jsonb :metadata, default: {}
|
||||
t.boolean :active, default: true
|
||||
|
||||
t.timestamps
|
||||
end
|
||||
|
||||
add_index :eval_datasets, :name, unique: true
|
||||
add_index :eval_datasets, [ :eval_type, :active ]
|
||||
|
||||
# Eval Samples - Individual test cases
|
||||
create_table :eval_samples, id: :uuid do |t|
|
||||
t.references :eval_dataset, null: false, foreign_key: true, type: :uuid
|
||||
t.jsonb :input_data, null: false
|
||||
t.jsonb :expected_output, null: false
|
||||
t.jsonb :context_data, default: {}
|
||||
t.string :difficulty, default: "medium"
|
||||
t.string :tags, array: true, default: []
|
||||
t.jsonb :metadata, default: {}
|
||||
|
||||
t.timestamps
|
||||
end
|
||||
|
||||
add_index :eval_samples, [ :eval_dataset_id, :difficulty ]
|
||||
add_index :eval_samples, :tags, using: :gin
|
||||
|
||||
# Eval Runs - Evaluation execution records
|
||||
create_table :eval_runs, id: :uuid do |t|
|
||||
t.references :eval_dataset, null: false, foreign_key: true, type: :uuid
|
||||
t.string :name
|
||||
t.string :status, null: false, default: "pending"
|
||||
t.string :provider, null: false
|
||||
t.string :model, null: false
|
||||
t.jsonb :provider_config, default: {}
|
||||
t.jsonb :metrics, default: {}
|
||||
t.integer :total_prompt_tokens, default: 0
|
||||
t.integer :total_completion_tokens, default: 0
|
||||
t.decimal :total_cost, precision: 10, scale: 6, default: 0.0
|
||||
t.datetime :started_at
|
||||
t.datetime :completed_at
|
||||
t.text :error_message
|
||||
|
||||
t.timestamps
|
||||
end
|
||||
|
||||
add_index :eval_runs, [ :eval_dataset_id, :model ]
|
||||
add_index :eval_runs, [ :provider, :model ]
|
||||
add_index :eval_runs, :status
|
||||
|
||||
# Eval Results - Individual sample results
|
||||
create_table :eval_results, id: :uuid do |t|
|
||||
t.references :eval_run, null: false, foreign_key: true, type: :uuid
|
||||
t.references :eval_sample, null: false, foreign_key: true, type: :uuid
|
||||
t.jsonb :actual_output, null: false
|
||||
t.boolean :correct, null: false
|
||||
t.boolean :exact_match, default: false
|
||||
t.boolean :hierarchical_match, default: false
|
||||
t.boolean :null_expected, default: false
|
||||
t.boolean :null_returned, default: false
|
||||
t.float :fuzzy_score
|
||||
t.integer :latency_ms
|
||||
t.integer :prompt_tokens
|
||||
t.integer :completion_tokens
|
||||
t.decimal :cost, precision: 10, scale: 6
|
||||
t.jsonb :metadata, default: {}
|
||||
|
||||
t.timestamps
|
||||
end
|
||||
|
||||
add_index :eval_results, [ :eval_run_id, :correct ]
|
||||
# eval_sample_id index is automatically created by t.references
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,5 @@
|
||||
class AddAlternativeMatchToEvalResults < ActiveRecord::Migration[7.2]
|
||||
def change
|
||||
add_column :eval_results, :alternative_match, :boolean, default: false
|
||||
end
|
||||
end
|
||||
96
db/schema.rb
generated
96
db/schema.rb
generated
@@ -307,6 +307,80 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
|
||||
t.index ["import_id"], name: "index_entries_on_import_id"
|
||||
end
|
||||
|
||||
create_table "eval_datasets", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.string "name", null: false
|
||||
t.string "description"
|
||||
t.string "eval_type", null: false
|
||||
t.string "version", default: "1.0", null: false
|
||||
t.integer "sample_count", default: 0
|
||||
t.jsonb "metadata", default: {}
|
||||
t.boolean "active", default: true
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.index ["eval_type", "active"], name: "index_eval_datasets_on_eval_type_and_active"
|
||||
t.index ["name"], name: "index_eval_datasets_on_name", unique: true
|
||||
end
|
||||
|
||||
create_table "eval_results", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.uuid "eval_run_id", null: false
|
||||
t.uuid "eval_sample_id", null: false
|
||||
t.jsonb "actual_output", null: false
|
||||
t.boolean "correct", null: false
|
||||
t.boolean "exact_match", default: false
|
||||
t.boolean "hierarchical_match", default: false
|
||||
t.boolean "null_expected", default: false
|
||||
t.boolean "null_returned", default: false
|
||||
t.float "fuzzy_score"
|
||||
t.integer "latency_ms"
|
||||
t.integer "prompt_tokens"
|
||||
t.integer "completion_tokens"
|
||||
t.decimal "cost", precision: 10, scale: 6
|
||||
t.jsonb "metadata", default: {}
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.boolean "alternative_match", default: false
|
||||
t.index ["eval_run_id", "correct"], name: "index_eval_results_on_eval_run_id_and_correct"
|
||||
t.index ["eval_run_id"], name: "index_eval_results_on_eval_run_id"
|
||||
t.index ["eval_sample_id"], name: "index_eval_results_on_eval_sample_id"
|
||||
end
|
||||
|
||||
create_table "eval_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.uuid "eval_dataset_id", null: false
|
||||
t.string "name"
|
||||
t.string "status", default: "pending", null: false
|
||||
t.string "provider", null: false
|
||||
t.string "model", null: false
|
||||
t.jsonb "provider_config", default: {}
|
||||
t.jsonb "metrics", default: {}
|
||||
t.integer "total_prompt_tokens", default: 0
|
||||
t.integer "total_completion_tokens", default: 0
|
||||
t.decimal "total_cost", precision: 10, scale: 6, default: "0.0"
|
||||
t.datetime "started_at"
|
||||
t.datetime "completed_at"
|
||||
t.text "error_message"
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.index ["eval_dataset_id", "model"], name: "index_eval_runs_on_eval_dataset_id_and_model"
|
||||
t.index ["eval_dataset_id"], name: "index_eval_runs_on_eval_dataset_id"
|
||||
t.index ["provider", "model"], name: "index_eval_runs_on_provider_and_model"
|
||||
t.index ["status"], name: "index_eval_runs_on_status"
|
||||
end
|
||||
|
||||
create_table "eval_samples", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.uuid "eval_dataset_id", null: false
|
||||
t.jsonb "input_data", null: false
|
||||
t.jsonb "expected_output", null: false
|
||||
t.jsonb "context_data", default: {}
|
||||
t.string "difficulty", default: "medium"
|
||||
t.string "tags", default: [], array: true
|
||||
t.jsonb "metadata", default: {}
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.index ["eval_dataset_id", "difficulty"], name: "index_eval_samples_on_eval_dataset_id_and_difficulty"
|
||||
t.index ["eval_dataset_id"], name: "index_eval_samples_on_eval_dataset_id"
|
||||
t.index ["tags"], name: "index_eval_samples_on_tags", using: :gin
|
||||
end
|
||||
|
||||
create_table "exchange_rates", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.string "from_currency", null: false
|
||||
t.string "to_currency", null: false
|
||||
@@ -789,6 +863,21 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
|
||||
t.index ["rule_id"], name: "index_rule_conditions_on_rule_id"
|
||||
end
|
||||
|
||||
create_table "rule_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.uuid "rule_id", null: false
|
||||
t.string "execution_type", null: false
|
||||
t.string "status", null: false
|
||||
t.integer "transactions_processed", default: 0, null: false
|
||||
t.integer "transactions_modified", default: 0, null: false
|
||||
t.datetime "executed_at", null: false
|
||||
t.text "error_message"
|
||||
t.datetime "created_at", null: false
|
||||
t.datetime "updated_at", null: false
|
||||
t.index ["executed_at"], name: "index_rule_runs_on_executed_at"
|
||||
t.index ["rule_id", "executed_at"], name: "index_rule_runs_on_rule_id_and_executed_at"
|
||||
t.index ["rule_id"], name: "index_rule_runs_on_rule_id"
|
||||
end
|
||||
|
||||
create_table "rules", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
|
||||
t.uuid "family_id", null: false
|
||||
t.string "resource_type", null: false
|
||||
@@ -991,6 +1080,8 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
|
||||
t.datetime "updated_at", null: false
|
||||
t.string "currency"
|
||||
t.jsonb "locked_attributes", default: {}
|
||||
t.uuid "category_id"
|
||||
t.index ["category_id"], name: "index_trades_on_category_id"
|
||||
t.index ["security_id"], name: "index_trades_on_security_id"
|
||||
end
|
||||
|
||||
@@ -1095,6 +1186,10 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
|
||||
add_foreign_key "enable_banking_items", "families"
|
||||
add_foreign_key "entries", "accounts", on_delete: :cascade
|
||||
add_foreign_key "entries", "imports"
|
||||
add_foreign_key "eval_results", "eval_runs"
|
||||
add_foreign_key "eval_results", "eval_samples"
|
||||
add_foreign_key "eval_runs", "eval_datasets"
|
||||
add_foreign_key "eval_samples", "eval_datasets"
|
||||
add_foreign_key "family_exports", "families"
|
||||
add_foreign_key "holdings", "account_providers"
|
||||
add_foreign_key "holdings", "accounts", on_delete: :cascade
|
||||
@@ -1136,6 +1231,7 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
|
||||
add_foreign_key "taggings", "tags"
|
||||
add_foreign_key "tags", "families"
|
||||
add_foreign_key "tool_calls", "messages"
|
||||
add_foreign_key "trades", "categories"
|
||||
add_foreign_key "trades", "securities"
|
||||
add_foreign_key "transactions", "categories", on_delete: :nullify
|
||||
add_foreign_key "transactions", "merchants"
|
||||
|
||||
739
lib/tasks/evals.rake
Normal file
739
lib/tasks/evals.rake
Normal file
@@ -0,0 +1,739 @@
|
||||
namespace :evals do
|
||||
desc "List all evaluation datasets"
|
||||
task list_datasets: :environment do
|
||||
datasets = Eval::Dataset.order(:eval_type, :name)
|
||||
|
||||
if datasets.empty?
|
||||
puts "No datasets found. Import a dataset with: rake evals:import_dataset[path/to/file.yml]"
|
||||
next
|
||||
end
|
||||
|
||||
puts "=" * 80
|
||||
puts "Available Evaluation Datasets"
|
||||
puts "=" * 80
|
||||
puts
|
||||
|
||||
datasets.group_by(&:eval_type).each do |eval_type, type_datasets|
|
||||
puts "#{eval_type.titleize}:"
|
||||
puts "-" * 40
|
||||
|
||||
type_datasets.each do |dataset|
|
||||
status = dataset.active ? "active" : "inactive"
|
||||
puts " #{dataset.name} (v#{dataset.version}) - #{dataset.sample_count} samples [#{status}]"
|
||||
puts " #{dataset.description}" if dataset.description.present?
|
||||
end
|
||||
puts
|
||||
end
|
||||
end
|
||||
|
||||
desc "Import dataset from YAML file"
|
||||
task :import_dataset, [ :file_path ] => :environment do |_t, args|
|
||||
file_path = args[:file_path] || ENV["FILE"]
|
||||
|
||||
if file_path.blank?
|
||||
puts "Usage: rake evals:import_dataset[path/to/file.yml]"
|
||||
puts " or: FILE=path/to/file.yml rake evals:import_dataset"
|
||||
exit 1
|
||||
end
|
||||
|
||||
unless File.exist?(file_path)
|
||||
puts "Error: File not found: #{file_path}"
|
||||
exit 1
|
||||
end
|
||||
|
||||
puts "Importing dataset from #{file_path}..."
|
||||
|
||||
dataset = Eval::Dataset.import_from_yaml(file_path)
|
||||
|
||||
puts "Successfully imported dataset:"
|
||||
puts " Name: #{dataset.name}"
|
||||
puts " Type: #{dataset.eval_type}"
|
||||
puts " Version: #{dataset.version}"
|
||||
puts " Samples: #{dataset.sample_count}"
|
||||
|
||||
stats = dataset.statistics
|
||||
puts " By difficulty: #{stats[:by_difficulty].map { |k, v| "#{k}=#{v}" }.join(', ')}"
|
||||
end
|
||||
|
||||
desc "Run evaluation against a model"
|
||||
task :run, [ :dataset_name, :model ] => :environment do |_t, args|
|
||||
dataset_name = args[:dataset_name] || ENV["DATASET"]
|
||||
model = args[:model] || ENV["MODEL"] || "gpt-4.1"
|
||||
provider = ENV["PROVIDER"] || "openai"
|
||||
|
||||
if dataset_name.blank?
|
||||
puts "Usage: rake evals:run[dataset_name,model]"
|
||||
puts " or: DATASET=name MODEL=gpt-4 rake evals:run"
|
||||
exit 1
|
||||
end
|
||||
|
||||
dataset = Eval::Dataset.find_by(name: dataset_name)
|
||||
|
||||
if dataset.nil?
|
||||
puts "Error: Dataset '#{dataset_name}' not found"
|
||||
puts "Available datasets:"
|
||||
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
|
||||
exit 1
|
||||
end
|
||||
|
||||
run_name = "#{dataset_name}_#{model}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
|
||||
|
||||
puts "=" * 80
|
||||
puts "Starting Evaluation Run"
|
||||
puts "=" * 80
|
||||
puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
|
||||
puts " Type: #{dataset.eval_type}"
|
||||
puts " Model: #{model}"
|
||||
puts " Provider: #{provider}"
|
||||
puts " Run Name: #{run_name}"
|
||||
puts
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: provider,
|
||||
model: model,
|
||||
name: run_name,
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
runner = dataset.runner_class.new(eval_run)
|
||||
|
||||
puts "Running evaluation..."
|
||||
start_time = Time.current
|
||||
|
||||
begin
|
||||
result = runner.run
|
||||
duration = (Time.current - start_time).round(1)
|
||||
|
||||
puts
|
||||
puts "=" * 80
|
||||
puts "Evaluation Complete"
|
||||
puts "=" * 80
|
||||
puts " Status: #{result.status}"
|
||||
puts " Duration: #{duration}s"
|
||||
puts " Run ID: #{result.id}"
|
||||
puts
|
||||
puts "Metrics:"
|
||||
result.metrics.each do |key, value|
|
||||
next if value.is_a?(Hash) # Skip nested metrics for summary
|
||||
puts " #{key}: #{format_metric_value(value)}"
|
||||
end
|
||||
|
||||
# Show difficulty breakdown if available
|
||||
if result.metrics["by_difficulty"].present?
|
||||
puts
|
||||
puts "By Difficulty:"
|
||||
result.metrics["by_difficulty"].each do |difficulty, stats|
|
||||
puts " #{difficulty}: #{stats['accuracy']}% accuracy (#{stats['correct']}/#{stats['count']})"
|
||||
end
|
||||
end
|
||||
rescue => e
|
||||
puts
|
||||
puts "Evaluation FAILED: #{e.message}"
|
||||
puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
|
||||
exit 1
|
||||
end
|
||||
end
|
||||
|
||||
desc "Compare multiple models on a dataset"
|
||||
task :compare, [ :dataset_name ] => :environment do |_t, args|
|
||||
dataset_name = args[:dataset_name] || ENV["DATASET"]
|
||||
models = (ENV["MODELS"] || "gpt-4.1,gpt-4o-mini").split(",").map(&:strip)
|
||||
provider = ENV["PROVIDER"] || "openai"
|
||||
|
||||
if dataset_name.blank?
|
||||
puts "Usage: MODELS=model1,model2 rake evals:compare[dataset_name]"
|
||||
exit 1
|
||||
end
|
||||
|
||||
dataset = Eval::Dataset.find_by!(name: dataset_name)
|
||||
|
||||
puts "=" * 80
|
||||
puts "Model Comparison"
|
||||
puts "=" * 80
|
||||
puts " Dataset: #{dataset.name}"
|
||||
puts " Models: #{models.join(', ')}"
|
||||
puts
|
||||
|
||||
runs = models.map do |model|
|
||||
puts "Running evaluation for #{model}..."
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: provider,
|
||||
model: model,
|
||||
name: "compare_#{model}_#{Time.current.to_i}",
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
runner = dataset.runner_class.new(eval_run)
|
||||
runner.run
|
||||
end
|
||||
|
||||
puts
|
||||
puts "=" * 80
|
||||
puts "Comparison Results"
|
||||
puts "=" * 80
|
||||
puts
|
||||
|
||||
reporter = Eval::Reporters::ComparisonReporter.new(runs)
|
||||
puts reporter.to_table
|
||||
|
||||
summary = reporter.summary
|
||||
if summary.present?
|
||||
puts
|
||||
puts "Recommendations:"
|
||||
puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
|
||||
puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
|
||||
puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
|
||||
puts
|
||||
puts " #{summary[:recommendation]}"
|
||||
end
|
||||
|
||||
# Export to CSV if requested
|
||||
if ENV["CSV"].present?
|
||||
csv_path = reporter.to_csv(ENV["CSV"])
|
||||
puts
|
||||
puts "Exported to: #{csv_path}"
|
||||
end
|
||||
end
|
||||
|
||||
desc "Generate report for specific runs"
|
||||
task :report, [ :run_ids ] => :environment do |_t, args|
|
||||
run_ids = (args[:run_ids] || ENV["RUN_IDS"])&.split(",")
|
||||
|
||||
runs = if run_ids.present?
|
||||
Eval::Run.where(id: run_ids)
|
||||
else
|
||||
Eval::Run.completed.order(created_at: :desc).limit(5)
|
||||
end
|
||||
|
||||
if runs.empty?
|
||||
puts "No runs found."
|
||||
exit 1
|
||||
end
|
||||
|
||||
reporter = Eval::Reporters::ComparisonReporter.new(runs)
|
||||
|
||||
puts reporter.to_table
|
||||
|
||||
summary = reporter.summary
|
||||
if summary.present?
|
||||
puts
|
||||
puts "Summary:"
|
||||
puts " Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
|
||||
puts " Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
|
||||
puts " Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
|
||||
end
|
||||
|
||||
if ENV["CSV"].present?
|
||||
csv_path = reporter.to_csv(ENV["CSV"])
|
||||
puts
|
||||
puts "Exported to: #{csv_path}"
|
||||
end
|
||||
end
|
||||
|
||||
desc "Quick smoke test to verify provider configuration"
|
||||
task smoke_test: :environment do
|
||||
puts "Running smoke test..."
|
||||
|
||||
provider = Provider::Registry.get_provider(:openai)
|
||||
|
||||
unless provider
|
||||
puts "FAIL: OpenAI provider not configured"
|
||||
puts "Set OPENAI_ACCESS_TOKEN environment variable or configure in settings"
|
||||
exit 1
|
||||
end
|
||||
|
||||
puts " Provider: #{provider.provider_name}"
|
||||
puts " Model: #{provider.instance_variable_get(:@default_model)}"
|
||||
|
||||
# Test with a single categorization sample
|
||||
result = provider.auto_categorize(
|
||||
transactions: [
|
||||
{ id: "test", amount: 10, classification: "expense", description: "McDonalds" }
|
||||
],
|
||||
user_categories: [
|
||||
{ id: "1", name: "Food & Drink", classification: "expense" }
|
||||
]
|
||||
)
|
||||
|
||||
if result.success?
|
||||
category = result.data.first&.category_name
|
||||
puts " Test result: #{category || 'null'}"
|
||||
puts
|
||||
puts "PASS: Provider is working correctly"
|
||||
else
|
||||
puts "FAIL: #{result.error.message}"
|
||||
exit 1
|
||||
end
|
||||
end
|
||||
|
||||
desc "Run CI regression test"
|
||||
task ci_regression: :environment do
|
||||
dataset_name = ENV["EVAL_DATASET"] || "categorization_golden_v1"
|
||||
model = ENV["EVAL_MODEL"] || "gpt-4.1-mini"
|
||||
threshold = (ENV["EVAL_THRESHOLD"] || "80").to_f
|
||||
|
||||
dataset = Eval::Dataset.find_by(name: dataset_name)
|
||||
|
||||
unless dataset
|
||||
puts "Dataset '#{dataset_name}' not found. Skipping CI regression test."
|
||||
exit 0
|
||||
end
|
||||
|
||||
# Get baseline from last successful run
|
||||
baseline_run = dataset.runs.completed.for_model(model).order(created_at: :desc).first
|
||||
|
||||
# Run new evaluation
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: model,
|
||||
name: "ci_regression_#{Time.current.to_i}",
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
runner = dataset.runner_class.new(eval_run)
|
||||
result = runner.run
|
||||
|
||||
current_accuracy = result.metrics["accuracy"] || 0
|
||||
|
||||
puts "CI Regression Test Results:"
|
||||
puts " Model: #{model}"
|
||||
puts " Current Accuracy: #{current_accuracy}%"
|
||||
|
||||
if baseline_run
|
||||
baseline_accuracy = baseline_run.metrics["accuracy"] || 0
|
||||
puts " Baseline Accuracy: #{baseline_accuracy}%"
|
||||
|
||||
accuracy_diff = current_accuracy - baseline_accuracy
|
||||
|
||||
if accuracy_diff < -5
|
||||
puts
|
||||
puts "REGRESSION DETECTED!"
|
||||
puts "Accuracy dropped by #{accuracy_diff.abs}% (threshold: 5%)"
|
||||
exit 1
|
||||
end
|
||||
|
||||
puts " Difference: #{accuracy_diff > 0 ? '+' : ''}#{accuracy_diff.round(2)}%"
|
||||
end
|
||||
|
||||
if current_accuracy < threshold
|
||||
puts
|
||||
puts "BELOW THRESHOLD!"
|
||||
puts "Accuracy #{current_accuracy}% is below required #{threshold}%"
|
||||
exit 1
|
||||
end
|
||||
|
||||
puts
|
||||
puts "CI Regression Test PASSED"
|
||||
end
|
||||
|
||||
desc "List recent evaluation runs"
|
||||
task list_runs: :environment do
|
||||
runs = Eval::Run.order(created_at: :desc).limit(20)
|
||||
|
||||
if runs.empty?
|
||||
puts "No runs found."
|
||||
next
|
||||
end
|
||||
|
||||
puts "=" * 100
|
||||
puts "Recent Evaluation Runs"
|
||||
puts "=" * 100
|
||||
|
||||
runs.each do |run|
|
||||
status_icon = case run.status
|
||||
when "completed" then "[OK]"
|
||||
when "failed" then "[FAIL]"
|
||||
when "running" then "[...]"
|
||||
else "[?]"
|
||||
end
|
||||
|
||||
accuracy = run.metrics["accuracy"] ? "#{run.metrics['accuracy']}%" : "-"
|
||||
|
||||
puts "#{status_icon} #{run.id[0..7]} | #{run.model.ljust(15)} | #{run.dataset.name.ljust(25)} | #{accuracy.rjust(8)} | #{run.created_at.strftime('%Y-%m-%d %H:%M')}"
|
||||
end
|
||||
end
|
||||
|
||||
desc "Show details for a specific run"
|
||||
task :show_run, [ :run_id ] => :environment do |_t, args|
|
||||
run_id = args[:run_id] || ENV["RUN_ID"]
|
||||
|
||||
if run_id.blank?
|
||||
puts "Usage: rake evals:show_run[run_id]"
|
||||
exit 1
|
||||
end
|
||||
|
||||
run = Eval::Run.find_by(id: run_id) || Eval::Run.find_by("id::text LIKE ?", "#{run_id}%")
|
||||
|
||||
unless run
|
||||
puts "Run not found: #{run_id}"
|
||||
exit 1
|
||||
end
|
||||
|
||||
puts "=" * 80
|
||||
puts "Evaluation Run Details"
|
||||
puts "=" * 80
|
||||
puts
|
||||
puts "Run ID: #{run.id}"
|
||||
puts "Name: #{run.name}"
|
||||
puts "Dataset: #{run.dataset.name}"
|
||||
puts "Model: #{run.model}"
|
||||
puts "Provider: #{run.provider}"
|
||||
puts "Status: #{run.status}"
|
||||
puts "Created: #{run.created_at}"
|
||||
puts "Duration: #{run.duration_seconds}s" if run.duration_seconds
|
||||
|
||||
if run.error_message.present?
|
||||
puts
|
||||
puts "Error: #{run.error_message}"
|
||||
end
|
||||
|
||||
if run.metrics.present?
|
||||
puts
|
||||
puts "Metrics:"
|
||||
run.metrics.each do |key, value|
|
||||
if value.is_a?(Hash)
|
||||
puts " #{key}:"
|
||||
value.each { |k, v| puts " #{k}: #{v}" }
|
||||
else
|
||||
puts " #{key}: #{format_metric_value(value)}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Show sample of incorrect results
|
||||
incorrect = run.results.incorrect.limit(5)
|
||||
if incorrect.any?
|
||||
puts
|
||||
puts "Sample Incorrect Results (#{run.results.incorrect.count} total):"
|
||||
incorrect.each do |result|
|
||||
puts " Sample: #{result.sample_id[0..7]}"
|
||||
puts " Expected: #{result.sample.expected_output}"
|
||||
puts " Actual: #{result.actual_output}"
|
||||
puts
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# =============================================================================
|
||||
# Langfuse Integration
|
||||
# =============================================================================
|
||||
|
||||
namespace :langfuse do
|
||||
desc "Check Langfuse configuration"
|
||||
task check: :environment do
|
||||
begin
|
||||
client = Eval::Langfuse::Client.new
|
||||
puts "✓ Langfuse credentials configured"
|
||||
|
||||
# Try to list datasets to verify connection
|
||||
response = client.list_datasets(limit: 1)
|
||||
puts "✓ Successfully connected to Langfuse"
|
||||
puts " Region: #{ENV['LANGFUSE_REGION'] || 'us (default)'}"
|
||||
rescue Eval::Langfuse::Client::ConfigurationError => e
|
||||
puts "✗ #{e.message}"
|
||||
exit 1
|
||||
rescue Eval::Langfuse::Client::ApiError => e
|
||||
puts "✗ Failed to connect to Langfuse: #{e.message}"
|
||||
exit 1
|
||||
end
|
||||
end
|
||||
|
||||
desc "Upload dataset to Langfuse"
|
||||
task :upload_dataset, [ :dataset_name ] => :environment do |_t, args|
|
||||
dataset_name = args[:dataset_name] || ENV["DATASET"]
|
||||
|
||||
if dataset_name.blank?
|
||||
puts "Usage: rake evals:langfuse:upload_dataset[dataset_name]"
|
||||
puts " or: DATASET=name rake evals:langfuse:upload_dataset"
|
||||
exit 1
|
||||
end
|
||||
|
||||
dataset = Eval::Dataset.find_by(name: dataset_name)
|
||||
|
||||
if dataset.nil?
|
||||
puts "Error: Dataset '#{dataset_name}' not found"
|
||||
puts "Available datasets:"
|
||||
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
|
||||
exit 1
|
||||
end
|
||||
|
||||
puts "=" * 80
|
||||
puts "Uploading Dataset to Langfuse"
|
||||
puts "=" * 80
|
||||
puts " Dataset: #{dataset.name}"
|
||||
puts " Type: #{dataset.eval_type}"
|
||||
puts " Samples: #{dataset.sample_count}"
|
||||
puts
|
||||
|
||||
begin
|
||||
exporter = Eval::Langfuse::DatasetExporter.new(dataset)
|
||||
result = exporter.export
|
||||
|
||||
puts
|
||||
puts "✓ Successfully uploaded dataset to Langfuse"
|
||||
puts " Langfuse dataset name: #{result[:dataset_name]}"
|
||||
puts " Items exported: #{result[:items_exported]}"
|
||||
puts
|
||||
puts "View in Langfuse: https://cloud.langfuse.com/project/datasets"
|
||||
rescue Eval::Langfuse::Client::ConfigurationError => e
|
||||
puts "✗ #{e.message}"
|
||||
exit 1
|
||||
rescue Eval::Langfuse::Client::ApiError => e
|
||||
puts "✗ Langfuse API error: #{e.message}"
|
||||
exit 1
|
||||
end
|
||||
end
|
||||
|
||||
desc "Run experiment in Langfuse"
|
||||
task :run_experiment, [ :dataset_name, :model ] => :environment do |_t, args|
|
||||
dataset_name = args[:dataset_name] || ENV["DATASET"]
|
||||
model = args[:model] || ENV["MODEL"] || "gpt-4.1"
|
||||
provider = ENV["PROVIDER"] || "openai"
|
||||
run_name = ENV["RUN_NAME"]
|
||||
|
||||
if dataset_name.blank?
|
||||
puts "Usage: rake evals:langfuse:run_experiment[dataset_name,model]"
|
||||
puts " or: DATASET=name MODEL=gpt-4.1 rake evals:langfuse:run_experiment"
|
||||
puts
|
||||
puts "Optional environment variables:"
|
||||
puts " PROVIDER=openai (default)"
|
||||
puts " RUN_NAME=custom_run_name"
|
||||
exit 1
|
||||
end
|
||||
|
||||
dataset = Eval::Dataset.find_by(name: dataset_name)
|
||||
|
||||
if dataset.nil?
|
||||
puts "Error: Dataset '#{dataset_name}' not found"
|
||||
puts "Available datasets:"
|
||||
Eval::Dataset.pluck(:name).each { |n| puts " - #{n}" }
|
||||
exit 1
|
||||
end
|
||||
|
||||
puts "=" * 80
|
||||
puts "Running Langfuse Experiment"
|
||||
puts "=" * 80
|
||||
puts " Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
|
||||
puts " Type: #{dataset.eval_type}"
|
||||
puts " Model: #{model}"
|
||||
puts " Provider: #{provider}"
|
||||
puts
|
||||
|
||||
begin
|
||||
runner = Eval::Langfuse::ExperimentRunner.new(
|
||||
dataset,
|
||||
model: model,
|
||||
provider: provider
|
||||
)
|
||||
|
||||
start_time = Time.current
|
||||
result = runner.run(run_name: run_name)
|
||||
duration = (Time.current - start_time).round(1)
|
||||
|
||||
puts
|
||||
puts "=" * 80
|
||||
puts "Experiment Complete"
|
||||
puts "=" * 80
|
||||
puts " Run Name: #{result[:run_name]}"
|
||||
puts " Duration: #{duration}s"
|
||||
puts
|
||||
puts "Results:"
|
||||
puts " Accuracy: #{result[:metrics][:accuracy]}%"
|
||||
puts " Correct: #{result[:metrics][:correct]}/#{result[:metrics][:total]}"
|
||||
puts " Avg Latency: #{result[:metrics][:avg_latency_ms]}ms"
|
||||
puts
|
||||
puts "View in Langfuse:"
|
||||
puts " Dataset: https://cloud.langfuse.com/project/datasets"
|
||||
puts " Traces: https://cloud.langfuse.com/project/traces"
|
||||
rescue Eval::Langfuse::Client::ConfigurationError => e
|
||||
puts "✗ #{e.message}"
|
||||
exit 1
|
||||
rescue Eval::Langfuse::Client::ApiError => e
|
||||
puts "✗ Langfuse API error: #{e.message}"
|
||||
exit 1
|
||||
rescue => e
|
||||
puts "✗ Error: #{e.message}"
|
||||
puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
|
||||
exit 1
|
||||
end
|
||||
end
|
||||
|
||||
desc "List datasets in Langfuse"
|
||||
task list_datasets: :environment do
|
||||
begin
|
||||
client = Eval::Langfuse::Client.new
|
||||
response = client.list_datasets(limit: 100)
|
||||
|
||||
datasets = response["data"] || []
|
||||
|
||||
if datasets.empty?
|
||||
puts "No datasets found in Langfuse."
|
||||
puts "Upload a dataset with: rake evals:langfuse:upload_dataset[dataset_name]"
|
||||
next
|
||||
end
|
||||
|
||||
puts "=" * 80
|
||||
puts "Langfuse Datasets"
|
||||
puts "=" * 80
|
||||
puts
|
||||
|
||||
datasets.each do |ds|
|
||||
puts " #{ds['name']}"
|
||||
puts " Description: #{ds['description']}" if ds["description"].present?
|
||||
puts " Created: #{ds['createdAt']}"
|
||||
puts " Metadata: #{ds['metadata']}" if ds["metadata"].present?
|
||||
puts
|
||||
end
|
||||
rescue Eval::Langfuse::Client::ConfigurationError => e
|
||||
puts "✗ #{e.message}"
|
||||
exit 1
|
||||
rescue Eval::Langfuse::Client::ApiError => e
|
||||
puts "✗ Langfuse API error: #{e.message}"
|
||||
exit 1
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
desc "Export manually categorized transactions as golden data"
|
||||
task :export_manual_categories, [ :family_id ] => :environment do |_t, args|
|
||||
family_id = args[:family_id] || ENV["FAMILY_ID"]
|
||||
output_path = ENV["OUTPUT"] || "db/eval_data/categorization_manual_export.yml"
|
||||
limit = (ENV["LIMIT"] || 500).to_i
|
||||
|
||||
if family_id.blank?
|
||||
puts "Usage: rake evals:export_manual_categories[family_id]"
|
||||
puts " or: FAMILY_ID=uuid rake evals:export_manual_categories"
|
||||
puts
|
||||
puts "Optional environment variables:"
|
||||
puts " OUTPUT=path/to/output.yml (default: db/eval_data/categorization_manual_export.yml)"
|
||||
puts " LIMIT=500 (default: 500)"
|
||||
exit 1
|
||||
end
|
||||
|
||||
family = Family.find_by(id: family_id)
|
||||
|
||||
if family.nil?
|
||||
puts "Error: Family '#{family_id}' not found"
|
||||
exit 1
|
||||
end
|
||||
|
||||
puts "=" * 80
|
||||
puts "Exporting Manually Categorized Transactions"
|
||||
puts "=" * 80
|
||||
puts " Family: #{family.name}"
|
||||
puts " Output: #{output_path}"
|
||||
puts " Limit: #{limit}"
|
||||
puts
|
||||
|
||||
# Find transactions that have:
|
||||
# 1. A category assigned
|
||||
# 2. locked_attributes contains "category_id" (meaning user manually set it)
|
||||
# 3. No DataEnrichment record for category_id (meaning it wasn't set by AI/rules/etc)
|
||||
manually_categorized = Transaction
|
||||
.joins(:entry)
|
||||
.joins("INNER JOIN accounts ON accounts.id = entries.account_id")
|
||||
.where(accounts: { family_id: family_id })
|
||||
.where.not(category_id: nil)
|
||||
.where("transactions.locked_attributes ? 'category_id'")
|
||||
.where.not(
|
||||
id: DataEnrichment
|
||||
.where(enrichable_type: "Transaction", attribute_name: "category_id")
|
||||
.select(:enrichable_id)
|
||||
)
|
||||
.includes(:category, entry: :account)
|
||||
.limit(limit)
|
||||
|
||||
count = manually_categorized.count
|
||||
|
||||
if count == 0
|
||||
puts "No manually categorized transactions found."
|
||||
puts
|
||||
puts "Manually categorized transactions are those where:"
|
||||
puts " - User set a category manually (locked_attributes contains 'category_id')"
|
||||
puts " - Category was NOT set by AI, rules, or data enrichment sources"
|
||||
exit 0
|
||||
end
|
||||
|
||||
puts "Found #{count} manually categorized transactions"
|
||||
puts
|
||||
|
||||
# Build category context from family's categories
|
||||
categories = family.categories.includes(:parent).map do |cat|
|
||||
{
|
||||
"id" => cat.id.to_s,
|
||||
"name" => cat.name,
|
||||
"classification" => cat.classification,
|
||||
"is_subcategory" => cat.subcategory?,
|
||||
"parent_id" => cat.parent_id&.to_s
|
||||
}.compact
|
||||
end
|
||||
|
||||
# Build samples
|
||||
samples = manually_categorized.map.with_index do |txn, idx|
|
||||
entry = txn.entry
|
||||
sample_id = "manual_#{idx + 1}"
|
||||
|
||||
{
|
||||
"id" => sample_id,
|
||||
"difficulty" => "manual",
|
||||
"tags" => [ txn.category.name.parameterize.underscore, "manual_export" ],
|
||||
"input" => {
|
||||
"id" => txn.id.to_s,
|
||||
"amount" => entry.amount.to_f.abs,
|
||||
"classification" => entry.classification,
|
||||
"description" => entry.name
|
||||
},
|
||||
"expected" => {
|
||||
"category_name" => txn.category.name
|
||||
}
|
||||
}
|
||||
end
|
||||
|
||||
# Build output structure
|
||||
output = {
|
||||
"name" => "categorization_manual_export",
|
||||
"description" => "Golden dataset exported from manually categorized user transactions",
|
||||
"eval_type" => "categorization",
|
||||
"version" => "1.0",
|
||||
"metadata" => {
|
||||
"created_at" => Time.current.strftime("%Y-%m-%d"),
|
||||
"source" => "manual_export",
|
||||
"family_id" => family_id,
|
||||
"exported_count" => samples.size
|
||||
},
|
||||
"context" => {
|
||||
"categories" => categories
|
||||
},
|
||||
"samples" => samples
|
||||
}
|
||||
|
||||
# Write to file
|
||||
FileUtils.mkdir_p(File.dirname(output_path))
|
||||
File.write(output_path, output.to_yaml)
|
||||
|
||||
puts "✓ Successfully exported #{samples.size} samples"
|
||||
puts " Difficulty: manual"
|
||||
puts
|
||||
puts "Output written to: #{output_path}"
|
||||
puts
|
||||
puts "To import this dataset, run:"
|
||||
puts " rake evals:import_dataset[#{output_path}]"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def format_metric_value(value)
|
||||
case value
|
||||
when Float
|
||||
value.round(4)
|
||||
when BigDecimal
|
||||
value.to_f.round(4)
|
||||
else
|
||||
value
|
||||
end
|
||||
end
|
||||
end
|
||||
118
test/models/eval/dataset_test.rb
Normal file
118
test/models/eval/dataset_test.rb
Normal file
@@ -0,0 +1,118 @@
|
||||
require "test_helper"
|
||||
|
||||
class Eval::DatasetTest < ActiveSupport::TestCase
|
||||
test "validates presence of name and eval_type" do
|
||||
dataset = Eval::Dataset.new
|
||||
|
||||
assert_not dataset.valid?
|
||||
assert_includes dataset.errors[:name], "can't be blank"
|
||||
assert_includes dataset.errors[:eval_type], "can't be blank"
|
||||
end
|
||||
|
||||
test "validates eval_type is one of allowed values" do
|
||||
dataset = Eval::Dataset.new(name: "test", eval_type: "invalid")
|
||||
|
||||
assert_not dataset.valid?
|
||||
assert_includes dataset.errors[:eval_type], "is not included in the list"
|
||||
|
||||
dataset.eval_type = "categorization"
|
||||
dataset.valid?
|
||||
assert_empty dataset.errors[:eval_type]
|
||||
end
|
||||
|
||||
test "validates name uniqueness" do
|
||||
Eval::Dataset.create!(name: "unique_test", eval_type: "categorization")
|
||||
|
||||
duplicate = Eval::Dataset.new(name: "unique_test", eval_type: "categorization")
|
||||
assert_not duplicate.valid?
|
||||
assert_includes duplicate.errors[:name], "has already been taken"
|
||||
end
|
||||
|
||||
test "scopes filter by eval_type" do
|
||||
cat_dataset = Eval::Dataset.create!(name: "cat_test", eval_type: "categorization")
|
||||
merch_dataset = Eval::Dataset.create!(name: "merch_test", eval_type: "merchant_detection")
|
||||
chat_dataset = Eval::Dataset.create!(name: "chat_test", eval_type: "chat")
|
||||
|
||||
assert_includes Eval::Dataset.for_categorization, cat_dataset
|
||||
assert_not_includes Eval::Dataset.for_categorization, merch_dataset
|
||||
|
||||
assert_includes Eval::Dataset.for_merchant_detection, merch_dataset
|
||||
assert_not_includes Eval::Dataset.for_merchant_detection, cat_dataset
|
||||
|
||||
assert_includes Eval::Dataset.for_chat, chat_dataset
|
||||
assert_not_includes Eval::Dataset.for_chat, cat_dataset
|
||||
end
|
||||
|
||||
test "import_from_yaml creates dataset with samples" do
|
||||
yaml_content = <<~YAML
|
||||
name: test_import
|
||||
description: Test dataset
|
||||
eval_type: categorization
|
||||
version: "1.0"
|
||||
context:
|
||||
categories:
|
||||
- id: "food"
|
||||
name: "Food"
|
||||
classification: "expense"
|
||||
samples:
|
||||
- id: sample_1
|
||||
difficulty: easy
|
||||
tags: [test]
|
||||
input:
|
||||
id: txn_1
|
||||
amount: 10
|
||||
classification: expense
|
||||
description: "Test transaction"
|
||||
expected:
|
||||
category_name: "Food"
|
||||
YAML
|
||||
|
||||
file_path = Rails.root.join("tmp", "test_import.yml")
|
||||
File.write(file_path, yaml_content)
|
||||
|
||||
dataset = Eval::Dataset.import_from_yaml(file_path)
|
||||
|
||||
assert_equal "test_import", dataset.name
|
||||
assert_equal "categorization", dataset.eval_type
|
||||
assert_equal 1, dataset.samples.count
|
||||
assert_equal "easy", dataset.samples.first.difficulty
|
||||
assert_equal "Food", dataset.samples.first.expected_output["category_name"]
|
||||
ensure
|
||||
File.delete(file_path) if File.exist?(file_path)
|
||||
end
|
||||
|
||||
test "statistics returns sample breakdown" do
|
||||
dataset = Eval::Dataset.create!(name: "stats_test", eval_type: "categorization")
|
||||
|
||||
dataset.samples.create!(
|
||||
input_data: { id: "1" },
|
||||
expected_output: { category_name: "Food" },
|
||||
difficulty: "easy",
|
||||
tags: [ "food" ]
|
||||
)
|
||||
|
||||
dataset.samples.create!(
|
||||
input_data: { id: "2" },
|
||||
expected_output: { category_name: "Travel" },
|
||||
difficulty: "medium",
|
||||
tags: [ "travel" ]
|
||||
)
|
||||
|
||||
stats = dataset.statistics
|
||||
|
||||
assert_equal 2, stats[:total_samples]
|
||||
assert_equal({ "easy" => 1, "medium" => 1 }, stats[:by_difficulty])
|
||||
assert_includes stats[:by_tags], "food"
|
||||
assert_includes stats[:by_tags], "travel"
|
||||
end
|
||||
|
||||
test "runner_class returns correct class for each eval_type" do
|
||||
cat_dataset = Eval::Dataset.new(eval_type: "categorization")
|
||||
merch_dataset = Eval::Dataset.new(eval_type: "merchant_detection")
|
||||
chat_dataset = Eval::Dataset.new(eval_type: "chat")
|
||||
|
||||
assert_equal Eval::Runners::CategorizationRunner, cat_dataset.runner_class
|
||||
assert_equal Eval::Runners::MerchantDetectionRunner, merch_dataset.runner_class
|
||||
assert_equal Eval::Runners::ChatRunner, chat_dataset.runner_class
|
||||
end
|
||||
end
|
||||
212
test/models/eval/runners/categorization_runner_test.rb
Normal file
212
test/models/eval/runners/categorization_runner_test.rb
Normal file
@@ -0,0 +1,212 @@
|
||||
require "test_helper"
|
||||
|
||||
class Eval::Runners::CategorizationRunnerTest < ActiveSupport::TestCase
|
||||
include ProviderTestHelper
|
||||
|
||||
setup do
|
||||
@categories = [
|
||||
{ "id" => "food", "name" => "Food & Drink", "classification" => "expense" },
|
||||
{ "id" => "fast_food", "name" => "Fast Food", "classification" => "expense", "parent_id" => "food" }
|
||||
]
|
||||
end
|
||||
|
||||
|
||||
test "run processes all samples and calculates metrics" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample1 = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
|
||||
expected_output: { "category_name" => "Fast Food" },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "easy"
|
||||
)
|
||||
|
||||
sample2 = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
|
||||
expected_output: { "category_name" => nil },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "edge_case"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_run",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
mock_response = provider_success_response([
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample1.id, category_name: "Fast Food"),
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample2.id, category_name: "null")
|
||||
])
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
result = runner.run
|
||||
|
||||
assert_equal "completed", result.status
|
||||
assert_equal 2, result.results.count
|
||||
assert result.metrics["accuracy"].present?
|
||||
end
|
||||
|
||||
test "records correct result when category matches" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_match_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
|
||||
expected_output: { "category_name" => "Fast Food" },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "easy"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_run",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
mock_response = provider_success_response([
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Fast Food")
|
||||
])
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
runner.run
|
||||
|
||||
result = eval_run.results.find_by(eval_sample_id: sample.id)
|
||||
|
||||
assert result.correct
|
||||
assert result.exact_match
|
||||
assert_equal "Fast Food", result.actual_output["category_name"]
|
||||
end
|
||||
|
||||
test "records hierarchical match when parent category returned" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_hier_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_3", "amount" => 50, "classification" => "expense", "description" => "Olive Garden" },
|
||||
expected_output: { "category_name" => "Fast Food" },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "medium"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_hierarchical",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
# Model returns parent category instead of subcategory
|
||||
mock_response = provider_success_response([
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Food & Drink")
|
||||
])
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
runner.run
|
||||
|
||||
result = eval_run.results.find_by(eval_sample_id: sample.id)
|
||||
|
||||
assert_not result.exact_match
|
||||
assert result.hierarchical_match
|
||||
end
|
||||
|
||||
test "handles null correctly when expected" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_null_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
|
||||
expected_output: { "category_name" => nil },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "edge_case"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_run",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
mock_response = provider_success_response([
|
||||
Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "null")
|
||||
])
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
runner.run
|
||||
|
||||
result = eval_run.results.find_by(eval_sample_id: sample.id)
|
||||
|
||||
assert result.correct
|
||||
assert result.null_expected
|
||||
assert result.null_returned
|
||||
end
|
||||
|
||||
test "records error results on provider error but completes run" do
|
||||
dataset = Eval::Dataset.create!(
|
||||
name: "test_cat_err_#{SecureRandom.hex(4)}",
|
||||
eval_type: "categorization",
|
||||
version: "1.0"
|
||||
)
|
||||
|
||||
sample = dataset.samples.create!(
|
||||
input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
|
||||
expected_output: { "category_name" => "Fast Food" },
|
||||
context_data: { "categories" => @categories },
|
||||
difficulty: "easy"
|
||||
)
|
||||
|
||||
eval_run = Eval::Run.create!(
|
||||
dataset: dataset,
|
||||
provider: "openai",
|
||||
model: "gpt-4.1",
|
||||
name: "test_run",
|
||||
provider_config: { "access_token" => "test-token" },
|
||||
status: "pending"
|
||||
)
|
||||
|
||||
Provider::Openai.any_instance.stubs(:auto_categorize).raises(StandardError.new("API Error"))
|
||||
|
||||
runner = Eval::Runners::CategorizationRunner.new(eval_run)
|
||||
result = runner.run
|
||||
|
||||
# Run completes but with error results
|
||||
assert_equal "completed", result.status
|
||||
assert_equal 1, result.results.count
|
||||
|
||||
error_result = result.results.find_by(eval_sample_id: sample.id)
|
||||
assert_not error_result.correct
|
||||
assert_includes error_result.actual_output["error"], "API Error"
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user