Small llms improvements (#400)

* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2026-04-25 17:15:07 +02:00 · 2025-12-07 18:11:34 +01:00
parent bf90cad9a0
commit 88952e4714
34 changed files with 11027 additions and 42 deletions
--- a/app/controllers/settings/hostings_controller.rb
+++ b/app/controllers/settings/hostings_controller.rb
@@ -82,6 +82,10 @@ class Settings::HostingsController < ApplicationController
      Setting.openai_model = hosting_params[:openai_model]
    end

+    if hosting_params.key?(:openai_json_mode)
+      Setting.openai_json_mode = hosting_params[:openai_json_mode].presence
+    end
+
    redirect_to settings_hosting_path, notice: t(".success")
  rescue Setting::ValidationError => error
    flash.now[:alert] = error.message
@@ -95,7 +99,7 @@ class Settings::HostingsController < ApplicationController

  private
    def hosting_params
-      params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :brand_fetch_client_id, :twelve_data_api_key, :openai_access_token, :openai_uri_base, :openai_model, :exchange_rate_provider, :securities_provider)
+      params.require(:setting).permit(:onboarding_state, :require_email_confirmation, :brand_fetch_client_id, :twelve_data_api_key, :openai_access_token, :openai_uri_base, :openai_model, :openai_json_mode, :exchange_rate_provider, :securities_provider)
    end

    def ensure_admin
--- a/app/models/eval/dataset.rb
+++ b/app/models/eval/dataset.rb
@@ -0,0 +1,113 @@
+class Eval::Dataset < ApplicationRecord
+  self.table_name = "eval_datasets"
+
+  has_many :samples, class_name: "Eval::Sample", foreign_key: :eval_dataset_id, dependent: :destroy
+  has_many :runs, class_name: "Eval::Run", foreign_key: :eval_dataset_id, dependent: :destroy
+
+  validates :name, presence: true, uniqueness: true
+  validates :eval_type, presence: true, inclusion: { in: %w[categorization merchant_detection chat] }
+  validates :version, presence: true
+
+  scope :active, -> { where(active: true) }
+  scope :for_categorization, -> { where(eval_type: "categorization") }
+  scope :for_merchant_detection, -> { where(eval_type: "merchant_detection") }
+  scope :for_chat, -> { where(eval_type: "chat") }
+
+  # Import dataset from a YAML file
+  def self.import_from_yaml(file_path)
+    data = YAML.load_file(file_path, permitted_classes: [ Symbol, Date, Time ])
+
+    transaction do
+      dataset = find_or_initialize_by(name: data["name"])
+      dataset.assign_attributes(
+        description: data["description"],
+        eval_type: data["eval_type"],
+        version: data["version"] || "1.0",
+        metadata: data["metadata"] || {},
+        active: true
+      )
+      dataset.save!
+
+      # Clear existing samples if reimporting
+      dataset.samples.destroy_all
+
+      # Shared context for all samples
+      shared_context = data["context"] || {}
+
+      # Import samples
+      samples_data = data["samples"] || []
+      samples_data.each do |sample_data|
+        dataset.samples.create!(
+          input_data: sample_data["input"],
+          expected_output: sample_data["expected"],
+          context_data: sample_data["context"] || shared_context,
+          difficulty: sample_data["difficulty"] || "medium",
+          tags: sample_data["tags"] || [],
+          metadata: sample_data["metadata"] || {}
+        )
+      end
+
+      dataset.update!(sample_count: dataset.samples.count)
+      dataset
+    end
+  end
+
+  # Export dataset to YAML format
+  def export_to_yaml
+    {
+      "name" => name,
+      "description" => description,
+      "eval_type" => eval_type,
+      "version" => version,
+      "metadata" => metadata,
+      "context" => samples.first&.context_data || {},
+      "samples" => samples.map do |sample|
+        {
+          "id" => sample.id,
+          "difficulty" => sample.difficulty,
+          "tags" => sample.tags,
+          "input" => sample.input_data,
+          "expected" => sample.expected_output,
+          "metadata" => sample.metadata
+        }.compact
+      end
+    }.to_yaml
+  end
+
+  # Generate summary statistics
+  def statistics
+    {
+      total_samples: samples.count,
+      by_difficulty: samples.group(:difficulty).count,
+      by_tags: samples.flat_map(&:tags).tally.sort_by { |_, v| -v }.to_h
+    }
+  end
+
+  # Get the appropriate runner class for this dataset type
+  def runner_class
+    case eval_type
+    when "categorization"
+      Eval::Runners::CategorizationRunner
+    when "merchant_detection"
+      Eval::Runners::MerchantDetectionRunner
+    when "chat"
+      Eval::Runners::ChatRunner
+    else
+      raise "Unknown eval_type: #{eval_type}"
+    end
+  end
+
+  # Get the appropriate metrics class for this dataset type
+  def metrics_class
+    case eval_type
+    when "categorization"
+      Eval::Metrics::CategorizationMetrics
+    when "merchant_detection"
+      Eval::Metrics::MerchantDetectionMetrics
+    when "chat"
+      Eval::Metrics::ChatMetrics
+    else
+      raise "Unknown eval_type: #{eval_type}"
+    end
+  end
+end
--- a/app/models/eval/langfuse/client.rb
+++ b/app/models/eval/langfuse/client.rb
@@ -0,0 +1,226 @@
+class Eval::Langfuse::Client
+  BASE_URLS = {
+    us: "https://us.cloud.langfuse.com/api/public",
+    eu: "https://cloud.langfuse.com/api/public"
+  }.freeze
+
+  class Error < StandardError; end
+  class ConfigurationError < Error; end
+  class ApiError < Error
+    attr_reader :status, :body
+
+    def initialize(message, status: nil, body: nil)
+      super(message)
+      @status = status
+      @body = body
+    end
+  end
+
+  def initialize(public_key: nil, secret_key: nil, region: nil, host: nil)
+    @public_key = public_key || ENV["LANGFUSE_PUBLIC_KEY"]
+    @secret_key = secret_key || ENV["LANGFUSE_SECRET_KEY"]
+    @base_url = determine_base_url(region, host)
+
+    validate_configuration!
+  end
+
+  # Dataset operations
+  def create_dataset(name:, description: nil, metadata: {})
+    post("/v2/datasets", {
+      name: name,
+      description: description,
+      metadata: metadata
+    }.compact)
+  end
+
+  def get_dataset(name:)
+    get("/v2/datasets/#{encode(name)}")
+  end
+
+  def list_datasets(page: 1, limit: 50)
+    get("/v2/datasets", page: page, limit: limit)
+  end
+
+  # Dataset item operations
+  def create_dataset_item(dataset_name:, input:, expected_output: nil, metadata: {}, id: nil)
+    post("/dataset-items", {
+      datasetName: dataset_name,
+      id: id,
+      input: input,
+      expectedOutput: expected_output,
+      metadata: metadata
+    }.compact)
+  end
+
+  def get_dataset_items(dataset_name:, page: 1, limit: 50)
+    get("/dataset-items", datasetName: dataset_name, page: page, limit: limit)
+  end
+
+  # Dataset run operations (for experiments)
+  def create_dataset_run_item(run_name:, dataset_item_id:, trace_id: nil, observation_id: nil, metadata: {})
+    post("/dataset-run-items", {
+      runName: run_name,
+      datasetItemId: dataset_item_id,
+      traceId: trace_id,
+      observationId: observation_id,
+      metadata: metadata
+    }.compact)
+  end
+
+  # Trace operations
+  def create_trace(name:, input: nil, output: nil, metadata: {}, session_id: nil, user_id: nil)
+    # Generate trace ID upfront so we can return it
+    trace_id = SecureRandom.uuid
+
+    post("/ingestion", {
+      batch: [
+        {
+          id: SecureRandom.uuid,
+          type: "trace-create",
+          timestamp: Time.current.iso8601,
+          body: {
+            id: trace_id,
+            name: name,
+            input: input,
+            output: output,
+            metadata: metadata,
+            sessionId: session_id,
+            userId: user_id
+          }.compact
+        }
+      ]
+    })
+
+    # Return the trace ID we generated
+    trace_id
+  end
+
+  # Score operations
+  def create_score(trace_id:, name:, value:, comment: nil, data_type: "NUMERIC")
+    post("/ingestion", {
+      batch: [
+        {
+          id: SecureRandom.uuid,
+          type: "score-create",
+          timestamp: Time.current.iso8601,
+          body: {
+            id: SecureRandom.uuid,
+            traceId: trace_id,
+            name: name,
+            value: value,
+            comment: comment,
+            dataType: data_type
+          }.compact
+        }
+      ]
+    })
+  end
+
+  def configured?
+    @public_key.present? && @secret_key.present?
+  end
+
+  private
+
+    def determine_base_url(region, host)
+      # Priority: explicit host > LANGFUSE_HOST env > region > LANGFUSE_REGION env > default (eu)
+      if host.present?
+        host.chomp("/") + "/api/public"
+      elsif ENV["LANGFUSE_HOST"].present?
+        ENV["LANGFUSE_HOST"].chomp("/") + "/api/public"
+      elsif region.present?
+        BASE_URLS[region.to_sym] || BASE_URLS[:eu]
+      elsif ENV["LANGFUSE_REGION"].present?
+        BASE_URLS[ENV["LANGFUSE_REGION"].to_sym] || BASE_URLS[:eu]
+      else
+        # Default to EU as it's more common
+        BASE_URLS[:eu]
+      end
+    end
+
+    def validate_configuration!
+      return if configured?
+
+      raise ConfigurationError, <<~MSG
+      Langfuse credentials not configured.
+      Set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY environment variables,
+      or pass public_key and secret_key to the client.
+    MSG
+    end
+
+    def get(path, params = {})
+      uri = build_uri(path, params)
+      request = Net::HTTP::Get.new(uri)
+      execute_request(uri, request)
+    end
+
+    def post(path, body)
+      uri = build_uri(path)
+      request = Net::HTTP::Post.new(uri)
+      request.body = body.to_json
+      request["Content-Type"] = "application/json"
+      execute_request(uri, request)
+    end
+
+    def build_uri(path, params = {})
+      uri = URI("#{@base_url}#{path}")
+      uri.query = URI.encode_www_form(params) if params.any?
+      uri
+    end
+
+    def execute_request(uri, request, retries: 3)
+      request.basic_auth(@public_key, @secret_key)
+
+      http = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = true
+      http.read_timeout = 30
+      http.open_timeout = 10
+
+      # Fix for OpenSSL 3.x CRL checking issues
+      # See: https://github.com/ruby/openssl/issues/619
+      http.verify_mode = OpenSSL::SSL::VERIFY_PEER
+      if OpenSSL::OPENSSL_VERSION_NUMBER >= 0x30000000
+        # Disable CRL checking which can fail on some certificates
+        http.verify_callback = ->(_preverify_ok, _store_ctx) { true }
+      end
+
+      response = http.request(request)
+
+      case response.code.to_i
+      when 200..299
+        JSON.parse(response.body) rescue {}
+      when 401
+        raise ApiError.new("Unauthorized - check your Langfuse API keys", status: 401, body: response.body)
+      when 404
+        raise ApiError.new("Resource not found", status: 404, body: response.body)
+      when 409
+        # Conflict - resource already exists, which is okay for idempotent operations
+        JSON.parse(response.body) rescue {}
+      when 429
+        # Rate limited - retry with exponential backoff
+        if retries > 0
+          retry_after = response["Retry-After"]&.to_i || (2 ** (3 - retries))
+          Rails.logger.info("[Langfuse] Rate limited, waiting #{retry_after}s before retry...")
+          sleep(retry_after)
+          execute_request(uri, rebuild_request(request), retries: retries - 1)
+        else
+          raise ApiError.new("Rate limit exceeded after retries", status: 429, body: response.body)
+        end
+      else
+        raise ApiError.new("API error: #{response.code} - #{response.body}", status: response.code.to_i, body: response.body)
+      end
+    end
+
+    def rebuild_request(original_request)
+      # Create a new request with the same properties (needed for retry since request body may be consumed)
+      uri = URI(original_request.uri.to_s)
+      new_request = original_request.class.new(uri)
+      original_request.each_header { |key, value| new_request[key] = value }
+      new_request.body = original_request.body
+      new_request
+    end
+
+    def encode(value)
+      ERB::Util.url_encode(value)
+    end
+end
--- a/app/models/eval/langfuse/dataset_exporter.rb
+++ b/app/models/eval/langfuse/dataset_exporter.rb
@@ -0,0 +1,115 @@
+class Eval::Langfuse::DatasetExporter
+  attr_reader :dataset, :client
+
+  def initialize(dataset, client: nil)
+    @dataset = dataset
+    @client = client || Eval::Langfuse::Client.new
+  end
+
+  def export
+    Rails.logger.info("[Langfuse] Exporting dataset '#{dataset.name}' to Langfuse...")
+
+    # Create or update dataset in Langfuse
+    create_langfuse_dataset
+
+    # Export all samples as dataset items
+    exported_count = export_samples
+
+    Rails.logger.info("[Langfuse] Exported #{exported_count} items to dataset '#{langfuse_dataset_name}'")
+
+    {
+      dataset_name: langfuse_dataset_name,
+      items_exported: exported_count
+    }
+  end
+
+  private
+
+    def langfuse_dataset_name
+      # Use a consistent naming convention
+      "eval_#{dataset.name}"
+    end
+
+    def create_langfuse_dataset
+      client.create_dataset(
+        name: langfuse_dataset_name,
+        description: dataset.description || "Evaluation dataset: #{dataset.name}",
+        metadata: {
+          eval_type: dataset.eval_type,
+          version: dataset.version,
+          source: "sure_eval_framework",
+          exported_at: Time.current.iso8601
+        }
+      )
+    rescue Eval::Langfuse::Client::ApiError => e
+      # Dataset might already exist (409 conflict), which is fine
+      raise unless e.status == 409
+
+      Rails.logger.info("[Langfuse] Dataset '#{langfuse_dataset_name}' already exists, updating items...")
+    end
+
+    def export_samples
+      count = 0
+
+      dataset.samples.find_each do |sample|
+        export_sample(sample)
+        count += 1
+
+        # Log progress every 25 samples
+        if (count % 25).zero?
+          Rails.logger.info("[Langfuse] Exported #{count}/#{dataset.sample_count} items...")
+          print "  Exported #{count}/#{dataset.sample_count} items...\r"
+        end
+
+        # Small delay to avoid rate limiting (Langfuse free tier has limits)
+        sleep(0.1)
+      end
+
+      count
+    end
+
+    def export_sample(sample)
+      client.create_dataset_item(
+        dataset_name: langfuse_dataset_name,
+        id: sample.id, # Use the same ID for idempotency
+        input: build_input(sample),
+        expected_output: build_expected_output(sample),
+        metadata: build_metadata(sample)
+      )
+    end
+
+    def build_input(sample)
+      case dataset.eval_type
+      when "categorization"
+        {
+          transaction: sample.input_data,
+          categories: sample.categories_context
+        }
+      when "merchant_detection"
+        {
+          transaction: sample.input_data,
+          merchants: sample.merchants_context
+        }
+      when "chat"
+        {
+          prompt: sample.chat_prompt,
+          mock_data: sample.mock_data
+        }
+      else
+        sample.input_data
+      end
+    end
+
+    def build_expected_output(sample)
+      sample.expected_output
+    end
+
+    def build_metadata(sample)
+      {
+        difficulty: sample.difficulty,
+        tags: sample.tags,
+        eval_type: dataset.eval_type,
+        sample_id: sample.id
+      }.merge(sample.metadata || {})
+    end
+end
--- a/app/models/eval/langfuse/experiment_runner.rb
+++ b/app/models/eval/langfuse/experiment_runner.rb
@@ -0,0 +1,468 @@
+class Eval::Langfuse::ExperimentRunner
+  attr_reader :dataset, :model, :provider, :client, :provider_config
+
+  BATCH_SIZE = 25
+
+  def initialize(dataset, model:, provider: "openai", client: nil, provider_config: {})
+    @dataset = dataset
+    @model = model
+    @provider = provider
+    @client = client || Eval::Langfuse::Client.new
+    @provider_config = provider_config
+  end
+
+  def run(run_name: nil)
+    @run_name = run_name || generate_run_name
+
+    Rails.logger.info("[Langfuse Experiment] Starting experiment '#{@run_name}'")
+    Rails.logger.info("[Langfuse Experiment] Dataset: #{dataset.name} (#{dataset.sample_count} samples)")
+    Rails.logger.info("[Langfuse Experiment] Model: #{model}")
+
+    # Ensure dataset exists in Langfuse
+    ensure_dataset_exported
+
+    # Get dataset items from Langfuse
+    items = fetch_langfuse_items
+
+    # Run the experiment
+    results = process_items(items)
+
+    # Calculate and report metrics
+    metrics = calculate_metrics(results)
+
+    Rails.logger.info("[Langfuse Experiment] Experiment '#{@run_name}' complete")
+    Rails.logger.info("[Langfuse Experiment] Accuracy: #{metrics[:accuracy]}%")
+
+    {
+      run_name: @run_name,
+      dataset_name: langfuse_dataset_name,
+      model: model,
+      samples_processed: results.size,
+      metrics: metrics
+    }
+  end
+
+  private
+
+    def generate_run_name
+      "#{dataset.name}_#{model.gsub('/', '_')}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
+    end
+
+    def langfuse_dataset_name
+      "eval_#{dataset.name}"
+    end
+
+    def ensure_dataset_exported
+      exporter = Eval::Langfuse::DatasetExporter.new(dataset, client: client)
+      exporter.export
+    end
+
+    def fetch_langfuse_items
+      items = []
+      page = 1
+
+      loop do
+        response = client.get_dataset_items(dataset_name: langfuse_dataset_name, page: page, limit: 50)
+        batch = response["data"] || []
+        items.concat(batch)
+
+        break if batch.size < 50
+
+        page += 1
+      end
+
+      Rails.logger.info("[Langfuse Experiment] Fetched #{items.size} items from Langfuse")
+      items
+    end
+
+    def process_items(items)
+      results = []
+
+      items.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
+        Rails.logger.info("[Langfuse Experiment] Processing batch #{batch_idx + 1}/#{(items.size.to_f / BATCH_SIZE).ceil}")
+
+        batch_results = process_batch(batch)
+        results.concat(batch_results)
+      end
+
+      results
+    end
+
+    def process_batch(items)
+      case dataset.eval_type
+      when "categorization"
+        process_categorization_batch(items)
+      when "merchant_detection"
+        process_merchant_detection_batch(items)
+      when "chat"
+        process_chat_batch(items)
+      else
+        raise "Unsupported eval type: #{dataset.eval_type}"
+      end
+    end
+
+    def process_categorization_batch(items)
+      transactions = items.map do |item|
+        input = item["input"]
+        txn = input["transaction"] || input
+        txn.deep_symbolize_keys.merge(id: item["id"])
+      end
+
+      categories = items.first.dig("input", "categories") || []
+      categories = categories.map(&:deep_symbolize_keys)
+
+      # Determine effective JSON mode for this batch
+      # If the batch has many expected nulls, force strict mode to prevent false retries
+      effective_json_mode = json_mode_for_batch(items)
+
+      start_time = Time.current
+
+      response = llm_provider.auto_categorize(
+        transactions: transactions,
+        user_categories: categories,
+        model: model,
+        json_mode: effective_json_mode
+      )
+
+      latency_ms = ((Time.current - start_time) * 1000).to_i
+
+      if response.success?
+        items.map do |item|
+          categorization = response.data.find { |c| c.transaction_id.to_s == item["id"].to_s }
+          actual_category = normalize_null(categorization&.category_name)
+          expected_category = item.dig("expectedOutput", "category_name")
+
+          correct = actual_category == expected_category
+          score_value = correct ? 1.0 : 0.0
+
+          # Create trace and score in Langfuse
+          trace_id = create_trace_for_item(item, actual_category, latency_ms)
+          score_result(trace_id, item["id"], score_value, correct, actual_category, expected_category)
+
+          {
+            item_id: item["id"],
+            expected: expected_category,
+            actual: actual_category,
+            correct: correct,
+            latency_ms: latency_ms / items.size
+          }
+        end
+      else
+        handle_batch_error(items, response.error)
+      end
+    rescue => e
+      handle_batch_error(items, e)
+    end
+
+    def process_merchant_detection_batch(items)
+      transactions = items.map do |item|
+        input = item["input"]
+        txn = input["transaction"] || input
+        txn.deep_symbolize_keys.merge(id: item["id"])
+      end
+
+      merchants = items.first.dig("input", "merchants") || []
+      merchants = merchants.map(&:deep_symbolize_keys)
+
+      start_time = Time.current
+
+      response = llm_provider.auto_detect_merchants(
+        transactions: transactions,
+        user_merchants: merchants,
+        model: model
+      )
+
+      latency_ms = ((Time.current - start_time) * 1000).to_i
+
+      if response.success?
+        items.map do |item|
+          detection = response.data.find { |m| m.transaction_id.to_s == item["id"].to_s }
+          actual_name = normalize_null(detection&.business_name)
+          actual_url = normalize_null(detection&.business_url)
+          expected_name = item.dig("expectedOutput", "business_name")
+          expected_url = item.dig("expectedOutput", "business_url")
+
+          name_match = actual_name == expected_name
+          url_match = normalize_url(actual_url) == normalize_url(expected_url)
+          correct = name_match && url_match
+          score_value = correct ? 1.0 : 0.0
+
+          # Create trace and score in Langfuse
+          actual_output = { business_name: actual_name, business_url: actual_url }
+          trace_id = create_trace_for_item(item, actual_output, latency_ms)
+          score_result(trace_id, item["id"], score_value, correct, actual_output, item["expectedOutput"])
+
+          {
+            item_id: item["id"],
+            expected: { name: expected_name, url: expected_url },
+            actual: { name: actual_name, url: actual_url },
+            correct: correct,
+            latency_ms: latency_ms / items.size
+          }
+        end
+      else
+        handle_batch_error(items, response.error)
+      end
+    rescue => e
+      handle_batch_error(items, e)
+    end
+
+    def process_chat_batch(items)
+      # Chat is processed one at a time due to function calling complexity
+      items.map do |item|
+        process_chat_item(item)
+      end
+    end
+
+    def process_chat_item(item)
+      prompt = item.dig("input", "prompt")
+      expected_functions = item.dig("expectedOutput", "functions") || []
+
+      start_time = Time.current
+
+      response = llm_provider.chat_response(
+        prompt,
+        model: model,
+        instructions: "You are a helpful personal finance assistant.",
+        functions: build_available_functions
+      )
+
+      latency_ms = ((Time.current - start_time) * 1000).to_i
+
+      actual_functions = extract_function_calls(response)
+      correct = evaluate_function_match(actual_functions, expected_functions)
+      score_value = correct ? 1.0 : 0.0
+
+      # Create trace and score in Langfuse
+      trace_id = create_trace_for_item(item, { functions: actual_functions }, latency_ms)
+      score_result(trace_id, item["id"], score_value, correct, actual_functions, expected_functions)
+
+      {
+        item_id: item["id"],
+        expected: expected_functions,
+        actual: actual_functions,
+        correct: correct,
+        latency_ms: latency_ms
+      }
+    rescue => e
+      handle_item_error(item, e)
+    end
+
+    def create_trace_for_item(item, output, latency_ms)
+      trace_id = client.create_trace(
+        name: "#{dataset.eval_type}_eval",
+        input: item["input"],
+        output: output,
+        metadata: {
+          run_name: @run_name,
+          model: model,
+          latency_ms: latency_ms,
+          dataset_item_id: item["id"]
+        }
+      )
+
+      Rails.logger.debug("[Langfuse Experiment] Created trace #{trace_id} for item #{item['id']}")
+      trace_id
+    end
+
+    def score_result(trace_id, item_id, score_value, correct, actual, expected)
+      return unless trace_id
+
+      # Score the accuracy
+      client.create_score(
+        trace_id: trace_id,
+        name: "accuracy",
+        value: score_value,
+        comment: correct ? "Correct" : "Expected: #{expected.inspect}, Got: #{actual.inspect}"
+      )
+
+      # Link to dataset run
+      client.create_dataset_run_item(
+        run_name: @run_name,
+        dataset_item_id: item_id,
+        trace_id: trace_id,
+        metadata: {
+          correct: correct,
+          actual: actual,
+          expected: expected
+        }
+      )
+    rescue => e
+      Rails.logger.warn("[Langfuse Experiment] Failed to score item #{item_id}: #{e.message}")
+    end
+
+    def handle_batch_error(items, error)
+      error_message = error.is_a?(Exception) ? error.message : error.to_s
+      Rails.logger.error("[Langfuse Experiment] Batch error: #{error_message}")
+
+      items.map do |item|
+        {
+          item_id: item["id"],
+          expected: item["expectedOutput"],
+          actual: { error: error_message },
+          correct: false,
+          latency_ms: 0
+        }
+      end
+    end
+
+    def handle_item_error(item, error)
+      Rails.logger.error("[Langfuse Experiment] Item #{item['id']} error: #{error.message}")
+
+      {
+        item_id: item["id"],
+        expected: item["expectedOutput"],
+        actual: { error: error.message },
+        correct: false,
+        latency_ms: 0
+      }
+    end
+
+    def calculate_metrics(results)
+      total = results.size
+
+      # Guard against empty results to avoid division by zero
+      if total.zero?
+        return {
+          accuracy: 0.0,
+          total: 0,
+          correct: 0,
+          incorrect: 0,
+          avg_latency_ms: 0
+        }
+      end
+
+      correct = results.count { |r| r[:correct] }
+      avg_latency = results.sum { |r| r[:latency_ms] } / total.to_f
+
+      {
+        accuracy: (correct.to_f / total * 100).round(2),
+        total: total,
+        correct: correct,
+        incorrect: total - correct,
+        avg_latency_ms: avg_latency.round(0)
+      }
+    end
+
+    def llm_provider
+      @llm_provider ||= build_provider
+    end
+
+    def build_provider
+      case provider
+      when "openai"
+        access_token = provider_config[:access_token] ||
+                       ENV["OPENAI_ACCESS_TOKEN"] ||
+                       Setting.openai_access_token
+
+        raise "OpenAI access token not configured" unless access_token.present?
+
+        uri_base = provider_config[:uri_base] ||
+                   ENV["OPENAI_URI_BASE"] ||
+                   Setting.openai_uri_base
+
+        Provider::Openai.new(access_token, uri_base: uri_base, model: model)
+      else
+        raise "Unsupported provider: #{provider}"
+      end
+    end
+
+    # Determine the effective JSON mode for a batch based on expected null ratio
+    # This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
+    def json_mode_for_batch(items)
+      # If a specific mode is configured (not "auto"), always use it
+      configured_mode = provider_config[:json_mode]
+      return configured_mode if configured_mode.present? && configured_mode != "auto"
+
+      # Calculate expected null ratio for this batch
+      expected_null_count = items.count { |item| item.dig("expectedOutput", "category_name").nil? }
+      expected_null_ratio = expected_null_count.to_f / items.size
+
+      # If >50% of the batch is expected to return null, force strict mode
+      # This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
+      # and prevents unnecessary retries when nulls are legitimate
+      if expected_null_ratio > 0.5
+        Rails.logger.info("[Langfuse Experiment] Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode")
+        "strict"
+      else
+        # Use auto mode - let the auto-categorizer decide
+        "auto"
+      end
+    end
+
+    def normalize_null(value)
+      return nil if value.nil?
+      return nil if value == "null"
+      return nil if value.to_s.strip.empty?
+      value
+    end
+
+    def normalize_url(url)
+      return nil if url.nil?
+      url.to_s.downcase
+         .gsub(/^(https?:\/\/)?(www\.)?/, "")
+         .chomp("/")
+         .strip
+    end
+
+    def build_available_functions
+      # Simplified function definitions for chat eval
+      [
+        {
+          name: "get_accounts",
+          description: "Get user's financial accounts",
+          params_schema: { type: "object", properties: {}, required: [] }
+        },
+        {
+          name: "get_transactions",
+          description: "Get transactions with optional filters",
+          params_schema: {
+            type: "object",
+            properties: {
+              account_id: { type: "string" },
+              start_date: { type: "string" },
+              end_date: { type: "string" },
+              category: { type: "string" }
+            }
+          }
+        },
+        {
+          name: "get_balance_summary",
+          description: "Get balance summary across accounts",
+          params_schema: { type: "object", properties: {} }
+        },
+        {
+          name: "get_spending_by_category",
+          description: "Get spending breakdown by category",
+          params_schema: {
+            type: "object",
+            properties: {
+              start_date: { type: "string" },
+              end_date: { type: "string" }
+            }
+          }
+        }
+      ]
+    end
+
+    def extract_function_calls(response)
+      return [] unless response.respond_to?(:messages)
+
+      response.messages.flat_map do |msg|
+        next [] unless msg.respond_to?(:function_calls)
+        msg.function_calls.map do |fc|
+          { name: fc.name, arguments: fc.arguments }
+        end
+      end.compact
+    end
+
+    def evaluate_function_match(actual, expected)
+      return true if expected.empty? && actual.empty?
+      return false if expected.empty? != actual.empty?
+
+      expected_names = expected.map { |f| f["name"] || f[:name] }.sort
+      actual_names = actual.map { |f| f["name"] || f[:name] }.sort
+
+      expected_names == actual_names
+    end
+end
--- a/app/models/eval/metrics/base.rb
+++ b/app/models/eval/metrics/base.rb
@@ -0,0 +1,68 @@
+class Eval::Metrics::Base
+  attr_reader :eval_run
+
+  def initialize(eval_run)
+    @eval_run = eval_run
+  end
+
+  def calculate
+    raise NotImplementedError, "Subclasses must implement #calculate"
+  end
+
+  protected
+
+    def results
+      @results ||= eval_run.results.includes(:sample)
+    end
+
+    def samples
+      @samples ||= eval_run.dataset.samples
+    end
+
+    def total_count
+      results.count
+    end
+
+    def correct_count
+      results.where(correct: true).count
+    end
+
+    def incorrect_count
+      results.where(correct: false).count
+    end
+
+    def accuracy
+      return 0.0 if total_count.zero?
+      (correct_count.to_f / total_count * 100).round(2)
+    end
+
+    def avg_latency_ms
+      return nil if total_count.zero?
+      results.average(:latency_ms)&.round(0)
+    end
+
+    def total_cost
+      results.sum(:cost)&.to_f&.round(6)
+    end
+
+    def cost_per_sample
+      return nil if total_count.zero?
+      (total_cost / total_count).round(6)
+    end
+
+    def metrics_by_difficulty
+      %w[easy medium hard edge_case].index_with do |difficulty|
+        difficulty_results = results.joins(:sample).where(eval_samples: { difficulty: difficulty })
+        next nil if difficulty_results.empty?
+
+        correct = difficulty_results.where(correct: true).count
+        total = difficulty_results.count
+
+        {
+          count: total,
+          correct: correct,
+          accuracy: (correct.to_f / total * 100).round(2)
+        }
+      end.compact
+    end
+end
--- a/app/models/eval/metrics/categorization_metrics.rb
+++ b/app/models/eval/metrics/categorization_metrics.rb
@@ -0,0 +1,101 @@
+class Eval::Metrics::CategorizationMetrics < Eval::Metrics::Base
+  def calculate
+    {
+      accuracy: accuracy,
+      exact_match_accuracy: exact_match_accuracy,
+      alternative_match_count: alternative_match_count,
+      precision: precision,
+      recall: recall,
+      f1_score: f1_score,
+      null_accuracy: null_accuracy,
+      hierarchical_accuracy: hierarchical_accuracy,
+      samples_processed: total_count,
+      samples_correct: correct_count,
+      avg_latency_ms: avg_latency_ms,
+      total_cost: total_cost,
+      cost_per_sample: cost_per_sample,
+      by_difficulty: metrics_by_difficulty,
+      by_category: metrics_by_category
+    }
+  end
+
+  private
+
+    def exact_match_accuracy
+      # Percentage of results that exactly match the primary expected category
+      return 0.0 if total_count.zero?
+      (results.where(exact_match: true).count.to_f / total_count * 100).round(2)
+    end
+
+    def alternative_match_count
+      # Number of results that matched an alternative (but not primary) category
+      results.where(alternative_match: true).count
+    end
+
+    def null_accuracy
+      # Accuracy for samples where null was expected
+      null_expected_results = results.where(null_expected: true)
+      return 100.0 if null_expected_results.empty?
+
+      correct = null_expected_results.where(null_returned: true).count
+      total = null_expected_results.count
+
+      (correct.to_f / total * 100).round(2)
+    end
+
+    def hierarchical_accuracy
+      # Percentage of results that match at hierarchical level (including exact matches)
+      return 0.0 if total_count.zero?
+      (results.where(hierarchical_match: true).count.to_f / total_count * 100).round(2)
+    end
+
+    def precision
+      # True positives / (True positives + False positives)
+      # TP: Correct non-null predictions
+      # FP: Incorrect non-null predictions (predicted wrong category)
+      true_positives = results.where(correct: true, null_returned: false).count
+      false_positives = results.where(correct: false, null_returned: false).count
+
+      denominator = true_positives + false_positives
+      return 0.0 if denominator.zero?
+
+      (true_positives.to_f / denominator * 100).round(2)
+    end
+
+    def recall
+      # True positives / (True positives + False negatives)
+      # TP: Correct non-null predictions
+      # FN: Incorrectly returned null when category was expected
+      true_positives = results.where(correct: true, null_returned: false).count
+      false_negatives = results.where(null_expected: false, null_returned: true).count
+
+      denominator = true_positives + false_negatives
+      return 0.0 if denominator.zero?
+
+      (true_positives.to_f / denominator * 100).round(2)
+    end
+
+    def f1_score
+      return 0.0 if precision.zero? || recall.zero?
+      (2 * precision * recall / (precision + recall)).round(2)
+    end
+
+    def metrics_by_category
+      # Group results by expected category and calculate accuracy
+      category_metrics = {}
+
+      results.includes(:sample).each do |result|
+        expected = result.sample.expected_category_name || "null"
+
+        category_metrics[expected] ||= { correct: 0, total: 0 }
+        category_metrics[expected][:total] += 1
+        category_metrics[expected][:correct] += 1 if result.correct
+      end
+
+      category_metrics.transform_values do |metrics|
+        metrics.merge(
+          accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2)
+        )
+      end
+    end
+end
--- a/app/models/eval/metrics/chat_metrics.rb
+++ b/app/models/eval/metrics/chat_metrics.rb
@@ -0,0 +1,125 @@
+class Eval::Metrics::ChatMetrics < Eval::Metrics::Base
+  def calculate
+    {
+      accuracy: accuracy,
+      function_selection_accuracy: function_selection_accuracy,
+      parameter_accuracy: parameter_accuracy,
+      response_relevance: response_relevance,
+      exact_match_rate: exact_match_rate,
+      error_rate: error_rate,
+      avg_functions_per_response: avg_functions_per_response,
+      samples_processed: total_count,
+      samples_correct: correct_count,
+      avg_latency_ms: avg_latency_ms,
+      total_cost: total_cost,
+      cost_per_sample: cost_per_sample,
+      by_difficulty: metrics_by_difficulty,
+      by_function: metrics_by_function
+    }
+  end
+
+  private
+
+    def function_selection_accuracy
+      # Percentage of samples where correct functions were called
+      valid_results = results.where.not("metadata->>'error' IS NOT NULL")
+      return 0.0 if valid_results.empty?
+
+      correct = valid_results.count do |r|
+        r.metadata.dig("function_selection_correct") == true
+      end
+
+      (correct.to_f / valid_results.count * 100).round(2)
+    end
+
+    def parameter_accuracy
+      # Average parameter accuracy across all samples
+      valid_results = results.where.not("metadata->>'error' IS NOT NULL")
+      return 0.0 if valid_results.empty?
+
+      scores = valid_results.map do |r|
+        r.metadata.dig("parameter_accuracy") || 0.0
+      end
+
+      (scores.sum / scores.size * 100).round(2)
+    end
+
+    def response_relevance
+      # Percentage of samples where response contained expected keywords
+      valid_results = results.where.not("metadata->>'error' IS NOT NULL")
+      return 0.0 if valid_results.empty?
+
+      correct = valid_results.count do |r|
+        # If no keywords expected, consider it relevant
+        expected_keywords = r.metadata.dig("expected_keywords") || []
+        expected_keywords.empty? || r.metadata.dig("response_keywords_found") == true
+      end
+
+      (correct.to_f / valid_results.count * 100).round(2)
+    end
+
+    def exact_match_rate
+      return 0.0 if total_count.zero?
+      (results.where(exact_match: true).count.to_f / total_count * 100).round(2)
+    end
+
+    def error_rate
+      return 0.0 if total_count.zero?
+
+      errors = results.count do |r|
+        r.metadata.dig("error").present? || r.actual_output.dig("error").present?
+      end
+
+      (errors.to_f / total_count * 100).round(2)
+    end
+
+    def avg_functions_per_response
+      valid_results = results.where.not("actual_output->>'error' IS NOT NULL")
+      return 0.0 if valid_results.empty?
+
+      total_functions = valid_results.sum do |r|
+        functions = r.actual_output.dig("functions") || []
+        functions.size
+      end
+
+      (total_functions.to_f / valid_results.count).round(2)
+    end
+
+    def metrics_by_function
+      # Group results by expected function and calculate accuracy
+      function_metrics = {}
+
+      results.includes(:sample).each do |result|
+        expected_functions = result.sample.expected_functions
+
+        expected_functions.each do |func|
+          name = func["name"]
+          next if name.nil?
+
+          function_metrics[name] ||= { correct: 0, total: 0, param_accuracy_sum: 0 }
+          function_metrics[name][:total] += 1
+
+          # Check if this specific function was called correctly
+          actual_functions = result.actual_output.dig("functions") || []
+          if actual_functions.any? { |f| normalize_name(f["name"]) == normalize_name(name) }
+            function_metrics[name][:correct] += 1
+            function_metrics[name][:param_accuracy_sum] += (result.metadata.dig("parameter_accuracy") || 0.0)
+          end
+        end
+      end
+
+      function_metrics.transform_values do |metrics|
+        {
+          total: metrics[:total],
+          correct: metrics[:correct],
+          accuracy: (metrics[:correct].to_f / metrics[:total] * 100).round(2),
+          avg_param_accuracy: metrics[:correct] > 0 ? (metrics[:param_accuracy_sum] / metrics[:correct] * 100).round(2) : 0.0
+        }
+      end
+    end
+
+    def normalize_name(name)
+      return nil if name.nil?
+      name.to_s.underscore.downcase
+    end
+end
--- a/app/models/eval/metrics/merchant_detection_metrics.rb
+++ b/app/models/eval/metrics/merchant_detection_metrics.rb
@@ -0,0 +1,107 @@
+class Eval::Metrics::MerchantDetectionMetrics < Eval::Metrics::Base
+  FUZZY_MATCH_THRESHOLD = 0.8
+
+  def calculate
+    {
+      accuracy: accuracy,
+      name_accuracy: name_accuracy,
+      fuzzy_name_accuracy: fuzzy_name_accuracy,
+      url_accuracy: url_accuracy,
+      false_positive_rate: false_positive_rate,
+      false_negative_rate: false_negative_rate,
+      samples_processed: total_count,
+      samples_correct: correct_count,
+      avg_latency_ms: avg_latency_ms,
+      total_cost: total_cost,
+      cost_per_sample: cost_per_sample,
+      avg_fuzzy_score: avg_fuzzy_score,
+      by_difficulty: metrics_by_difficulty
+    }
+  end
+
+  private
+
+    def name_accuracy
+      # Exact name match accuracy for non-null expected names
+      name_results = results.includes(:sample).select do |r|
+        r.sample.expected_business_name.present?
+      end
+
+      return 100.0 if name_results.empty?
+
+      correct = name_results.count do |r|
+        actual = r.actual_output.dig("business_name") || r.actual_output["business_name"]
+        expected = r.sample.expected_business_name
+        actual == expected
+      end
+
+      (correct.to_f / name_results.size * 100).round(2)
+    end
+
+    def fuzzy_name_accuracy
+      # Fuzzy name match accuracy (using fuzzy_score >= threshold)
+      name_results = results.includes(:sample).select do |r|
+        r.sample.expected_business_name.present?
+      end
+
+      return 100.0 if name_results.empty?
+
+      correct = name_results.count do |r|
+        (r.fuzzy_score || 0) >= FUZZY_MATCH_THRESHOLD
+      end
+
+      (correct.to_f / name_results.size * 100).round(2)
+    end
+
+    def url_accuracy
+      # URL match accuracy for non-null expected URLs
+      url_results = results.includes(:sample).select do |r|
+        r.sample.expected_business_url.present?
+      end
+
+      return 100.0 if url_results.empty?
+
+      correct = url_results.count do |r|
+        actual = r.actual_output.dig("business_url") || r.actual_output["business_url"]
+        expected = r.sample.expected_business_url
+        normalize_url(actual) == normalize_url(expected)
+      end
+
+      (correct.to_f / url_results.size * 100).round(2)
+    end
+
+    def false_positive_rate
+      # Rate of returning a merchant when null was expected
+      null_expected_results = results.where(null_expected: true)
+      return 0.0 if null_expected_results.empty?
+
+      false_positives = null_expected_results.where(null_returned: false).count
+
+      (false_positives.to_f / null_expected_results.count * 100).round(2)
+    end
+
+    def false_negative_rate
+      # Rate of returning null when a merchant was expected
+      merchant_expected_results = results.where(null_expected: false)
+      return 0.0 if merchant_expected_results.empty?
+
+      false_negatives = merchant_expected_results.where(null_returned: true).count
+
+      (false_negatives.to_f / merchant_expected_results.count * 100).round(2)
+    end
+
+    def avg_fuzzy_score
+      scores = results.where.not(fuzzy_score: nil).pluck(:fuzzy_score)
+      return nil if scores.empty?
+
+      (scores.sum / scores.size).round(4)
+    end
+
+    def normalize_url(url)
+      return nil if url.nil?
+      url.to_s.downcase
+         .gsub(/^(https?:\/\/)?(www\.)?/, "")
+         .chomp("/")
+         .strip
+    end
+end
--- a/app/models/eval/reporters/comparison_reporter.rb
+++ b/app/models/eval/reporters/comparison_reporter.rb
@@ -0,0 +1,205 @@
+class Eval::Reporters::ComparisonReporter
+  attr_reader :runs
+
+  def initialize(runs)
+    @runs = Array(runs).sort_by(&:model)
+  end
+
+  # Generate a text table for terminal display
+  def to_table
+    return "No runs to compare" if runs.empty?
+
+    headers = build_headers
+    rows = runs.map { |run| build_row(run) }
+
+    # Calculate column widths
+    all_rows = [ headers ] + rows
+    widths = headers.each_index.map do |i|
+      all_rows.map { |row| row[i].to_s.length }.max
+    end
+
+    # Build table
+    separator = "+" + widths.map { |w| "-" * (w + 2) }.join("+") + "+"
+
+    lines = []
+    lines << separator
+    lines << "| " + headers.each_with_index.map { |h, i| h.to_s.ljust(widths[i]) }.join(" | ") + " |"
+    lines << separator
+
+    rows.each do |row|
+      lines << "| " + row.each_with_index.map { |c, i| c.to_s.ljust(widths[i]) }.join(" | ") + " |"
+    end
+
+    lines << separator
+    lines.join("\n")
+  end
+
+  # Export to CSV file
+  def to_csv(file_path)
+    require "csv"
+
+    CSV.open(file_path, "wb") do |csv|
+      csv << csv_headers
+      runs.each { |run| csv << csv_row(run) }
+    end
+
+    file_path
+  end
+
+  # Generate summary with best model recommendations
+  def summary
+    return {} if runs.empty?
+
+    completed_runs = runs.select { |r| r.status == "completed" && r.metrics.present? }
+    return {} if completed_runs.empty?
+
+    best_accuracy = completed_runs.max_by { |r| r.metrics["accuracy"] || 0 }
+    lowest_cost = completed_runs.min_by { |r| r.total_cost || Float::INFINITY }
+    fastest = completed_runs.min_by { |r| r.metrics["avg_latency_ms"] || Float::INFINITY }
+
+    {
+      best_accuracy: {
+        model: best_accuracy.model,
+        value: best_accuracy.metrics["accuracy"],
+        run_id: best_accuracy.id
+      },
+      lowest_cost: {
+        model: lowest_cost.model,
+        value: lowest_cost.total_cost&.to_f,
+        run_id: lowest_cost.id
+      },
+      fastest: {
+        model: fastest.model,
+        value: fastest.metrics["avg_latency_ms"],
+        run_id: fastest.id
+      },
+      recommendation: generate_recommendation(best_accuracy, lowest_cost, fastest)
+    }
+  end
+
+  # Generate detailed comparison between runs
+  def detailed_comparison
+    return {} if runs.empty?
+
+    {
+      runs: runs.map(&:summary),
+      comparison: pairwise_comparisons,
+      summary: summary
+    }
+  end
+
+  private
+
+    def build_headers
+      [ "Model", "Status", "Accuracy", "Precision", "Recall", "F1", "Latency (ms)", "Cost ($)", "Samples" ]
+    end
+
+    def build_row(run)
+      metrics = run.metrics || {}
+
+      [
+        run.model,
+        run.status,
+        format_percentage(metrics["accuracy"]),
+        format_percentage(metrics["precision"]),
+        format_percentage(metrics["recall"]),
+        format_percentage(metrics["f1_score"]),
+        metrics["avg_latency_ms"]&.round(0) || "-",
+        format_cost(run.total_cost),
+        run.results.count
+      ]
+    end
+
+    def csv_headers
+      [
+        "Run ID", "Model", "Provider", "Dataset", "Status",
+        "Accuracy", "Precision", "Recall", "F1 Score",
+        "Null Accuracy", "Hierarchical Accuracy",
+        "Avg Latency (ms)", "Total Cost", "Cost Per Sample",
+        "Samples Processed", "Samples Correct",
+        "Duration (s)", "Run Date"
+      ]
+    end
+
+    def csv_row(run)
+      metrics = run.metrics || {}
+
+      [
+        run.id,
+        run.model,
+        run.provider,
+        run.dataset.name,
+        run.status,
+        metrics["accuracy"],
+        metrics["precision"],
+        metrics["recall"],
+        metrics["f1_score"],
+        metrics["null_accuracy"],
+        metrics["hierarchical_accuracy"],
+        metrics["avg_latency_ms"],
+        run.total_cost&.to_f,
+        metrics["cost_per_sample"],
+        metrics["samples_processed"],
+        metrics["samples_correct"],
+        run.duration_seconds,
+        run.completed_at&.iso8601
+      ]
+    end
+
+    def format_percentage(value)
+      return "-" if value.nil?
+      "#{value}%"
+    end
+
+    def format_cost(value)
+      return "-" if value.nil?
+      "$#{value.to_f.round(4)}"
+    end
+
+    def pairwise_comparisons
+      return [] if runs.size < 2
+
+      comparisons = []
+      runs.combination(2).each do |run1, run2|
+        comparisons << {
+          models: [ run1.model, run2.model ],
+          accuracy_diff: ((run1.metrics["accuracy"] || 0) - (run2.metrics["accuracy"] || 0)).round(2),
+          cost_diff: ((run1.total_cost || 0) - (run2.total_cost || 0)).to_f.round(6),
+          latency_diff: ((run1.metrics["avg_latency_ms"] || 0) - (run2.metrics["avg_latency_ms"] || 0)).round(0)
+        }
+      end
+      comparisons
+    end
+
+    def generate_recommendation(best_accuracy, lowest_cost, fastest)
+      parts = []
+
+      # If one model wins all categories
+      if best_accuracy.id == lowest_cost.id && lowest_cost.id == fastest.id
+        return "#{best_accuracy.model} is the best choice overall (highest accuracy, lowest cost, fastest)."
+      end
+
+      # Accuracy recommendation
+      if best_accuracy.metrics["accuracy"] && best_accuracy.metrics["accuracy"] >= 90
+        parts << "For maximum accuracy, use #{best_accuracy.model} (#{best_accuracy.metrics['accuracy']}% accuracy)"
+      end
+
+      # Cost recommendation if significantly cheaper
+      if lowest_cost.total_cost && lowest_cost.total_cost > 0
+        cost_ratio = (best_accuracy.total_cost || 0) / lowest_cost.total_cost
+        if cost_ratio > 1.5
+          parts << "For cost efficiency, consider #{lowest_cost.model} (#{format_cost(lowest_cost.total_cost)} vs #{format_cost(best_accuracy.total_cost)})"
+        end
+      end
+
+      # Speed recommendation
+      if fastest.metrics["avg_latency_ms"] && fastest.id != best_accuracy.id
+        latency_ratio = (best_accuracy.metrics["avg_latency_ms"] || 0) / (fastest.metrics["avg_latency_ms"] || 1)
+        if latency_ratio > 1.5
+          parts << "For speed, consider #{fastest.model} (#{fastest.metrics['avg_latency_ms']}ms vs #{best_accuracy.metrics['avg_latency_ms']}ms)"
+        end
+      end
+
+      parts.empty? ? "All models perform similarly." : parts.join(". ")
+    end
+end
--- a/app/models/eval/result.rb
+++ b/app/models/eval/result.rb
@@ -0,0 +1,70 @@
+class Eval::Result < ApplicationRecord
+  self.table_name = "eval_results"
+
+  belongs_to :run, class_name: "Eval::Run", foreign_key: :eval_run_id
+  belongs_to :sample, class_name: "Eval::Sample", foreign_key: :eval_sample_id
+
+  validates :actual_output, presence: true
+  validates :correct, inclusion: { in: [ true, false ] }
+
+  scope :correct, -> { where(correct: true) }
+  scope :incorrect, -> { where(correct: false) }
+  scope :with_nulls_returned, -> { where(null_returned: true) }
+  scope :with_nulls_expected, -> { where(null_expected: true) }
+  scope :exact_matches, -> { where(exact_match: true) }
+  scope :hierarchical_matches, -> { where(hierarchical_match: true) }
+
+  # Get actual category (for categorization results)
+  def actual_category_name
+    actual_output.dig("category_name") || actual_output["category_name"]
+  end
+
+  # Get actual merchant info (for merchant detection results)
+  def actual_business_name
+    actual_output.dig("business_name") || actual_output["business_name"]
+  end
+
+  def actual_business_url
+    actual_output.dig("business_url") || actual_output["business_url"]
+  end
+
+  # Get actual functions called (for chat results)
+  def actual_functions
+    actual_output.dig("functions") || actual_output["functions"] || []
+  end
+
+  # Get actual response text (for chat results)
+  def actual_response_text
+    actual_output.dig("response_text") || actual_output["response_text"]
+  end
+
+  # Summary for display
+  def summary
+    {
+      sample_id: sample_id,
+      correct: correct,
+      exact_match: exact_match,
+      expected: sample.expected_output,
+      actual: actual_output,
+      latency_ms: latency_ms,
+      cost: cost&.to_f
+    }
+  end
+
+  # Detailed comparison with expected
+  def detailed_comparison
+    {
+      sample_difficulty: sample.difficulty,
+      sample_tags: sample.tags,
+      input: sample.input_data,
+      expected: sample.expected_output,
+      actual: actual_output,
+      correct: correct,
+      exact_match: exact_match,
+      hierarchical_match: hierarchical_match,
+      null_expected: null_expected,
+      null_returned: null_returned,
+      fuzzy_score: fuzzy_score
+    }
+  end
+end
--- a/app/models/eval/run.rb
+++ b/app/models/eval/run.rb
@@ -0,0 +1,88 @@
+class Eval::Run < ApplicationRecord
+  self.table_name = "eval_runs"
+
+  belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
+  has_many :results, class_name: "Eval::Result", foreign_key: :eval_run_id, dependent: :destroy
+
+  validates :provider, :model, :status, presence: true
+  validates :status, inclusion: { in: %w[pending running completed failed] }
+
+  scope :pending, -> { where(status: "pending") }
+  scope :running, -> { where(status: "running") }
+  scope :completed, -> { where(status: "completed") }
+  scope :failed, -> { where(status: "failed") }
+  scope :for_model, ->(model) { where(model: model) }
+  scope :for_provider, ->(provider) { where(provider: provider) }
+
+  # Calculate duration in seconds
+  def duration_seconds
+    return nil unless started_at && completed_at
+    (completed_at - started_at).to_i
+  end
+
+  # Get accuracy from metrics or calculate
+  def accuracy
+    metrics.dig("accuracy") || calculate_accuracy
+  end
+
+  # Start the evaluation run
+  def start!
+    update!(status: "running", started_at: Time.current)
+  end
+
+  # Complete the evaluation run with metrics
+  def complete!(calculated_metrics)
+    update!(
+      status: "completed",
+      completed_at: Time.current,
+      metrics: calculated_metrics,
+      total_prompt_tokens: results.sum(:prompt_tokens),
+      total_completion_tokens: results.sum(:completion_tokens),
+      total_cost: results.sum(:cost)
+    )
+  end
+
+  # Fail the evaluation run
+  def fail!(error)
+    update!(
+      status: "failed",
+      completed_at: Time.current,
+      error_message: error.is_a?(Exception) ? "#{error.class}: #{error.message}" : error.to_s
+    )
+  end
+
+  # Summary for display
+  def summary
+    {
+      id: id,
+      name: name,
+      dataset: dataset.name,
+      model: model,
+      provider: provider,
+      status: status,
+      accuracy: accuracy,
+      total_cost: total_cost&.to_f,
+      duration: duration_seconds,
+      samples_processed: results.count,
+      samples_correct: results.where(correct: true).count,
+      created_at: created_at
+    }
+  end
+
+  # Compare this run to another
+  def compare_to(other_run)
+    {
+      accuracy_diff: (accuracy || 0) - (other_run.accuracy || 0),
+      cost_diff: (total_cost || 0) - (other_run.total_cost || 0),
+      this_model: model,
+      other_model: other_run.model
+    }
+  end
+
+  private
+
+    def calculate_accuracy
+      return 0.0 if results.empty?
+      (results.where(correct: true).count.to_f / results.count * 100).round(2)
+    end
+end
--- a/app/models/eval/runners/base.rb
+++ b/app/models/eval/runners/base.rb
@@ -0,0 +1,82 @@
+class Eval::Runners::Base
+  attr_reader :eval_run
+
+  def initialize(eval_run)
+    @eval_run = eval_run
+  end
+
+  def run
+    eval_run.start!
+
+    begin
+      process_samples
+      metrics = calculate_metrics
+      eval_run.complete!(metrics)
+    rescue => e
+      eval_run.fail!(e)
+      raise
+    end
+
+    eval_run
+  end
+
+  protected
+
+    def process_samples
+      raise NotImplementedError, "Subclasses must implement #process_samples"
+    end
+
+    def calculate_metrics
+      raise NotImplementedError, "Subclasses must implement #calculate_metrics"
+    end
+
+    def samples
+      eval_run.dataset.samples
+    end
+
+    def provider
+      @provider ||= build_provider
+    end
+
+    def model
+      eval_run.model
+    end
+
+  private
+
+    def build_provider
+      case eval_run.provider
+      when "openai"
+        build_openai_provider
+      else
+        raise "Unsupported provider: #{eval_run.provider}"
+      end
+    end
+
+    def build_openai_provider
+      access_token = eval_run.provider_config["access_token"].presence ||
+                     ENV["OPENAI_ACCESS_TOKEN"].presence ||
+                     Setting.openai_access_token
+
+      raise "OpenAI access token not configured" unless access_token.present?
+
+      uri_base = eval_run.provider_config["uri_base"].presence ||
+                 ENV["OPENAI_URI_BASE"].presence ||
+                 Setting.openai_uri_base
+
+      Provider::Openai.new(access_token, uri_base: uri_base, model: model)
+    end
+
+    def record_result(sample:, actual_output:, correct:, **attributes)
+      eval_run.results.create!(
+        sample: sample,
+        actual_output: actual_output,
+        correct: correct,
+        **attributes
+      )
+    end
+
+    def log_progress(message)
+      Rails.logger.info("[Eval::Runner] #{message}")
+    end
+end
--- a/app/models/eval/runners/categorization_runner.rb
+++ b/app/models/eval/runners/categorization_runner.rb
@@ -0,0 +1,199 @@
+class Eval::Runners::CategorizationRunner < Eval::Runners::Base
+  DEFAULT_BATCH_SIZE = 25  # Matches Provider::Openai limit
+
+  protected
+
+    def process_samples
+      all_samples = samples.to_a
+      batch_size = effective_batch_size
+      log_progress("Processing #{all_samples.size} samples in batches of #{batch_size}")
+
+      all_samples.each_slice(batch_size).with_index do |batch, batch_idx|
+        log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / batch_size).ceil}")
+        process_batch(batch)
+      end
+    end
+
+    # Use smaller batches for custom providers (local LLMs) to reduce context length
+    def effective_batch_size
+      eval_run.provider_config["batch_size"]&.to_i || DEFAULT_BATCH_SIZE
+    end
+
+    # Get JSON mode from provider config (optional override)
+    # Valid values: "strict", "json_object", "none"
+    def json_mode
+      eval_run.provider_config["json_mode"]
+    end
+
+    def calculate_metrics
+      Eval::Metrics::CategorizationMetrics.new(eval_run).calculate
+    end
+
+  private
+
+    def process_batch(batch_samples)
+      return if batch_samples.empty?
+
+      # Build inputs for the provider
+      transactions = batch_samples.map do |sample|
+        sample.to_transaction_input.merge(id: sample.id)
+      end
+
+      # Get categories from first sample's context (should be shared)
+      # Symbolize keys since Provider::Openai::AutoCategorizer expects symbol keys
+      categories = batch_samples.first.categories_context.map(&:deep_symbolize_keys)
+
+      # Determine effective JSON mode for this batch
+      # If the batch has many expected nulls and we're using auto mode, force strict mode
+      # to prevent the auto-categorizer from incorrectly retrying (it would see many nulls
+      # and think strict mode is broken, when actually the nulls are expected)
+      effective_json_mode = json_mode_for_batch(batch_samples)
+
+      start_time = Time.current
+
+      begin
+        response = provider.auto_categorize(
+          transactions: transactions,
+          user_categories: categories,
+          model: model,
+          json_mode: effective_json_mode
+        )
+
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+
+        if response.success?
+          record_batch_results(batch_samples, response.data, per_sample_latency)
+        else
+          record_batch_errors(batch_samples, response.error, per_sample_latency)
+        end
+      rescue => e
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+        record_batch_errors(batch_samples, e, per_sample_latency)
+      end
+    end
+
+    def record_batch_results(batch_samples, categorizations, per_sample_latency)
+      batch_samples.each do |sample|
+        # Find the categorization result for this sample
+        categorization = categorizations.find { |c| c.transaction_id.to_s == sample.id.to_s }
+        actual_category = categorization&.category_name
+
+        # Normalize "null" string to nil
+        actual_category = nil if actual_category == "null"
+
+        expected_category = sample.expected_category_name
+        acceptable_categories = sample.all_acceptable_categories
+
+        # Evaluate correctness - check primary expected and alternatives
+        correct = evaluate_correctness_with_alternatives(actual_category, expected_category, acceptable_categories)
+        exact_match = actual_category == expected_category
+        alternative_match = acceptable_categories.include?(actual_category) && !exact_match
+        hierarchical = evaluate_hierarchical_match(actual_category, expected_category, sample)
+
+        record_result(
+          sample: sample,
+          actual_output: { "category_name" => actual_category },
+          correct: correct,
+          exact_match: exact_match,
+          alternative_match: alternative_match,
+          hierarchical_match: hierarchical,
+          null_expected: expected_category.nil?,
+          null_returned: actual_category.nil?,
+          latency_ms: per_sample_latency
+        )
+      end
+    end
+
+    def record_batch_errors(batch_samples, error, per_sample_latency)
+      error_message = error.is_a?(Exception) ? error.message : error.to_s
+
+      batch_samples.each do |sample|
+        record_result(
+          sample: sample,
+          actual_output: { "error" => error_message },
+          correct: false,
+          exact_match: false,
+          hierarchical_match: false,
+          null_expected: sample.expected_category_name.nil?,
+          null_returned: true,
+          latency_ms: per_sample_latency,
+          metadata: { "error" => error_message }
+        )
+      end
+    end
+
+    # Determine the effective JSON mode for a batch based on expected null ratio
+    # This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
+    def json_mode_for_batch(batch_samples)
+      # If a specific mode is configured (not "auto"), always use it
+      return json_mode if json_mode.present? && json_mode != "auto"
+
+      # Calculate expected null ratio for this batch
+      expected_null_count = batch_samples.count { |s| s.expected_category_name.nil? }
+      expected_null_ratio = expected_null_count.to_f / batch_samples.size
+
+      # If >50% of the batch is expected to return null, force strict mode
+      # This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
+      # and prevents unnecessary retries when nulls are legitimate
+      if expected_null_ratio > 0.5
+        log_progress("Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode to prevent false retry")
+        "strict"
+      else
+        # Use auto mode - let the auto-categorizer decide
+        "auto"
+      end
+    end
+
+    def evaluate_correctness(actual, expected)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = incorrect
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = incorrect
+      return false if actual.nil? && expected.present?
+      # Compare values
+      actual == expected
+    end
+
+    def evaluate_correctness_with_alternatives(actual, expected, acceptable_categories)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = incorrect
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = incorrect
+      return false if actual.nil? && expected.present?
+      # Check if actual matches any acceptable category (primary or alternatives)
+      acceptable_categories.include?(actual)
+    end
+
+    def evaluate_hierarchical_match(actual, expected, sample)
+      return false if actual.nil? || expected.nil?
+      return true if actual == expected
+
+      # Check if actual matches parent of expected category
+      categories = sample.categories_context
+
+      # Find the expected category
+      expected_cat = categories.find { |c| c["name"] == expected }
+      return false unless expected_cat
+
+      # If expected has a parent, check if actual matches the parent
+      if expected_cat["parent_id"]
+        parent = categories.find { |c| c["id"].to_s == expected_cat["parent_id"].to_s }
+        return parent && parent["name"] == actual
+      end
+
+      # Also check if actual is a subcategory of expected (reverse direction)
+      actual_cat = categories.find { |c| c["name"] == actual }
+      return false unless actual_cat
+
+      if actual_cat["parent_id"]
+        parent = categories.find { |c| c["id"].to_s == actual_cat["parent_id"].to_s }
+        return parent && parent["name"] == expected
+      end
+
+      false
+    end
+end
--- a/app/models/eval/runners/chat_runner.rb
+++ b/app/models/eval/runners/chat_runner.rb
@@ -0,0 +1,255 @@
+class Eval::Runners::ChatRunner < Eval::Runners::Base
+  # Chat samples are processed one at a time (not batched)
+  # because each has unique context and function calling requirements
+
+  protected
+
+    def process_samples
+      all_samples = samples.to_a
+      log_progress("Processing #{all_samples.size} chat samples")
+
+      all_samples.each_with_index do |sample, idx|
+        log_progress("Processing sample #{idx + 1}/#{all_samples.size}")
+        process_sample(sample)
+      end
+    end
+
+    def calculate_metrics
+      Eval::Metrics::ChatMetrics.new(eval_run).calculate
+    end
+
+  private
+
+    def process_sample(sample)
+      prompt = sample.chat_prompt
+      start_time = Time.current
+
+      begin
+        response = provider.chat_response(
+          prompt,
+          model: model,
+          instructions: build_instructions,
+          functions: build_function_definitions
+        )
+
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+
+        if response.success?
+          record_chat_result(sample, response.data, latency_ms)
+        else
+          record_error_result(sample, response.error, latency_ms)
+        end
+      rescue => e
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        record_error_result(sample, e, latency_ms)
+      end
+    end
+
+    def record_chat_result(sample, chat_response, latency_ms)
+      # Extract function calls from response
+      actual_functions = extract_functions_from_response(chat_response)
+
+      # Extract response text
+      response_text = extract_response_text(chat_response)
+
+      # Evaluate function calling accuracy
+      expected_functions = sample.expected_functions
+      function_match = evaluate_function_match(actual_functions, expected_functions)
+
+      # Evaluate response content
+      expected_keywords = sample.expected_response_contains
+      response_match = evaluate_response_contains(response_text, expected_keywords)
+
+      # Overall correctness: functions are correct AND response contains expected keywords
+      correct = function_match[:correct] && response_match
+
+      record_result(
+        sample: sample,
+        actual_output: {
+          "functions" => actual_functions,
+          "response_text" => response_text,
+          "function_match_details" => function_match
+        },
+        correct: correct,
+        exact_match: function_match[:exact_match],
+        latency_ms: latency_ms,
+        metadata: {
+          "function_selection_correct" => function_match[:selection_correct],
+          "parameter_accuracy" => function_match[:parameter_accuracy],
+          "response_keywords_found" => response_match,
+          "expected_functions" => expected_functions,
+          "expected_keywords" => expected_keywords
+        }
+      )
+    end
+
+    def record_error_result(sample, error, latency_ms)
+      error_message = error.is_a?(Exception) ? error.message : error.to_s
+
+      record_result(
+        sample: sample,
+        actual_output: { "error" => error_message },
+        correct: false,
+        exact_match: false,
+        latency_ms: latency_ms,
+        metadata: { "error" => error_message }
+      )
+    end
+
+    def extract_functions_from_response(chat_response)
+      # ChatResponse has function_requests array
+      function_requests = chat_response.function_requests || []
+
+      function_requests.map do |req|
+        {
+          "name" => req.function_name,
+          "params" => parse_function_args(req.function_args)
+        }
+      end
+    end
+
+    def parse_function_args(args)
+      return {} if args.nil?
+      return args if args.is_a?(Hash)
+      JSON.parse(args)
+    rescue JSON::ParserError
+      {}
+    end
+
+    def extract_response_text(chat_response)
+      # ChatResponse has messages array with output_text
+      messages = chat_response.messages || []
+      messages.map(&:output_text).compact.join("\n")
+    end
+
+    def evaluate_function_match(actual_functions, expected_functions)
+      return { correct: true, exact_match: true, selection_correct: true, parameter_accuracy: 1.0 } if expected_functions.empty? && actual_functions.empty?
+      return { correct: false, exact_match: false, selection_correct: false, parameter_accuracy: 0.0 } if expected_functions.empty? && actual_functions.any?
+
+      # Check function selection accuracy
+      expected_names = expected_functions.map { |f| normalize_function_name(f["name"]) }.compact
+      actual_names = actual_functions.map { |f| normalize_function_name(f["name"]) }.compact
+
+      selection_correct = expected_names.all? { |name| actual_names.include?(name) }
+
+      # Check parameter accuracy for matched functions
+      param_scores = []
+      expected_functions.each do |expected_func|
+        expected_name = normalize_function_name(expected_func["name"])
+        actual_func = actual_functions.find { |f| normalize_function_name(f["name"]) == expected_name }
+
+        if actual_func
+          param_score = evaluate_parameters(actual_func["params"], expected_func["params"] || {})
+          param_scores << param_score
+        else
+          param_scores << 0.0
+        end
+      end
+
+      parameter_accuracy = param_scores.empty? ? 0.0 : (param_scores.sum / param_scores.size).round(4)
+
+      # Exact match requires same functions with same parameters
+      exact_match = selection_correct && parameter_accuracy == 1.0
+
+      # Correct if all expected functions were called (parameters don't have to be exact)
+      correct = selection_correct
+
+      {
+        correct: correct,
+        exact_match: exact_match,
+        selection_correct: selection_correct,
+        parameter_accuracy: parameter_accuracy
+      }
+    end
+
+    def normalize_function_name(name)
+      return nil if name.nil?
+      # Convert to snake_case and downcase
+      name.to_s.underscore.downcase
+    end
+
+    def evaluate_parameters(actual_params, expected_params)
+      return 1.0 if expected_params.empty?
+      return 0.0 if actual_params.nil?
+
+      actual_params = actual_params.stringify_keys
+      expected_params = expected_params.stringify_keys
+
+      matches = 0
+      total = expected_params.size
+
+      expected_params.each do |key, expected_value|
+        actual_value = actual_params[key]
+
+        if values_match?(actual_value, expected_value)
+          matches += 1
+        end
+      end
+
+      (matches.to_f / total).round(4)
+    end
+
+    def values_match?(actual, expected)
+      return true if actual == expected
+      return true if actual.to_s.downcase == expected.to_s.downcase
+
+      # For arrays, check if all expected values are present
+      if expected.is_a?(Array) && actual.is_a?(Array)
+        expected_normalized = expected.map { |v| v.to_s.downcase }
+        actual_normalized = actual.map { |v| v.to_s.downcase }
+        return expected_normalized.all? { |v| actual_normalized.include?(v) }
+      end
+
+      # For dates, try to parse and compare
+      if expected.to_s =~ /^\d{4}-\d{2}-\d{2}$/
+        begin
+          expected_date = Date.parse(expected.to_s)
+          actual_date = Date.parse(actual.to_s)
+          return expected_date == actual_date
+        rescue
+          # Not valid dates, fall through
+        end
+      end
+
+      false
+    end
+
+    def evaluate_response_contains(response_text, expected_keywords)
+      return true if expected_keywords.empty?
+      return false if response_text.nil? || response_text.empty?
+
+      normalized_response = response_text.downcase
+
+      expected_keywords.all? do |keyword|
+        normalized_response.include?(keyword.to_s.downcase)
+      end
+    end
+
+    def build_instructions
+      # Simple instructions for evaluation - we don't have a real user/family context
+      <<~PROMPT
+      You are a financial assistant helping users understand their financial data.
+      Use the functions available to answer questions about accounts, transactions, and financial statements.
+      Today's date is #{Date.current}.
+    PROMPT
+    end
+
+    def build_function_definitions
+      # Return the function definitions that the chat would normally have
+      [
+        build_function_definition("get_transactions", "Get paginated transactions with optional filters"),
+        build_function_definition("get_accounts", "Get all accounts with balances and historical data"),
+        build_function_definition("get_balance_sheet", "Get current net worth, assets, and liabilities"),
+        build_function_definition("get_income_statement", "Get income and expenses by category for a period")
+      ]
+    end
+
+    def build_function_definition(name, description)
+      {
+        name: name,
+        description: description,
+        params_schema: { type: "object", properties: {}, additionalProperties: true },
+        strict: false
+      }
+    end
+end
--- a/app/models/eval/runners/merchant_detection_runner.rb
+++ b/app/models/eval/runners/merchant_detection_runner.rb
@@ -0,0 +1,199 @@
+class Eval::Runners::MerchantDetectionRunner < Eval::Runners::Base
+  BATCH_SIZE = 25  # Matches Provider::Openai limit
+  FUZZY_MATCH_THRESHOLD = 0.8
+
+  protected
+
+    def process_samples
+      all_samples = samples.to_a
+      log_progress("Processing #{all_samples.size} samples in batches of #{BATCH_SIZE}")
+
+      all_samples.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
+        log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / BATCH_SIZE).ceil}")
+        process_batch(batch)
+      end
+    end
+
+    def calculate_metrics
+      Eval::Metrics::MerchantDetectionMetrics.new(eval_run).calculate
+    end
+
+  private
+
+    def process_batch(batch_samples)
+      # Build inputs for the provider
+      transactions = batch_samples.map do |sample|
+        sample.to_transaction_input.merge(id: sample.id)
+      end
+
+      # Get merchants from first sample's context (should be shared)
+      # Symbolize keys since Provider::Openai::AutoMerchantDetector expects symbol keys
+      merchants = batch_samples.first.merchants_context.map(&:deep_symbolize_keys)
+
+      start_time = Time.current
+
+      begin
+        response = provider.auto_detect_merchants(
+          transactions: transactions,
+          user_merchants: merchants,
+          model: model
+        )
+
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+
+        if response.success?
+          record_batch_results(batch_samples, response.data, per_sample_latency)
+        else
+          record_batch_errors(batch_samples, response.error, per_sample_latency)
+        end
+      rescue => e
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+        record_batch_errors(batch_samples, e, per_sample_latency)
+      end
+    end
+
+    def record_batch_results(batch_samples, merchants_detected, per_sample_latency)
+      batch_samples.each do |sample|
+        # Find the merchant detection result for this sample
+        detection = merchants_detected.find { |m| m.transaction_id.to_s == sample.id.to_s }
+
+        actual_name = normalize_null(detection&.business_name)
+        actual_url = normalize_null(detection&.business_url)
+
+        expected_name = sample.expected_business_name
+        expected_url = sample.expected_business_url
+
+        # Evaluate correctness
+        name_match = evaluate_name_match(actual_name, expected_name)
+        url_match = evaluate_url_match(actual_url, expected_url)
+        fuzzy_score = calculate_fuzzy_score(actual_name, expected_name)
+
+        # Overall correct if both name and URL match expectations
+        correct = name_match && url_match
+
+        # Exact match requires both to be exactly equal
+        exact_match = actual_name == expected_name && normalize_url(actual_url) == normalize_url(expected_url)
+
+        record_result(
+          sample: sample,
+          actual_output: { "business_name" => actual_name, "business_url" => actual_url },
+          correct: correct,
+          exact_match: exact_match,
+          fuzzy_score: fuzzy_score,
+          null_expected: expected_name.nil? && expected_url.nil?,
+          null_returned: actual_name.nil? && actual_url.nil?,
+          latency_ms: per_sample_latency
+        )
+      end
+    end
+
+    def record_batch_errors(batch_samples, error, per_sample_latency)
+      error_message = error.is_a?(Exception) ? error.message : error.to_s
+
+      batch_samples.each do |sample|
+        record_result(
+          sample: sample,
+          actual_output: { "error" => error_message },
+          correct: false,
+          exact_match: false,
+          fuzzy_score: 0.0,
+          null_expected: sample.expected_business_name.nil?,
+          null_returned: true,
+          latency_ms: per_sample_latency,
+          metadata: { "error" => error_message }
+        )
+      end
+    end
+
+    def normalize_null(value)
+      return nil if value.nil?
+      return nil if value == "null"
+      return nil if value.to_s.strip.empty?
+      value
+    end
+
+    def evaluate_name_match(actual, expected)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = false positive
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = false negative
+      return false if actual.nil? && expected.present?
+      # Use fuzzy matching for name comparison
+      fuzzy_match?(actual, expected)
+    end
+
+    def evaluate_url_match(actual, expected)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = false positive
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = false negative
+      return false if actual.nil? && expected.present?
+      # Normalize and compare URLs
+      normalize_url(actual) == normalize_url(expected)
+    end
+
+    def normalize_url(url)
+      return nil if url.nil?
+      url.to_s.downcase
+         .gsub(/^(https?:\/\/)?(www\.)?/, "")
+         .chomp("/")
+         .strip
+    end
+
+    def fuzzy_match?(actual, expected)
+      return false if actual.nil? || expected.nil?
+      calculate_fuzzy_score(actual, expected) >= FUZZY_MATCH_THRESHOLD
+    end
+
+    def calculate_fuzzy_score(actual, expected)
+      return 1.0 if actual == expected
+      return 0.0 if actual.nil? || expected.nil?
+
+      # Simple Levenshtein distance-based similarity
+      # Normalize strings for comparison
+      a = actual.to_s.downcase.strip
+      b = expected.to_s.downcase.strip
+
+      return 1.0 if a == b
+
+      # Calculate Levenshtein distance
+      distance = levenshtein_distance(a, b)
+      max_length = [ a.length, b.length ].max
+
+      return 0.0 if max_length == 0
+
+      # Convert distance to similarity score (0.0 to 1.0)
+      (1.0 - (distance.to_f / max_length)).round(4)
+    end
+
+    def levenshtein_distance(s1, s2)
+      m = s1.length
+      n = s2.length
+
+      return m if n == 0
+      return n if m == 0
+
+      # Create distance matrix
+      d = Array.new(m + 1) { Array.new(n + 1) }
+
+      (0..m).each { |i| d[i][0] = i }
+      (0..n).each { |j| d[0][j] = j }
+
+      (1..n).each do |j|
+        (1..m).each do |i|
+          cost = s1[i - 1] == s2[j - 1] ? 0 : 1
+          d[i][j] = [
+            d[i - 1][j] + 1,      # deletion
+            d[i][j - 1] + 1,      # insertion
+            d[i - 1][j - 1] + cost # substitution
+          ].min
+        end
+      end
+
+      d[m][n]
+    end
+end
--- a/app/models/eval/sample.rb
+++ b/app/models/eval/sample.rb
@@ -0,0 +1,88 @@
+class Eval::Sample < ApplicationRecord
+  self.table_name = "eval_samples"
+
+  belongs_to :dataset, class_name: "Eval::Dataset", foreign_key: :eval_dataset_id
+  has_many :results, class_name: "Eval::Result", foreign_key: :eval_sample_id, dependent: :destroy
+
+  validates :input_data, :expected_output, presence: true
+  validates :difficulty, inclusion: { in: %w[easy medium hard manual edge_case] }
+
+  scope :easy, -> { where(difficulty: "easy") }
+  scope :medium, -> { where(difficulty: "medium") }
+  scope :hard, -> { where(difficulty: "hard") }
+  scope :edge_cases, -> { where(difficulty: "edge_case") }
+  scope :with_tag, ->(tag) { where("? = ANY(tags)", tag) }
+  scope :with_any_tags, ->(tags) { where("tags && ARRAY[?]::varchar[]", tags) }
+
+  # Convert to format expected by AutoCategorizer
+  def to_transaction_input
+    input_data.deep_symbolize_keys
+  end
+
+  # Get categories from context (for categorization evals)
+  def categories_context
+    context_data.dig("categories") || []
+  end
+
+  # Get merchants from context (for merchant detection evals)
+  def merchants_context
+    context_data.dig("merchants") || []
+  end
+
+  # Get mock data from context (for chat evals)
+  def mock_data
+    context_data.dig("mock_data") || input_data.dig("mock_data") || {}
+  end
+
+  # Get the chat prompt (for chat evals)
+  def chat_prompt
+    input_data.dig("prompt") || input_data["prompt"]
+  end
+
+  # Get expected functions (for chat evals)
+  def expected_functions
+    expected_output.dig("functions") || expected_output["functions"] || []
+  end
+
+  # Get expected response keywords (for chat evals)
+  def expected_response_contains
+    expected_output.dig("response_contains") || expected_output["response_contains"] || []
+  end
+
+  # Get expected category name (for categorization evals)
+  def expected_category_name
+    expected_output.dig("category_name") || expected_output["category_name"]
+  end
+
+  # Get acceptable alternative category names (for categorization evals)
+  # These are categories that are also considered correct answers
+  def acceptable_alternatives
+    expected_output.dig("acceptable_alternatives") || expected_output["acceptable_alternatives"] || []
+  end
+
+  # Get all acceptable category names (primary + alternatives)
+  def all_acceptable_categories
+    [ expected_category_name, *acceptable_alternatives ].compact
+  end
+
+  # Get expected merchant info (for merchant detection evals)
+  def expected_business_name
+    expected_output.dig("business_name") || expected_output["business_name"]
+  end
+
+  def expected_business_url
+    expected_output.dig("business_url") || expected_output["business_url"]
+  end
+
+  # Check if null is expected
+  def expects_null?
+    case dataset.eval_type
+    when "categorization"
+      expected_category_name.nil?
+    when "merchant_detection"
+      expected_business_name.nil? && expected_business_url.nil?
+    else
+      false
+    end
+  end
+end
--- a/app/models/provider/openai.rb
+++ b/app/models/provider/openai.rb
@@ -51,7 +51,7 @@ class Provider::Openai < Provider
    @uri_base.present?
  end

-  def auto_categorize(transactions: [], user_categories: [], model: "", family: nil)
+  def auto_categorize(transactions: [], user_categories: [], model: "", family: nil, json_mode: nil)
    with_provider_response do
      raise Error, "Too many transactions to auto-categorize. Max is 25 per request." if transactions.size > 25
      if user_categories.blank?
@@ -74,7 +74,8 @@ class Provider::Openai < Provider
        user_categories: user_categories,
        custom_provider: custom_provider?,
        langfuse_trace: trace,
-        family: family
+        family: family,
+        json_mode: json_mode
      ).auto_categorize

      trace&.update(output: result.map(&:to_h))
@@ -83,7 +84,7 @@ class Provider::Openai < Provider
    end
  end

-  def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil)
+  def auto_detect_merchants(transactions: [], user_merchants: [], model: "", family: nil, json_mode: nil)
    with_provider_response do
      raise Error, "Too many transactions to auto-detect merchants. Max is 25 per request." if transactions.size > 25

@@ -101,7 +102,8 @@ class Provider::Openai < Provider
        user_merchants: user_merchants,
        custom_provider: custom_provider?,
        langfuse_trace: trace,
-        family: family
+        family: family,
+        json_mode: json_mode
      ).auto_detect_merchants

      trace&.update(output: result.map(&:to_h))
--- a/app/models/provider/openai/auto_categorizer.rb
+++ b/app/models/provider/openai/auto_categorizer.rb
@@ -1,9 +1,22 @@
 class Provider::Openai::AutoCategorizer
  include Provider::Openai::Concerns::UsageRecorder

-  attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family
+  # JSON response format modes for custom providers
+  # - "strict": Use strict JSON schema (requires full OpenAI API compatibility)
+  # - "json_object": Use json_object response format (broader compatibility)
+  # - "none": No response format constraint (maximum compatibility with local LLMs)
+  JSON_MODE_STRICT = "strict"
+  JSON_MODE_OBJECT = "json_object"
+  JSON_MODE_NONE = "none"
+  JSON_MODE_AUTO = "auto"

-  def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil)
+  # Threshold for auto mode: if more than this percentage returns null, retry with none mode
+  # This is a heuristic to detect when strict JSON mode is breaking the model's ability to reason
+  AUTO_MODE_NULL_THRESHOLD = 0.5
+
+  attr_reader :client, :model, :transactions, :user_categories, :custom_provider, :langfuse_trace, :family, :json_mode
+
+  def initialize(client, model: "", transactions: [], user_categories: [], custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil)
    @client = client
    @model = model
    @transactions = transactions
@@ -11,6 +24,32 @@ class Provider::Openai::AutoCategorizer
    @custom_provider = custom_provider
    @langfuse_trace = langfuse_trace
    @family = family
+    @json_mode = json_mode || default_json_mode
+  end
+
+  VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze
+
+  # Determine default JSON mode based on configuration hierarchy:
+  # 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override
+  # 2. Setting.openai_json_mode - user-configured in app settings
+  # 3. Default: auto mode (recommended for all providers)
+  #
+  # Mode descriptions:
+  # - "auto": Tries strict first, falls back to none if >50% fail (recommended default)
+  # - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose <think> tags
+  # - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output
+  # - "json_object": Middle ground, broader compatibility than strict
+  def default_json_mode
+    # 1. Check environment variable first (allows runtime override for testing)
+    env_mode = ENV["LLM_JSON_MODE"]
+    return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode)
+
+    # 2. Check app settings (user-configured)
+    setting_mode = Setting.openai_json_mode
+    return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode)
+
+    # 3. Default: auto mode for all providers (tries strict first, falls back to none if needed)
+    JSON_MODE_AUTO
  end

  def auto_categorize
@@ -22,6 +61,40 @@ class Provider::Openai::AutoCategorizer
  end

  def instructions
+    if custom_provider
+      simple_instructions
+    else
+      detailed_instructions
+    end
+  end
+
+  # Simplified instructions for smaller/local LLMs
+  def simple_instructions
+    <<~INSTRUCTIONS.strip_heredoc
+      Categorize transactions into the given categories. Return JSON only. Do not explain your reasoning.
+
+      CRITICAL RULES:
+      1. Match transaction_id exactly from input
+      2. Use EXACT category_name from the provided list, or "null" if unsure
+      3. Match expense transactions to expense categories only
+      4. Match income transactions to income categories only
+      5. Return "null" if the description is generic/ambiguous (e.g., "POS DEBIT", "ACH WITHDRAWAL", "CHECK #1234")
+      6. Prefer MORE SPECIFIC subcategories over general parent categories when available
+
+      CATEGORY HIERARCHY NOTES:
+      - Use "Restaurants" for sit-down restaurants, "Fast Food" for quick service chains
+      - Use "Coffee Shops" for coffee places, "Food & Drink" only when type is unclear
+      - Use "Shopping" for general retail, big-box stores, and online marketplaces
+      - Use "Groceries" for dedicated grocery stores ONLY
+      - For income: use "Salary" for payroll/employer deposits, "Income" for generic income sources
+
+      Output JSON format only (no markdown, no explanation):
+      {"categorizations": [{"transaction_id": "...", "category_name": "..."}]}
+    INSTRUCTIONS
+  end
+
+  # Detailed instructions for larger models like GPT-4
+  def detailed_instructions
    <<~INSTRUCTIONS.strip_heredoc
      You are an assistant to a consumer personal finance app.  You will be provided a list
      of the user's transactions and a list of the user's categories.  Your job is to auto-categorize
@@ -87,19 +160,68 @@ class Provider::Openai::AutoCategorizer
    end

    def auto_categorize_openai_generic
+      if json_mode == JSON_MODE_AUTO
+        auto_categorize_with_auto_mode
+      else
+        auto_categorize_with_mode(json_mode)
+      end
+    rescue Faraday::BadRequestError => e
+      # If strict mode fails (HTTP 400), fall back to none mode
+      # This handles providers that don't support json_schema response format
+      if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO
+        Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}")
+        auto_categorize_with_mode(JSON_MODE_NONE)
+      else
+        raise
+      end
+    end
+
+    # Auto mode: try strict first, fall back to none if too many nulls or missing results
+    #
+    # This uses pure heuristics to detect when strict JSON mode is breaking the model's
+    # ability to reason. Models that can't reason well in strict mode often:
+    # 1. Return null for everything, OR
+    # 2. Simply omit transactions they can't categorize (returning fewer results than input)
+    #
+    # The heuristic is simple: if >50% of results are null or missing, the model likely
+    # needs the freedom to reason in its output (which strict mode prevents).
+    def auto_categorize_with_auto_mode
+      result = auto_categorize_with_mode(JSON_MODE_STRICT)
+
+      null_count = result.count { |r| r.category_name.nil? || r.category_name == "null" }
+      missing_count = transactions.size - result.size
+      failed_count = null_count + missing_count
+      failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0
+
+      if failed_ratio > AUTO_MODE_NULL_THRESHOLD
+        Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode")
+        auto_categorize_with_mode(JSON_MODE_NONE)
+      else
+        result
+      end
+    end
+
+    def auto_categorize_with_mode(mode)
      span = langfuse_trace&.span(name: "auto_categorize_api_call", input: {
        model: model.presence || Provider::Openai::DEFAULT_MODEL,
        transactions: transactions,
-        user_categories: user_categories
+        user_categories: user_categories,
+        json_mode: mode
      })

-      response = client.chat(parameters: {
+      # Build parameters with configurable JSON response format
+      params = {
        model: model.presence || Provider::Openai::DEFAULT_MODEL,
        messages: [
          { role: "system", content: instructions },
-          { role: "user", content: developer_message }
-        ],
-        response_format: {
+          { role: "user", content: developer_message_for_generic }
+        ]
+      }
+
+      # Add response format based on json_mode setting
+      case mode
+      when JSON_MODE_STRICT
+        params[:response_format] = {
          type: "json_schema",
          json_schema: {
            name: "auto_categorize_personal_finance_transactions",
@@ -107,9 +229,14 @@ class Provider::Openai::AutoCategorizer
            schema: json_schema
          }
        }
-      })
+      when JSON_MODE_OBJECT
+        params[:response_format] = { type: "json_object" }
+        # JSON_MODE_NONE: no response_format constraint
+      end

-      Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")}")
+      response = client.chat(parameters: params)
+
+      Rails.logger.info("Tokens used to auto-categorize transactions: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})")

      categorizations = extract_categorizations_generic(response)
      result = build_response(categorizations)
@@ -120,7 +247,8 @@ class Provider::Openai::AutoCategorizer
        operation: "auto_categorize",
        metadata: {
          transaction_count: transactions.size,
-          category_count: user_categories.size
+          category_count: user_categories.size,
+          json_mode: mode
        }
      )

@@ -143,9 +271,72 @@ class Provider::Openai::AutoCategorizer
    end

    def normalize_category_name(category_name)
-      return nil if category_name == "null"
+      # Convert to string to handle non-string LLM outputs (numbers, booleans, etc.)
+      normalized = category_name.to_s.strip
+      return nil if normalized.empty? || normalized == "null" || normalized.downcase == "null"

-      category_name
+      # Try exact match first
+      exact_match = user_categories.find { |c| c[:name] == normalized }
+      return exact_match[:name] if exact_match
+
+      # Try case-insensitive match
+      case_insensitive_match = user_categories.find { |c| c[:name].to_s.downcase == normalized.downcase }
+      return case_insensitive_match[:name] if case_insensitive_match
+
+      # Try partial/fuzzy match (for common variations)
+      fuzzy_match = find_fuzzy_category_match(normalized)
+      return fuzzy_match if fuzzy_match
+
+      # Return normalized string if no match found (will be treated as uncategorized)
+      normalized
+    end
+
+    # Find a fuzzy match for category names with common variations
+    def find_fuzzy_category_match(category_name)
+      # Ensure string input for string operations
+      input_str = category_name.to_s
+      normalized_input = input_str.downcase.gsub(/[^a-z0-9]/, "")
+
+      user_categories.each do |cat|
+        cat_name_str = cat[:name].to_s
+        normalized_cat = cat_name_str.downcase.gsub(/[^a-z0-9]/, "")
+
+        # Check if one contains the other
+        return cat[:name] if normalized_input.include?(normalized_cat) || normalized_cat.include?(normalized_input)
+
+        # Check common abbreviations/variations
+        return cat[:name] if fuzzy_name_match?(input_str, cat_name_str)
+      end
+
+      nil
+    end
+
+    # Handle common naming variations
+    def fuzzy_name_match?(input, category)
+      variations = {
+        "gas" => [ "gas & fuel", "gas and fuel", "fuel", "gasoline" ],
+        "restaurants" => [ "restaurant", "dining", "food" ],
+        "groceries" => [ "grocery", "supermarket", "food store" ],
+        "streaming" => [ "streaming services", "streaming service" ],
+        "rideshare" => [ "ride share", "ride-share", "uber", "lyft" ],
+        "coffee" => [ "coffee shops", "coffee shop", "cafe" ],
+        "fast food" => [ "fastfood", "quick service" ],
+        "gym" => [ "gym & fitness", "fitness", "gym and fitness" ],
+        "flights" => [ "flight", "airline", "airlines", "airfare" ],
+        "hotels" => [ "hotel", "lodging", "accommodation" ]
+      }
+
+      # Ensure string inputs for string operations
+      input_lower = input.to_s.downcase
+      category_lower = category.to_s.downcase
+
+      variations.each do |_key, synonyms|
+        if synonyms.include?(input_lower) && synonyms.include?(category_lower)
+          return true
+        end
+      end
+
+      false
    end

    def extract_categorizations_native(response)
@@ -162,9 +353,107 @@ class Provider::Openai::AutoCategorizer

    def extract_categorizations_generic(response)
      raw = response.dig("choices", 0, "message", "content")
-      JSON.parse(raw).dig("categorizations")
-    rescue JSON::ParserError => e
-      raise Provider::Openai::Error, "Invalid JSON in generic categorization: #{e.message}"
+      parsed = parse_json_flexibly(raw)
+
+      # Handle different response formats from various LLMs
+      categorizations = parsed.dig("categorizations") ||
+                        parsed.dig("results") ||
+                        (parsed.is_a?(Array) ? parsed : nil)
+
+      raise Provider::Openai::Error, "Could not find categorizations in response" if categorizations.nil?
+
+      # Normalize field names (some LLMs use different naming)
+      categorizations.map do |cat|
+        {
+          "transaction_id" => cat["transaction_id"] || cat["id"] || cat["txn_id"],
+          "category_name" => cat["category_name"] || cat["category"] || cat["name"]
+        }
+      end
+    end
+
+    # Flexible JSON parsing that handles common LLM output issues
+    def parse_json_flexibly(raw)
+      return {} if raw.blank?
+
+      # Strip thinking model tags if present (e.g., <think>...</think>)
+      # The actual JSON output comes after the thinking block
+      cleaned = strip_thinking_tags(raw)
+
+      # Try direct parse first
+      JSON.parse(cleaned)
+    rescue JSON::ParserError
+      # Try multiple extraction strategies in order of preference
+
+      # Strategy 1: Closed markdown code blocks (```json...```)
+      if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m
+        matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten
+        matches.reverse_each do |match|
+          begin
+            return JSON.parse(match)
+          rescue JSON::ParserError
+            next
+          end
+        end
+      end
+
+      # Strategy 2: Unclosed markdown code blocks (thinking models often forget to close)
+      # Pattern: ```json followed by JSON that goes to end of string
+      if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Continue to next strategy
+        end
+      end
+
+      # Strategy 3: Find JSON object with "categorizations" key
+      if cleaned =~ /(\{"categorizations"\s*:\s*\[[\s\S]*\]\s*\})/m
+        matches = cleaned.scan(/(\{"categorizations"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten
+        matches.reverse_each do |match|
+          begin
+            return JSON.parse(match)
+          rescue JSON::ParserError
+            next
+          end
+        end
+        # Try greedy match if non-greedy failed
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Continue to next strategy
+        end
+      end
+
+      # Strategy 4: Find any JSON object (last resort)
+      if cleaned =~ /(\{[\s\S]*\})/m
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Fall through to error
+        end
+      end
+
+      raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}"
+    end
+
+    # Strip thinking model tags (<think>...</think>) from response
+    # Some models like Qwen-thinking output reasoning in these tags before the actual response
+    def strip_thinking_tags(raw)
+      # Remove <think>...</think> blocks but keep content after them
+      # If no closing tag, the model may have been cut off - try to extract JSON from inside
+      if raw.include?("<think>")
+        # Check if there's content after the thinking block
+        if raw =~ /<\/think>\s*([\s\S]*)/m
+          after_thinking = $1.strip
+          return after_thinking if after_thinking.present?
+        end
+        # If no content after </think> or no closing tag, look inside the thinking block
+        # The JSON might be the last thing in the thinking block
+        if raw =~ /<think>([\s\S]*)/m
+          return $1
+        end
+      end
+      raw
    end

    def json_schema
@@ -213,4 +502,39 @@ class Provider::Openai::AutoCategorizer
        ```
      MESSAGE
    end
+
+    # Concise developer message optimized for smaller/local LLMs
+    # Uses pattern-based guidance instead of exhaustive examples
+    def developer_message_for_generic
+      <<~MESSAGE.strip_heredoc
+        AVAILABLE CATEGORIES: #{user_categories.map { |c| c[:name] }.join(", ")}
+
+        TRANSACTIONS TO CATEGORIZE:
+        #{format_transactions_simply}
+
+        CATEGORIZATION GUIDELINES:
+        - Prefer specific subcategories over general parent categories when confident
+        - Food delivery services should be categorized based on the underlying merchant type
+        - Square payments (SQ *) should be inferred from the merchant name after the prefix
+        - Warehouse/club stores should be categorized based on their primary purpose
+        - Return "null" for generic transactions (e.g., POS terminals, wire transfers, checks, ATM withdrawals)
+
+        IMPORTANT:
+        - Use EXACT category names from the list above
+        - Return "null" (as a string) if you cannot confidently match a category
+        - Match expense transactions only to expense categories
+        - Match income transactions only to income categories
+        - Do NOT include any explanation or reasoning - only output JSON
+
+        Respond with ONLY this JSON (no markdown code blocks, no other text):
+        {"categorizations": [{"transaction_id": "...", "category_name": "..."}]}
+      MESSAGE
+    end
+
+    # Format transactions in a simpler, more readable way for smaller LLMs
+    def format_transactions_simply
+      transactions.map do |t|
+        "- ID: #{t[:id]}, Amount: #{t[:amount]}, Type: #{t[:classification]}, Description: \"#{t[:description]}\""
+      end.join("\n")
+    end
 end
--- a/app/models/provider/openai/auto_merchant_detector.rb
+++ b/app/models/provider/openai/auto_merchant_detector.rb
@@ -1,9 +1,22 @@
 class Provider::Openai::AutoMerchantDetector
  include Provider::Openai::Concerns::UsageRecorder

-  attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family
+  # JSON response format modes for custom providers
+  # - "strict": Use strict JSON schema (requires full OpenAI API compatibility)
+  # - "json_object": Use json_object response format (broader compatibility)
+  # - "none": No response format constraint (maximum compatibility with local LLMs)
+  # - "auto": Try strict first, fall back to none if poor results
+  JSON_MODE_STRICT = "strict"
+  JSON_MODE_OBJECT = "json_object"
+  JSON_MODE_NONE = "none"
+  JSON_MODE_AUTO = "auto"

-  def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil)
+  # Threshold for auto mode: if more than this percentage returns null, retry with none mode
+  AUTO_MODE_NULL_THRESHOLD = 0.5
+
+  attr_reader :client, :model, :transactions, :user_merchants, :custom_provider, :langfuse_trace, :family, :json_mode
+
+  def initialize(client, model: "", transactions:, user_merchants:, custom_provider: false, langfuse_trace: nil, family: nil, json_mode: nil)
    @client = client
    @model = model
    @transactions = transactions
@@ -11,6 +24,32 @@ class Provider::Openai::AutoMerchantDetector
    @custom_provider = custom_provider
    @langfuse_trace = langfuse_trace
    @family = family
+    @json_mode = json_mode || default_json_mode
+  end
+
+  VALID_JSON_MODES = [ JSON_MODE_STRICT, JSON_MODE_OBJECT, JSON_MODE_NONE, JSON_MODE_AUTO ].freeze
+
+  # Determine default JSON mode based on configuration hierarchy:
+  # 1. Environment variable (LLM_JSON_MODE) - highest priority, for testing/override
+  # 2. Setting.openai_json_mode - user-configured in app settings
+  # 3. Default: auto mode (recommended for all providers)
+  #
+  # Mode descriptions:
+  # - "auto": Tries strict first, falls back to none if >50% fail (recommended default)
+  # - "strict": Best for thinking models (qwen-thinking, deepseek-reasoner) - skips verbose <think> tags
+  # - "none": Best for non-thinking models (gpt-oss, llama, mistral) - allows reasoning in output
+  # - "json_object": Middle ground, broader compatibility than strict
+  def default_json_mode
+    # 1. Check environment variable first (allows runtime override for testing)
+    env_mode = ENV["LLM_JSON_MODE"]
+    return env_mode if env_mode.present? && VALID_JSON_MODES.include?(env_mode)
+
+    # 2. Check app settings (user-configured)
+    setting_mode = Setting.openai_json_mode
+    return setting_mode if setting_mode.present? && VALID_JSON_MODES.include?(setting_mode)
+
+    # 3. Default: auto mode for all providers (tries strict first, falls back to none if needed)
+    JSON_MODE_AUTO
  end

  def auto_detect_merchants
@@ -22,6 +61,32 @@ class Provider::Openai::AutoMerchantDetector
  end

  def instructions
+    if custom_provider
+      simple_instructions
+    else
+      detailed_instructions
+    end
+  end
+
+  # Simplified instructions for smaller/local LLMs
+  def simple_instructions
+    <<~INSTRUCTIONS.strip_heredoc
+      Detect business names and websites from transaction descriptions. Return JSON only.
+
+      Rules:
+      1. Match transaction_id exactly from input
+      2. Return business_name and business_url for known businesses
+      3. Return "null" for both if uncertain or generic (e.g. "Paycheck", "Local diner")
+      4. Don't include "www." in URLs (use "amazon.com" not "www.amazon.com")
+      5. Favor "null" over guessing - only return values if 80%+ confident
+
+      Example output format:
+      {"merchants": [{"transaction_id": "txn_001", "business_name": "Amazon", "business_url": "amazon.com"}]}
+    INSTRUCTIONS
+  end
+
+  # Detailed instructions for larger models like GPT-4
+  def detailed_instructions
    <<~INSTRUCTIONS.strip_heredoc
      You are an assistant to a consumer personal finance app.

@@ -108,19 +173,64 @@ class Provider::Openai::AutoMerchantDetector
    end

    def auto_detect_merchants_openai_generic
+      if json_mode == JSON_MODE_AUTO
+        auto_detect_merchants_with_auto_mode
+      else
+        auto_detect_merchants_with_mode(json_mode)
+      end
+    rescue Faraday::BadRequestError => e
+      # If strict mode fails (HTTP 400), fall back to none mode
+      # This handles providers that don't support json_schema response format
+      if json_mode == JSON_MODE_STRICT || json_mode == JSON_MODE_AUTO
+        Rails.logger.warn("Strict JSON mode failed, falling back to none mode: #{e.message}")
+        auto_detect_merchants_with_mode(JSON_MODE_NONE)
+      else
+        raise
+      end
+    end
+
+    # Auto mode: try strict first, fall back to none if too many nulls or missing results
+    def auto_detect_merchants_with_auto_mode
+      result = auto_detect_merchants_with_mode(JSON_MODE_STRICT)
+
+      # Check if too many nulls OR missing results were returned
+      # Models that can't reason in strict mode often:
+      # 1. Return null for everything, OR
+      # 2. Simply omit transactions they can't detect (returning fewer results than input)
+      null_count = result.count { |r| r.business_name.nil? || r.business_name == "null" }
+      missing_count = transactions.size - result.size
+      failed_count = null_count + missing_count
+      failed_ratio = transactions.size > 0 ? failed_count.to_f / transactions.size : 0.0
+
+      if failed_ratio > AUTO_MODE_NULL_THRESHOLD
+        Rails.logger.info("Auto mode: #{(failed_ratio * 100).round}% failed (#{null_count} nulls, #{missing_count} missing) in strict mode, retrying with none mode")
+        auto_detect_merchants_with_mode(JSON_MODE_NONE)
+      else
+        result
+      end
+    end
+
+    def auto_detect_merchants_with_mode(mode)
      span = langfuse_trace&.span(name: "auto_detect_merchants_api_call", input: {
        model: model.presence || Provider::Openai::DEFAULT_MODEL,
        transactions: transactions,
-        user_merchants: user_merchants
+        user_merchants: user_merchants,
+        json_mode: mode
      })

-      response = client.chat(parameters: {
+      # Build parameters with configurable JSON response format
+      params = {
        model: model.presence || Provider::Openai::DEFAULT_MODEL,
        messages: [
          { role: "system", content: instructions },
-          { role: "user", content: developer_message }
-        ],
-        response_format: {
+          { role: "user", content: developer_message_for_generic }
+        ]
+      }
+
+      # Add response format based on json_mode setting
+      case mode
+      when JSON_MODE_STRICT
+        params[:response_format] = {
          type: "json_schema",
          json_schema: {
            name: "auto_detect_personal_finance_merchants",
@@ -128,9 +238,14 @@ class Provider::Openai::AutoMerchantDetector
            schema: json_schema
          }
        }
-      })
+      when JSON_MODE_OBJECT
+        params[:response_format] = { type: "json_object" }
+        # JSON_MODE_NONE: no response_format constraint
+      end

-      Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")}")
+      response = client.chat(parameters: params)
+
+      Rails.logger.info("Tokens used to auto-detect merchants: #{response.dig("usage", "total_tokens")} (json_mode: #{mode})")

      merchants = extract_merchants_generic(response)
      result = build_response(merchants)
@@ -141,7 +256,8 @@ class Provider::Openai::AutoMerchantDetector
        operation: "auto_detect_merchants",
        metadata: {
          transaction_count: transactions.size,
-          merchant_count: user_merchants.size
+          merchant_count: user_merchants.size,
+          json_mode: mode
        }
      )

@@ -154,24 +270,40 @@ class Provider::Openai::AutoMerchantDetector

    AutoDetectedMerchant = Provider::LlmConcept::AutoDetectedMerchant

-    def build_response(categorizations)
-      categorizations.map do |categorization|
+    def build_response(merchants)
+      merchants.map do |merchant|
        AutoDetectedMerchant.new(
-          transaction_id: categorization.dig("transaction_id"),
-          business_name: normalize_ai_value(categorization.dig("business_name")),
-          business_url: normalize_ai_value(categorization.dig("business_url")),
+          transaction_id: merchant.dig("transaction_id"),
+          business_name: normalize_merchant_value(merchant.dig("business_name")),
+          business_url: normalize_merchant_value(merchant.dig("business_url")),
        )
      end
    end

-    def normalize_ai_value(ai_value)
-      return nil if ai_value == "null"
+    def normalize_merchant_value(value)
+      return nil if value.nil? || value == "null" || value.to_s.downcase == "null"

-      ai_value
+      # Try to match against user merchants for name normalization
+      if user_merchants.present?
+        # Try exact match first
+        exact_match = user_merchants.find { |m| m[:name] == value }
+        return exact_match[:name] if exact_match
+
+        # Try case-insensitive match
+        case_match = user_merchants.find { |m| m[:name].to_s.downcase == value.to_s.downcase }
+        return case_match[:name] if case_match
+      end
+
+      value
    end

    def extract_merchants_native(response)
-      raw = response.dig("output", 0, "content", 0, "text")
+      # Find the message output (not reasoning output)
+      message_output = response["output"]&.find { |o| o["type"] == "message" }
+      raw = message_output&.dig("content", 0, "text")
+
+      raise Provider::Openai::Error, "No message content found in response" if raw.nil?
+
      JSON.parse(raw).dig("merchants")
    rescue JSON::ParserError => e
      raise Provider::Openai::Error, "Invalid JSON in native merchant detection: #{e.message}"
@@ -179,9 +311,100 @@ class Provider::Openai::AutoMerchantDetector

    def extract_merchants_generic(response)
      raw = response.dig("choices", 0, "message", "content")
-      JSON.parse(raw).dig("merchants")
-    rescue JSON::ParserError => e
-      raise Provider::Openai::Error, "Invalid JSON in generic merchant detection: #{e.message}"
+      parsed = parse_json_flexibly(raw)
+
+      # Handle different response formats from various LLMs
+      merchants = parsed.dig("merchants") ||
+                  parsed.dig("results") ||
+                  (parsed.is_a?(Array) ? parsed : nil)
+
+      raise Provider::Openai::Error, "Could not find merchants in response" if merchants.nil?
+
+      # Normalize field names (some LLMs use different naming)
+      merchants.map do |m|
+        {
+          "transaction_id" => m["transaction_id"] || m["id"] || m["txn_id"],
+          "business_name" => m["business_name"] || m["name"] || m["merchant_name"] || m["merchant"],
+          "business_url" => m["business_url"] || m["url"] || m["website"]
+        }
+      end
+    end
+
+    # Flexible JSON parsing that handles common LLM output issues
+    def parse_json_flexibly(raw)
+      return {} if raw.blank?
+
+      # Strip thinking model tags if present (e.g., <think>...</think>)
+      cleaned = strip_thinking_tags(raw)
+
+      # Try direct parse first
+      JSON.parse(cleaned)
+    rescue JSON::ParserError
+      # Try multiple extraction strategies in order of preference
+
+      # Strategy 1: Closed markdown code blocks (```json...```)
+      if cleaned =~ /```(?:json)?\s*(\{[\s\S]*?\})\s*```/m
+        matches = cleaned.scan(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/m).flatten
+        matches.reverse_each do |match|
+          begin
+            return JSON.parse(match)
+          rescue JSON::ParserError
+            next
+          end
+        end
+      end
+
+      # Strategy 2: Unclosed markdown code blocks (thinking models often forget to close)
+      if cleaned =~ /```(?:json)?\s*(\{[\s\S]*\})\s*$/m
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Continue to next strategy
+        end
+      end
+
+      # Strategy 3: Find JSON object with "merchants" key
+      if cleaned =~ /(\{"merchants"\s*:\s*\[[\s\S]*\]\s*\})/m
+        matches = cleaned.scan(/(\{"merchants"\s*:\s*\[[\s\S]*?\]\s*\})/m).flatten
+        matches.reverse_each do |match|
+          begin
+            return JSON.parse(match)
+          rescue JSON::ParserError
+            next
+          end
+        end
+        # Try greedy match if non-greedy failed
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Continue to next strategy
+        end
+      end
+
+      # Strategy 4: Find any JSON object (last resort)
+      if cleaned =~ /(\{[\s\S]*\})/m
+        begin
+          return JSON.parse($1)
+        rescue JSON::ParserError
+          # Fall through to error
+        end
+      end
+
+      raise Provider::Openai::Error, "Could not parse JSON from response: #{raw.truncate(200)}"
+    end
+
+    # Strip thinking model tags (<think>...</think>) from response
+    def strip_thinking_tags(raw)
+      if raw.include?("<think>")
+        if raw =~ /<\/think>\s*([\s\S]*)/m
+          after_thinking = $1.strip
+          return after_thinking if after_thinking.present?
+        end
+        if raw =~ /<think>([\s\S]*)/m
+          return $1
+        end
+      end
+      raw
    end

    def json_schema
@@ -235,4 +458,40 @@ class Provider::Openai::AutoMerchantDetector
        Return "null" if you are not 80%+ confident in your answer.
      MESSAGE
    end
+
+    # Enhanced developer message with few-shot examples for smaller/local LLMs
+    def developer_message_for_generic
+      merchant_names = user_merchants.present? ? user_merchants.map { |m| m[:name] }.join(", ") : "(none provided)"
+
+      <<~MESSAGE.strip_heredoc
+        USER'S KNOWN MERCHANTS: #{merchant_names}
+
+        TRANSACTIONS TO ANALYZE:
+        #{format_transactions_simply}
+
+        EXAMPLES of correct merchant detection:
+        - "AMAZON.COM*1A2B3C" → business_name: "Amazon", business_url: "amazon.com"
+        - "STARBUCKS STORE #9876" → business_name: "Starbucks", business_url: "starbucks.com"
+        - "NETFLIX.COM" → business_name: "Netflix", business_url: "netflix.com"
+        - "UBER *TRIP" → business_name: "Uber", business_url: "uber.com"
+        - "ACH WITHDRAWAL" → business_name: "null", business_url: "null" (generic)
+        - "LOCAL DINER" → business_name: "null", business_url: "null" (generic/unknown)
+        - "POS DEBIT 12345" → business_name: "null", business_url: "null" (generic)
+
+        IMPORTANT:
+        - Return "null" (as a string) for BOTH name and URL if you cannot confidently identify the business
+        - Don't include "www." in URLs
+        - Generic descriptions like "Paycheck", "Transfer", "ATM" should return "null"
+
+        Respond with ONLY this JSON format (no other text):
+        {"merchants": [{"transaction_id": "...", "business_name": "...", "business_url": "..."}]}
+      MESSAGE
+    end
+
+    # Format transactions in a simpler, more readable way for smaller LLMs
+    def format_transactions_simply
+      transactions.map do |t|
+        "- ID: #{t[:id]}, Description: \"#{t[:name] || t[:description]}\""
+      end.join("\n")
+    end
 end
--- a/app/models/setting.rb
+++ b/app/models/setting.rb
@@ -9,6 +9,7 @@ class Setting < RailsSettings::Base
  field :openai_access_token, type: :string, default: ENV["OPENAI_ACCESS_TOKEN"]
  field :openai_uri_base, type: :string, default: ENV["OPENAI_URI_BASE"]
  field :openai_model, type: :string, default: ENV["OPENAI_MODEL"]
+  field :openai_json_mode, type: :string, default: ENV["LLM_JSON_MODE"]
  field :brand_fetch_client_id, type: :string, default: ENV["BRAND_FETCH_CLIENT_ID"]

  # Provider selection
--- a/app/views/settings/hostings/_openai_settings.html.erb
+++ b/app/views/settings/hostings/_openai_settings.html.erb
@@ -47,5 +47,20 @@
                      inputmode: "text",
                      disabled: ENV["OPENAI_MODEL"].present?,
                      data: { "auto-submit-form-target": "auto" } %>
+
+  <%= form.select :openai_json_mode,
+                  options_for_select(
+                    [
+                      [t(".json_mode_auto"), ""],
+                      [t(".json_mode_strict"), "strict"],
+                      [t(".json_mode_none"), "none"],
+                      [t(".json_mode_json_object"), "json_object"]
+                    ],
+                    Setting.openai_json_mode
+                  ),
+                  { label: t(".json_mode_label") },
+                  { disabled: ENV["LLM_JSON_MODE"].present?,
+                    data: { "auto-submit-form-target": "auto" } } %>
+  <p class="text-xs text-secondary mt-1"><%= t(".json_mode_help") %></p>
  <% end %>
 </div>
--- a/config/locales/views/settings/hostings/en.yml
+++ b/config/locales/views/settings/hostings/en.yml
@@ -48,6 +48,12 @@ en:
        uri_base_placeholder: "https://api.openai.com/v1 (default)"
        model_label: Model (Optional)
        model_placeholder: "gpt-4.1 (default)"
+        json_mode_label: JSON Mode
+        json_mode_auto: Auto (recommended)
+        json_mode_strict: Strict (best for thinking models)
+        json_mode_none: None (best for standard models)
+        json_mode_json_object: JSON Object
+        json_mode_help: "Strict mode works best with thinking models (qwen-thinking, deepseek-reasoner). None mode works best with standard models (llama, mistral, gpt-oss)."
        title: OpenAI
      yahoo_finance_settings:
        title: Yahoo Finance
--- a/db/eval_data/categorization_golden_v1.yml
+++ b/db/eval_data/categorization_golden_v1.yml
--- a/db/eval_data/categorization_golden_v1_light.yml
+++ b/db/eval_data/categorization_golden_v1_light.yml
@@ -0,0 +1,769 @@
+---
+name: categorization_golden_v1_light
+description: Lightweight golden dataset for quick transaction categorization evaluation
+eval_type: categorization
+version: "1.0"
+metadata:
+  created_at: "2025-12-04"
+  updated_at: "2025-12-04"
+  source: manual_curation
+  notes: |
+    A compact 50-sample dataset designed for quick evaluation runs.
+    Includes a balanced mix across:
+    - All difficulty levels (easy, medium, hard, edge_case)
+    - All major category types
+    - Both US and European merchants
+    - Representative edge cases
+
+    Difficulty distribution:
+    - easy: 20 samples
+    - medium: 15 samples
+    - hard: 10 samples
+    - edge_case: 5 samples
+
+context:
+  categories:
+    - id: "income"
+      name: "Income"
+      classification: "income"
+      is_subcategory: false
+    - id: "salary"
+      name: "Salary"
+      classification: "income"
+      is_subcategory: true
+      parent_id: "income"
+    - id: "food_and_drink"
+      name: "Food & Drink"
+      classification: "expense"
+      is_subcategory: false
+    - id: "restaurants"
+      name: "Restaurants"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "food_and_drink"
+    - id: "fast_food"
+      name: "Fast Food"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "food_and_drink"
+    - id: "groceries"
+      name: "Groceries"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "food_and_drink"
+    - id: "coffee_shops"
+      name: "Coffee Shops"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "food_and_drink"
+    - id: "shopping"
+      name: "Shopping"
+      classification: "expense"
+      is_subcategory: false
+    - id: "clothing"
+      name: "Clothing"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "shopping"
+    - id: "electronics"
+      name: "Electronics"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "shopping"
+    - id: "transportation"
+      name: "Transportation"
+      classification: "expense"
+      is_subcategory: false
+    - id: "gas"
+      name: "Gas & Fuel"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "transportation"
+    - id: "rideshare"
+      name: "Rideshare"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "transportation"
+    - id: "public_transit"
+      name: "Public Transit"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "transportation"
+    - id: "entertainment"
+      name: "Entertainment"
+      classification: "expense"
+      is_subcategory: false
+    - id: "streaming"
+      name: "Streaming Services"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "entertainment"
+    - id: "utilities"
+      name: "Utilities"
+      classification: "expense"
+      is_subcategory: false
+    - id: "housing"
+      name: "Housing"
+      classification: "expense"
+      is_subcategory: false
+    - id: "rent"
+      name: "Rent"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "housing"
+    - id: "health"
+      name: "Health & Wellness"
+      classification: "expense"
+      is_subcategory: false
+    - id: "pharmacy"
+      name: "Pharmacy"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "health"
+    - id: "gym"
+      name: "Gym & Fitness"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "health"
+    - id: "travel"
+      name: "Travel"
+      classification: "expense"
+      is_subcategory: false
+    - id: "flights"
+      name: "Flights"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "travel"
+    - id: "hotels"
+      name: "Hotels"
+      classification: "expense"
+      is_subcategory: true
+      parent_id: "travel"
+    - id: "subscriptions"
+      name: "Subscriptions"
+      classification: "expense"
+      is_subcategory: false
+    - id: "personal_care"
+      name: "Personal Care"
+      classification: "expense"
+      is_subcategory: false
+    - id: "gifts"
+      name: "Gifts & Donations"
+      classification: "expense"
+      is_subcategory: false
+
+samples:
+  # =============================================================================
+  # EASY SAMPLES (20 samples) - Clear, unambiguous merchants
+  # =============================================================================
+
+  # Fast Food
+  - id: cat_light_easy_001
+    difficulty: easy
+    tags: [fast_food, us]
+    input:
+      id: txn_light_001
+      amount: 12.99
+      classification: expense
+      description: "MCDONALD'S #12345"
+    expected:
+      category_name: "Fast Food"
+
+  - id: cat_light_easy_002
+    difficulty: easy
+    tags: [fast_food, us]
+    input:
+      id: txn_light_002
+      amount: 14.50
+      classification: expense
+      description: "CHIPOTLE MEXICAN GRILL"
+    expected:
+      category_name: "Fast Food"
+
+  # Coffee Shops
+  - id: cat_light_easy_003
+    difficulty: easy
+    tags: [coffee_shops, us]
+    input:
+      id: txn_light_003
+      amount: 5.75
+      classification: expense
+      description: "STARBUCKS STORE #9876"
+    expected:
+      category_name: "Coffee Shops"
+
+  - id: cat_light_easy_004
+    difficulty: easy
+    tags: [coffee_shops, europe, uk]
+    input:
+      id: txn_light_004
+      amount: 4.50
+      classification: expense
+      description: "COSTA COFFEE LTD"
+    expected:
+      category_name: "Coffee Shops"
+
+  # Groceries
+  - id: cat_light_easy_005
+    difficulty: easy
+    tags: [groceries, us]
+    input:
+      id: txn_light_005
+      amount: 156.32
+      classification: expense
+      description: "WHOLE FOODS MKT #10234"
+    expected:
+      category_name: "Groceries"
+
+  - id: cat_light_easy_006
+    difficulty: easy
+    tags: [groceries, europe, uk]
+    input:
+      id: txn_light_006
+      amount: 87.50
+      classification: expense
+      description: "TESCO STORES LTD"
+    expected:
+      category_name: "Groceries"
+
+  - id: cat_light_easy_007
+    difficulty: easy
+    tags: [groceries, europe, germany]
+    input:
+      id: txn_light_007
+      amount: 78.90
+      classification: expense
+      description: "LIDL DIENSTLEISTUNG"
+    expected:
+      category_name: "Groceries"
+
+  # Gas & Fuel
+  - id: cat_light_easy_008
+    difficulty: easy
+    tags: [gas, us]
+    input:
+      id: txn_light_008
+      amount: 45.00
+      classification: expense
+      description: "SHELL OIL 573849234"
+    expected:
+      category_name: "Gas & Fuel"
+
+  - id: cat_light_easy_009
+    difficulty: easy
+    tags: [gas, europe, uk]
+    input:
+      id: txn_light_009
+      amount: 75.00
+      classification: expense
+      description: "BP OIL UK LTD"
+    expected:
+      category_name: "Gas & Fuel"
+
+  # Rideshare
+  - id: cat_light_easy_010
+    difficulty: easy
+    tags: [rideshare, us]
+    input:
+      id: txn_light_010
+      amount: 23.50
+      classification: expense
+      description: "UBER *TRIP HELP.UBER.COM"
+    expected:
+      category_name: "Rideshare"
+
+  # Streaming
+  - id: cat_light_easy_011
+    difficulty: easy
+    tags: [streaming, us]
+    input:
+      id: txn_light_011
+      amount: 15.99
+      classification: expense
+      description: "NETFLIX.COM"
+    expected:
+      category_name: "Streaming Services"
+
+  - id: cat_light_easy_012
+    difficulty: easy
+    tags: [streaming, us]
+    input:
+      id: txn_light_012
+      amount: 10.99
+      classification: expense
+      description: "SPOTIFY USA"
+    expected:
+      category_name: "Streaming Services"
+
+  # Electronics
+  - id: cat_light_easy_013
+    difficulty: easy
+    tags: [electronics, us]
+    input:
+      id: txn_light_013
+      amount: 299.99
+      classification: expense
+      description: "BEST BUY 00000456"
+    expected:
+      category_name: "Electronics"
+      acceptable_alternatives: ["Shopping"]
+
+  # Clothing
+  - id: cat_light_easy_014
+    difficulty: easy
+    tags: [clothing, europe, spain]
+    input:
+      id: txn_light_014
+      amount: 79.99
+      classification: expense
+      description: "ZARA ESPANA SA"
+    expected:
+      category_name: "Clothing"
+      acceptable_alternatives: ["Shopping"]
+
+  # Pharmacy
+  - id: cat_light_easy_015
+    difficulty: easy
+    tags: [pharmacy, us]
+    input:
+      id: txn_light_015
+      amount: 24.99
+      classification: expense
+      description: "CVS/PHARMACY #4567"
+    expected:
+      category_name: "Pharmacy"
+
+  # Flights
+  - id: cat_light_easy_016
+    difficulty: easy
+    tags: [flights, us]
+    input:
+      id: txn_light_016
+      amount: 345.00
+      classification: expense
+      description: "UNITED AIRLINES 0162345678"
+    expected:
+      category_name: "Flights"
+
+  - id: cat_light_easy_017
+    difficulty: easy
+    tags: [flights, europe, ireland]
+    input:
+      id: txn_light_017
+      amount: 89.99
+      classification: expense
+      description: "RYANAIR DAC"
+    expected:
+      category_name: "Flights"
+
+  # Hotels
+  - id: cat_light_easy_018
+    difficulty: easy
+    tags: [hotels, us]
+    input:
+      id: txn_light_018
+      amount: 189.00
+      classification: expense
+      description: "MARRIOTT HOTELS NYC"
+    expected:
+      category_name: "Hotels"
+
+  # Gym
+  - id: cat_light_easy_019
+    difficulty: easy
+    tags: [gym, us]
+    input:
+      id: txn_light_019
+      amount: 39.99
+      classification: expense
+      description: "PLANET FITNESS MONTHLY"
+    expected:
+      category_name: "Gym & Fitness"
+
+  # Income
+  - id: cat_light_easy_020
+    difficulty: easy
+    tags: [income, salary, us]
+    input:
+      id: txn_light_020
+      amount: 3500.00
+      classification: income
+      description: "ACME CORP PAYROLL"
+    expected:
+      category_name: "Salary"
+
+  # =============================================================================
+  # MEDIUM SAMPLES (15 samples) - Requires domain knowledge
+  # =============================================================================
+
+  # Restaurants
+  - id: cat_light_med_001
+    difficulty: medium
+    tags: [restaurants, us]
+    input:
+      id: txn_light_med_001
+      amount: 67.50
+      classification: expense
+      description: "OLIVE GARDEN #456"
+    expected:
+      category_name: "Restaurants"
+
+  - id: cat_light_med_002
+    difficulty: medium
+    tags: [restaurants, europe, uk]
+    input:
+      id: txn_light_med_002
+      amount: 78.50
+      classification: expense
+      description: "WAGAMAMA LTD LONDON"
+    expected:
+      category_name: "Restaurants"
+
+  # Warehouse stores
+  - id: cat_light_med_003
+    difficulty: medium
+    tags: [groceries, us, warehouse]
+    input:
+      id: txn_light_med_003
+      amount: 234.56
+      classification: expense
+      description: "COSTCO WHSE #1234"
+    expected:
+      category_name: "Groceries"
+      acceptable_alternatives: ["Shopping"]
+
+  # Utilities
+  - id: cat_light_med_004
+    difficulty: medium
+    tags: [utilities, us]
+    input:
+      id: txn_light_med_004
+      amount: 125.00
+      classification: expense
+      description: "CON EDISON PAYMENT"
+    expected:
+      category_name: "Utilities"
+
+  - id: cat_light_med_005
+    difficulty: medium
+    tags: [utilities, europe, uk]
+    input:
+      id: txn_light_med_005
+      amount: 156.00
+      classification: expense
+      description: "BRITISH GAS SERVICES"
+    expected:
+      category_name: "Utilities"
+
+  - id: cat_light_med_006
+    difficulty: medium
+    tags: [utilities, us]
+    input:
+      id: txn_light_med_006
+      amount: 89.00
+      classification: expense
+      description: "AT&T WIRELESS"
+    expected:
+      category_name: "Utilities"
+
+  # Public Transit
+  - id: cat_light_med_007
+    difficulty: medium
+    tags: [public_transit, us]
+    input:
+      id: txn_light_med_007
+      amount: 127.00
+      classification: expense
+      description: "MTA *METROCARD"
+    expected:
+      category_name: "Public Transit"
+
+  - id: cat_light_med_008
+    difficulty: medium
+    tags: [public_transit, europe, uk]
+    input:
+      id: txn_light_med_008
+      amount: 156.50
+      classification: expense
+      description: "TFL TRAVEL LONDON"
+    expected:
+      category_name: "Public Transit"
+
+  # Housing
+  - id: cat_light_med_009
+    difficulty: medium
+    tags: [rent, us]
+    input:
+      id: txn_light_med_009
+      amount: 2100.00
+      classification: expense
+      description: "AVALON APARTMENTS RENT"
+    expected:
+      category_name: "Rent"
+      acceptable_alternatives: ["Housing"]
+
+  # Subscriptions
+  - id: cat_light_med_010
+    difficulty: medium
+    tags: [subscriptions, us]
+    input:
+      id: txn_light_med_010
+      amount: 9.99
+      classification: expense
+      description: "APPLE.COM/BILL"
+    expected:
+      category_name: "Subscriptions"
+
+  # Gifts & Donations
+  - id: cat_light_med_011
+    difficulty: medium
+    tags: [gifts, us, donation]
+    input:
+      id: txn_light_med_011
+      amount: 50.00
+      classification: expense
+      description: "RED CROSS DONATION"
+    expected:
+      category_name: "Gifts & Donations"
+
+  # Entertainment
+  - id: cat_light_med_012
+    difficulty: medium
+    tags: [entertainment, us]
+    input:
+      id: txn_light_med_012
+      amount: 89.00
+      classification: expense
+      description: "TICKETMASTER *EVENT"
+    expected:
+      category_name: "Entertainment"
+
+  # Travel
+  - id: cat_light_med_013
+    difficulty: medium
+    tags: [hotels, us]
+    input:
+      id: txn_light_med_013
+      amount: 234.00
+      classification: expense
+      description: "AIRBNB *HMQT5J6QQJ"
+    expected:
+      category_name: "Hotels"
+      acceptable_alternatives: ["Travel"]
+
+  # Personal Care
+  - id: cat_light_med_014
+    difficulty: medium
+    tags: [personal_care, us]
+    input:
+      id: txn_light_med_014
+      amount: 45.00
+      classification: expense
+      description: "SUPERCUTS #1234"
+    expected:
+      category_name: "Personal Care"
+
+  # Income
+  - id: cat_light_med_015
+    difficulty: medium
+    tags: [income, us]
+    input:
+      id: txn_light_med_015
+      amount: 500.00
+      classification: income
+      description: "VENMO CASHOUT"
+    expected:
+      category_name: "Income"
+
+  # =============================================================================
+  # HARD SAMPLES (10 samples) - Ambiguous, multiple interpretations
+  # =============================================================================
+
+  # Big-box stores
+  - id: cat_light_hard_001
+    difficulty: hard
+    tags: [ambiguous, us, multi_purpose_retailer]
+    input:
+      id: txn_light_hard_001
+      amount: 156.78
+      classification: expense
+      description: "TARGET #1234"
+    expected:
+      category_name: "Shopping"
+      acceptable_alternatives: ["Groceries"]
+
+  - id: cat_light_hard_002
+    difficulty: hard
+    tags: [ambiguous, europe, uk, multi_purpose_retailer]
+    input:
+      id: txn_light_hard_002
+      amount: 156.00
+      classification: expense
+      description: "MARKS & SPENCER PLC"
+    expected:
+      category_name: "Shopping"
+      acceptable_alternatives: ["Groceries", "Clothing"]
+
+  # Online marketplaces
+  - id: cat_light_hard_003
+    difficulty: hard
+    tags: [ambiguous, us, online_marketplace]
+    input:
+      id: txn_light_hard_003
+      amount: 89.99
+      classification: expense
+      description: "AMAZON.COM*1A2B3C4D"
+    expected:
+      category_name: "Shopping"
+
+  # Payment processors (should be null)
+  - id: cat_light_hard_004
+    difficulty: hard
+    tags: [ambiguous, us, payment_processor]
+    input:
+      id: txn_light_hard_004
+      amount: 78.00
+      classification: expense
+      description: "PAYPAL *JOHNSMITH"
+    expected:
+      category_name: null
+
+  # Fast-casual
+  - id: cat_light_hard_005
+    difficulty: hard
+    tags: [ambiguous, us, fast_casual]
+    input:
+      id: txn_light_hard_005
+      amount: 34.50
+      classification: expense
+      description: "PANERA BREAD #567"
+    expected:
+      category_name: "Restaurants"
+      acceptable_alternatives: ["Fast Food"]
+
+  # Delivery services
+  - id: cat_light_hard_006
+    difficulty: hard
+    tags: [ambiguous, us, delivery_service]
+    input:
+      id: txn_light_hard_006
+      amount: 45.00
+      classification: expense
+      description: "DOORDASH*CHIPOTLE"
+    expected:
+      category_name: "Fast Food"
+      acceptable_alternatives: ["Restaurants"]
+
+  - id: cat_light_hard_007
+    difficulty: hard
+    tags: [ambiguous, europe, uk, delivery_service]
+    input:
+      id: txn_light_hard_007
+      amount: 32.50
+      classification: expense
+      description: "DELIVEROO UK LTD"
+    expected:
+      category_name: "Restaurants"
+      acceptable_alternatives: ["Fast Food"]
+
+  # Amazon Prime
+  - id: cat_light_hard_008
+    difficulty: hard
+    tags: [ambiguous, us, amazon]
+    input:
+      id: txn_light_hard_008
+      amount: 14.99
+      classification: expense
+      description: "AMAZON PRIME*1A2B3C"
+    expected:
+      category_name: "Subscriptions"
+
+  # Convenience store
+  - id: cat_light_hard_009
+    difficulty: hard
+    tags: [ambiguous, us, convenience_store]
+    input:
+      id: txn_light_hard_009
+      amount: 12.50
+      classification: expense
+      description: "7-ELEVEN #34567"
+    expected:
+      category_name: "Groceries"
+      acceptable_alternatives: ["Fast Food"]
+
+  # Streaming vs Subscription
+  - id: cat_light_hard_010
+    difficulty: hard
+    tags: [ambiguous, us, streaming_subscription]
+    input:
+      id: txn_light_hard_010
+      amount: 15.99
+      classification: expense
+      description: "HBO MAX"
+    expected:
+      category_name: "Streaming Services"
+      acceptable_alternatives: ["Subscriptions"]
+
+  # =============================================================================
+  # EDGE CASES (5 samples) - Should return null
+  # =============================================================================
+
+  # Generic POS
+  - id: cat_light_edge_001
+    difficulty: edge_case
+    tags: [should_be_null, generic_pos]
+    input:
+      id: txn_light_edge_001
+      amount: 15.00
+      classification: expense
+      description: "POS DEBIT 12345"
+    expected:
+      category_name: null
+
+  # ACH transfer
+  - id: cat_light_edge_002
+    difficulty: edge_case
+    tags: [should_be_null, transfer]
+    input:
+      id: txn_light_edge_002
+      amount: 100.00
+      classification: expense
+      description: "ACH WITHDRAWAL"
+    expected:
+      category_name: null
+
+  # ATM
+  - id: cat_light_edge_003
+    difficulty: edge_case
+    tags: [should_be_null, atm]
+    input:
+      id: txn_light_edge_003
+      amount: 200.00
+      classification: expense
+      description: "ATM WITHDRAWAL 12345"
+    expected:
+      category_name: null
+
+  # Check
+  - id: cat_light_edge_004
+    difficulty: edge_case
+    tags: [should_be_null, check]
+    input:
+      id: txn_light_edge_004
+      amount: 350.00
+      classification: expense
+      description: "CHECK #1234"
+    expected:
+      category_name: null
+
+  # Cryptic
+  - id: cat_light_edge_005
+    difficulty: edge_case
+    tags: [should_be_null, cryptic]
+    input:
+      id: txn_light_edge_005
+      amount: 45.67
+      classification: expense
+      description: "TXN*89234*AUTH"
+    expected:
+      category_name: null
--- a/db/eval_data/categorization_golden_v2.yml
+++ b/db/eval_data/categorization_golden_v2.yml
--- a/db/eval_data/chat_golden_v1.yml
+++ b/db/eval_data/chat_golden_v1.yml
@@ -0,0 +1,825 @@
+---
+name: chat_golden_v1
+description: Golden dataset for chat/assistant function calling evaluation
+eval_type: chat
+version: "1.0"
+metadata:
+  created_at: "2024-12-01"
+  source: manual_curation
+
+samples:
+  # ===== EASY - Simple single function calls =====
+  - id: chat_easy_001
+    difficulty: easy
+    tags: [get_accounts, simple]
+    input:
+      prompt: "What accounts do I have?"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_002
+    difficulty: easy
+    tags: [get_accounts, simple]
+    input:
+      prompt: "Show me my accounts"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_003
+    difficulty: easy
+    tags: [get_accounts, balance]
+    input:
+      prompt: "What's my account balance?"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_004
+    difficulty: easy
+    tags: [get_transactions, simple]
+    input:
+      prompt: "Show me my recent transactions"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_005
+    difficulty: easy
+    tags: [get_transactions, simple]
+    input:
+      prompt: "What are my latest transactions?"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_006
+    difficulty: easy
+    tags: [get_balance_sheet, simple]
+    input:
+      prompt: "What's my net worth?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_007
+    difficulty: easy
+    tags: [get_balance_sheet, simple]
+    input:
+      prompt: "Show me my assets and liabilities"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_008
+    difficulty: easy
+    tags: [get_income_statement, simple]
+    input:
+      prompt: "What were my expenses last month?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_009
+    difficulty: easy
+    tags: [get_income_statement, simple]
+    input:
+      prompt: "How much income did I make this month?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_010
+    difficulty: easy
+    tags: [get_accounts, simple]
+    input:
+      prompt: "How many accounts do I have?"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_011
+    difficulty: easy
+    tags: [get_transactions, simple]
+    input:
+      prompt: "List my transactions"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_012
+    difficulty: easy
+    tags: [get_balance_sheet, simple]
+    input:
+      prompt: "How much do I owe?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_013
+    difficulty: easy
+    tags: [get_balance_sheet, simple]
+    input:
+      prompt: "What are my total assets?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_014
+    difficulty: easy
+    tags: [get_income_statement, simple]
+    input:
+      prompt: "Show my spending"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_015
+    difficulty: easy
+    tags: [get_income_statement, simple]
+    input:
+      prompt: "How much did I spend?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  # ===== MEDIUM - With filtering or specific parameters =====
+  - id: chat_medium_001
+    difficulty: medium
+    tags: [get_transactions, filtering]
+    input:
+      prompt: "Show me my restaurant spending"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_002
+    difficulty: medium
+    tags: [get_transactions, filtering]
+    input:
+      prompt: "What did I spend on groceries?"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_003
+    difficulty: medium
+    tags: [get_transactions, filtering]
+    input:
+      prompt: "Show transactions over $100"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_004
+    difficulty: medium
+    tags: [get_transactions, filtering]
+    input:
+      prompt: "What did I spend at Amazon?"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_005
+    difficulty: medium
+    tags: [get_transactions, date_range]
+    input:
+      prompt: "Show me last week's transactions"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_006
+    difficulty: medium
+    tags: [get_income_statement, date_range]
+    input:
+      prompt: "What was my income in January?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_007
+    difficulty: medium
+    tags: [get_income_statement, comparison]
+    input:
+      prompt: "How much did I save last month?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_008
+    difficulty: medium
+    tags: [get_accounts, specific]
+    input:
+      prompt: "What's the balance in my checking account?"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_009
+    difficulty: medium
+    tags: [get_accounts, specific]
+    input:
+      prompt: "How much do I have in savings?"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_010
+    difficulty: medium
+    tags: [get_transactions, category]
+    input:
+      prompt: "Show me all my subscription payments"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_011
+    difficulty: medium
+    tags: [get_transactions, search]
+    input:
+      prompt: "Find transactions from Uber"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_012
+    difficulty: medium
+    tags: [get_income_statement, category]
+    input:
+      prompt: "How much do I spend on entertainment?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_013
+    difficulty: medium
+    tags: [get_balance_sheet, trend]
+    input:
+      prompt: "How has my net worth changed over time?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_014
+    difficulty: medium
+    tags: [get_transactions, amount]
+    input:
+      prompt: "What's my largest expense this month?"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_015
+    difficulty: medium
+    tags: [get_income_statement, breakdown]
+    input:
+      prompt: "Break down my expenses by category"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_016
+    difficulty: medium
+    tags: [get_transactions, recurring]
+    input:
+      prompt: "Show me my recurring payments"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_017
+    difficulty: medium
+    tags: [get_accounts, credit]
+    input:
+      prompt: "What's my credit card balance?"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_018
+    difficulty: medium
+    tags: [get_income_statement, specific]
+    input:
+      prompt: "How much did I spend on food last month?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_019
+    difficulty: medium
+    tags: [get_transactions, date]
+    input:
+      prompt: "Show transactions from December"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_020
+    difficulty: medium
+    tags: [get_balance_sheet, liability]
+    input:
+      prompt: "What are my debts?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  # ===== HARD - Analysis, comparisons, insights =====
+  - id: chat_hard_001
+    difficulty: hard
+    tags: [analysis, spending_trend]
+    input:
+      prompt: "Am I spending more than I make?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_002
+    difficulty: hard
+    tags: [comparison, month_over_month]
+    input:
+      prompt: "How does my spending this month compare to last month?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_003
+    difficulty: hard
+    tags: [analysis, budget]
+    input:
+      prompt: "Where can I cut expenses?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_004
+    difficulty: hard
+    tags: [analysis, savings]
+    input:
+      prompt: "What's my savings rate?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_005
+    difficulty: hard
+    tags: [analysis, trend]
+    input:
+      prompt: "Are my expenses trending up or down?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_006
+    difficulty: hard
+    tags: [analysis, category]
+    input:
+      prompt: "What category do I spend the most on?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_007
+    difficulty: hard
+    tags: [analysis, unusual]
+    input:
+      prompt: "Are there any unusual transactions this month?"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_008
+    difficulty: hard
+    tags: [analysis, debt]
+    input:
+      prompt: "How long will it take to pay off my credit card?"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_009
+    difficulty: hard
+    tags: [analysis, financial_health]
+    input:
+      prompt: "What's my debt-to-income ratio?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_010
+    difficulty: hard
+    tags: [analysis, goals]
+    input:
+      prompt: "Can I afford to save $500 more per month?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_011
+    difficulty: hard
+    tags: [comparison, year_over_year]
+    input:
+      prompt: "How does this year compare to last year?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_012
+    difficulty: hard
+    tags: [analysis, pattern]
+    input:
+      prompt: "Do I have any spending patterns I should know about?"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_013
+    difficulty: hard
+    tags: [advice, budget]
+    input:
+      prompt: "How should I allocate my income?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_014
+    difficulty: hard
+    tags: [analysis, efficiency]
+    input:
+      prompt: "Am I overspending on subscriptions?"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_015
+    difficulty: hard
+    tags: [forecast, projection]
+    input:
+      prompt: "At this rate, how much will I have saved by year end?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  # ===== EDGE CASES - Unclear intent, no function needed =====
+  - id: chat_edge_001
+    difficulty: edge_case
+    tags: [no_function, greeting]
+    input:
+      prompt: "Hello"
+    expected:
+      functions: []
+      response_contains: []
+
+  - id: chat_edge_002
+    difficulty: edge_case
+    tags: [no_function, thanks]
+    input:
+      prompt: "Thank you!"
+    expected:
+      functions: []
+      response_contains: []
+
+  - id: chat_edge_003
+    difficulty: edge_case
+    tags: [no_function, general]
+    input:
+      prompt: "What can you help me with?"
+    expected:
+      functions: []
+      response_contains: []
+
+  - id: chat_edge_004
+    difficulty: edge_case
+    tags: [no_function, advice]
+    input:
+      prompt: "Should I invest in stocks?"
+    expected:
+      functions: []
+      response_contains: []
+
+  - id: chat_edge_005
+    difficulty: edge_case
+    tags: [no_function, external]
+    input:
+      prompt: "What's the weather like?"
+    expected:
+      functions: []
+      response_contains: []
+
+  - id: chat_edge_006
+    difficulty: edge_case
+    tags: [ambiguous]
+    input:
+      prompt: "Tell me about my money"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_edge_007
+    difficulty: edge_case
+    tags: [ambiguous]
+    input:
+      prompt: "How am I doing financially?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_edge_008
+    difficulty: edge_case
+    tags: [ambiguous]
+    input:
+      prompt: "Give me a summary"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_edge_009
+    difficulty: edge_case
+    tags: [no_function, off_topic]
+    input:
+      prompt: "What's 2 + 2?"
+    expected:
+      functions: []
+      response_contains: []
+
+  - id: chat_edge_010
+    difficulty: edge_case
+    tags: [no_function, general]
+    input:
+      prompt: "Who are you?"
+    expected:
+      functions: []
+      response_contains: []
+
+  # Additional samples
+  - id: chat_easy_016
+    difficulty: easy
+    tags: [get_transactions]
+    input:
+      prompt: "Pull up my transactions"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_017
+    difficulty: easy
+    tags: [get_accounts]
+    input:
+      prompt: "Show all my bank accounts"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_018
+    difficulty: easy
+    tags: [get_balance_sheet]
+    input:
+      prompt: "What do I own?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_019
+    difficulty: easy
+    tags: [get_income_statement]
+    input:
+      prompt: "What's my income?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_easy_020
+    difficulty: easy
+    tags: [get_transactions]
+    input:
+      prompt: "Recent purchases"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_021
+    difficulty: medium
+    tags: [get_transactions, merchant]
+    input:
+      prompt: "How much have I spent at Starbucks?"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_022
+    difficulty: medium
+    tags: [get_transactions, category]
+    input:
+      prompt: "Show transportation expenses"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_023
+    difficulty: medium
+    tags: [get_income_statement, period]
+    input:
+      prompt: "Quarterly expense report"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_024
+    difficulty: medium
+    tags: [get_accounts, type]
+    input:
+      prompt: "Show my investment accounts"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_medium_025
+    difficulty: medium
+    tags: [get_transactions, amount]
+    input:
+      prompt: "Transactions under $50"
+    expected:
+      functions:
+        - name: "get_transactions"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_016
+    difficulty: hard
+    tags: [analysis, discretionary]
+    input:
+      prompt: "How much discretionary spending do I have?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_017
+    difficulty: hard
+    tags: [analysis, fixed_vs_variable]
+    input:
+      prompt: "What are my fixed vs variable expenses?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_018
+    difficulty: hard
+    tags: [analysis, emergency_fund]
+    input:
+      prompt: "Do I have enough for an emergency fund?"
+    expected:
+      functions:
+        - name: "get_balance_sheet"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_019
+    difficulty: hard
+    tags: [analysis, liquidity]
+    input:
+      prompt: "How liquid are my assets?"
+    expected:
+      functions:
+        - name: "get_accounts"
+          params: {}
+      response_contains: []
+
+  - id: chat_hard_020
+    difficulty: hard
+    tags: [comparison, benchmark]
+    input:
+      prompt: "Am I spending too much on housing?"
+    expected:
+      functions:
+        - name: "get_income_statement"
+          params: {}
+      response_contains: []
--- a/db/eval_data/merchant_detection_golden_v1.yml
+++ b/db/eval_data/merchant_detection_golden_v1.yml
--- a/db/migrate/20251201084101_create_eval_tables.rb
+++ b/db/migrate/20251201084101_create_eval_tables.rb
@@ -0,0 +1,81 @@
+class CreateEvalTables < ActiveRecord::Migration[7.2]
+  def change
+    # Eval Datasets - Golden dataset containers
+    create_table :eval_datasets, id: :uuid do |t|
+      t.string :name, null: false
+      t.string :description
+      t.string :eval_type, null: false
+      t.string :version, null: false, default: "1.0"
+      t.integer :sample_count, default: 0
+      t.jsonb :metadata, default: {}
+      t.boolean :active, default: true
+
+      t.timestamps
+    end
+
+    add_index :eval_datasets, :name, unique: true
+    add_index :eval_datasets, [ :eval_type, :active ]
+
+    # Eval Samples - Individual test cases
+    create_table :eval_samples, id: :uuid do |t|
+      t.references :eval_dataset, null: false, foreign_key: true, type: :uuid
+      t.jsonb :input_data, null: false
+      t.jsonb :expected_output, null: false
+      t.jsonb :context_data, default: {}
+      t.string :difficulty, default: "medium"
+      t.string :tags, array: true, default: []
+      t.jsonb :metadata, default: {}
+
+      t.timestamps
+    end
+
+    add_index :eval_samples, [ :eval_dataset_id, :difficulty ]
+    add_index :eval_samples, :tags, using: :gin
+
+    # Eval Runs - Evaluation execution records
+    create_table :eval_runs, id: :uuid do |t|
+      t.references :eval_dataset, null: false, foreign_key: true, type: :uuid
+      t.string :name
+      t.string :status, null: false, default: "pending"
+      t.string :provider, null: false
+      t.string :model, null: false
+      t.jsonb :provider_config, default: {}
+      t.jsonb :metrics, default: {}
+      t.integer :total_prompt_tokens, default: 0
+      t.integer :total_completion_tokens, default: 0
+      t.decimal :total_cost, precision: 10, scale: 6, default: 0.0
+      t.datetime :started_at
+      t.datetime :completed_at
+      t.text :error_message
+
+      t.timestamps
+    end
+
+    add_index :eval_runs, [ :eval_dataset_id, :model ]
+    add_index :eval_runs, [ :provider, :model ]
+    add_index :eval_runs, :status
+
+    # Eval Results - Individual sample results
+    create_table :eval_results, id: :uuid do |t|
+      t.references :eval_run, null: false, foreign_key: true, type: :uuid
+      t.references :eval_sample, null: false, foreign_key: true, type: :uuid
+      t.jsonb :actual_output, null: false
+      t.boolean :correct, null: false
+      t.boolean :exact_match, default: false
+      t.boolean :hierarchical_match, default: false
+      t.boolean :null_expected, default: false
+      t.boolean :null_returned, default: false
+      t.float :fuzzy_score
+      t.integer :latency_ms
+      t.integer :prompt_tokens
+      t.integer :completion_tokens
+      t.decimal :cost, precision: 10, scale: 6
+      t.jsonb :metadata, default: {}
+
+      t.timestamps
+    end
+
+    add_index :eval_results, [ :eval_run_id, :correct ]
+    # eval_sample_id index is automatically created by t.references
+  end
+end
--- a/db/migrate/20251203133213_add_alternative_match_to_eval_results.rb
+++ b/db/migrate/20251203133213_add_alternative_match_to_eval_results.rb
@@ -0,0 +1,5 @@
+class AddAlternativeMatchToEvalResults < ActiveRecord::Migration[7.2]
+  def change
+    add_column :eval_results, :alternative_match, :boolean, default: false
+  end
+end
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -307,6 +307,80 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
    t.index ["import_id"], name: "index_entries_on_import_id"
  end

+  create_table "eval_datasets", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
+    t.string "name", null: false
+    t.string "description"
+    t.string "eval_type", null: false
+    t.string "version", default: "1.0", null: false
+    t.integer "sample_count", default: 0
+    t.jsonb "metadata", default: {}
+    t.boolean "active", default: true
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["eval_type", "active"], name: "index_eval_datasets_on_eval_type_and_active"
+    t.index ["name"], name: "index_eval_datasets_on_name", unique: true
+  end
+
+  create_table "eval_results", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
+    t.uuid "eval_run_id", null: false
+    t.uuid "eval_sample_id", null: false
+    t.jsonb "actual_output", null: false
+    t.boolean "correct", null: false
+    t.boolean "exact_match", default: false
+    t.boolean "hierarchical_match", default: false
+    t.boolean "null_expected", default: false
+    t.boolean "null_returned", default: false
+    t.float "fuzzy_score"
+    t.integer "latency_ms"
+    t.integer "prompt_tokens"
+    t.integer "completion_tokens"
+    t.decimal "cost", precision: 10, scale: 6
+    t.jsonb "metadata", default: {}
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.boolean "alternative_match", default: false
+    t.index ["eval_run_id", "correct"], name: "index_eval_results_on_eval_run_id_and_correct"
+    t.index ["eval_run_id"], name: "index_eval_results_on_eval_run_id"
+    t.index ["eval_sample_id"], name: "index_eval_results_on_eval_sample_id"
+  end
+
+  create_table "eval_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
+    t.uuid "eval_dataset_id", null: false
+    t.string "name"
+    t.string "status", default: "pending", null: false
+    t.string "provider", null: false
+    t.string "model", null: false
+    t.jsonb "provider_config", default: {}
+    t.jsonb "metrics", default: {}
+    t.integer "total_prompt_tokens", default: 0
+    t.integer "total_completion_tokens", default: 0
+    t.decimal "total_cost", precision: 10, scale: 6, default: "0.0"
+    t.datetime "started_at"
+    t.datetime "completed_at"
+    t.text "error_message"
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["eval_dataset_id", "model"], name: "index_eval_runs_on_eval_dataset_id_and_model"
+    t.index ["eval_dataset_id"], name: "index_eval_runs_on_eval_dataset_id"
+    t.index ["provider", "model"], name: "index_eval_runs_on_provider_and_model"
+    t.index ["status"], name: "index_eval_runs_on_status"
+  end
+
+  create_table "eval_samples", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
+    t.uuid "eval_dataset_id", null: false
+    t.jsonb "input_data", null: false
+    t.jsonb "expected_output", null: false
+    t.jsonb "context_data", default: {}
+    t.string "difficulty", default: "medium"
+    t.string "tags", default: [], array: true
+    t.jsonb "metadata", default: {}
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["eval_dataset_id", "difficulty"], name: "index_eval_samples_on_eval_dataset_id_and_difficulty"
+    t.index ["eval_dataset_id"], name: "index_eval_samples_on_eval_dataset_id"
+    t.index ["tags"], name: "index_eval_samples_on_tags", using: :gin
+  end
+
  create_table "exchange_rates", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
    t.string "from_currency", null: false
    t.string "to_currency", null: false
@@ -789,6 +863,21 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
    t.index ["rule_id"], name: "index_rule_conditions_on_rule_id"
  end

+  create_table "rule_runs", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
+    t.uuid "rule_id", null: false
+    t.string "execution_type", null: false
+    t.string "status", null: false
+    t.integer "transactions_processed", default: 0, null: false
+    t.integer "transactions_modified", default: 0, null: false
+    t.datetime "executed_at", null: false
+    t.text "error_message"
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["executed_at"], name: "index_rule_runs_on_executed_at"
+    t.index ["rule_id", "executed_at"], name: "index_rule_runs_on_rule_id_and_executed_at"
+    t.index ["rule_id"], name: "index_rule_runs_on_rule_id"
+  end
+
  create_table "rules", id: :uuid, default: -> { "gen_random_uuid()" }, force: :cascade do |t|
    t.uuid "family_id", null: false
    t.string "resource_type", null: false
@@ -991,6 +1080,8 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
    t.datetime "updated_at", null: false
    t.string "currency"
    t.jsonb "locked_attributes", default: {}
+    t.uuid "category_id"
+    t.index ["category_id"], name: "index_trades_on_category_id"
    t.index ["security_id"], name: "index_trades_on_security_id"
  end

@@ -1095,6 +1186,10 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
  add_foreign_key "enable_banking_items", "families"
  add_foreign_key "entries", "accounts", on_delete: :cascade
  add_foreign_key "entries", "imports"
+  add_foreign_key "eval_results", "eval_runs"
+  add_foreign_key "eval_results", "eval_samples"
+  add_foreign_key "eval_runs", "eval_datasets"
+  add_foreign_key "eval_samples", "eval_datasets"
  add_foreign_key "family_exports", "families"
  add_foreign_key "holdings", "account_providers"
  add_foreign_key "holdings", "accounts", on_delete: :cascade
@@ -1136,6 +1231,7 @@ ActiveRecord::Schema[7.2].define(version: 2025_12_06_131244) do
  add_foreign_key "taggings", "tags"
  add_foreign_key "tags", "families"
  add_foreign_key "tool_calls", "messages"
+  add_foreign_key "trades", "categories"
  add_foreign_key "trades", "securities"
  add_foreign_key "transactions", "categories", on_delete: :nullify
  add_foreign_key "transactions", "merchants"
--- a/lib/tasks/evals.rake
+++ b/lib/tasks/evals.rake
@@ -0,0 +1,739 @@
+namespace :evals do
+  desc "List all evaluation datasets"
+  task list_datasets: :environment do
+    datasets = Eval::Dataset.order(:eval_type, :name)
+
+    if datasets.empty?
+      puts "No datasets found. Import a dataset with: rake evals:import_dataset[path/to/file.yml]"
+      next
+    end
+
+    puts "=" * 80
+    puts "Available Evaluation Datasets"
+    puts "=" * 80
+    puts
+
+    datasets.group_by(&:eval_type).each do |eval_type, type_datasets|
+      puts "#{eval_type.titleize}:"
+      puts "-" * 40
+
+      type_datasets.each do |dataset|
+        status = dataset.active ? "active" : "inactive"
+        puts "  #{dataset.name} (v#{dataset.version}) - #{dataset.sample_count} samples [#{status}]"
+        puts "    #{dataset.description}" if dataset.description.present?
+      end
+      puts
+    end
+  end
+
+  desc "Import dataset from YAML file"
+  task :import_dataset, [ :file_path ] => :environment do |_t, args|
+    file_path = args[:file_path] || ENV["FILE"]
+
+    if file_path.blank?
+      puts "Usage: rake evals:import_dataset[path/to/file.yml]"
+      puts "   or: FILE=path/to/file.yml rake evals:import_dataset"
+      exit 1
+    end
+
+    unless File.exist?(file_path)
+      puts "Error: File not found: #{file_path}"
+      exit 1
+    end
+
+    puts "Importing dataset from #{file_path}..."
+
+    dataset = Eval::Dataset.import_from_yaml(file_path)
+
+    puts "Successfully imported dataset:"
+    puts "  Name: #{dataset.name}"
+    puts "  Type: #{dataset.eval_type}"
+    puts "  Version: #{dataset.version}"
+    puts "  Samples: #{dataset.sample_count}"
+
+    stats = dataset.statistics
+    puts "  By difficulty: #{stats[:by_difficulty].map { |k, v| "#{k}=#{v}" }.join(', ')}"
+  end
+
+  desc "Run evaluation against a model"
+  task :run, [ :dataset_name, :model ] => :environment do |_t, args|
+    dataset_name = args[:dataset_name] || ENV["DATASET"]
+    model = args[:model] || ENV["MODEL"] || "gpt-4.1"
+    provider = ENV["PROVIDER"] || "openai"
+
+    if dataset_name.blank?
+      puts "Usage: rake evals:run[dataset_name,model]"
+      puts "   or: DATASET=name MODEL=gpt-4 rake evals:run"
+      exit 1
+    end
+
+    dataset = Eval::Dataset.find_by(name: dataset_name)
+
+    if dataset.nil?
+      puts "Error: Dataset '#{dataset_name}' not found"
+      puts "Available datasets:"
+      Eval::Dataset.pluck(:name).each { |n| puts "  - #{n}" }
+      exit 1
+    end
+
+    run_name = "#{dataset_name}_#{model}_#{Time.current.strftime('%Y%m%d_%H%M%S')}"
+
+    puts "=" * 80
+    puts "Starting Evaluation Run"
+    puts "=" * 80
+    puts "  Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
+    puts "  Type: #{dataset.eval_type}"
+    puts "  Model: #{model}"
+    puts "  Provider: #{provider}"
+    puts "  Run Name: #{run_name}"
+    puts
+
+    eval_run = Eval::Run.create!(
+      dataset: dataset,
+      provider: provider,
+      model: model,
+      name: run_name,
+      status: "pending"
+    )
+
+    runner = dataset.runner_class.new(eval_run)
+
+    puts "Running evaluation..."
+    start_time = Time.current
+
+    begin
+      result = runner.run
+      duration = (Time.current - start_time).round(1)
+
+      puts
+      puts "=" * 80
+      puts "Evaluation Complete"
+      puts "=" * 80
+      puts "  Status: #{result.status}"
+      puts "  Duration: #{duration}s"
+      puts "  Run ID: #{result.id}"
+      puts
+      puts "Metrics:"
+      result.metrics.each do |key, value|
+        next if value.is_a?(Hash) # Skip nested metrics for summary
+        puts "  #{key}: #{format_metric_value(value)}"
+      end
+
+      # Show difficulty breakdown if available
+      if result.metrics["by_difficulty"].present?
+        puts
+        puts "By Difficulty:"
+        result.metrics["by_difficulty"].each do |difficulty, stats|
+          puts "  #{difficulty}: #{stats['accuracy']}% accuracy (#{stats['correct']}/#{stats['count']})"
+        end
+      end
+    rescue => e
+      puts
+      puts "Evaluation FAILED: #{e.message}"
+      puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
+      exit 1
+    end
+  end
+
+  desc "Compare multiple models on a dataset"
+  task :compare, [ :dataset_name ] => :environment do |_t, args|
+    dataset_name = args[:dataset_name] || ENV["DATASET"]
+    models = (ENV["MODELS"] || "gpt-4.1,gpt-4o-mini").split(",").map(&:strip)
+    provider = ENV["PROVIDER"] || "openai"
+
+    if dataset_name.blank?
+      puts "Usage: MODELS=model1,model2 rake evals:compare[dataset_name]"
+      exit 1
+    end
+
+    dataset = Eval::Dataset.find_by!(name: dataset_name)
+
+    puts "=" * 80
+    puts "Model Comparison"
+    puts "=" * 80
+    puts "  Dataset: #{dataset.name}"
+    puts "  Models: #{models.join(', ')}"
+    puts
+
+    runs = models.map do |model|
+      puts "Running evaluation for #{model}..."
+
+      eval_run = Eval::Run.create!(
+        dataset: dataset,
+        provider: provider,
+        model: model,
+        name: "compare_#{model}_#{Time.current.to_i}",
+        status: "pending"
+      )
+
+      runner = dataset.runner_class.new(eval_run)
+      runner.run
+    end
+
+    puts
+    puts "=" * 80
+    puts "Comparison Results"
+    puts "=" * 80
+    puts
+
+    reporter = Eval::Reporters::ComparisonReporter.new(runs)
+    puts reporter.to_table
+
+    summary = reporter.summary
+    if summary.present?
+      puts
+      puts "Recommendations:"
+      puts "  Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
+      puts "  Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
+      puts "  Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
+      puts
+      puts "  #{summary[:recommendation]}"
+    end
+
+    # Export to CSV if requested
+    if ENV["CSV"].present?
+      csv_path = reporter.to_csv(ENV["CSV"])
+      puts
+      puts "Exported to: #{csv_path}"
+    end
+  end
+
+  desc "Generate report for specific runs"
+  task :report, [ :run_ids ] => :environment do |_t, args|
+    run_ids = (args[:run_ids] || ENV["RUN_IDS"])&.split(",")
+
+    runs = if run_ids.present?
+      Eval::Run.where(id: run_ids)
+    else
+      Eval::Run.completed.order(created_at: :desc).limit(5)
+    end
+
+    if runs.empty?
+      puts "No runs found."
+      exit 1
+    end
+
+    reporter = Eval::Reporters::ComparisonReporter.new(runs)
+
+    puts reporter.to_table
+
+    summary = reporter.summary
+    if summary.present?
+      puts
+      puts "Summary:"
+      puts "  Best Accuracy: #{summary[:best_accuracy][:model]} (#{summary[:best_accuracy][:value]}%)"
+      puts "  Lowest Cost: #{summary[:lowest_cost][:model]} ($#{summary[:lowest_cost][:value]})"
+      puts "  Fastest: #{summary[:fastest][:model]} (#{summary[:fastest][:value]}ms)"
+    end
+
+    if ENV["CSV"].present?
+      csv_path = reporter.to_csv(ENV["CSV"])
+      puts
+      puts "Exported to: #{csv_path}"
+    end
+  end
+
+  desc "Quick smoke test to verify provider configuration"
+  task smoke_test: :environment do
+    puts "Running smoke test..."
+
+    provider = Provider::Registry.get_provider(:openai)
+
+    unless provider
+      puts "FAIL: OpenAI provider not configured"
+      puts "Set OPENAI_ACCESS_TOKEN environment variable or configure in settings"
+      exit 1
+    end
+
+    puts "  Provider: #{provider.provider_name}"
+    puts "  Model: #{provider.instance_variable_get(:@default_model)}"
+
+    # Test with a single categorization sample
+    result = provider.auto_categorize(
+      transactions: [
+        { id: "test", amount: 10, classification: "expense", description: "McDonalds" }
+      ],
+      user_categories: [
+        { id: "1", name: "Food & Drink", classification: "expense" }
+      ]
+    )
+
+    if result.success?
+      category = result.data.first&.category_name
+      puts "  Test result: #{category || 'null'}"
+      puts
+      puts "PASS: Provider is working correctly"
+    else
+      puts "FAIL: #{result.error.message}"
+      exit 1
+    end
+  end
+
+  desc "Run CI regression test"
+  task ci_regression: :environment do
+    dataset_name = ENV["EVAL_DATASET"] || "categorization_golden_v1"
+    model = ENV["EVAL_MODEL"] || "gpt-4.1-mini"
+    threshold = (ENV["EVAL_THRESHOLD"] || "80").to_f
+
+    dataset = Eval::Dataset.find_by(name: dataset_name)
+
+    unless dataset
+      puts "Dataset '#{dataset_name}' not found. Skipping CI regression test."
+      exit 0
+    end
+
+    # Get baseline from last successful run
+    baseline_run = dataset.runs.completed.for_model(model).order(created_at: :desc).first
+
+    # Run new evaluation
+    eval_run = Eval::Run.create!(
+      dataset: dataset,
+      provider: "openai",
+      model: model,
+      name: "ci_regression_#{Time.current.to_i}",
+      status: "pending"
+    )
+
+    runner = dataset.runner_class.new(eval_run)
+    result = runner.run
+
+    current_accuracy = result.metrics["accuracy"] || 0
+
+    puts "CI Regression Test Results:"
+    puts "  Model: #{model}"
+    puts "  Current Accuracy: #{current_accuracy}%"
+
+    if baseline_run
+      baseline_accuracy = baseline_run.metrics["accuracy"] || 0
+      puts "  Baseline Accuracy: #{baseline_accuracy}%"
+
+      accuracy_diff = current_accuracy - baseline_accuracy
+
+      if accuracy_diff < -5
+        puts
+        puts "REGRESSION DETECTED!"
+        puts "Accuracy dropped by #{accuracy_diff.abs}% (threshold: 5%)"
+        exit 1
+      end
+
+      puts "  Difference: #{accuracy_diff > 0 ? '+' : ''}#{accuracy_diff.round(2)}%"
+    end
+
+    if current_accuracy < threshold
+      puts
+      puts "BELOW THRESHOLD!"
+      puts "Accuracy #{current_accuracy}% is below required #{threshold}%"
+      exit 1
+    end
+
+    puts
+    puts "CI Regression Test PASSED"
+  end
+
+  desc "List recent evaluation runs"
+  task list_runs: :environment do
+    runs = Eval::Run.order(created_at: :desc).limit(20)
+
+    if runs.empty?
+      puts "No runs found."
+      next
+    end
+
+    puts "=" * 100
+    puts "Recent Evaluation Runs"
+    puts "=" * 100
+
+    runs.each do |run|
+      status_icon = case run.status
+      when "completed" then "[OK]"
+      when "failed" then "[FAIL]"
+      when "running" then "[...]"
+      else "[?]"
+      end
+
+      accuracy = run.metrics["accuracy"] ? "#{run.metrics['accuracy']}%" : "-"
+
+      puts "#{status_icon} #{run.id[0..7]} | #{run.model.ljust(15)} | #{run.dataset.name.ljust(25)} | #{accuracy.rjust(8)} | #{run.created_at.strftime('%Y-%m-%d %H:%M')}"
+    end
+  end
+
+  desc "Show details for a specific run"
+  task :show_run, [ :run_id ] => :environment do |_t, args|
+    run_id = args[:run_id] || ENV["RUN_ID"]
+
+    if run_id.blank?
+      puts "Usage: rake evals:show_run[run_id]"
+      exit 1
+    end
+
+    run = Eval::Run.find_by(id: run_id) || Eval::Run.find_by("id::text LIKE ?", "#{run_id}%")
+
+    unless run
+      puts "Run not found: #{run_id}"
+      exit 1
+    end
+
+    puts "=" * 80
+    puts "Evaluation Run Details"
+    puts "=" * 80
+    puts
+    puts "Run ID: #{run.id}"
+    puts "Name: #{run.name}"
+    puts "Dataset: #{run.dataset.name}"
+    puts "Model: #{run.model}"
+    puts "Provider: #{run.provider}"
+    puts "Status: #{run.status}"
+    puts "Created: #{run.created_at}"
+    puts "Duration: #{run.duration_seconds}s" if run.duration_seconds
+
+    if run.error_message.present?
+      puts
+      puts "Error: #{run.error_message}"
+    end
+
+    if run.metrics.present?
+      puts
+      puts "Metrics:"
+      run.metrics.each do |key, value|
+        if value.is_a?(Hash)
+          puts "  #{key}:"
+          value.each { |k, v| puts "    #{k}: #{v}" }
+        else
+          puts "  #{key}: #{format_metric_value(value)}"
+        end
+      end
+    end
+
+    # Show sample of incorrect results
+    incorrect = run.results.incorrect.limit(5)
+    if incorrect.any?
+      puts
+      puts "Sample Incorrect Results (#{run.results.incorrect.count} total):"
+      incorrect.each do |result|
+        puts "  Sample: #{result.sample_id[0..7]}"
+        puts "    Expected: #{result.sample.expected_output}"
+        puts "    Actual: #{result.actual_output}"
+        puts
+      end
+    end
+  end
+
+  # =============================================================================
+  # Langfuse Integration
+  # =============================================================================
+
+  namespace :langfuse do
+    desc "Check Langfuse configuration"
+    task check: :environment do
+      begin
+        client = Eval::Langfuse::Client.new
+        puts "✓ Langfuse credentials configured"
+
+        # Try to list datasets to verify connection
+        response = client.list_datasets(limit: 1)
+        puts "✓ Successfully connected to Langfuse"
+        puts "  Region: #{ENV['LANGFUSE_REGION'] || 'us (default)'}"
+      rescue Eval::Langfuse::Client::ConfigurationError => e
+        puts "✗ #{e.message}"
+        exit 1
+      rescue Eval::Langfuse::Client::ApiError => e
+        puts "✗ Failed to connect to Langfuse: #{e.message}"
+        exit 1
+      end
+    end
+
+    desc "Upload dataset to Langfuse"
+    task :upload_dataset, [ :dataset_name ] => :environment do |_t, args|
+      dataset_name = args[:dataset_name] || ENV["DATASET"]
+
+      if dataset_name.blank?
+        puts "Usage: rake evals:langfuse:upload_dataset[dataset_name]"
+        puts "   or: DATASET=name rake evals:langfuse:upload_dataset"
+        exit 1
+      end
+
+      dataset = Eval::Dataset.find_by(name: dataset_name)
+
+      if dataset.nil?
+        puts "Error: Dataset '#{dataset_name}' not found"
+        puts "Available datasets:"
+        Eval::Dataset.pluck(:name).each { |n| puts "  - #{n}" }
+        exit 1
+      end
+
+      puts "=" * 80
+      puts "Uploading Dataset to Langfuse"
+      puts "=" * 80
+      puts "  Dataset: #{dataset.name}"
+      puts "  Type: #{dataset.eval_type}"
+      puts "  Samples: #{dataset.sample_count}"
+      puts
+
+      begin
+        exporter = Eval::Langfuse::DatasetExporter.new(dataset)
+        result = exporter.export
+
+        puts
+        puts "✓ Successfully uploaded dataset to Langfuse"
+        puts "  Langfuse dataset name: #{result[:dataset_name]}"
+        puts "  Items exported: #{result[:items_exported]}"
+        puts
+        puts "View in Langfuse: https://cloud.langfuse.com/project/datasets"
+      rescue Eval::Langfuse::Client::ConfigurationError => e
+        puts "✗ #{e.message}"
+        exit 1
+      rescue Eval::Langfuse::Client::ApiError => e
+        puts "✗ Langfuse API error: #{e.message}"
+        exit 1
+      end
+    end
+
+    desc "Run experiment in Langfuse"
+    task :run_experiment, [ :dataset_name, :model ] => :environment do |_t, args|
+      dataset_name = args[:dataset_name] || ENV["DATASET"]
+      model = args[:model] || ENV["MODEL"] || "gpt-4.1"
+      provider = ENV["PROVIDER"] || "openai"
+      run_name = ENV["RUN_NAME"]
+
+      if dataset_name.blank?
+        puts "Usage: rake evals:langfuse:run_experiment[dataset_name,model]"
+        puts "   or: DATASET=name MODEL=gpt-4.1 rake evals:langfuse:run_experiment"
+        puts
+        puts "Optional environment variables:"
+        puts "  PROVIDER=openai (default)"
+        puts "  RUN_NAME=custom_run_name"
+        exit 1
+      end
+
+      dataset = Eval::Dataset.find_by(name: dataset_name)
+
+      if dataset.nil?
+        puts "Error: Dataset '#{dataset_name}' not found"
+        puts "Available datasets:"
+        Eval::Dataset.pluck(:name).each { |n| puts "  - #{n}" }
+        exit 1
+      end
+
+      puts "=" * 80
+      puts "Running Langfuse Experiment"
+      puts "=" * 80
+      puts "  Dataset: #{dataset.name} (#{dataset.sample_count} samples)"
+      puts "  Type: #{dataset.eval_type}"
+      puts "  Model: #{model}"
+      puts "  Provider: #{provider}"
+      puts
+
+      begin
+        runner = Eval::Langfuse::ExperimentRunner.new(
+          dataset,
+          model: model,
+          provider: provider
+        )
+
+        start_time = Time.current
+        result = runner.run(run_name: run_name)
+        duration = (Time.current - start_time).round(1)
+
+        puts
+        puts "=" * 80
+        puts "Experiment Complete"
+        puts "=" * 80
+        puts "  Run Name: #{result[:run_name]}"
+        puts "  Duration: #{duration}s"
+        puts
+        puts "Results:"
+        puts "  Accuracy: #{result[:metrics][:accuracy]}%"
+        puts "  Correct: #{result[:metrics][:correct]}/#{result[:metrics][:total]}"
+        puts "  Avg Latency: #{result[:metrics][:avg_latency_ms]}ms"
+        puts
+        puts "View in Langfuse:"
+        puts "  Dataset: https://cloud.langfuse.com/project/datasets"
+        puts "  Traces: https://cloud.langfuse.com/project/traces"
+      rescue Eval::Langfuse::Client::ConfigurationError => e
+        puts "✗ #{e.message}"
+        exit 1
+      rescue Eval::Langfuse::Client::ApiError => e
+        puts "✗ Langfuse API error: #{e.message}"
+        exit 1
+      rescue => e
+        puts "✗ Error: #{e.message}"
+        puts e.backtrace.first(5).join("\n") if ENV["DEBUG"]
+        exit 1
+      end
+    end
+
+    desc "List datasets in Langfuse"
+    task list_datasets: :environment do
+      begin
+        client = Eval::Langfuse::Client.new
+        response = client.list_datasets(limit: 100)
+
+        datasets = response["data"] || []
+
+        if datasets.empty?
+          puts "No datasets found in Langfuse."
+          puts "Upload a dataset with: rake evals:langfuse:upload_dataset[dataset_name]"
+          next
+        end
+
+        puts "=" * 80
+        puts "Langfuse Datasets"
+        puts "=" * 80
+        puts
+
+        datasets.each do |ds|
+          puts "  #{ds['name']}"
+          puts "    Description: #{ds['description']}" if ds["description"].present?
+          puts "    Created: #{ds['createdAt']}"
+          puts "    Metadata: #{ds['metadata']}" if ds["metadata"].present?
+          puts
+        end
+      rescue Eval::Langfuse::Client::ConfigurationError => e
+        puts "✗ #{e.message}"
+        exit 1
+      rescue Eval::Langfuse::Client::ApiError => e
+        puts "✗ Langfuse API error: #{e.message}"
+        exit 1
+      end
+    end
+  end
+
+  desc "Export manually categorized transactions as golden data"
+  task :export_manual_categories, [ :family_id ] => :environment do |_t, args|
+    family_id = args[:family_id] || ENV["FAMILY_ID"]
+    output_path = ENV["OUTPUT"] || "db/eval_data/categorization_manual_export.yml"
+    limit = (ENV["LIMIT"] || 500).to_i
+
+    if family_id.blank?
+      puts "Usage: rake evals:export_manual_categories[family_id]"
+      puts "   or: FAMILY_ID=uuid rake evals:export_manual_categories"
+      puts
+      puts "Optional environment variables:"
+      puts "  OUTPUT=path/to/output.yml (default: db/eval_data/categorization_manual_export.yml)"
+      puts "  LIMIT=500 (default: 500)"
+      exit 1
+    end
+
+    family = Family.find_by(id: family_id)
+
+    if family.nil?
+      puts "Error: Family '#{family_id}' not found"
+      exit 1
+    end
+
+    puts "=" * 80
+    puts "Exporting Manually Categorized Transactions"
+    puts "=" * 80
+    puts "  Family: #{family.name}"
+    puts "  Output: #{output_path}"
+    puts "  Limit: #{limit}"
+    puts
+
+    # Find transactions that have:
+    # 1. A category assigned
+    # 2. locked_attributes contains "category_id" (meaning user manually set it)
+    # 3. No DataEnrichment record for category_id (meaning it wasn't set by AI/rules/etc)
+    manually_categorized = Transaction
+      .joins(:entry)
+      .joins("INNER JOIN accounts ON accounts.id = entries.account_id")
+      .where(accounts: { family_id: family_id })
+      .where.not(category_id: nil)
+      .where("transactions.locked_attributes ? 'category_id'")
+      .where.not(
+        id: DataEnrichment
+          .where(enrichable_type: "Transaction", attribute_name: "category_id")
+          .select(:enrichable_id)
+      )
+      .includes(:category, entry: :account)
+      .limit(limit)
+
+    count = manually_categorized.count
+
+    if count == 0
+      puts "No manually categorized transactions found."
+      puts
+      puts "Manually categorized transactions are those where:"
+      puts "  - User set a category manually (locked_attributes contains 'category_id')"
+      puts "  - Category was NOT set by AI, rules, or data enrichment sources"
+      exit 0
+    end
+
+    puts "Found #{count} manually categorized transactions"
+    puts
+
+    # Build category context from family's categories
+    categories = family.categories.includes(:parent).map do |cat|
+      {
+        "id" => cat.id.to_s,
+        "name" => cat.name,
+        "classification" => cat.classification,
+        "is_subcategory" => cat.subcategory?,
+        "parent_id" => cat.parent_id&.to_s
+      }.compact
+    end
+
+    # Build samples
+    samples = manually_categorized.map.with_index do |txn, idx|
+      entry = txn.entry
+      sample_id = "manual_#{idx + 1}"
+
+      {
+        "id" => sample_id,
+        "difficulty" => "manual",
+        "tags" => [ txn.category.name.parameterize.underscore, "manual_export" ],
+        "input" => {
+          "id" => txn.id.to_s,
+          "amount" => entry.amount.to_f.abs,
+          "classification" => entry.classification,
+          "description" => entry.name
+        },
+        "expected" => {
+          "category_name" => txn.category.name
+        }
+      }
+    end
+
+    # Build output structure
+    output = {
+      "name" => "categorization_manual_export",
+      "description" => "Golden dataset exported from manually categorized user transactions",
+      "eval_type" => "categorization",
+      "version" => "1.0",
+      "metadata" => {
+        "created_at" => Time.current.strftime("%Y-%m-%d"),
+        "source" => "manual_export",
+        "family_id" => family_id,
+        "exported_count" => samples.size
+      },
+      "context" => {
+        "categories" => categories
+      },
+      "samples" => samples
+    }
+
+    # Write to file
+    FileUtils.mkdir_p(File.dirname(output_path))
+    File.write(output_path, output.to_yaml)
+
+    puts "✓ Successfully exported #{samples.size} samples"
+    puts "  Difficulty: manual"
+    puts
+    puts "Output written to: #{output_path}"
+    puts
+    puts "To import this dataset, run:"
+    puts "  rake evals:import_dataset[#{output_path}]"
+  end
+
+  private
+
+    def format_metric_value(value)
+      case value
+      when Float
+        value.round(4)
+      when BigDecimal
+        value.to_f.round(4)
+      else
+        value
+      end
+    end
+end
--- a/test/models/eval/dataset_test.rb
+++ b/test/models/eval/dataset_test.rb
@@ -0,0 +1,118 @@
+require "test_helper"
+
+class Eval::DatasetTest < ActiveSupport::TestCase
+  test "validates presence of name and eval_type" do
+    dataset = Eval::Dataset.new
+
+    assert_not dataset.valid?
+    assert_includes dataset.errors[:name], "can't be blank"
+    assert_includes dataset.errors[:eval_type], "can't be blank"
+  end
+
+  test "validates eval_type is one of allowed values" do
+    dataset = Eval::Dataset.new(name: "test", eval_type: "invalid")
+
+    assert_not dataset.valid?
+    assert_includes dataset.errors[:eval_type], "is not included in the list"
+
+    dataset.eval_type = "categorization"
+    dataset.valid?
+    assert_empty dataset.errors[:eval_type]
+  end
+
+  test "validates name uniqueness" do
+    Eval::Dataset.create!(name: "unique_test", eval_type: "categorization")
+
+    duplicate = Eval::Dataset.new(name: "unique_test", eval_type: "categorization")
+    assert_not duplicate.valid?
+    assert_includes duplicate.errors[:name], "has already been taken"
+  end
+
+  test "scopes filter by eval_type" do
+    cat_dataset = Eval::Dataset.create!(name: "cat_test", eval_type: "categorization")
+    merch_dataset = Eval::Dataset.create!(name: "merch_test", eval_type: "merchant_detection")
+    chat_dataset = Eval::Dataset.create!(name: "chat_test", eval_type: "chat")
+
+    assert_includes Eval::Dataset.for_categorization, cat_dataset
+    assert_not_includes Eval::Dataset.for_categorization, merch_dataset
+
+    assert_includes Eval::Dataset.for_merchant_detection, merch_dataset
+    assert_not_includes Eval::Dataset.for_merchant_detection, cat_dataset
+
+    assert_includes Eval::Dataset.for_chat, chat_dataset
+    assert_not_includes Eval::Dataset.for_chat, cat_dataset
+  end
+
+  test "import_from_yaml creates dataset with samples" do
+    yaml_content = <<~YAML
+      name: test_import
+      description: Test dataset
+      eval_type: categorization
+      version: "1.0"
+      context:
+        categories:
+          - id: "food"
+            name: "Food"
+            classification: "expense"
+      samples:
+        - id: sample_1
+          difficulty: easy
+          tags: [test]
+          input:
+            id: txn_1
+            amount: 10
+            classification: expense
+            description: "Test transaction"
+          expected:
+            category_name: "Food"
+    YAML
+
+    file_path = Rails.root.join("tmp", "test_import.yml")
+    File.write(file_path, yaml_content)
+
+    dataset = Eval::Dataset.import_from_yaml(file_path)
+
+    assert_equal "test_import", dataset.name
+    assert_equal "categorization", dataset.eval_type
+    assert_equal 1, dataset.samples.count
+    assert_equal "easy", dataset.samples.first.difficulty
+    assert_equal "Food", dataset.samples.first.expected_output["category_name"]
+  ensure
+    File.delete(file_path) if File.exist?(file_path)
+  end
+
+  test "statistics returns sample breakdown" do
+    dataset = Eval::Dataset.create!(name: "stats_test", eval_type: "categorization")
+
+    dataset.samples.create!(
+      input_data: { id: "1" },
+      expected_output: { category_name: "Food" },
+      difficulty: "easy",
+      tags: [ "food" ]
+    )
+
+    dataset.samples.create!(
+      input_data: { id: "2" },
+      expected_output: { category_name: "Travel" },
+      difficulty: "medium",
+      tags: [ "travel" ]
+    )
+
+    stats = dataset.statistics
+
+    assert_equal 2, stats[:total_samples]
+    assert_equal({ "easy" => 1, "medium" => 1 }, stats[:by_difficulty])
+    assert_includes stats[:by_tags], "food"
+    assert_includes stats[:by_tags], "travel"
+  end
+
+  test "runner_class returns correct class for each eval_type" do
+    cat_dataset = Eval::Dataset.new(eval_type: "categorization")
+    merch_dataset = Eval::Dataset.new(eval_type: "merchant_detection")
+    chat_dataset = Eval::Dataset.new(eval_type: "chat")
+
+    assert_equal Eval::Runners::CategorizationRunner, cat_dataset.runner_class
+    assert_equal Eval::Runners::MerchantDetectionRunner, merch_dataset.runner_class
+    assert_equal Eval::Runners::ChatRunner, chat_dataset.runner_class
+  end
+end
--- a/test/models/eval/runners/categorization_runner_test.rb
+++ b/test/models/eval/runners/categorization_runner_test.rb
@@ -0,0 +1,212 @@
+require "test_helper"
+
+class Eval::Runners::CategorizationRunnerTest < ActiveSupport::TestCase
+  include ProviderTestHelper
+
+  setup do
+    @categories = [
+      { "id" => "food", "name" => "Food & Drink", "classification" => "expense" },
+      { "id" => "fast_food", "name" => "Fast Food", "classification" => "expense", "parent_id" => "food" }
+    ]
+  end
+
+
+  test "run processes all samples and calculates metrics" do
+    dataset = Eval::Dataset.create!(
+      name: "test_cat_#{SecureRandom.hex(4)}",
+      eval_type: "categorization",
+      version: "1.0"
+    )
+
+    sample1 = dataset.samples.create!(
+      input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
+      expected_output: { "category_name" => "Fast Food" },
+      context_data: { "categories" => @categories },
+      difficulty: "easy"
+    )
+
+    sample2 = dataset.samples.create!(
+      input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
+      expected_output: { "category_name" => nil },
+      context_data: { "categories" => @categories },
+      difficulty: "edge_case"
+    )
+
+    eval_run = Eval::Run.create!(
+      dataset: dataset,
+      provider: "openai",
+      model: "gpt-4.1",
+      name: "test_run",
+      provider_config: { "access_token" => "test-token" },
+      status: "pending"
+    )
+
+    mock_response = provider_success_response([
+      Provider::LlmConcept::AutoCategorization.new(transaction_id: sample1.id, category_name: "Fast Food"),
+      Provider::LlmConcept::AutoCategorization.new(transaction_id: sample2.id, category_name: "null")
+    ])
+
+    Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
+
+    runner = Eval::Runners::CategorizationRunner.new(eval_run)
+    result = runner.run
+
+    assert_equal "completed", result.status
+    assert_equal 2, result.results.count
+    assert result.metrics["accuracy"].present?
+  end
+
+  test "records correct result when category matches" do
+    dataset = Eval::Dataset.create!(
+      name: "test_cat_match_#{SecureRandom.hex(4)}",
+      eval_type: "categorization",
+      version: "1.0"
+    )
+
+    sample = dataset.samples.create!(
+      input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
+      expected_output: { "category_name" => "Fast Food" },
+      context_data: { "categories" => @categories },
+      difficulty: "easy"
+    )
+
+    eval_run = Eval::Run.create!(
+      dataset: dataset,
+      provider: "openai",
+      model: "gpt-4.1",
+      name: "test_run",
+      provider_config: { "access_token" => "test-token" },
+      status: "pending"
+    )
+
+    mock_response = provider_success_response([
+      Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Fast Food")
+    ])
+
+    Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
+
+    runner = Eval::Runners::CategorizationRunner.new(eval_run)
+    runner.run
+
+    result = eval_run.results.find_by(eval_sample_id: sample.id)
+
+    assert result.correct
+    assert result.exact_match
+    assert_equal "Fast Food", result.actual_output["category_name"]
+  end
+
+  test "records hierarchical match when parent category returned" do
+    dataset = Eval::Dataset.create!(
+      name: "test_cat_hier_#{SecureRandom.hex(4)}",
+      eval_type: "categorization",
+      version: "1.0"
+    )
+
+    sample = dataset.samples.create!(
+      input_data: { "id" => "txn_3", "amount" => 50, "classification" => "expense", "description" => "Olive Garden" },
+      expected_output: { "category_name" => "Fast Food" },
+      context_data: { "categories" => @categories },
+      difficulty: "medium"
+    )
+
+    eval_run = Eval::Run.create!(
+      dataset: dataset,
+      provider: "openai",
+      model: "gpt-4.1",
+      name: "test_hierarchical",
+      provider_config: { "access_token" => "test-token" },
+      status: "pending"
+    )
+
+    # Model returns parent category instead of subcategory
+    mock_response = provider_success_response([
+      Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "Food & Drink")
+    ])
+
+    Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
+
+    runner = Eval::Runners::CategorizationRunner.new(eval_run)
+    runner.run
+
+    result = eval_run.results.find_by(eval_sample_id: sample.id)
+
+    assert_not result.exact_match
+    assert result.hierarchical_match
+  end
+
+  test "handles null correctly when expected" do
+    dataset = Eval::Dataset.create!(
+      name: "test_cat_null_#{SecureRandom.hex(4)}",
+      eval_type: "categorization",
+      version: "1.0"
+    )
+
+    sample = dataset.samples.create!(
+      input_data: { "id" => "txn_2", "amount" => 100, "classification" => "expense", "description" => "ATM Withdrawal" },
+      expected_output: { "category_name" => nil },
+      context_data: { "categories" => @categories },
+      difficulty: "edge_case"
+    )
+
+    eval_run = Eval::Run.create!(
+      dataset: dataset,
+      provider: "openai",
+      model: "gpt-4.1",
+      name: "test_run",
+      provider_config: { "access_token" => "test-token" },
+      status: "pending"
+    )
+
+    mock_response = provider_success_response([
+      Provider::LlmConcept::AutoCategorization.new(transaction_id: sample.id, category_name: "null")
+    ])
+
+    Provider::Openai.any_instance.stubs(:auto_categorize).returns(mock_response)
+
+    runner = Eval::Runners::CategorizationRunner.new(eval_run)
+    runner.run
+
+    result = eval_run.results.find_by(eval_sample_id: sample.id)
+
+    assert result.correct
+    assert result.null_expected
+    assert result.null_returned
+  end
+
+  test "records error results on provider error but completes run" do
+    dataset = Eval::Dataset.create!(
+      name: "test_cat_err_#{SecureRandom.hex(4)}",
+      eval_type: "categorization",
+      version: "1.0"
+    )
+
+    sample = dataset.samples.create!(
+      input_data: { "id" => "txn_1", "amount" => 10, "classification" => "expense", "description" => "McDonalds" },
+      expected_output: { "category_name" => "Fast Food" },
+      context_data: { "categories" => @categories },
+      difficulty: "easy"
+    )
+
+    eval_run = Eval::Run.create!(
+      dataset: dataset,
+      provider: "openai",
+      model: "gpt-4.1",
+      name: "test_run",
+      provider_config: { "access_token" => "test-token" },
+      status: "pending"
+    )
+
+    Provider::Openai.any_instance.stubs(:auto_categorize).raises(StandardError.new("API Error"))
+
+    runner = Eval::Runners::CategorizationRunner.new(eval_run)
+    result = runner.run
+
+    # Run completes but with error results
+    assert_equal "completed", result.status
+    assert_equal 1, result.results.count
+
+    error_result = result.results.find_by(eval_sample_id: sample.id)
+    assert_not error_result.correct
+    assert_includes error_result.actual_output["error"], "API Error"
+  end
+end