mirror of
https://github.com/we-promise/sure
synced 2026-04-25 17:15:07 +02:00
feat(vector-store): Implement pgvector adapter for self-hosted RAG (#1211)
* Add conditional migration for vector_store_chunks table Creates the pgvector-backed chunks table when VECTOR_STORE_PROVIDER=pgvector. Enables the vector extension, adds store_id/file_id indexes, and uses vector(1024) column type for embeddings. * Add VectorStore::Embeddable concern for text extraction and embedding Shared concern providing extract_text (PDF via pdf-reader, plain-text as-is), paragraph-boundary chunking (~2000 chars, ~200 overlap), and embed/embed_batch via OpenAI-compatible /v1/embeddings endpoint using Faraday. Configurable via EMBEDDING_MODEL, EMBEDDING_URI_BASE, with fallback to OPENAI_* env vars. * Implement VectorStore::Pgvector adapter with raw SQL Replaces the stub with a full implementation using ActiveRecord::Base.connection with parameterized binds. Supports create_store, delete_store, upload_file (extract+chunk+embed+insert), remove_file, and cosine-similarity search via the <=> operator. * Add registry test for pgvector adapter selection * Configure pgvector in compose.example.ai.yml Switch db image to pgvector/pgvector:pg16, add VECTOR_STORE_PROVIDER, EMBEDDING_MODEL, and EMBEDDING_DIMENSIONS env vars, and include nomic-embed-text in Ollama's pre-loaded models. * Update pgvector docs from scaffolded to ready Document env vars, embedding model setup, pgvector Docker image requirement, and Ollama pull instructions. * Address PR review feedback - Migration: remove env guard, use pgvector_available? check so it runs on plain Postgres (CI) but creates the table on pgvector-capable servers. Add NOT NULL constraints on content/embedding/metadata, unique index on (store_id, file_id, chunk_index). - Pgvector adapter: wrap chunk inserts in a DB transaction to prevent partial file writes. Override supported_extensions to match formats that extract_text can actually parse. - Embeddable: add hard_split fallback for paragraphs exceeding CHUNK_SIZE to avoid overflowing embedding model token limits. * Bump schema version to include vector_store_chunks migration CI uses db:schema:load which checks the version — without this bump, the migration is detected as pending and tests fail to start. * Update 20260316120000_create_vector_store_chunks.rb --------- Co-authored-by: sokiee <sokysrm@gmail.com>
This commit is contained in:
152
app/models/vector_store/embeddable.rb
Normal file
152
app/models/vector_store/embeddable.rb
Normal file
@@ -0,0 +1,152 @@
|
||||
module VectorStore::Embeddable
|
||||
extend ActiveSupport::Concern
|
||||
|
||||
CHUNK_SIZE = 2000
|
||||
CHUNK_OVERLAP = 200
|
||||
EMBED_BATCH_SIZE = 50
|
||||
|
||||
TEXT_EXTENSIONS = %w[
|
||||
.txt .md .csv .json .xml .html .css
|
||||
.js .ts .py .rb .go .java .php .c .cpp .sh .tex
|
||||
].freeze
|
||||
|
||||
private
|
||||
|
||||
# Dispatch by extension: PDF via PDF::Reader, plain-text types as-is.
|
||||
# Returns nil for unsupported binary formats.
|
||||
def extract_text(file_content, filename)
|
||||
ext = File.extname(filename).downcase
|
||||
|
||||
case ext
|
||||
when ".pdf"
|
||||
extract_pdf_text(file_content)
|
||||
when *TEXT_EXTENSIONS
|
||||
file_content.to_s.encode("UTF-8", invalid: :replace, undef: :replace)
|
||||
else
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def extract_pdf_text(file_content)
|
||||
io = StringIO.new(file_content)
|
||||
reader = PDF::Reader.new(io)
|
||||
reader.pages.map(&:text).join("\n\n")
|
||||
rescue => e
|
||||
Rails.logger.error("VectorStore::Embeddable PDF extraction error: #{e.message}")
|
||||
nil
|
||||
end
|
||||
|
||||
# Split text on paragraph boundaries (~2000 char chunks, ~200 char overlap).
|
||||
# Paragraphs longer than CHUNK_SIZE are hard-split to avoid overflowing
|
||||
# embedding model token limits.
|
||||
def chunk_text(text)
|
||||
return [] if text.blank?
|
||||
|
||||
paragraphs = text.split(/\n\s*\n/)
|
||||
chunks = []
|
||||
current_chunk = +""
|
||||
|
||||
paragraphs.each do |para|
|
||||
para = para.strip
|
||||
next if para.empty?
|
||||
|
||||
# Hard-split oversized paragraphs into CHUNK_SIZE slices with overlap
|
||||
slices = if para.length > CHUNK_SIZE
|
||||
hard_split(para)
|
||||
else
|
||||
[ para ]
|
||||
end
|
||||
|
||||
slices.each do |slice|
|
||||
if current_chunk.empty?
|
||||
current_chunk << slice
|
||||
elsif (current_chunk.length + slice.length + 2) <= CHUNK_SIZE
|
||||
current_chunk << "\n\n" << slice
|
||||
else
|
||||
chunks << current_chunk.freeze
|
||||
overlap = current_chunk.last(CHUNK_OVERLAP)
|
||||
current_chunk = +""
|
||||
current_chunk << overlap << "\n\n" << slice
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
chunks << current_chunk.freeze unless current_chunk.empty?
|
||||
chunks
|
||||
end
|
||||
|
||||
# Hard-split a single long string into CHUNK_SIZE slices with CHUNK_OVERLAP.
|
||||
def hard_split(text)
|
||||
slices = []
|
||||
offset = 0
|
||||
while offset < text.length
|
||||
slices << text[offset, CHUNK_SIZE]
|
||||
offset += CHUNK_SIZE - CHUNK_OVERLAP
|
||||
end
|
||||
slices
|
||||
end
|
||||
|
||||
# Embed a single text string → vector array.
|
||||
def embed(text)
|
||||
response = embedding_client.post("embeddings") do |req|
|
||||
req.body = {
|
||||
model: embedding_model,
|
||||
input: text
|
||||
}
|
||||
end
|
||||
|
||||
data = response.body
|
||||
raise VectorStore::Error, "Embedding request failed: #{data}" unless data.is_a?(Hash) && data["data"]
|
||||
|
||||
data["data"].first["embedding"]
|
||||
end
|
||||
|
||||
# Batch embed, processing in groups of EMBED_BATCH_SIZE.
|
||||
def embed_batch(texts)
|
||||
vectors = []
|
||||
|
||||
texts.each_slice(EMBED_BATCH_SIZE) do |batch|
|
||||
response = embedding_client.post("embeddings") do |req|
|
||||
req.body = {
|
||||
model: embedding_model,
|
||||
input: batch
|
||||
}
|
||||
end
|
||||
|
||||
data = response.body
|
||||
raise VectorStore::Error, "Batch embedding request failed: #{data}" unless data.is_a?(Hash) && data["data"]
|
||||
|
||||
# Sort by index to preserve order
|
||||
sorted = data["data"].sort_by { |d| d["index"] }
|
||||
vectors.concat(sorted.map { |d| d["embedding"] })
|
||||
end
|
||||
|
||||
vectors
|
||||
end
|
||||
|
||||
def embedding_client
|
||||
@embedding_client ||= Faraday.new(url: embedding_uri_base) do |f|
|
||||
f.request :json
|
||||
f.response :json
|
||||
f.headers["Authorization"] = "Bearer #{embedding_access_token}" if embedding_access_token.present?
|
||||
f.options.timeout = 120
|
||||
f.options.open_timeout = 10
|
||||
end
|
||||
end
|
||||
|
||||
def embedding_model
|
||||
ENV.fetch("EMBEDDING_MODEL", "nomic-embed-text")
|
||||
end
|
||||
|
||||
def embedding_dimensions
|
||||
ENV.fetch("EMBEDDING_DIMENSIONS", "1024").to_i
|
||||
end
|
||||
|
||||
def embedding_uri_base
|
||||
ENV["EMBEDDING_URI_BASE"].presence || ENV["OPENAI_URI_BASE"].presence || "https://api.openai.com/v1/"
|
||||
end
|
||||
|
||||
def embedding_access_token
|
||||
ENV["EMBEDDING_ACCESS_TOKEN"].presence || ENV["OPENAI_ACCESS_TOKEN"].presence
|
||||
end
|
||||
end
|
||||
@@ -2,88 +2,137 @@
|
||||
#
|
||||
# This keeps all data on your own infrastructure — no external vector-store
|
||||
# service required. You still need an embedding provider (e.g. OpenAI, or a
|
||||
# local model served via an OpenAI-compatible endpoint) to turn text into
|
||||
# vectors before insertion and at query time.
|
||||
# local model served via an OpenAI-compatible endpoint such as Ollama) to turn
|
||||
# text into vectors before insertion and at query time.
|
||||
#
|
||||
# Requirements (not yet wired up):
|
||||
# - PostgreSQL with the `vector` extension enabled
|
||||
# - gem "neighbor" (for ActiveRecord integration) or raw SQL
|
||||
# - An embedding model endpoint (EMBEDDING_MODEL_URL / EMBEDDING_MODEL_NAME)
|
||||
# - A chunking strategy (see #chunk_file below)
|
||||
#
|
||||
# Schema sketch (for reference — migration not included):
|
||||
#
|
||||
# create_table :vector_store_chunks do |t|
|
||||
# t.string :store_id, null: false # logical namespace
|
||||
# t.string :file_id, null: false
|
||||
# t.string :filename
|
||||
# t.text :content # the original text chunk
|
||||
# t.vector :embedding, limit: 1536 # adjust dimensions to your model
|
||||
# t.jsonb :metadata, default: {}
|
||||
# t.timestamps
|
||||
# end
|
||||
# add_index :vector_store_chunks, :store_id
|
||||
# add_index :vector_store_chunks, :file_id
|
||||
# Requirements:
|
||||
# - PostgreSQL with the `vector` extension enabled (use pgvector/pgvector Docker image)
|
||||
# - An embedding model endpoint (EMBEDDING_URI_BASE / EMBEDDING_MODEL)
|
||||
# - Migration: CreateVectorStoreChunks (run with VECTOR_STORE_PROVIDER=pgvector)
|
||||
#
|
||||
class VectorStore::Pgvector < VectorStore::Base
|
||||
include VectorStore::Embeddable
|
||||
|
||||
PGVECTOR_SUPPORTED_EXTENSIONS = (VectorStore::Embeddable::TEXT_EXTENSIONS + [ ".pdf" ]).uniq.freeze
|
||||
|
||||
def supported_extensions
|
||||
PGVECTOR_SUPPORTED_EXTENSIONS
|
||||
end
|
||||
|
||||
def create_store(name:)
|
||||
with_response do
|
||||
# A "store" is just a logical namespace (a UUID).
|
||||
# No external resource to create.
|
||||
# { id: SecureRandom.uuid }
|
||||
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
|
||||
{ id: SecureRandom.uuid }
|
||||
end
|
||||
end
|
||||
|
||||
def delete_store(store_id:)
|
||||
with_response do
|
||||
# TODO: DELETE FROM vector_store_chunks WHERE store_id = ?
|
||||
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
|
||||
connection.exec_delete(
|
||||
"DELETE FROM vector_store_chunks WHERE store_id = $1",
|
||||
"VectorStore::Pgvector DeleteStore",
|
||||
[ bind_param("store_id", store_id) ]
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def upload_file(store_id:, file_content:, filename:)
|
||||
with_response do
|
||||
# 1. chunk_file(file_content, filename) → array of text chunks
|
||||
# 2. embed each chunk via the configured embedding model
|
||||
# 3. INSERT INTO vector_store_chunks (store_id, file_id, filename, content, embedding)
|
||||
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
|
||||
text = extract_text(file_content, filename)
|
||||
raise VectorStore::Error, "Could not extract text from #{filename}" if text.blank?
|
||||
|
||||
chunks = chunk_text(text)
|
||||
raise VectorStore::Error, "No chunks produced from #{filename}" if chunks.empty?
|
||||
|
||||
vectors = embed_batch(chunks)
|
||||
file_id = SecureRandom.uuid
|
||||
now = Time.current
|
||||
|
||||
connection.transaction do
|
||||
chunks.each_with_index do |chunk_content, index|
|
||||
embedding_literal = "[#{vectors[index].join(',')}]"
|
||||
|
||||
connection.exec_insert(
|
||||
<<~SQL,
|
||||
INSERT INTO vector_store_chunks
|
||||
(id, store_id, file_id, filename, chunk_index, content, embedding, metadata, created_at, updated_at)
|
||||
VALUES
|
||||
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
||||
SQL
|
||||
"VectorStore::Pgvector InsertChunk",
|
||||
[
|
||||
bind_param("id", SecureRandom.uuid),
|
||||
bind_param("store_id", store_id),
|
||||
bind_param("file_id", file_id),
|
||||
bind_param("filename", filename),
|
||||
bind_param("chunk_index", index),
|
||||
bind_param("content", chunk_content),
|
||||
bind_param("embedding", embedding_literal, ActiveRecord::Type::String.new),
|
||||
bind_param("metadata", "{}"),
|
||||
bind_param("created_at", now),
|
||||
bind_param("updated_at", now)
|
||||
]
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
{ file_id: file_id }
|
||||
end
|
||||
end
|
||||
|
||||
def remove_file(store_id:, file_id:)
|
||||
with_response do
|
||||
# TODO: DELETE FROM vector_store_chunks WHERE store_id = ? AND file_id = ?
|
||||
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
|
||||
connection.exec_delete(
|
||||
"DELETE FROM vector_store_chunks WHERE store_id = $1 AND file_id = $2",
|
||||
"VectorStore::Pgvector RemoveFile",
|
||||
[
|
||||
bind_param("store_id", store_id),
|
||||
bind_param("file_id", file_id)
|
||||
]
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def search(store_id:, query:, max_results: 10)
|
||||
with_response do
|
||||
# 1. embed(query) → vector
|
||||
# 2. SELECT content, filename, file_id,
|
||||
# 1 - (embedding <=> query_vector) AS score
|
||||
# FROM vector_store_chunks
|
||||
# WHERE store_id = ?
|
||||
# ORDER BY embedding <=> query_vector
|
||||
# LIMIT max_results
|
||||
raise VectorStore::Error, "Pgvector adapter is not yet implemented"
|
||||
query_vector = embed(query)
|
||||
vector_literal = "[#{query_vector.join(',')}]"
|
||||
|
||||
results = connection.exec_query(
|
||||
<<~SQL,
|
||||
SELECT content, filename, file_id,
|
||||
1 - (embedding <=> $1::vector) AS score
|
||||
FROM vector_store_chunks
|
||||
WHERE store_id = $2
|
||||
ORDER BY embedding <=> $1::vector
|
||||
LIMIT $3
|
||||
SQL
|
||||
"VectorStore::Pgvector Search",
|
||||
[
|
||||
bind_param("embedding", vector_literal, ActiveRecord::Type::String.new),
|
||||
bind_param("store_id", store_id),
|
||||
bind_param("limit", max_results)
|
||||
]
|
||||
)
|
||||
|
||||
results.map do |row|
|
||||
{
|
||||
content: row["content"],
|
||||
filename: row["filename"],
|
||||
score: row["score"].to_f,
|
||||
file_id: row["file_id"]
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Placeholder: split file content into overlapping text windows.
|
||||
# A real implementation would handle PDFs, DOCX, etc. via
|
||||
# libraries like `pdf-reader`, `docx`, or an extraction service.
|
||||
def chunk_file(file_content, filename)
|
||||
# TODO: implement format-aware chunking
|
||||
[]
|
||||
def connection
|
||||
ActiveRecord::Base.connection
|
||||
end
|
||||
|
||||
# Placeholder: call an embedding API to turn text into a vector.
|
||||
def embed(text)
|
||||
# TODO: call EMBEDDING_MODEL_URL or OpenAI embeddings endpoint
|
||||
raise VectorStore::Error, "Embedding model not configured"
|
||||
def bind_param(name, value, type = nil)
|
||||
type ||= ActiveModel::Type::Value.new
|
||||
ActiveRecord::Relation::QueryAttribute.new(name, value, type)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -69,6 +69,10 @@ x-rails-env: &rails_env
|
||||
OPENAI_ACCESS_TOKEN: token-can-be-any-value-for-ollama
|
||||
OPENAI_MODEL: llama3.1:8b # Note: Use tool-enabled model
|
||||
OPENAI_URI_BASE: http://ollama:11434/v1
|
||||
# Vector store — pgvector keeps all data local (requires pgvector/pgvector Docker image for db)
|
||||
VECTOR_STORE_PROVIDER: pgvector
|
||||
EMBEDDING_MODEL: nomic-embed-text
|
||||
EMBEDDING_DIMENSIONS: "1024"
|
||||
# NOTE: enabling OpenAI will incur costs when you use AI-related features in the app (chat, rules). Make sure you have set appropriate spend limits on your account before adding this.
|
||||
# OPENAI_ACCESS_TOKEN: ${OPENAI_ACCESS_TOKEN}
|
||||
# External AI Assistant — delegates chat to a remote AI agent (e.g., OpenClaw).
|
||||
@@ -128,7 +132,7 @@ services:
|
||||
- "11434:11434"
|
||||
environment:
|
||||
- OLLAMA_KEEP_ALIVE=1h
|
||||
- OLLAMA_MODELS=deepseek-r1:8b,llama3.1:8b # Pre-load model on startup, you can change this to your preferred model
|
||||
- OLLAMA_MODELS=deepseek-r1:8b,llama3.1:8b,nomic-embed-text # Pre-load model on startup, you can change this to your preferred model
|
||||
networks:
|
||||
- sure_net
|
||||
# Recommended: Enable GPU support
|
||||
@@ -213,7 +217,7 @@ services:
|
||||
- sure_net
|
||||
|
||||
db:
|
||||
image: postgres:16
|
||||
image: pgvector/pgvector:pg16
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- postgres-data:/var/lib/postgresql/data
|
||||
|
||||
43
db/migrate/20260316120000_create_vector_store_chunks.rb
Normal file
43
db/migrate/20260316120000_create_vector_store_chunks.rb
Normal file
@@ -0,0 +1,43 @@
|
||||
class CreateVectorStoreChunks < ActiveRecord::Migration[7.2]
|
||||
def up
|
||||
return unless pgvector_available?
|
||||
|
||||
enable_extension "vector" unless extension_enabled?("vector")
|
||||
|
||||
create_table :vector_store_chunks, id: :uuid do |t|
|
||||
t.string :store_id, null: false
|
||||
t.string :file_id, null: false
|
||||
t.string :filename
|
||||
t.integer :chunk_index, null: false, default: 0
|
||||
t.text :content, null: false
|
||||
t.column :embedding, "vector(#{ENV.fetch('EMBEDDING_DIMENSIONS', '1024')})", null: false
|
||||
t.jsonb :metadata, null: false, default: {}
|
||||
t.timestamps null: false
|
||||
end
|
||||
|
||||
add_index :vector_store_chunks, :store_id
|
||||
add_index :vector_store_chunks, :file_id
|
||||
add_index :vector_store_chunks, [ :store_id, :file_id, :chunk_index ], unique: true,
|
||||
name: "index_vector_store_chunks_on_store_file_chunk"
|
||||
end
|
||||
|
||||
def down
|
||||
drop_table :vector_store_chunks, if_exists: true
|
||||
disable_extension "vector" if extension_enabled?("vector")
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Check if the pgvector extension is installed in the PostgreSQL server,
|
||||
# not just whether it is enabled in this database. This lets the migration
|
||||
# run harmlessly on plain Postgres (CI, dev without pgvector) while still
|
||||
# creating the table on pgvector-capable servers.
|
||||
def pgvector_available?
|
||||
result = ActiveRecord::Base.connection.execute(
|
||||
"SELECT 1 FROM pg_available_extensions WHERE name = 'vector' LIMIT 1"
|
||||
)
|
||||
result.any?
|
||||
rescue
|
||||
false
|
||||
end
|
||||
end
|
||||
2
db/schema.rb
generated
2
db/schema.rb
generated
@@ -10,7 +10,7 @@
|
||||
#
|
||||
# It's strongly recommended that you check this file into your version control system.
|
||||
|
||||
ActiveRecord::Schema[7.2].define(version: 2026_03_14_131357) do
|
||||
ActiveRecord::Schema[7.2].define(version: 2026_03_16_120000) do
|
||||
# These are extensions that must be enabled in order to support this database
|
||||
enable_extension "pgcrypto"
|
||||
enable_extension "plpgsql"
|
||||
|
||||
@@ -1140,7 +1140,7 @@ Sure's AI assistant can search documents that have been uploaded to a family's v
|
||||
| Backend | Status | Best For | Requirements |
|
||||
|---------|--------|----------|--------------|
|
||||
| **OpenAI** (default) | ready | Cloud deployments, zero setup | `OPENAI_ACCESS_TOKEN` |
|
||||
| **Pgvector** | scaffolded | Self-hosted, full data privacy | PostgreSQL with `pgvector` extension |
|
||||
| **Pgvector** | ready | Self-hosted, full data privacy | PostgreSQL with `pgvector` extension + embedding model |
|
||||
| **Qdrant** | scaffolded | Self-hosted, dedicated vector DB | Running Qdrant instance |
|
||||
|
||||
#### Configuration
|
||||
@@ -1156,16 +1156,29 @@ OPENAI_ACCESS_TOKEN=sk-proj-...
|
||||
|
||||
##### Pgvector (Self-Hosted)
|
||||
|
||||
> [!CAUTION]
|
||||
> Only `OpenAI` has been implemented!
|
||||
Use PostgreSQL's pgvector extension for fully local document search. All data stays on your infrastructure.
|
||||
|
||||
Use PostgreSQL's pgvector extension for fully local document search:
|
||||
**Requirements:**
|
||||
- Use the `pgvector/pgvector:pg16` Docker image instead of `postgres:16` (drop-in replacement)
|
||||
- An embedding model served via an OpenAI-compatible `/v1/embeddings` endpoint (e.g. Ollama with `nomic-embed-text`)
|
||||
- Run the migration with `VECTOR_STORE_PROVIDER=pgvector` to create the `vector_store_chunks` table
|
||||
|
||||
```bash
|
||||
# Required
|
||||
VECTOR_STORE_PROVIDER=pgvector
|
||||
|
||||
# Embedding model configuration
|
||||
EMBEDDING_MODEL=nomic-embed-text # Default: nomic-embed-text
|
||||
EMBEDDING_DIMENSIONS=1024 # Default: 1024 (must match your model)
|
||||
EMBEDDING_URI_BASE=http://ollama:11434/v1 # Falls back to OPENAI_URI_BASE if not set
|
||||
EMBEDDING_ACCESS_TOKEN= # Falls back to OPENAI_ACCESS_TOKEN if not set
|
||||
```
|
||||
|
||||
> **Note:** The pgvector adapter is currently a skeleton. A future release will add full support including embedding model configuration.
|
||||
If you are using Ollama (as in `compose.example.ai.yml`), pull the embedding model:
|
||||
|
||||
```bash
|
||||
docker compose exec ollama ollama pull nomic-embed-text
|
||||
```
|
||||
|
||||
##### Qdrant (Self-Hosted)
|
||||
|
||||
|
||||
204
test/models/vector_store/embeddable_test.rb
Normal file
204
test/models/vector_store/embeddable_test.rb
Normal file
@@ -0,0 +1,204 @@
|
||||
require "test_helper"
|
||||
|
||||
class VectorStore::EmbeddableTest < ActiveSupport::TestCase
|
||||
class EmbeddableHost
|
||||
include VectorStore::Embeddable
|
||||
# Expose private methods for testing
|
||||
public :extract_text, :chunk_text, :embed, :embed_batch
|
||||
end
|
||||
|
||||
setup do
|
||||
@host = EmbeddableHost.new
|
||||
end
|
||||
|
||||
# --- extract_text ---
|
||||
|
||||
test "extract_text returns plain text for .txt files" do
|
||||
result = @host.extract_text("Hello world", "notes.txt")
|
||||
assert_equal "Hello world", result
|
||||
end
|
||||
|
||||
test "extract_text returns content for markdown files" do
|
||||
result = @host.extract_text("# Heading\n\nBody", "readme.md")
|
||||
assert_equal "# Heading\n\nBody", result
|
||||
end
|
||||
|
||||
test "extract_text returns content for code files" do
|
||||
result = @host.extract_text("def foo; end", "app.rb")
|
||||
assert_equal "def foo; end", result
|
||||
end
|
||||
|
||||
test "extract_text returns nil for unsupported binary formats" do
|
||||
assert_nil @host.extract_text("\x00\x01binary", "photo.png")
|
||||
assert_nil @host.extract_text("\x00\x01binary", "archive.zip")
|
||||
end
|
||||
|
||||
test "extract_text handles PDF files" do
|
||||
pdf_content = "fake pdf bytes"
|
||||
mock_page = mock("page")
|
||||
mock_page.stubs(:text).returns("Page 1 content")
|
||||
|
||||
mock_reader = mock("reader")
|
||||
mock_reader.stubs(:pages).returns([ mock_page ])
|
||||
|
||||
PDF::Reader.expects(:new).with(instance_of(StringIO)).returns(mock_reader)
|
||||
|
||||
result = @host.extract_text(pdf_content, "document.pdf")
|
||||
assert_equal "Page 1 content", result
|
||||
end
|
||||
|
||||
test "extract_text returns nil when PDF extraction fails" do
|
||||
PDF::Reader.expects(:new).raises(StandardError, "corrupt pdf")
|
||||
|
||||
result = @host.extract_text("bad data", "broken.pdf")
|
||||
assert_nil result
|
||||
end
|
||||
|
||||
# --- chunk_text ---
|
||||
|
||||
test "chunk_text returns empty array for blank text" do
|
||||
assert_equal [], @host.chunk_text("")
|
||||
assert_equal [], @host.chunk_text(nil)
|
||||
end
|
||||
|
||||
test "chunk_text returns single chunk for short text" do
|
||||
text = "Short paragraph."
|
||||
chunks = @host.chunk_text(text)
|
||||
assert_equal 1, chunks.size
|
||||
assert_equal "Short paragraph.", chunks.first
|
||||
end
|
||||
|
||||
test "chunk_text splits on paragraph boundaries" do
|
||||
# Create text that exceeds CHUNK_SIZE when combined
|
||||
para1 = "A" * 1200
|
||||
para2 = "B" * 1200
|
||||
text = "#{para1}\n\n#{para2}"
|
||||
|
||||
chunks = @host.chunk_text(text)
|
||||
assert_equal 2, chunks.size
|
||||
assert_includes chunks.first, "A" * 1200
|
||||
assert_includes chunks.last, "B" * 1200
|
||||
end
|
||||
|
||||
test "chunk_text includes overlap between chunks" do
|
||||
para1 = "A" * 1500
|
||||
para2 = "B" * 1500
|
||||
text = "#{para1}\n\n#{para2}"
|
||||
|
||||
chunks = @host.chunk_text(text)
|
||||
assert_equal 2, chunks.size
|
||||
# Second chunk should start with overlap from end of first chunk
|
||||
overlap = para1.last(VectorStore::Embeddable::CHUNK_OVERLAP)
|
||||
assert chunks.last.start_with?(overlap)
|
||||
end
|
||||
|
||||
test "chunk_text keeps small paragraphs together" do
|
||||
paragraphs = Array.new(5) { |i| "Paragraph #{i} content." }
|
||||
text = paragraphs.join("\n\n")
|
||||
|
||||
chunks = @host.chunk_text(text)
|
||||
assert_equal 1, chunks.size
|
||||
end
|
||||
|
||||
test "chunk_text hard-splits oversized paragraphs" do
|
||||
# A single paragraph longer than CHUNK_SIZE with no paragraph breaks
|
||||
long_para = "X" * 5000
|
||||
chunks = @host.chunk_text(long_para)
|
||||
|
||||
assert chunks.size > 1
|
||||
chunks.each do |chunk|
|
||||
assert chunk.length <= VectorStore::Embeddable::CHUNK_SIZE + VectorStore::Embeddable::CHUNK_OVERLAP + 2,
|
||||
"Chunk too large: #{chunk.length} chars"
|
||||
end
|
||||
end
|
||||
|
||||
# --- embed ---
|
||||
|
||||
test "embed calls embedding endpoint and returns vector" do
|
||||
expected_vector = [ 0.1, 0.2, 0.3 ]
|
||||
stub_response = { "data" => [ { "embedding" => expected_vector, "index" => 0 } ] }
|
||||
|
||||
mock_client = mock("faraday")
|
||||
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
|
||||
OpenStruct.new(body: stub_response)
|
||||
)
|
||||
@host.instance_variable_set(:@embedding_client, mock_client)
|
||||
|
||||
result = @host.embed("test text")
|
||||
assert_equal expected_vector, result
|
||||
end
|
||||
|
||||
test "embed raises on failed response" do
|
||||
mock_client = mock("faraday")
|
||||
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
|
||||
OpenStruct.new(body: { "error" => "bad request" })
|
||||
)
|
||||
@host.instance_variable_set(:@embedding_client, mock_client)
|
||||
|
||||
assert_raises(VectorStore::Error) { @host.embed("test text") }
|
||||
end
|
||||
|
||||
# --- embed_batch ---
|
||||
|
||||
test "embed_batch processes texts and returns ordered vectors" do
|
||||
texts = [ "first", "second", "third" ]
|
||||
vectors = [ [ 0.1 ], [ 0.2 ], [ 0.3 ] ]
|
||||
stub_response = {
|
||||
"data" => [
|
||||
{ "embedding" => vectors[0], "index" => 0 },
|
||||
{ "embedding" => vectors[1], "index" => 1 },
|
||||
{ "embedding" => vectors[2], "index" => 2 }
|
||||
]
|
||||
}
|
||||
|
||||
mock_client = mock("faraday")
|
||||
mock_client.expects(:post).with("embeddings").yields(mock_request).returns(
|
||||
OpenStruct.new(body: stub_response)
|
||||
)
|
||||
@host.instance_variable_set(:@embedding_client, mock_client)
|
||||
|
||||
result = @host.embed_batch(texts)
|
||||
assert_equal vectors, result
|
||||
end
|
||||
|
||||
test "embed_batch handles multiple batches" do
|
||||
# Override batch size constant for testing
|
||||
original = VectorStore::Embeddable::EMBED_BATCH_SIZE
|
||||
VectorStore::Embeddable.send(:remove_const, :EMBED_BATCH_SIZE)
|
||||
VectorStore::Embeddable.const_set(:EMBED_BATCH_SIZE, 2)
|
||||
|
||||
texts = [ "a", "b", "c" ]
|
||||
|
||||
batch1_response = {
|
||||
"data" => [
|
||||
{ "embedding" => [ 0.1 ], "index" => 0 },
|
||||
{ "embedding" => [ 0.2 ], "index" => 1 }
|
||||
]
|
||||
}
|
||||
batch2_response = {
|
||||
"data" => [
|
||||
{ "embedding" => [ 0.3 ], "index" => 0 }
|
||||
]
|
||||
}
|
||||
|
||||
mock_client = mock("faraday")
|
||||
mock_client.expects(:post).with("embeddings").twice
|
||||
.yields(mock_request)
|
||||
.returns(OpenStruct.new(body: batch1_response))
|
||||
.then.returns(OpenStruct.new(body: batch2_response))
|
||||
@host.instance_variable_set(:@embedding_client, mock_client)
|
||||
|
||||
result = @host.embed_batch(texts)
|
||||
assert_equal [ [ 0.1 ], [ 0.2 ], [ 0.3 ] ], result
|
||||
ensure
|
||||
VectorStore::Embeddable.send(:remove_const, :EMBED_BATCH_SIZE)
|
||||
VectorStore::Embeddable.const_set(:EMBED_BATCH_SIZE, original)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def mock_request
|
||||
request = OpenStruct.new(body: nil)
|
||||
request
|
||||
end
|
||||
end
|
||||
141
test/models/vector_store/pgvector_test.rb
Normal file
141
test/models/vector_store/pgvector_test.rb
Normal file
@@ -0,0 +1,141 @@
|
||||
require "test_helper"
|
||||
|
||||
class VectorStore::PgvectorTest < ActiveSupport::TestCase
|
||||
setup do
|
||||
@adapter = VectorStore::Pgvector.new
|
||||
end
|
||||
|
||||
test "create_store returns a UUID" do
|
||||
response = @adapter.create_store(name: "Test Store")
|
||||
assert response.success?
|
||||
assert_match(/\A[0-9a-f-]{36}\z/, response.data[:id])
|
||||
end
|
||||
|
||||
test "delete_store executes delete query" do
|
||||
mock_conn = mock("connection")
|
||||
mock_conn.expects(:exec_delete).with(
|
||||
"DELETE FROM vector_store_chunks WHERE store_id = $1",
|
||||
"VectorStore::Pgvector DeleteStore",
|
||||
anything
|
||||
).returns(0)
|
||||
|
||||
@adapter.stubs(:connection).returns(mock_conn)
|
||||
|
||||
response = @adapter.delete_store(store_id: "store-123")
|
||||
assert response.success?
|
||||
end
|
||||
|
||||
test "upload_file extracts text, chunks, embeds, and inserts" do
|
||||
file_content = "Hello world"
|
||||
filename = "test.txt"
|
||||
store_id = "store-123"
|
||||
|
||||
@adapter.expects(:extract_text).with(file_content, filename).returns("Hello world")
|
||||
@adapter.expects(:chunk_text).with("Hello world").returns([ "Hello world" ])
|
||||
@adapter.expects(:embed_batch).with([ "Hello world" ]).returns([ [ 0.1, 0.2, 0.3 ] ])
|
||||
|
||||
mock_conn = mock("connection")
|
||||
mock_conn.expects(:transaction).yields
|
||||
mock_conn.expects(:exec_insert).once
|
||||
@adapter.stubs(:connection).returns(mock_conn)
|
||||
|
||||
response = @adapter.upload_file(store_id: store_id, file_content: file_content, filename: filename)
|
||||
assert response.success?
|
||||
assert_match(/\A[0-9a-f-]{36}\z/, response.data[:file_id])
|
||||
end
|
||||
|
||||
test "upload_file fails when text extraction returns nil" do
|
||||
@adapter.expects(:extract_text).returns(nil)
|
||||
|
||||
response = @adapter.upload_file(store_id: "store-123", file_content: "\x00binary", filename: "photo.png")
|
||||
assert_not response.success?
|
||||
assert_match(/Could not extract text/, response.error.message)
|
||||
end
|
||||
|
||||
test "upload_file fails when no chunks produced" do
|
||||
@adapter.expects(:extract_text).returns("some text")
|
||||
@adapter.expects(:chunk_text).returns([])
|
||||
|
||||
response = @adapter.upload_file(store_id: "store-123", file_content: "some text", filename: "empty.txt")
|
||||
assert_not response.success?
|
||||
assert_match(/No chunks produced/, response.error.message)
|
||||
end
|
||||
|
||||
test "upload_file inserts multiple chunks in a transaction" do
|
||||
@adapter.expects(:extract_text).returns("chunk1\n\nchunk2")
|
||||
@adapter.expects(:chunk_text).returns([ "chunk1", "chunk2" ])
|
||||
@adapter.expects(:embed_batch).returns([ [ 0.1 ], [ 0.2 ] ])
|
||||
|
||||
mock_conn = mock("connection")
|
||||
mock_conn.expects(:transaction).yields
|
||||
mock_conn.expects(:exec_insert).twice
|
||||
@adapter.stubs(:connection).returns(mock_conn)
|
||||
|
||||
response = @adapter.upload_file(store_id: "store-123", file_content: "chunk1\n\nchunk2", filename: "doc.txt")
|
||||
assert response.success?
|
||||
end
|
||||
|
||||
test "remove_file executes delete with store_id and file_id" do
|
||||
mock_conn = mock("connection")
|
||||
mock_conn.expects(:exec_delete).with(
|
||||
"DELETE FROM vector_store_chunks WHERE store_id = $1 AND file_id = $2",
|
||||
"VectorStore::Pgvector RemoveFile",
|
||||
anything
|
||||
).returns(1)
|
||||
|
||||
@adapter.stubs(:connection).returns(mock_conn)
|
||||
|
||||
response = @adapter.remove_file(store_id: "store-123", file_id: "file-456")
|
||||
assert response.success?
|
||||
end
|
||||
|
||||
test "search embeds query and returns scored results" do
|
||||
query_vector = [ 0.1, 0.2, 0.3 ]
|
||||
@adapter.expects(:embed).with("income").returns(query_vector)
|
||||
|
||||
mock_result = [
|
||||
{ "content" => "Total income: $85,000", "filename" => "tax_return.pdf", "file_id" => "file-xyz", "score" => 0.95 }
|
||||
]
|
||||
|
||||
mock_conn = mock("connection")
|
||||
mock_conn.expects(:exec_query).returns(mock_result)
|
||||
@adapter.stubs(:connection).returns(mock_conn)
|
||||
|
||||
response = @adapter.search(store_id: "store-123", query: "income", max_results: 5)
|
||||
assert response.success?
|
||||
assert_equal 1, response.data.size
|
||||
assert_equal "Total income: $85,000", response.data.first[:content]
|
||||
assert_equal "tax_return.pdf", response.data.first[:filename]
|
||||
assert_equal 0.95, response.data.first[:score]
|
||||
assert_equal "file-xyz", response.data.first[:file_id]
|
||||
end
|
||||
|
||||
test "search returns empty array when no results" do
|
||||
@adapter.expects(:embed).returns([ 0.1 ])
|
||||
|
||||
mock_conn = mock("connection")
|
||||
mock_conn.expects(:exec_query).returns([])
|
||||
@adapter.stubs(:connection).returns(mock_conn)
|
||||
|
||||
response = @adapter.search(store_id: "store-123", query: "nothing")
|
||||
assert response.success?
|
||||
assert_empty response.data
|
||||
end
|
||||
|
||||
test "wraps errors in failure response" do
|
||||
@adapter.expects(:extract_text).raises(StandardError, "unexpected error")
|
||||
|
||||
response = @adapter.upload_file(store_id: "store-123", file_content: "data", filename: "test.txt")
|
||||
assert_not response.success?
|
||||
assert_equal "unexpected error", response.error.message
|
||||
end
|
||||
|
||||
test "supported_extensions matches extractable formats only" do
|
||||
assert_includes @adapter.supported_extensions, ".pdf"
|
||||
assert_includes @adapter.supported_extensions, ".txt"
|
||||
assert_includes @adapter.supported_extensions, ".csv"
|
||||
assert_not_includes @adapter.supported_extensions, ".png"
|
||||
assert_not_includes @adapter.supported_extensions, ".zip"
|
||||
assert_not_includes @adapter.supported_extensions, ".docx"
|
||||
end
|
||||
end
|
||||
@@ -43,6 +43,13 @@ class VectorStore::RegistryTest < ActiveSupport::TestCase
|
||||
end
|
||||
end
|
||||
|
||||
test "adapter returns VectorStore::Pgvector instance when pgvector configured" do
|
||||
ClimateControl.modify(VECTOR_STORE_PROVIDER: "pgvector") do
|
||||
adapter = VectorStore::Registry.adapter
|
||||
assert_instance_of VectorStore::Pgvector, adapter
|
||||
end
|
||||
end
|
||||
|
||||
test "configured? delegates to adapter presence" do
|
||||
VectorStore::Registry.stubs(:adapter).returns(nil)
|
||||
assert_not VectorStore.configured?
|
||||
|
||||
Reference in New Issue
Block a user