examples: refactor all-mini-lm-l6 for semantic similarity

2026-04-25 16:34:55 +02:00 · 2024-07-22 19:53:23 -05:00
parent 1a10b11b10
commit c5538f26d0
6 changed files with 65 additions and 45 deletions
--- a/examples/all-mini-lm-l6/examples/all-mini-lm-l6.rs
+++ b/examples/all-mini-lm-l6/examples/all-mini-lm-l6.rs
@@ -1,41 +0,0 @@
-use std::path::Path;
-
-use ndarray::{Array1, Axis};
-use ort::{CUDAExecutionProvider, GraphOptimizationLevel, Session};
-use tokenizers::Tokenizer;
-
-/// all-mini-lm-l6 embeddings generation
-///
-/// This is a sentence-transformers model: It maps sentences & paragraphs to a 384
-///
-/// dimensional dense vector space and can be used for tasks like clustering or semantic search.
-fn main() -> ort::Result<()> {
-	// Initialize tracing to receive debug messages from `ort`
-	tracing_subscriber::fmt::init();
-
-	// Create the ONNX Runtime environment, enabling CUDA execution providers for all sessions created in this process.
-	ort::init()
-		.with_name("all-Mini-LM-L6")
-		.with_execution_providers([CUDAExecutionProvider::default().build()])
-		.commit()?;
-
-	// Load our model
-	let session = Session::builder()?
-		.with_optimization_level(GraphOptimizationLevel::Level1)?
-		.with_intra_threads(1)?
-		.commit_from_url("https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/onnx/model.onnx")?;
-
-	// Load the tokenizer and encode the text.
-	let tokenizer = Tokenizer::from_file(Path::new(env!("CARGO_MANIFEST_DIR")).join("data").join("tokenizer.json")).unwrap();
-	let tokens = tokenizer.encode("test", false)?;
-	let mask = tokens.get_attention_mask().iter().map(|i| *i as i64).collect::<Vec<i64>>();
-	let ids = tokens.get_ids().iter().map(|i| *i as i64).collect::<Vec<i64>>();
-	let a_ids = Array1::from_vec(ids);
-	let a_mask = Array1::from_vec(mask);
-	let input_ids = a_ids.view().insert_axis(Axis(0));
-	let input_mask = a_mask.view().insert_axis(Axis(0));
-	let outputs = session.run(ort::inputs![input_ids, input_mask]?)?;
-	let tensor = outputs[1].try_extract_tensor::<f32>();
-	println!("{:?}", tensor);
-	Ok(())
-}
--- a/examples/sentence-transformers/Cargo.toml
+++ b/examples/sentence-transformers/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 publish = false
-name = "example-all-mini-lm-l6"
+name = "sentence-transformers"
 version = "0.0.0"
 edition = "2021"

--- a/examples/sentence-transformers/build.rs
+++ b/examples/sentence-transformers/build.rs
--- a/examples/sentence-transformers/data/tokenizer.json
+++ b/examples/sentence-transformers/data/tokenizer.json
--- a/examples/sentence-transformers/examples/semantic-similarity.rs
+++ b/examples/sentence-transformers/examples/semantic-similarity.rs
@@ -0,0 +1,61 @@
+use std::path::Path;
+
+use ndarray::{s, Array1, Array2, Axis, Ix2};
+use ort::{CUDAExecutionProvider, GraphOptimizationLevel, Session};
+use tokenizers::Tokenizer;
+
+/// Example usage of a text embedding model like Sentence Transformers' `all-mini-lm-l6` model for semantic textual similarity.
+///
+/// Text embedding models map sentences & paragraphs to an n-dimensional dense vector space, which can then be used for
+/// tasks like clustering or semantic search.
+fn main() -> ort::Result<()> {
+	// Initialize tracing to receive debug messages from `ort`
+	tracing_subscriber::fmt::init();
+
+	// Create the ONNX Runtime environment, enabling CUDA execution providers for all sessions created in this process.
+	ort::init()
+		.with_name("sbert")
+		.with_execution_providers([CUDAExecutionProvider::default().build()])
+		.commit()?;
+
+	// Load our model
+	let session = Session::builder()?
+		.with_optimization_level(GraphOptimizationLevel::Level1)?
+		.with_intra_threads(1)?
+		.commit_from_url("https://parcel.pyke.io/v2/cdn/assetdelivery/ortrsv2/ex_models/all-MiniLM-L6-v2.onnx")?;
+
+	// Load the tokenizer and encode the text.
+	let tokenizer = Tokenizer::from_file(Path::new(env!("CARGO_MANIFEST_DIR")).join("data").join("tokenizer.json")).unwrap();
+
+	let inputs = vec!["The weather outside is lovely.", "It's so sunny outside!", "She drove to the stadium."];
+
+	// Encode our input strings. `encode_batch` will pad each input to be the same length.
+	let encodings = tokenizer.encode_batch(inputs.clone(), false)?;
+
+	// Get the padded length of each encoding.
+	let padded_token_length = encodings[0].len();
+
+	// Get our token IDs & mask as a flattened array.
+	let ids: Vec<i64> = encodings.iter().flat_map(|e| e.get_ids().iter().map(|i| *i as i64)).collect();
+	let mask: Vec<i64> = encodings.iter().flat_map(|e| e.get_attention_mask().iter().map(|i| *i as i64)).collect();
+
+	// Convert our flattened arrays into 2-dimensional tensors of shape [N, L].
+	let a_ids = Array2::from_shape_vec([inputs.len(), padded_token_length], ids).unwrap();
+	let a_mask = Array2::from_shape_vec([inputs.len(), padded_token_length], mask).unwrap();
+
+	// Run the model.
+	let outputs = session.run(ort::inputs![a_ids, a_mask]?)?;
+
+	// Extract our embeddings tensor and convert it to a strongly-typed 2-dimensional array.
+	let embeddings = outputs[1].try_extract_tensor::<f32>()?.into_dimensionality::<Ix2>().unwrap();
+
+	println!("Similarity for '{}'", inputs[0]);
+	let query = embeddings.index_axis(Axis(0), 0);
+	for (embeddings, sentence) in embeddings.axis_iter(Axis(0)).zip(inputs.iter()).skip(1) {
+		// Calculate cosine similarity against the 'query' sentence.
+		let dot_product: f32 = query.iter().zip(embeddings.iter()).map(|(a, b)| a * b).sum();
+		println!("\t'{}': {:.1}%", sentence, dot_product * 100.);
+	}
+
+	Ok(())
+}