refactor!: allow zero-copy from_array for array views with TensorRef

This has all sorts of fun breaking changes:
- `ort::inputs!` no longer yields an `ort::Result<...>` (thank God)
- `Tensor::from_array` now only accepts owned data.
- Introduce `TensorRef::from_array_view` and `TensorRefMut::from_array_view_mut`.
- `TryFrom<A>` is no longer implemented for `Tensor<T>` for any variants.

This opens the door to new optimizations on top of fixing a few unsoundness issues.

TODO: update docs
This commit is contained in:
Carson M.
2024-12-21 00:24:54 -06:00
parent d7d4493c3e
commit 9ea18d815b
23 changed files with 379 additions and 381 deletions

View File

@@ -18,7 +18,7 @@ tokio = { version = "1.36", features = [ "full" ] }
tokio-stream = "0.1"
tower-http = { version = "0.5", features = ["fs", "trace"] }
anyhow = "1.0"
async-stream = "0.3"
async-stream-lite = "0.2"
[features]
load-dynamic = [ "ort/load-dynamic" ]

View File

@@ -10,11 +10,10 @@ use axum::{
routing::post
};
use futures::Stream;
use ndarray::{Array1, ArrayViewD, Axis, array, concatenate, s};
use ort::{
execution_providers::CUDAExecutionProvider,
inputs,
session::{Session, builder::GraphOptimizationLevel}
session::{Session, builder::GraphOptimizationLevel},
value::TensorRef
};
use rand::Rng;
use tokenizers::Tokenizer;
@@ -64,37 +63,31 @@ struct AppState {
tokenizer: Arc<Tokenizer>
}
fn generate_stream(tokenizer: Arc<Tokenizer>, session: Arc<Session>, tokens: Vec<i64>, gen_tokens: usize) -> impl Stream<Item = ort::Result<Event>> + Send {
async_stream::try_stream! {
let mut tokens = Array1::from_iter(tokens.iter().cloned());
fn generate_stream(tokenizer: Arc<Tokenizer>, session: Arc<Session>, mut tokens: Vec<i64>, gen_tokens: usize) -> impl Stream<Item = ort::Result<Event>> + Send {
async_stream_lite::try_async_stream(|yielder| async move {
for _ in 0..gen_tokens {
let array = tokens.view().insert_axis(Axis(0)).insert_axis(Axis(1));
let outputs = session.run_async(inputs![array]?)?.await?;
let generated_tokens: ArrayViewD<f32> = outputs["output1"].try_extract_tensor()?;
let input = TensorRef::from_array_view((vec![1, 1, tokens.len() as i64], tokens.as_slice()))?;
let outputs = session.run_async(ort::inputs![input])?.await?;
let (dim, probabilities) = outputs["output1"].try_extract_raw_tensor()?;
// Collect and sort logits
let probabilities = &mut generated_tokens
.slice(s![0, 0, -1, ..])
.insert_axis(Axis(0))
.to_owned()
.iter()
.cloned()
.enumerate()
.collect::<Vec<_>>();
let (seq_len, vocab_size) = (dim[2] as usize, dim[3] as usize);
let mut probabilities: Vec<(usize, f32)> = probabilities[(seq_len - 1) * vocab_size..].iter().copied().enumerate().collect();
probabilities.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
// Sample using top-k sampling
let token = {
let mut rng = rand::thread_rng();
probabilities[rng.gen_range(0..=5)].0
probabilities[rng.gen_range(0..=5)].0 as i64
};
tokens = concatenate![Axis(0), tokens, array![token.try_into().unwrap()]];
tokens.push(token);
let token_str = tokenizer.decode(&[token as _], true).unwrap();
yield Event::default().data(token_str);
yielder.r#yield(Event::default().data(token_str)).await;
}
}
Ok(())
})
}
impl FromRef<AppState> for Arc<Session> {

View File

@@ -45,7 +45,7 @@ fn main() -> anyhow::Result<()> {
)
.unwrap()
};
let outputs = model.run([tensor.into()])?;
let outputs = model.run(ort::inputs![tensor])?;
let output = outputs["output"].try_extract_tensor::<f32>()?;

View File

@@ -1,4 +1,3 @@
use ndarray::Array2;
use ort::{
operator::{
Operator, OperatorDomain,
@@ -6,7 +5,8 @@ use ort::{
kernel::{Kernel, KernelAttributes, KernelContext}
},
session::Session,
tensor::TensorElementType
tensor::TensorElementType,
value::Tensor
};
struct CustomOpOne;
@@ -78,7 +78,16 @@ fn main() -> ort::Result<()> {
.with_operators(OperatorDomain::new("test.customop")?.add(CustomOpOne)?.add(CustomOpTwo)?)?
.commit_from_file("tests/data/custom_op_test.onnx")?;
let values = session.run(ort::inputs![Array2::<f32>::zeros((3, 5)), Array2::<f32>::ones((3, 5))]?)?;
let allocator = session.allocator();
let value1 = Tensor::<f32>::new(allocator, [3, 5])?;
let mut value2 = Tensor::<f32>::new(allocator, [3, 5])?;
{
let (_, data) = value2.extract_raw_tensor_mut();
for datum in data {
*datum = 1.;
}
}
let values = session.run(ort::inputs![&value1, &value2])?;
println!("{:?}", values[0].try_extract_tensor::<i32>()?);
Ok(())

View File

@@ -1,13 +1,13 @@
use std::{
io::{self, Write},
path::Path,
sync::Arc
path::Path
};
use ort::{
execution_providers::CUDAExecutionProvider,
inputs,
session::{Session, builder::GraphOptimizationLevel}
session::{Session, builder::GraphOptimizationLevel},
value::TensorRef
};
use rand::Rng;
use tokenizers::Tokenizer;
@@ -33,7 +33,7 @@ fn main() -> ort::Result<()> {
.with_execution_providers([CUDAExecutionProvider::default().build()])
.commit()?;
let mut stdout = io::stdout();
let mut stdout: io::Stdout = io::stdout();
let mut rng = rand::thread_rng();
// Load our model
@@ -45,7 +45,7 @@ fn main() -> ort::Result<()> {
// Load the tokenizer and encode the prompt into a sequence of tokens.
let tokenizer = Tokenizer::from_file(Path::new(env!("CARGO_MANIFEST_DIR")).join("data").join("tokenizer.json")).unwrap();
let tokens = tokenizer.encode(PROMPT, false).unwrap();
let mut tokens = Arc::new(tokens.get_ids().iter().map(|i| *i as i64).collect::<Vec<_>>().into_boxed_slice());
let mut tokens = tokens.get_ids().iter().map(|i| *i as i64).collect::<Vec<_>>();
print!("{PROMPT}");
stdout.flush().unwrap();
@@ -53,8 +53,8 @@ fn main() -> ort::Result<()> {
for _ in 0..GEN_TOKENS {
// Raw tensor construction takes a tuple of (dimensions, data).
// The model expects our input to have shape [B, _, S]
let input = (vec![1, 1, tokens.len() as i64], Arc::clone(&tokens));
let outputs = session.run(inputs![input]?)?;
let input = TensorRef::from_array_view((vec![1, 1, tokens.len() as i64], tokens.as_slice()))?;
let outputs = session.run(inputs![input])?;
let (dim, mut probabilities) = outputs["output1"].try_extract_raw_tensor()?;
// The output tensor will have shape [B, _, S + 1, V]
@@ -70,9 +70,7 @@ fn main() -> ort::Result<()> {
let token = probabilities[rng.gen_range(0..=TOP_K)].0 as i64;
// Add our generated token to the input sequence
let mut vec = tokens.to_vec();
vec.push(token);
*Arc::make_mut(&mut tokens) = vec.into_boxed_slice();
tokens.push(token);
let token_str = tokenizer.decode(&[token as u32], true).unwrap();
print!("{}", token_str);

View File

@@ -7,7 +7,8 @@ use ndarray::{Array1, ArrayViewD, Axis, array, concatenate, s};
use ort::{
execution_providers::CUDAExecutionProvider,
inputs,
session::{Session, builder::GraphOptimizationLevel}
session::{Session, builder::GraphOptimizationLevel},
value::TensorRef
};
use rand::Rng;
use tokenizers::Tokenizer;
@@ -54,7 +55,7 @@ fn main() -> ort::Result<()> {
for _ in 0..GEN_TOKENS {
let array = tokens.view().insert_axis(Axis(0)).insert_axis(Axis(1));
let outputs = session.run(inputs![array]?)?;
let outputs = session.run(inputs![TensorRef::from_array_view(array)?])?;
let generated_tokens: ArrayViewD<f32> = outputs["output1"].try_extract_tensor()?;
// Collect and sort logits

View File

@@ -4,7 +4,7 @@ use std::{ops::Mul, path::Path};
use image::{GenericImageView, ImageBuffer, Rgba, imageops::FilterType};
use ndarray::Array;
use ort::{execution_providers::CUDAExecutionProvider, inputs, session::Session};
use ort::{execution_providers::CUDAExecutionProvider, inputs, session::Session, value::TensorRef};
use show_image::{AsImageView, WindowOptions, event};
#[show_image::main]
@@ -31,7 +31,7 @@ fn main() -> ort::Result<()> {
input[[0, 2, y, x]] = (b as f32 - 127.5) / 127.5;
}
let outputs = model.run(inputs!["input" => input.view()]?)?;
let outputs = model.run(inputs!["input" => TensorRef::from_array_view(input.view())?])?;
let output = outputs["output"].try_extract_tensor::<f32>()?;

View File

@@ -4,7 +4,10 @@ use std::{path::Path, time::Instant};
use anyhow::Result;
use image::DynamicImage;
use ndarray::{Array, Array2, Array3, Array4, ArrayView, Ix3, Ix4, s};
use ort::{session::Session, value::Tensor};
use ort::{
session::Session,
value::{Tensor, TensorRef}
};
use tokenizers::Tokenizer;
const VISION_MODEL_NAME: &str = "phi-3-v-128k-instruct-vision.onnx";
@@ -31,11 +34,10 @@ fn get_image_embedding(vision_model: &Session, img: &Option<DynamicImage>) -> Re
pixel_values = result.pixel_values.shape(),
image_sizes = result.image_sizes.shape(),
);
let model_inputs = ort::inputs![
"pixel_values" => result.pixel_values,
"image_sizes" => result.image_sizes,
]?;
let outputs = vision_model.run(model_inputs)?;
let outputs = vision_model.run(ort::inputs![
"pixel_values" => Tensor::from_array(result.pixel_values)?,
"image_sizes" => Tensor::from_array(result.image_sizes)?,
])?;
let predictions_view: ArrayView<f32, _> = outputs["visual_features"].try_extract_tensor::<f32>()?;
predictions_view.into_dimensionality::<Ix3>()?.to_owned()
} else {
@@ -45,10 +47,9 @@ fn get_image_embedding(vision_model: &Session, img: &Option<DynamicImage>) -> Re
}
fn get_text_embedding(text_embedding_model: &Session, input_ids: &Array2<i64>) -> Result<Array3<f32>> {
let model_inputs = ort::inputs![
"input_ids" => input_ids.to_owned(),
]?;
let outputs = text_embedding_model.run(model_inputs)?;
let outputs = text_embedding_model.run(ort::inputs![
"input_ids" => TensorRef::from_array_view(input_ids)?,
])?;
let inputs_embeds_view: ArrayView<f32, _> = outputs["inputs_embeds"].try_extract_tensor::<f32>()?;
let inputs_embeds = inputs_embeds_view.into_dimensionality::<Ix3>()?.to_owned();
Ok(inputs_embeds)
@@ -144,12 +145,12 @@ pub async fn generate_text(
// Prepare model inputs
let model_inputs = {
let mut model_inputs = ort::inputs![
"inputs_embeds" => next_inputs_embeds.clone(),
"attention_mask" => attention_mask.clone(),
]?;
"inputs_embeds" => TensorRef::from_array_view(&next_inputs_embeds)?,
"attention_mask" => TensorRef::from_array_view(&attention_mask)?,
];
for i in 0..32 {
model_inputs.push((format!("past_key_values.{}.key", i).into(), Tensor::from_array(past_key_values[i * 2].view())?.into()));
model_inputs.push((format!("past_key_values.{}.value", i).into(), Tensor::from_array(past_key_values[i * 2 + 1].view())?.into()));
model_inputs.push((format!("past_key_values.{}.key", i).into(), TensorRef::from_array_view(&past_key_values[i * 2])?.into()));
model_inputs.push((format!("past_key_values.{}.value", i).into(), TensorRef::from_array_view(&past_key_values[i * 2 + 1])?.into()));
}
model_inputs
};

View File

@@ -1,10 +1,11 @@
use std::path::Path;
use ndarray::{Array2, Axis, Ix2};
use ndarray::{Axis, Ix2};
use ort::{
Error,
execution_providers::CUDAExecutionProvider,
session::{Session, builder::GraphOptimizationLevel}
session::{Session, builder::GraphOptimizationLevel},
value::TensorRef
};
use tokenizers::Tokenizer;
@@ -45,11 +46,11 @@ fn main() -> ort::Result<()> {
let mask: Vec<i64> = encodings.iter().flat_map(|e| e.get_attention_mask().iter().map(|i| *i as i64)).collect();
// Convert our flattened arrays into 2-dimensional tensors of shape [N, L].
let a_ids = Array2::from_shape_vec([inputs.len(), padded_token_length], ids).unwrap();
let a_mask = Array2::from_shape_vec([inputs.len(), padded_token_length], mask).unwrap();
let a_ids = TensorRef::from_array_view(([inputs.len(), padded_token_length], &*ids))?;
let a_mask = TensorRef::from_array_view(([inputs.len(), padded_token_length], &*mask))?;
// Run the model.
let outputs = session.run(ort::inputs![a_ids, a_mask]?)?;
let outputs = session.run(ort::inputs![a_ids, a_mask])?;
// Extract our embeddings tensor and convert it to a strongly-typed 2-dimensional array.
let embeddings = outputs[1].try_extract_tensor::<f32>()?.into_dimensionality::<Ix2>().unwrap();

View File

@@ -5,12 +5,12 @@ use std::{
};
use kdam::BarExt;
use ndarray::{Array1, Array2, ArrayViewD, Axis, concatenate, s};
use ort::{
execution_providers::CUDAExecutionProvider,
memory::Allocator,
session::{Session, builder::SessionBuilder},
training::{CheckpointStrategy, Trainer, TrainerCallbacks, TrainerControl, TrainerState, TrainingArguments}
training::{CheckpointStrategy, Trainer, TrainerCallbacks, TrainerControl, TrainerState, TrainingArguments},
value::{Tensor, TensorRef}
};
use rand::RngCore;
use tokenizers::Tokenizer;
@@ -94,10 +94,10 @@ fn main() -> ort::Result<()> {
.unwrap();
}
Ok((
ort::inputs![Array2::<i64>::from_shape_vec([BATCH_SIZE, SEQUENCE_LENGTH], input_buffer.iter().map(|c| *c as i64).collect()).unwrap()]?,
ort::inputs![Array1::<i64>::from_shape_vec([BATCH_SIZE * SEQUENCE_LENGTH], label_buffer.iter().map(|c| *c as i64).collect()).unwrap()]?
))
let inputs = Tensor::from_array(([BATCH_SIZE, SEQUENCE_LENGTH], input_buffer.iter().map(|c| *c as i64).collect::<Vec<i64>>()))?;
let labels = Tensor::from_array(([BATCH_SIZE * SEQUENCE_LENGTH], label_buffer.iter().map(|c| *c as i64).collect::<Vec<i64>>()))?;
Ok((ort::inputs![inputs], ort::inputs![labels]))
};
trainer.train(
@@ -115,26 +115,19 @@ fn main() -> ort::Result<()> {
let mut stdout = std::io::stdout();
let tokens = tokenizer.encode("<|endoftext|>", false).unwrap();
let tokens = tokens.get_ids().iter().map(|i| *i as i64).collect::<Vec<_>>();
let mut tokens = Array1::from_iter(tokens.iter().cloned());
let mut tokens = tokens.get_ids().iter().map(|i| *i as i64).collect::<Vec<_>>();
for _ in 0..50 {
let array = tokens.view().insert_axis(Axis(0));
let outputs = session.run(ort::inputs![array]?)?;
let generated_tokens: ArrayViewD<f32> = outputs["probs"].try_extract_tensor()?;
let input = TensorRef::from_array_view((vec![1, 1, tokens.len() as i64], tokens.as_slice()))?;
let outputs = session.run(ort::inputs![input])?;
let (dim, probabilities) = outputs["probs"].try_extract_raw_tensor()?;
let probabilities = &mut generated_tokens
.slice(s![-1, ..])
.to_owned()
.iter()
.cloned()
.enumerate()
.collect::<Vec<_>>();
let (seq_len, vocab_size) = (dim[2] as usize, dim[3] as usize);
let mut probabilities: Vec<(usize, f32)> = probabilities[(seq_len - 1) * vocab_size..].iter().copied().enumerate().collect();
probabilities.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
let token = probabilities[0].0;
tokens = concatenate![Axis(0), tokens, ndarray::array![token.try_into().unwrap()]];
let token = probabilities[0].0 as i64;
tokens.push(token);
let token_str = tokenizer.decode(&[token as _], false).unwrap();
print!("{}", token_str);

View File

@@ -5,12 +5,12 @@ use std::{
};
use kdam::BarExt;
use ndarray::{Array1, Array2, ArrayViewD, Axis, concatenate, s};
use ort::{
execution_providers::CUDAExecutionProvider,
memory::Allocator,
session::{Session, builder::SessionBuilder},
training::{Checkpoint, Trainer}
training::{Checkpoint, Trainer},
value::{Tensor, TensorRef}
};
use rand::RngCore;
use tokenizers::Tokenizer;
@@ -83,10 +83,10 @@ fn main() -> ort::Result<()> {
.unwrap();
}
let inputs = Array2::<i64>::from_shape_vec([BATCH_SIZE, SEQUENCE_LENGTH], input_buffer.iter().map(|c| *c as i64).collect()).unwrap();
let labels = Array1::<i64>::from_shape_vec([BATCH_SIZE * SEQUENCE_LENGTH], label_buffer.iter().map(|c| *c as i64).collect()).unwrap();
let inputs = Tensor::from_array(([BATCH_SIZE, SEQUENCE_LENGTH], input_buffer.iter().map(|c| *c as i64).collect::<Vec<i64>>()))?;
let labels = Tensor::from_array(([BATCH_SIZE * SEQUENCE_LENGTH], label_buffer.iter().map(|c| *c as i64).collect::<Vec<i64>>()))?;
let outputs = trainer.step(ort::inputs![inputs.view()]?, ort::inputs![labels.view()]?)?;
let outputs = trainer.step(ort::inputs![inputs], ort::inputs![labels])?;
let loss = outputs[0].try_extract_scalar::<f32>()?;
pb.set_postfix(format!("loss={loss:.3}"));
pb.update(1).unwrap();
@@ -107,26 +107,19 @@ fn main() -> ort::Result<()> {
let mut stdout = std::io::stdout();
let tokens = tokenizer.encode("<|endoftext|>", false).unwrap();
let tokens = tokens.get_ids().iter().map(|i| *i as i64).collect::<Vec<_>>();
let mut tokens = Array1::from_iter(tokens.iter().cloned());
let mut tokens = tokens.get_ids().iter().map(|i| *i as i64).collect::<Vec<_>>();
for _ in 0..50 {
let array = tokens.view().insert_axis(Axis(0));
let outputs = session.run(ort::inputs![array]?)?;
let generated_tokens: ArrayViewD<f32> = outputs["probs"].try_extract_tensor()?;
let input = TensorRef::from_array_view((vec![1, 1, tokens.len() as i64], tokens.as_slice()))?;
let outputs = session.run(ort::inputs![input])?;
let (dim, probabilities) = outputs["probs"].try_extract_raw_tensor()?;
let probabilities = &mut generated_tokens
.slice(s![-1, ..])
.to_owned()
.iter()
.cloned()
.enumerate()
.collect::<Vec<_>>();
let (seq_len, vocab_size) = (dim[2] as usize, dim[3] as usize);
let mut probabilities: Vec<(usize, f32)> = probabilities[(seq_len - 1) * vocab_size..].iter().copied().enumerate().collect();
probabilities.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
let token = probabilities[0].0;
tokens = concatenate![Axis(0), tokens, ndarray::array![token.try_into().unwrap()]];
let token = probabilities[0].0 as i64;
tokens.push(token);
let token_str = tokenizer.decode(&[token as _], false).unwrap();
print!("{}", token_str);

View File

@@ -7,7 +7,8 @@ use ndarray::{Array, Axis, s};
use ort::{
execution_providers::CUDAExecutionProvider,
inputs,
session::{Session, SessionOutputs}
session::{Session, SessionOutputs},
value::TensorRef
};
use raqote::{DrawOptions, DrawTarget, LineJoin, PathBuilder, SolidSource, Source, StrokeStyle};
use show_image::{AsImageView, WindowOptions, event};
@@ -66,7 +67,7 @@ fn main() -> ort::Result<()> {
let model = Session::builder()?.commit_from_url(YOLOV8M_URL)?;
// Run YOLOv8 inference
let outputs: SessionOutputs = model.run(inputs!["images" => input.view()]?)?;
let outputs: SessionOutputs = model.run(inputs!["images" => TensorRef::from_array_view(&input)?])?;
let output = outputs["output0"].try_extract_tensor::<f32>()?.t().into_owned();
let mut boxes = Vec::new();