refactor!: undo The Flattening

This commit is contained in:
Carson M.
2024-11-12 22:19:15 -06:00
parent 17fe990bdf
commit d4f82fc50e
67 changed files with 521 additions and 343 deletions

View File

@@ -11,7 +11,11 @@ use axum::{
};
use futures::Stream;
use ndarray::{Array1, ArrayViewD, Axis, array, concatenate, s};
use ort::{CUDAExecutionProvider, GraphOptimizationLevel, Session, inputs};
use ort::{
execution_providers::CUDAExecutionProvider,
inputs,
session::{Session, builder::GraphOptimizationLevel}
};
use rand::Rng;
use tokenizers::Tokenizer;
use tokio::net::TcpListener;

View File

@@ -1,10 +1,15 @@
use std::{ops::Mul, path::Path};
use cudarc::driver::{sys::CUdeviceptr, CudaDevice, DevicePtr, DevicePtrMut};
use image::{imageops::FilterType, GenericImageView, ImageBuffer, Rgba};
use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut, sys::CUdeviceptr};
use image::{GenericImageView, ImageBuffer, Rgba, imageops::FilterType};
use ndarray::Array;
use ort::{AllocationDevice, AllocatorType, CUDAExecutionProvider, ExecutionProvider, MemoryInfo, MemoryType, Session, TensorRefMut};
use show_image::{event, AsImageView, WindowOptions};
use ort::{
execution_providers::{CUDAExecutionProvider, ExecutionProvider},
memory::{AllocationDevice, AllocatorType, MemoryInfo, MemoryType},
session::Session,
value::TensorRefMut
};
use show_image::{AsImageView, WindowOptions, event};
#[show_image::main]
fn main() -> anyhow::Result<()> {
@@ -66,13 +71,10 @@ fn main() -> anyhow::Result<()> {
let window = show_image::context()
.run_function_wait(move |context| -> Result<_, String> {
let mut window = context
.create_window(
"ort + modnet",
WindowOptions {
size: Some([img_width, img_height]),
..WindowOptions::default()
}
)
.create_window("ort + modnet", WindowOptions {
size: Some([img_width, img_height]),
..WindowOptions::default()
})
.map_err(|e| e.to_string())?;
window.set_image("photo", &output.as_image_view().map_err(|e| e.to_string())?);
Ok(window.proxy())

View File

@@ -1,5 +1,13 @@
use ndarray::Array2;
use ort::{Kernel, KernelAttributes, KernelContext, Operator, OperatorDomain, OperatorInput, OperatorOutput, Session, TensorElementType};
use ort::{
operator::{
Operator, OperatorDomain,
io::{OperatorInput, OperatorOutput},
kernel::{Kernel, KernelAttributes, KernelContext}
},
session::Session,
tensor::TensorElementType
};
struct CustomOpOne;
struct CustomOpOneKernel;

View File

@@ -4,7 +4,11 @@ use std::{
sync::Arc
};
use ort::{CUDAExecutionProvider, GraphOptimizationLevel, Session, inputs};
use ort::{
execution_providers::CUDAExecutionProvider,
inputs,
session::{Session, builder::GraphOptimizationLevel}
};
use rand::Rng;
use tokenizers::Tokenizer;

View File

@@ -4,7 +4,11 @@ use std::{
};
use ndarray::{Array1, ArrayViewD, Axis, array, concatenate, s};
use ort::{CUDAExecutionProvider, GraphOptimizationLevel, Session, inputs};
use ort::{
execution_providers::CUDAExecutionProvider,
inputs,
session::{Session, builder::GraphOptimizationLevel}
};
use rand::Rng;
use tokenizers::Tokenizer;

View File

@@ -1,6 +1,6 @@
use std::{env, process};
use ort::{Session, TensorElementType, ValueType};
use ort::{session::Session, tensor::TensorElementType, value::ValueType};
fn display_element_type(t: TensorElementType) -> &'static str {
match t {

View File

@@ -4,7 +4,7 @@ use std::{ops::Mul, path::Path};
use image::{GenericImageView, ImageBuffer, Rgba, imageops::FilterType};
use ndarray::Array;
use ort::{CUDAExecutionProvider, Session, inputs};
use ort::{execution_providers::CUDAExecutionProvider, inputs, session::Session};
use show_image::{AsImageView, WindowOptions, event};
#[show_image::main]

View File

@@ -18,7 +18,7 @@ pub const NUM_CROPS: usize = 1;
pub const _NUM_IMG_TOKENS: usize = 144;
const OPENAI_CLIP_MEAN: [f32; 3] = [0.48145466, 0.4578275, 0.40821073];
const OPENAI_CLIP_STD: [f32; 3] = [0.26862954, 0.26130258, 0.27577711];
const OPENAI_CLIP_STD: [f32; 3] = [0.26862954, 0.2613026, 0.2757771];
pub struct Phi3VImageProcessor {
num_crops: usize,

View File

@@ -4,12 +4,12 @@ use std::{path::Path, time::Instant};
use anyhow::Result;
use image::DynamicImage;
use ndarray::{Array, Array2, Array3, Array4, ArrayView, Ix3, Ix4, s};
use ort::{Session, Tensor};
use ort::{session::Session, value::Tensor};
use tokenizers::Tokenizer;
const VISION_MODEL_NAME: &'static str = "phi-3-v-128k-instruct-vision.onnx";
const TEXT_EMBEDDING_MODEL_NAME: &'static str = "phi-3-v-128k-instruct-text-embedding.onnx";
const GENERATION_MODEL_NAME: &'static str = "phi-3-v-128k-instruct-text.onnx";
const VISION_MODEL_NAME: &str = "phi-3-v-128k-instruct-vision.onnx";
const TEXT_EMBEDDING_MODEL_NAME: &str = "phi-3-v-128k-instruct-text-embedding.onnx";
const GENERATION_MODEL_NAME: &str = "phi-3-v-128k-instruct-text.onnx";
const MAX_LENGTH: usize = 1000; // max length of the generated text
const EOS_TOKEN_ID: i64 = 32007; // <|end|>
@@ -37,8 +37,7 @@ fn get_image_embedding(vision_model: &Session, img: &Option<DynamicImage>) -> Re
]?;
let outputs = vision_model.run(model_inputs)?;
let predictions_view: ArrayView<f32, _> = outputs["visual_features"].try_extract_tensor::<f32>()?;
let predictions = predictions_view.into_dimensionality::<Ix3>()?.to_owned();
predictions
predictions_view.into_dimensionality::<Ix3>()?.to_owned()
} else {
Array::zeros((1, 0, 0))
};
@@ -71,7 +70,7 @@ fn merge_text_and_image_embeddings(
// Insert visual features
combined_embeds
.slice_mut(s![.., image_token_position..(image_token_position + visual_features.shape()[1]), ..])
.assign(&visual_features);
.assign(visual_features);
// Copy the remaining text embeddings
combined_embeds
@@ -109,13 +108,13 @@ pub async fn generate_text(
text: &str
) -> Result<()> {
let (inputs_embeds, mut attention_mask) = {
let visual_features = get_image_embedding(&vision_model, &image)?;
let prompt = format_chat_template(&image, text);
let visual_features = get_image_embedding(vision_model, image)?;
let prompt = format_chat_template(image, text);
let encoding = tokenizer.encode(prompt, true).map_err(|e| anyhow::anyhow!("Error encoding: {:?}", e))?;
let input_ids: Vec<i64> = encoding.get_ids().iter().map(|&id| id as i64).collect();
let input_ids: Array2<i64> = Array2::from_shape_vec((1, input_ids.len()), input_ids)?;
let mut inputs_embeds: Array3<f32> = get_text_embedding(&text_embedding_model, &input_ids)?;
let mut inputs_embeds: Array3<f32> = get_text_embedding(text_embedding_model, &input_ids)?;
let attention_mask: Vec<i64> = encoding.get_attention_mask().iter().map(|&mask| mask as i64).collect();
let mut attention_mask: Array2<i64> = Array2::from_shape_vec((1, attention_mask.len()), attention_mask)?;
@@ -190,7 +189,7 @@ pub async fn generate_text(
// Update current_embeds, attention_mask, and past_key_values for the next iteration
let new_token_id = Array2::from_elem((1, 1), next_token_id);
next_inputs_embeds = get_text_embedding(&text_embedding_model, &new_token_id)?;
next_inputs_embeds = get_text_embedding(text_embedding_model, &new_token_id)?;
attention_mask = Array2::ones((1, attention_mask.shape()[1] + 1));
for i in 0..32 {
past_key_values[i * 2] = model_outputs[format!("present.{}.key", i)]
@@ -213,15 +212,9 @@ async fn main() -> Result<()> {
let data_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("data");
let tokenizer = Tokenizer::from_file(data_dir.join("tokenizer.json")).map_err(|e| anyhow::anyhow!("Error loading tokenizer: {:?}", e))?;
let vision_model = Session::builder()?
.with_execution_providers([ort::CPUExecutionProvider::default().build()])?
.commit_from_file(data_dir.join(VISION_MODEL_NAME))?;
let text_embedding_model = Session::builder()?
.with_execution_providers([ort::CPUExecutionProvider::default().build()])?
.commit_from_file(data_dir.join(TEXT_EMBEDDING_MODEL_NAME))?;
let generation_model = Session::builder()?
.with_execution_providers([ort::CPUExecutionProvider::default().build()])?
.commit_from_file(data_dir.join(GENERATION_MODEL_NAME))?;
let vision_model = Session::builder()?.commit_from_file(data_dir.join(VISION_MODEL_NAME))?;
let text_embedding_model = Session::builder()?.commit_from_file(data_dir.join(TEXT_EMBEDDING_MODEL_NAME))?;
let generation_model = Session::builder()?.commit_from_file(data_dir.join(GENERATION_MODEL_NAME))?;
// Generate text from text
let image: Option<DynamicImage> = None;

View File

@@ -1,7 +1,11 @@
use std::path::Path;
use ndarray::{Array2, Axis, Ix2};
use ort::{CUDAExecutionProvider, Error, GraphOptimizationLevel, Session};
use ort::{
Error,
execution_providers::CUDAExecutionProvider,
session::{Session, builder::GraphOptimizationLevel}
};
use tokenizers::Tokenizer;
/// Example usage of a text embedding model like Sentence Transformers' `all-mini-lm-l6` model for semantic textual

View File

@@ -6,7 +6,12 @@ use std::{
use kdam::BarExt;
use ndarray::{Array1, Array2, ArrayViewD, Axis, concatenate, s};
use ort::{Allocator, CUDAExecutionProvider, CheckpointStrategy, Session, SessionBuilder, Trainer, TrainerCallbacks, TrainingArguments};
use ort::{
execution_providers::CUDAExecutionProvider,
memory::Allocator,
session::{Session, builder::SessionBuilder},
training::{CheckpointStrategy, Trainer, TrainerCallbacks, TrainerControl, TrainerState, TrainingArguments}
};
use rand::RngCore;
use tokenizers::Tokenizer;
@@ -26,7 +31,7 @@ impl LoggerCallback {
}
impl TrainerCallbacks for LoggerCallback {
fn train_step(&mut self, train_loss: f32, state: &ort::TrainerState, _: &mut ort::TrainerControl<'_>) -> ort::Result<()> {
fn train_step(&mut self, train_loss: f32, state: &TrainerState, _: &mut TrainerControl<'_>) -> ort::Result<()> {
self.progress_bar.total = state.max_steps;
self.progress_bar.set_postfix(format!("loss={train_loss:.3}"));
let _ = self.progress_bar.update_to(state.iter_step);

View File

@@ -6,7 +6,12 @@ use std::{
use kdam::BarExt;
use ndarray::{Array1, Array2, ArrayViewD, Axis, concatenate, s};
use ort::{Allocator, CUDAExecutionProvider, Checkpoint, Session, SessionBuilder, Trainer};
use ort::{
execution_providers::CUDAExecutionProvider,
memory::Allocator,
session::{Session, builder::SessionBuilder},
training::{Checkpoint, Trainer}
};
use rand::RngCore;
use tokenizers::Tokenizer;

View File

@@ -4,7 +4,11 @@ use std::path::Path;
use image::{GenericImageView, imageops::FilterType};
use ndarray::{Array, Axis, s};
use ort::{CUDAExecutionProvider, Session, SessionOutputs, inputs};
use ort::{
execution_providers::CUDAExecutionProvider,
inputs,
session::{Session, SessionOutputs}
};
use raqote::{DrawOptions, DrawTarget, LineJoin, PathBuilder, SolidSource, Source, StrokeStyle};
use show_image::{AsImageView, WindowOptions, event};