Add tokenizer improvments via Singleton class and estimation (#3072)

* Add tokenizer improvments via Singleton class
linting

* dev build

* Estimation fallback when string exceeds a fixed byte size

* Add notice to tiktoken on backend
This commit is contained in:
Timothy Carambat
2025-01-30 17:55:03 -08:00
committed by GitHub
parent e1af72daa7
commit d1ca16f7f8
19 changed files with 125 additions and 29 deletions

View File

@@ -66,7 +66,7 @@ async function loadGithubRepo(args, response) {
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
token_count_estimate: tokenizeString(doc.pageContent),
};
console.log(
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`