Skip to content

Commit

Permalink
fix: Fix tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
whiterabbit1983 committed May 3, 2024
1 parent acdf105 commit 70af41a
Showing 1 changed file with 14 additions and 15 deletions.
29 changes: 14 additions & 15 deletions agents-api/agents_api/embed_models_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
from typing import TypedDict, Any
from dataclasses import dataclass
from transformers import PreTrainedTokenizer
from tokenizers import Tokenizer
from agents_api.clients.model import openai_client
from agents_api.clients.embed import embed
from agents_api.exceptions import (
Expand All @@ -11,7 +11,6 @@
UnknownTokenizerError,
)


def normalize_l2(x):
x = np.array(x)
if x.ndim == 1:
Expand Down Expand Up @@ -73,9 +72,9 @@ def preprocess(self, inputs: list[EmbeddingInput]) -> list[str]:

async def embed(
self, inputs: list[EmbeddingInput]
) -> list[np.NDArray | list[float]]:
) -> list[np.ndarray | list[float]]:
input = self.preprocess(inputs)
embeddings: list[np.NDArray | list[float]] = []
embeddings: list[np.ndarray | list[float]] = []

if self.embedding_provider == "julep":
embeddings = await embed(input)
Expand All @@ -91,8 +90,8 @@ async def embed(
return self.normalize(embeddings)

def normalize(
self, embeddings: list[np.NDArray | list[float]]
) -> list[np.NDArray | list[float]]:
self, embeddings: list[np.ndarray | list[float]]
) -> list[np.ndarray | list[float]]:
return [
(
e
Expand All @@ -104,44 +103,44 @@ def normalize(


_embedding_model_registry = {
"text-embeddings-3-small": EmbeddingModel(
"text-embedding-3-small": EmbeddingModel(
embedding_provider="openai",
embedding_model_name="text-embeddings-3-small",
embedding_model_name="text-embedding-3-small",
original_embedding_dimensions=1024,
output_embedding_dimensions=1024,
context_window=8192,
tokenizer=tiktoken.encoding_for_model("text-embeddings-3-small"),
tokenizer=tiktoken.encoding_for_model("text-embedding-3-small"),
),
"text-embeddings-3-large": EmbeddingModel(
"text-embedding-3-large": EmbeddingModel(
embedding_provider="openai",
embedding_model_name="text-embeddings-3-large",
embedding_model_name="text-embedding-3-large",
original_embedding_dimensions=1024,
output_embedding_dimensions=1024,
context_window=8192,
tokenizer=tiktoken.encoding_for_model("text-embeddings-3-large"),
tokenizer=tiktoken.encoding_for_model("text-embedding-3-large"),
),
"Alibaba-NLP/gte-large-en-v1.5": EmbeddingModel(
embedding_provider="julep",
embedding_model_name="Alibaba-NLP/gte-large-en-v1.5",
original_embedding_dimensions=1024,
output_embedding_dimensions=1024,
context_window=8192,
tokenizer=PreTrainedTokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5"),
tokenizer=Tokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5"),
),
"BAAI/bge-m3": EmbeddingModel(
embedding_provider="julep",
embedding_model_name="BAAI/bge-m3",
original_embedding_dimensions=1024,
output_embedding_dimensions=1024,
context_window=8192,
tokenizer=PreTrainedTokenizer.from_pretrained("BAAI/bge-m3"),
tokenizer=Tokenizer.from_pretrained("BAAI/bge-m3"),
),
"BAAI/llm-embedder": EmbeddingModel(
embedding_provider="julep",
embedding_model_name="BAAI/llm-embedder",
original_embedding_dimensions=1024,
output_embedding_dimensions=1024,
context_window=8192,
tokenizer=PreTrainedTokenizer.from_pretrained("BAAI/llm-embedder"),
tokenizer=Tokenizer.from_pretrained("BAAI/llm-embedder"),
),
}

0 comments on commit 70af41a

Please sign in to comment.