Skip to content

Commit

Permalink
Added nomic embed v1.5 support with small dim for fast local ops
Browse files Browse the repository at this point in the history
  • Loading branch information
AndriyMulyar committed Feb 15, 2024
1 parent c09ae1c commit 3be1aab
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions dsp/modules/sentence_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import openai
import nomic


class BaseSentenceVectorizer(abc.ABC):
Expand Down Expand Up @@ -158,4 +159,38 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
embeddings_list.extend(cur_batch_embeddings)

embeddings = np.array(embeddings_list, dtype=np.float32)
return embeddings


class NomicVectorizer(BaseSentenceVectorizer):
'''
This vectorizer uses the Nomic Embedding API to convert texts to embeddings. Changing `model` is not
recommended. More about variable sized output models at https://docs.nomic.ai/atlas/guides/embeddings
`api_key` should be passed as an argument or as env variable (`NOMIC_API_KEY`).
`dimensionality`: Defaults to 256 for fast local ops.
'''
def __init__(
self,
model: str = 'nomic-embed-text-v1.5',
dimensionality: int = 256,
task_type: str = 'search_document',
api_key: Optional[str] = None
):
self.model = model
self.dimensionality = dimensionality
self.task_type = task_type

if api_key:
nomic.login(api_key)

def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
text_to_vectorize = self._extract_text_from_examples(inp_examples)


output = nomic.embed.text(texts=text_to_vectorize,
task_type=self.task_type,
model=self.model,
dimensionality=self.dimensionality)

embeddings = np.array(output['embeddings'], dtype=np.float32)
return embeddings

0 comments on commit 3be1aab

Please sign in to comment.