Skip to content

Commit

Permalink
Merge pull request #4320 from vespa-engine/arnej/add-nomic-ai-modernbert
Browse files Browse the repository at this point in the history
add nomic-ai modernbert
  • Loading branch information
arnej27959 authored Jan 21, 2025
2 parents 7f617f0 + b194364 commit 2ec50c9
Show file tree
Hide file tree
Showing 7 changed files with 401 additions and 5 deletions.
2 changes: 1 addition & 1 deletion lib/app_generator/container.rb
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def to_xml(indent="")
end

def dump_xml(indent="")
XmlHelper.new(indent).tag(@tag, @attrs).to_xml(@value).to_s
XmlHelper.new(indent).tag_always(@tag, @attrs).to_xml(@value).to_s
end
end

Expand Down
12 changes: 12 additions & 0 deletions tests/search/embedding/10-docs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[
{ "put": "id:x:doc::1", "fields": { "text": "Hello world" } },
{ "put": "id:x:doc::2", "fields": { "text": "To transport goods on water, use a boat" } },
{ "put": "id:x:doc::3", "fields": { "text": "Human interaction is often done by talking" } },
{ "put": "id:x:doc::4", "fields": { "text": "The galaxy is filled with stars" } },
{ "put": "id:x:doc::5", "fields": { "text": "TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten" } },
{ "put": "id:x:doc::6", "fields": { "text": "GELU (Gaussian Error Linear Unit) is an activation function that can be approximated using tanh" } },
{ "put": "id:x:doc::7", "fields": { "text": "Washtenaw Community College (WCC) is a public community college in Ann Arbor Charter Township, Michigan." } },
{ "put": "id:x:doc::8", "fields": { "text": "The Nintendo Entertainment System (NES) is an 8-bit home video game console produced by Nintendo. It was first released in Japan on July 15, 1983, as the Family Computer (Famicom)." } },
{ "put": "id:x:doc::9", "fields": { "text": "Written in 1787, ratified in 1788, and in operation since 1789, the United States Constitution is the world's longest surviving written charter of government. Its first three words – “We The People” – affirm that the government of the United States exists to serve its citizens." } },
{ "put": "id:x:doc::A", "fields": { "text": "When the Medical Research Council formed in Britain in 1913, it initially focused on tuberculosis research." } }
]
81 changes: 77 additions & 4 deletions tests/search/embedding/embedding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,18 @@ def huggingface_embedder_binarization_component
param('pooling-strategy', 'cls')
end

def nomic_modernbert_component
Component.new('nomicmb').
type('hugging-face-embedder').
param('transformer-model', '', { 'url' => 'https://data.vespa-cloud.com/onnx_models/nomic-ai-modernbert-embed-base/model.onnx' }).
param('transformer-token-type-ids').
param('tokenizer-model', '', { 'url' => 'https://data.vespa-cloud.com/onnx_models/nomic-ai-modernbert-embed-base/tokenizer.json' }).
param('transformer-output', 'token_embeddings').
param('max-tokens', 8192).
param('prepend', [ ComponentParam::new('query', 'search_query:', {}),
ComponentParam::new('document', 'search_document:', {}) ] )
end

def colbert_embedder_component
Component.new('colbert').
type('colbert-embedder').
Expand Down Expand Up @@ -124,6 +136,20 @@ def test_huggingface_embedding
verify_huggingface_embedding
end

def test_modernbert_embedding
deploy_app(
SearchApp.new.
container(
default_container_setup.
component(nomic_modernbert_component).
jvmoptions('-Xms3g -Xmx3g')).
sd(selfdir + 'nomic-ai/schemas/doc.sd').
indexing_cluster('default').indexing_chain('indexing'))
start_vespa
feed_and_wait_for_docs('doc', 10, :file => selfdir + '10-docs.json')
verify_embeddings_with('nomic-ai/expect.json', 'nomicmb')
end

def test_huggingface_embedding_binary_quantization
deploy_app(
SearchApp.new.
Expand Down Expand Up @@ -256,6 +282,53 @@ def verify_huggingface_tokens
end


def check_val_by_idx(expected_v, actual_v, idx)
expval = expected_v[idx]
actval = actual_v[idx]
assert((expval - actval).abs < 1e-5, "#{expval} != #{actval} at index #{idx}")
#puts("OK[#{idx}]: #{expval}")
end

def check_prefix_suffix(expected_v, actual_v, fixlen)
(1..fixlen).each { |i|
check_val_by_idx(expected_v, actual_v, i-1)
check_val_by_idx(expected_v, actual_v, -i)
}
end

def verify_embeddings_with(savedFile, embedder = "modernbert")
wanted = JSON.parse(File.read(selfdir + savedFile))
wanted.each do |want|
keyword = '"' + want['kw'] + '"'
puts "Looking for #{keyword}"
qtext = want['qtext']
q_emb = want['q_emb']
d_emb = want['d_emb']
yql = "select+*+from+sources+*+where+text+contains+#{keyword}"
qi = "input.query(embedding)=embed(#{embedder},@myqtext)"
result = search("?yql=#{yql}&#{qi}&myqtext=#{qtext}").json
assert_equal(1, result['root']['children'].size)
hitfields = result['root']['children'][0]['fields']
queryFeature = hitfields['summaryfeatures']['query(embedding)']
documentFeature = hitfields['summaryfeatures']['attribute(embedding)']

expected_length = 768
assert_equal(expected_length, queryFeature['values'].length)
assert_equal(expected_length, documentFeature['values'].length)

dfv = documentFeature['values']
check_prefix_suffix(d_emb, dfv, 5)

qfv = queryFeature['values']
check_prefix_suffix(q_emb, qfv, 5)

yql = "select+*+from+sources+*+where+{targetHits:10}nearestNeighbor(embedding,embedding)"
result = search("?yql=#{yql}&#{qi}&myqtext=#{qtext}&ranking=less")
puts "Hit 1: #{result.hit[0]}"
puts "Hit 2: #{result.hit[1]}"
end
end

def verify_huggingface_embedding
expected_embedding = JSON.parse(File.read(selfdir + 'hf-expected-vector.json'))
result = search("?yql=select%20*%20from%20sources%20*%20where%20text%20contains%20%22hello%22%3B&ranking.features.query(embedding)=embed(huggingface, \"Hello%20world\")&format=json&format.tensors=short").json
Expand All @@ -277,10 +350,10 @@ def verify_huggingface_embedding_binary_quantization
result = search("?yql=select%20*%20from%20sources%20*%20where%20true&input.query(embedding)=embed(mixed, \"Hello%20world\")&input.query(binary_embedding)=embed(mixed, \"Hello%20world\")&format=json&format.tensors=short").json
queryFeature = result['root']['children'][0]['fields']['summaryfeatures']["query(embedding)"]
attributeFeatureShortFloat = result['root']['children'][0]['fields']['summaryfeatures']["attribute(shortened_embedding)"]

attributeFeature = result['root']['children'][0]['fields']['summaryfeatures']["attribute(binary_embedding)"]
queryBinaryFeature = result['root']['children'][0]['fields']['summaryfeatures']["query(binary_embedding)"]

attributeFeatureShort = result['root']['children'][0]['fields']['summaryfeatures']["attribute(binary_embedding_short)"]
attributeUnpackedFeature = result['root']['children'][0]['fields']['summaryfeatures']["unpacked"]

Expand All @@ -300,13 +373,13 @@ def verify_huggingface_embedding_binary_quantization
(0..511).each { |i|
assert((queryFeature['values'][i] - attributeFeatureShortFloat['values'][i]).abs < 1e-5, "#{queryFeature['values'][i]} != #{attributeFeatureShortFloat['values'][i]} at index #{i}")
}

expected_length = 128
assert_equal(expected_length, attributeFeature['values'].length)
assert_equal(expected_length, queryBinaryFeature['values'].length)
assert_equal(8*expected_length, queryFeature['values'].length)
assert_equal(8*expected_length, attributeUnpackedFeature['values'].length)

assert_equal(2, attributeFeatureShort['values'].length)

expected_embedding = JSON.parse(File.read(selfdir + 'hf-binarized-expected-vector.json'))
Expand Down
156 changes: 156 additions & 0 deletions tests/search/embedding/nomic-ai/expect.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
[
{
"kw": "hello",
"qtext": "Hello world",
"q_emb": [
-0.1517996, -0.6374424, 0.4247261, 0.3869069, -0.3653424,
0,
-1.6047241, -0.7826133, 1.3783929, -0.9292217, 0.8447577
],
"d_emb": [
-1.2073024, -1.0536656, 0.6317429, 1.0615695, -0.3480913,
0,
-1.7890979, -1.0321823, 0.3601977, -0.3549922, 0.4872490
]
},
{
"kw": "boat",
"qtext": "How should we ship containers overseas?",
"q_emb": [
-0.9009371, -0.9360512, -0.4412966, -0.2381173, 0.3677789,
0,
-0.8887857, -2.1327288, 0.1672105, 0.7446382, -0.5363208
],
"d_emb": [
-0.5977033, -0.2346260, 0.2736411, 0.5059149, 0.4526244,
0,
0.9618395, 0.3053280, 0.0895867, -0.9108645, -0.0150846
]
},
{
"kw": "talking",
"qtext": "How do people communicate?",
"q_emb": [
0.5898885, 0.2406307, 1.4363319, -0.6214936, 0.4670902,
0,
0.8544088, -0.4659769, 0.4571314, -0.6716288, 1.3323010
],
"d_emb": [
-0.0539239, -1.1474287, 0.6985879, 0.7136049, 0.1865370,
0,
0.4620113, -0.6710358, 0.8524743, -2.0078239, -0.2919319
]
},
{
"kw": "galaxy",
"qtext": "What can we find in outer space?",
"q_emb": [
0.0731505, -0.1972794, 0.3282010, -0.8390856, 0.5357113,
0,
-0.5428911, 0.5391271, -1.7250355, 0.1367350, 0.0116808
],
"d_emb": [
-0.6224923, -1.8438429, -1.0246146, 0.0386446, 1.7245310,
0,
-0.8285973, 0.6575807, -1.4614897, 1.3965569, 1.0075078
]
},
{
"kw": "TSNE",
"qtext": "What is TSNE?",
"q_emb": [
-1.1086491, -0.4933267, -1.2220038, -0.0333620, 1.4924712,
0,
1.0930823, 0.8708690, -2.2517716, -1.2630360, -1.7850674
],
"d_emb": [
-0.5343507, -0.2965835, -0.9271476, 0.4835486, 1.6937600,
0,
0.8845421, 0.9818263, -2.3405263, -2.0586404, -1.1525191
]
},
{
"kw": "TSNE",
"qtext": "Who is Laurens van der Maaten?",
"q_emb": [
2.4408578, -0.1772350, -0.8954426, -1.0389008, 0.3895829,
0,
0.5072003, 1.1779983, -1.6226364, -0.6145927, -1.4175989
],
"d_emb": [
-0.5343507, -0.2965835, -0.9271476, 0.4835486, 1.6937600,
0,
0.8845421, 0.9818263, -2.3405263, -2.0586404, -1.1525191
]
},
{
"kw": "gaussian",
"qtext": "Are there fast RELU alternatives?",
"q_emb": [
-2.1928505, -0.7935686, -0.4195246, -0.9163905, 0.3253127,
0,
-0.8234660, -0.2149347, -1.7023215, -0.2171830, -0.3060097
],
"d_emb": [
-0.5021159, -0.7608940, 0.3736764, -1.0960837, 0.3534286,
0,
1.9716295, 0.4452886, 1.0279579, 0.2335866, 0.3022618
]
},
{
"kw": "college",
"qtext": "What community college is located in Ann Arbor?",
"q_emb": [
0.8431194, 0.5613796, 0.5770338, -0.9051319, 0.4925844,
0,
0.1592770, 0.2980305, -0.6384548, 1.1294649, -0.3964248
],
"d_emb": [
0.4650825, 0.1456752, -0.3070188, -0.1909874, 0.8432318,
0,
0.7797915, 0.1910115, -0.1199950, 1.3307210, 0.1070940
]
},
{
"kw": "nintendo",
"qtext": "When was the nationwide release of the NES?",
"q_emb": [
0.2872243, 1.2303341, 1.5656623, 0.0331985, -0.2138445,
0,
0.6698879, 0.7841985, 0.5323055, 0.4517521, -0.1455747
],
"d_emb": [
0.9628002, 1.3298000, 1.0267871, -0.8570239, 0.2695003,
0,
1.7462596, -0.3236981, -0.4424495, 0.9941528, -0.8473317
]
},
{
"kw": "constitution",
"qtext": "What is the foundation of the U.S. federal government?",
"q_emb": [
2.2936303, -1.1732379, -0.8962960, -1.5333929, -0.9628574,
0,
1.4127426, 0.5350651, 0.3293975, -0.0545545, -0.3093640
],
"d_emb": [
0.8969953, -0.7017827, 0.3892760, -0.1460893, 1.2133235,
0,
0.6898801, 0.0570064, 1.3232383, 0.5762012, 0.8841960
]
},
{
"kw": "tuberculosis",
"qtext": "What British health organization made tuberculosis its top priority at its start?",
"q_emb": [
-0.6363328, 0.0660632, 0.6954114, -0.9171738, 1.6874761,
0,
-0.2744052, -0.0265377, -0.1333801, -0.5362182, -1.0908823
],
"d_emb": [
0.0900357, -0.5186007, 0.3904161, -0.4314490, 1.5636123,
0,
0.1861010, 0.0789120, 0.3013286, 0.0102469, -1.1406360
]
}
]
54 changes: 54 additions & 0 deletions tests/search/embedding/nomic-ai/gen-qa-embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/python3.11

import sys
import json
import torch
from transformers import AutoTokenizer, AutoModel

def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = (
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
)
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
input_mask_expanded.sum(1), min=1e-9
)

queries = [ ]
documents = [ ]
j = json.load(open('q-and-a.json'))
for o in j:
queries.append('search_query: ' + o['qtext'])
documents.append('search_document: ' + o['dtext'])

tokenizer = AutoTokenizer.from_pretrained("nomic-ai/modernbert-embed-base")
model = AutoModel.from_pretrained("nomic-ai/modernbert-embed-base")

encoded_queries = tokenizer(queries, padding=True, truncation=True, return_tensors="pt")
encoded_documents = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
queries_outputs = model(**encoded_queries)
documents_outputs = model(**encoded_documents)

query_embeddings = mean_pooling(queries_outputs, encoded_queries["attention_mask"])
doc_embeddings = mean_pooling(documents_outputs, encoded_documents["attention_mask"])

torch.set_printoptions(precision=6)
torch.set_printoptions(threshold=25)
torch.set_printoptions(edgeitems=5)
torch.set_printoptions(linewidth=120)
torch.set_printoptions(sci_mode=False)

d = doc_embeddings
q = query_embeddings
for i in range(0, len(j)):
qe = torch.cat((q[i][:6], q[i][-5:])).tolist()
qe[5] = 0
j[i]['q_emb'] = qe
de = torch.cat((d[i][:6], d[i][-5:])).tolist()
de[5] = 0
j[i]['d_emb'] = de

json.dump(j, sys.stdout, indent=4)
print("")
Loading

0 comments on commit 2ec50c9

Please sign in to comment.