Merge pull request #4320 from vespa-engine/arnej/add-nomic-ai-modernbert

add nomic-ai modernbert
vespa-engine · Jan 21, 2025 · 2ec50c9 · 2ec50c9
2 parents 7f617f0 + b194364
commit 2ec50c9
Show file tree

Hide file tree

Showing 7 changed files with 401 additions and 5 deletions.
diff --git a/lib/app_generator/container.rb b/lib/app_generator/container.rb
@@ -315,7 +315,7 @@ def to_xml(indent="")
   end
 
   def dump_xml(indent="")
-    XmlHelper.new(indent).tag(@tag, @attrs).to_xml(@value).to_s
+    XmlHelper.new(indent).tag_always(@tag, @attrs).to_xml(@value).to_s
   end
 end
 

diff --git a/tests/search/embedding/10-docs.json b/tests/search/embedding/10-docs.json
@@ -0,0 +1,12 @@
+[
+{ "put": "id:x:doc::1", "fields": { "text": "Hello world" } },
+{ "put": "id:x:doc::2", "fields": { "text": "To transport goods on water, use a boat" } },
+{ "put": "id:x:doc::3", "fields": { "text": "Human interaction is often done by talking" } },
+{ "put": "id:x:doc::4", "fields": { "text": "The galaxy is filled with stars" } },
+{ "put": "id:x:doc::5", "fields": { "text": "TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten" } },
+{ "put": "id:x:doc::6", "fields": { "text": "GELU (Gaussian Error Linear Unit) is an activation function that can be approximated using tanh" } },
+{ "put": "id:x:doc::7", "fields": { "text": "Washtenaw Community College (WCC) is a public community college in Ann Arbor Charter Township, Michigan." } },
+{ "put": "id:x:doc::8", "fields": { "text": "The Nintendo Entertainment System (NES) is an 8-bit home video game console produced by Nintendo. It was first released in Japan on July 15, 1983, as the Family Computer (Famicom)." } },
+{ "put": "id:x:doc::9", "fields": { "text": "Written in 1787, ratified in 1788, and in operation since 1789, the United States Constitution is the world's longest surviving written charter of government. Its first three words – “We The People” – affirm that the government of the United States exists to serve its citizens." } },
+{ "put": "id:x:doc::A", "fields": { "text": "When the Medical Research Council formed in Britain in 1913, it initially focused on tuberculosis research." } }
+]
diff --git a/tests/search/embedding/embedding.rb b/tests/search/embedding/embedding.rb
@@ -49,6 +49,18 @@ def huggingface_embedder_binarization_component
       param('pooling-strategy', 'cls')
   end
 
+  def nomic_modernbert_component
+    Component.new('nomicmb').
+      type('hugging-face-embedder').
+      param('transformer-model', '', { 'url' => 'https://data.vespa-cloud.com/onnx_models/nomic-ai-modernbert-embed-base/model.onnx' }).
+      param('transformer-token-type-ids').
+      param('tokenizer-model', '', { 'url' => 'https://data.vespa-cloud.com/onnx_models/nomic-ai-modernbert-embed-base/tokenizer.json' }).
+      param('transformer-output', 'token_embeddings').
+      param('max-tokens', 8192).
+      param('prepend', [ ComponentParam::new('query', 'search_query:', {}),
+                         ComponentParam::new('document', 'search_document:', {}) ] )
+  end
+
   def colbert_embedder_component
      Component.new('colbert').
        type('colbert-embedder').
@@ -124,6 +136,20 @@ def test_huggingface_embedding
     verify_huggingface_embedding
   end
 
+  def test_modernbert_embedding
+    deploy_app(
+      SearchApp.new.
+        container(
+          default_container_setup.
+            component(nomic_modernbert_component).
+            jvmoptions('-Xms3g -Xmx3g')).
+        sd(selfdir + 'nomic-ai/schemas/doc.sd').
+        indexing_cluster('default').indexing_chain('indexing'))
+    start_vespa
+    feed_and_wait_for_docs('doc', 10, :file => selfdir + '10-docs.json')
+    verify_embeddings_with('nomic-ai/expect.json', 'nomicmb')
+  end
+
   def test_huggingface_embedding_binary_quantization
     deploy_app(
       SearchApp.new.
@@ -256,6 +282,53 @@ def verify_huggingface_tokens
   end
 
 
+  def check_val_by_idx(expected_v, actual_v, idx)
+      expval = expected_v[idx]
+      actval = actual_v[idx]
+      assert((expval - actval).abs < 1e-5, "#{expval} != #{actval} at index #{idx}")
+      #puts("OK[#{idx}]: #{expval}")
+  end
+
+  def check_prefix_suffix(expected_v, actual_v, fixlen)
+    (1..fixlen).each { |i|
+      check_val_by_idx(expected_v, actual_v, i-1)
+      check_val_by_idx(expected_v, actual_v, -i)
+    }
+  end
+
+  def verify_embeddings_with(savedFile, embedder = "modernbert")
+    wanted = JSON.parse(File.read(selfdir + savedFile))
+    wanted.each do |want|
+      keyword = '"' + want['kw'] + '"'
+      puts "Looking for #{keyword}"
+      qtext = want['qtext']
+      q_emb = want['q_emb']
+      d_emb = want['d_emb']
+      yql = "select+*+from+sources+*+where+text+contains+#{keyword}"
+      qi = "input.query(embedding)=embed(#{embedder},@myqtext)"
+      result = search("?yql=#{yql}&#{qi}&myqtext=#{qtext}").json
+      assert_equal(1, result['root']['children'].size)
+      hitfields = result['root']['children'][0]['fields']
+      queryFeature    = hitfields['summaryfeatures']['query(embedding)']
+      documentFeature = hitfields['summaryfeatures']['attribute(embedding)']
+
+      expected_length = 768
+      assert_equal(expected_length, queryFeature['values'].length)
+      assert_equal(expected_length, documentFeature['values'].length)
+
+      dfv = documentFeature['values']
+      check_prefix_suffix(d_emb, dfv, 5)
+
+      qfv = queryFeature['values']
+      check_prefix_suffix(q_emb, qfv, 5)
+
+      yql = "select+*+from+sources+*+where+{targetHits:10}nearestNeighbor(embedding,embedding)"
+      result = search("?yql=#{yql}&#{qi}&myqtext=#{qtext}&ranking=less")
+      puts "Hit 1: #{result.hit[0]}"
+      puts "Hit 2: #{result.hit[1]}"
+    end
+  end
+
   def verify_huggingface_embedding
     expected_embedding = JSON.parse(File.read(selfdir + 'hf-expected-vector.json'))
     result = search("?yql=select%20*%20from%20sources%20*%20where%20text%20contains%20%22hello%22%3B&ranking.features.query(embedding)=embed(huggingface, \"Hello%20world\")&format=json&format.tensors=short").json
@@ -277,10 +350,10 @@ def verify_huggingface_embedding_binary_quantization
     result = search("?yql=select%20*%20from%20sources%20*%20where%20true&input.query(embedding)=embed(mixed, \"Hello%20world\")&input.query(binary_embedding)=embed(mixed, \"Hello%20world\")&format=json&format.tensors=short").json
     queryFeature     = result['root']['children'][0]['fields']['summaryfeatures']["query(embedding)"]
     attributeFeatureShortFloat = result['root']['children'][0]['fields']['summaryfeatures']["attribute(shortened_embedding)"]
-    
+
     attributeFeature = result['root']['children'][0]['fields']['summaryfeatures']["attribute(binary_embedding)"]
     queryBinaryFeature = result['root']['children'][0]['fields']['summaryfeatures']["query(binary_embedding)"]
-    
+
     attributeFeatureShort = result['root']['children'][0]['fields']['summaryfeatures']["attribute(binary_embedding_short)"]
     attributeUnpackedFeature = result['root']['children'][0]['fields']['summaryfeatures']["unpacked"]
 
@@ -300,13 +373,13 @@ def verify_huggingface_embedding_binary_quantization
     (0..511).each { |i|
       assert((queryFeature['values'][i] - attributeFeatureShortFloat['values'][i]).abs < 1e-5, "#{queryFeature['values'][i]} != #{attributeFeatureShortFloat['values'][i]} at index #{i}")
     }
-    
+
     expected_length = 128
     assert_equal(expected_length, attributeFeature['values'].length)
     assert_equal(expected_length, queryBinaryFeature['values'].length)
     assert_equal(8*expected_length, queryFeature['values'].length)
     assert_equal(8*expected_length, attributeUnpackedFeature['values'].length)
-    
+
     assert_equal(2, attributeFeatureShort['values'].length)
 
     expected_embedding = JSON.parse(File.read(selfdir + 'hf-binarized-expected-vector.json'))

diff --git a/tests/search/embedding/nomic-ai/expect.json b/tests/search/embedding/nomic-ai/expect.json
@@ -0,0 +1,156 @@
+[
+    {
+        "kw": "hello",
+        "qtext": "Hello world",
+        "q_emb": [
+            -0.1517996, -0.6374424, 0.4247261, 0.3869069, -0.3653424,
+            0,
+            -1.6047241, -0.7826133, 1.3783929, -0.9292217, 0.8447577
+        ],
+        "d_emb": [
+            -1.2073024, -1.0536656, 0.6317429, 1.0615695, -0.3480913,
+            0,
+            -1.7890979, -1.0321823, 0.3601977, -0.3549922, 0.4872490
+        ]
+    },
+    {
+        "kw": "boat",
+        "qtext": "How should we ship containers overseas?",
+        "q_emb": [
+            -0.9009371, -0.9360512, -0.4412966, -0.2381173, 0.3677789,
+            0,
+            -0.8887857, -2.1327288, 0.1672105, 0.7446382, -0.5363208
+        ],
+        "d_emb": [
+            -0.5977033, -0.2346260, 0.2736411, 0.5059149, 0.4526244,
+            0,
+            0.9618395, 0.3053280, 0.0895867, -0.9108645, -0.0150846
+        ]
+    },
+    {
+        "kw": "talking",
+        "qtext": "How do people communicate?",
+        "q_emb": [
+            0.5898885, 0.2406307, 1.4363319, -0.6214936, 0.4670902,
+            0,
+            0.8544088, -0.4659769, 0.4571314, -0.6716288, 1.3323010
+        ],
+        "d_emb": [
+            -0.0539239, -1.1474287, 0.6985879, 0.7136049, 0.1865370,
+            0,
+            0.4620113, -0.6710358, 0.8524743, -2.0078239, -0.2919319
+        ]
+    },
+    {
+        "kw": "galaxy",
+        "qtext": "What can we find in outer space?",
+        "q_emb": [
+            0.0731505, -0.1972794, 0.3282010, -0.8390856, 0.5357113,
+            0,
+            -0.5428911, 0.5391271, -1.7250355, 0.1367350, 0.0116808
+        ],
+        "d_emb": [
+            -0.6224923, -1.8438429, -1.0246146, 0.0386446, 1.7245310,
+            0,
+            -0.8285973, 0.6575807, -1.4614897, 1.3965569, 1.0075078
+        ]
+    },
+    {
+        "kw": "TSNE",
+        "qtext": "What is TSNE?",
+        "q_emb": [
+            -1.1086491, -0.4933267, -1.2220038, -0.0333620, 1.4924712,
+            0,
+            1.0930823, 0.8708690, -2.2517716, -1.2630360, -1.7850674
+        ],
+        "d_emb": [
+            -0.5343507, -0.2965835, -0.9271476, 0.4835486, 1.6937600,
+            0,
+            0.8845421, 0.9818263, -2.3405263, -2.0586404, -1.1525191
+        ]
+    },
+    {
+        "kw": "TSNE",
+        "qtext": "Who is Laurens van der Maaten?",
+        "q_emb": [
+            2.4408578, -0.1772350, -0.8954426, -1.0389008, 0.3895829,
+            0,
+            0.5072003, 1.1779983, -1.6226364, -0.6145927, -1.4175989
+        ],
+        "d_emb": [
+            -0.5343507, -0.2965835, -0.9271476, 0.4835486, 1.6937600,
+            0,
+            0.8845421, 0.9818263, -2.3405263, -2.0586404, -1.1525191
+        ]
+    },
+    {
+        "kw": "gaussian",
+        "qtext": "Are there fast RELU alternatives?",
+        "q_emb": [
+            -2.1928505, -0.7935686, -0.4195246, -0.9163905, 0.3253127,
+            0,
+            -0.8234660, -0.2149347, -1.7023215, -0.2171830, -0.3060097
+        ],
+        "d_emb": [
+            -0.5021159, -0.7608940, 0.3736764, -1.0960837, 0.3534286,
+            0,
+            1.9716295, 0.4452886, 1.0279579, 0.2335866, 0.3022618
+        ]
+    },
+    {
+        "kw": "college",
+        "qtext": "What community college is located in Ann Arbor?",
+        "q_emb": [
+            0.8431194, 0.5613796, 0.5770338, -0.9051319, 0.4925844,
+            0,
+            0.1592770, 0.2980305, -0.6384548, 1.1294649, -0.3964248
+        ],
+        "d_emb": [
+            0.4650825, 0.1456752, -0.3070188, -0.1909874, 0.8432318,
+            0,
+            0.7797915, 0.1910115, -0.1199950, 1.3307210, 0.1070940
+        ]
+    },
+    {
+        "kw": "nintendo",
+        "qtext": "When was the nationwide release of the NES?",
+        "q_emb": [
+            0.2872243, 1.2303341, 1.5656623, 0.0331985, -0.2138445,
+            0,
+            0.6698879, 0.7841985, 0.5323055, 0.4517521, -0.1455747
+        ],
+        "d_emb": [
+            0.9628002, 1.3298000, 1.0267871, -0.8570239, 0.2695003,
+            0,
+            1.7462596, -0.3236981, -0.4424495, 0.9941528, -0.8473317
+        ]
+    },
+    {
+        "kw": "constitution",
+        "qtext": "What is the foundation of the U.S. federal government?",
+        "q_emb": [
+            2.2936303, -1.1732379, -0.8962960, -1.5333929, -0.9628574,
+            0,
+            1.4127426, 0.5350651, 0.3293975, -0.0545545, -0.3093640
+        ],
+        "d_emb": [
+            0.8969953, -0.7017827, 0.3892760, -0.1460893, 1.2133235,
+            0,
+            0.6898801, 0.0570064, 1.3232383, 0.5762012, 0.8841960
+        ]
+    },
+    {
+        "kw": "tuberculosis",
+        "qtext": "What British health organization made tuberculosis its top priority at its start?",
+        "q_emb": [
+            -0.6363328, 0.0660632, 0.6954114, -0.9171738, 1.6874761,
+            0,
+            -0.2744052, -0.0265377, -0.1333801, -0.5362182, -1.0908823
+        ],
+        "d_emb": [
+            0.0900357, -0.5186007, 0.3904161, -0.4314490, 1.5636123,
+            0,
+            0.1861010, 0.0789120, 0.3013286, 0.0102469, -1.1406360
+        ]
+    }
+]
diff --git a/tests/search/embedding/nomic-ai/gen-qa-embeddings.py b/tests/search/embedding/nomic-ai/gen-qa-embeddings.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python3.11
+
+import sys
+import json
+import torch
+from transformers import AutoTokenizer, AutoModel
+
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    )
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
+
+queries = [ ]
+documents = [ ]
+j = json.load(open('q-and-a.json'))
+for o in j:
+    queries.append('search_query: ' + o['qtext'])
+    documents.append('search_document: ' + o['dtext'])
+
+tokenizer = AutoTokenizer.from_pretrained("nomic-ai/modernbert-embed-base")
+model = AutoModel.from_pretrained("nomic-ai/modernbert-embed-base")
+
+encoded_queries   = tokenizer(queries,   padding=True, truncation=True, return_tensors="pt")
+encoded_documents = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")
+
+with torch.no_grad():
+    queries_outputs = model(**encoded_queries)
+    documents_outputs = model(**encoded_documents)
+
+query_embeddings = mean_pooling(queries_outputs, encoded_queries["attention_mask"])
+doc_embeddings = mean_pooling(documents_outputs, encoded_documents["attention_mask"])
+
+torch.set_printoptions(precision=6)
+torch.set_printoptions(threshold=25)
+torch.set_printoptions(edgeitems=5)
+torch.set_printoptions(linewidth=120)
+torch.set_printoptions(sci_mode=False)
+
+d = doc_embeddings
+q = query_embeddings
+for i in range(0, len(j)):
+    qe = torch.cat((q[i][:6], q[i][-5:])).tolist()
+    qe[5] = 0
+    j[i]['q_emb'] = qe
+    de = torch.cat((d[i][:6], d[i][-5:])).tolist()
+    de[5] = 0
+    j[i]['d_emb'] = de
+
+json.dump(j, sys.stdout, indent=4)
+print("")
-Original file line number
+Diff line change
@@ Expand Up / @@ -315,7 +315,7 @@ def to_xml(indent="") @@
       end
       def dump_xml(indent="")
-        XmlHelper.new(indent).tag(@tag, @attrs).to_xml(@value).to_s
+        XmlHelper.new(indent).tag_always(@tag, @attrs).to_xml(@value).to_s
       end
     end
@@ Expand Down @@