license

545999961 · Aug 3, 2023 · c086741 · c086741
1 parent 1287b57
commit c086741
Show file tree

Hide file tree

Showing 24 changed files with 132 additions and 159 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 staoxiao
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -169,6 +169,6 @@ You can easily finetune your model with it.
 
 
 
-## Citing & Authors
 
-<!--- Describe where people can find more information -->
+
+
diff --git a/benchmark/C_MTEB/Classification.py b/benchmark/C_MTEB/Classification.py
@@ -1,5 +1,6 @@
 from mteb import AbsTaskClassification
 
+
 class TNews(AbsTaskClassification):
     @property
     def description(self):
@@ -52,7 +53,6 @@ def description(self):
         }
 
 
-
 class JDReview(AbsTaskClassification):
     @property
     def description(self):
@@ -98,4 +98,4 @@ def description(self):
             'eval_langs': ['zh'],
             'main_score': 'accuracy',
             'samples_per_label': 32,
-        }
+        }
diff --git a/benchmark/C_MTEB/Clustering.py b/benchmark/C_MTEB/Clustering.py
@@ -19,7 +19,6 @@ def description(self):
         }
 
 
-
 class CLSClusteringP2P(AbsTaskClustering):
     @property
     def description(self):
@@ -38,7 +37,6 @@ def description(self):
         }
 
 
-
 class ThuNewsClusteringS2S(AbsTaskClustering):
     @property
     def description(self):

diff --git a/benchmark/C_MTEB/Reranking.py b/benchmark/C_MTEB/Reranking.py
@@ -1,8 +1,7 @@
-from mteb import AbsTask, RerankingEvaluator, AbsTaskReranking
 import logging
 
 import numpy as np
-
+from mteb import RerankingEvaluator, AbsTaskReranking
 
 logger = logging.getLogger(__name__)
 
@@ -45,7 +44,8 @@ def compute_metrics_batched(self, model):
             # In case the query is a list of strings, we get the most similar embedding to any of the queries
             all_query_flattened = [q for sample in self.samples for q in sample["query"]]
             if hasattr(model, 'encode_queries'):
-                all_query_embs = model.encode_queries(all_query_flattened, convert_to_tensor=True, batch_size=self.batch_size)
+                all_query_embs = model.encode_queries(all_query_flattened, convert_to_tensor=True,
+                                                      batch_size=self.batch_size)
             else:
                 all_query_embs = model.encode(all_query_flattened, convert_to_tensor=True, batch_size=self.batch_size)
         else:
@@ -64,12 +64,12 @@ def compute_metrics_batched(self, model):
         query_idx, docs_idx = 0, 0
         for instance in self.samples:
             num_subqueries = len(instance["query"]) if isinstance(instance["query"], list) else 1
-            query_emb = all_query_embs[query_idx : query_idx + num_subqueries]
+            query_emb = all_query_embs[query_idx: query_idx + num_subqueries]
             query_idx += num_subqueries
 
             num_pos = len(instance["positive"])
             num_neg = len(instance["negative"])
-            docs_emb = all_docs_embs[docs_idx : docs_idx + num_pos + num_neg]
+            docs_emb = all_docs_embs[docs_idx: docs_idx + num_pos + num_neg]
             docs_idx += num_pos + num_neg
 
             if num_pos == 0 or num_neg == 0:
@@ -98,6 +98,7 @@ def evaluate(self, model, split="test", **kwargs):
 
     return dict(scores)
 
+
 AbsTaskReranking.evaluate = evaluate
 
 

diff --git a/benchmark/C_MTEB/Retrieval.py b/benchmark/C_MTEB/Retrieval.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+
 from datasets import load_dataset, DatasetDict
 from mteb import AbsTaskRetrieval
 
@@ -14,9 +15,9 @@ def load_retrieval_data(hf_hub_name, eval_splits):
     for e in qrels:
         relevant_docs[e['qid']][e['pid']] = e['score']
 
-    corpus = DatasetDict({eval_split:corpus})
-    queries = DatasetDict({eval_split:queries})
-    relevant_docs = DatasetDict({eval_split:relevant_docs})
+    corpus = DatasetDict({eval_split: corpus})
+    queries = DatasetDict({eval_split: queries})
+    relevant_docs = DatasetDict({eval_split: relevant_docs})
     return corpus, queries, relevant_docs
 
 
@@ -116,7 +117,6 @@ def load_data(self, **kwargs):
         self.data_loaded = True
 
 
-
 class CmedqaRetrieval(AbsTaskRetrieval):
     @property
     def description(self):
@@ -208,6 +208,6 @@ def load_data(self, **kwargs):
         if self.data_loaded:
             return
 
-        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'], self.description['eval_splits'])
+        self.corpus, self.queries, self.relevant_docs = load_retrieval_data(self.description['hf_hub_name'],
+                                                                            self.description['eval_splits'])
         self.data_loaded = True
-
diff --git a/benchmark/C_MTEB/STS.py b/benchmark/C_MTEB/STS.py
@@ -17,7 +17,6 @@ def description(self):
         }
 
 
-
 class BQ(AbsTaskSTS):
     @property
     def description(self):
@@ -50,7 +49,6 @@ def description(self):
         }
 
 
-
 class PAWSX(AbsTaskSTS):
     @property
     def description(self):
@@ -99,7 +97,6 @@ def description(self):
         }
 
 
-
 class QBQTC(AbsTaskSTS):
     @property
     def description(self):

diff --git a/benchmark/C_MTEB/__init__.py b/benchmark/C_MTEB/__init__.py
@@ -1,15 +1,14 @@
+from .Classification import *
 from .Clustering import *
-from .Reranking import *
 from .PairClassification import *
+from .Reranking import *
 from .Retrieval import *
 from .STS import *
-from .Classification import *
 
 ChineseTaskList = ['TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'OnlineShopping', 'Waimai',
                    'CLSClusteringS2S', 'CLSClusteringP2P', 'ThuNewsClusteringS2S', 'ThuNewsClusteringP2P',
                    'Ocnli', 'Cmnli',
                    'T2Reranking', 'MmarcoReranking', 'CMedQAv1', 'CMedQAv2',
-                   'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
+                   'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval',
+                   'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
                    'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC']
-
-
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -192,6 +192,4 @@ In retrieval task, we sample 100,000 candidates (including the ground truths) fr
 This work is inspired by [Massive Text Embedding Benchmark](https://github.com/embeddings-benchmark/mteb), 
 which lacks of the evaluation for chinese text.
 
-## Citing & Authors
 
-<!--- Describe where people can find more information -->
diff --git a/benchmark/eval_C-MTEB.py b/benchmark/eval_C-MTEB.py
@@ -1,11 +1,9 @@
 import argparse
 
-from mteb import MTEB
-from models import UniversalModel
 from C_MTEB import *
 from C_MTEB import ChineseTaskList
-
-
+from models import UniversalModel
+from mteb import MTEB
 
 query_instruction_for_retrieval_dict = {
     "BAAI/baai-general-embedding-large-zh-instruction": "为这个句子生成表示以用于检索相关文章：",
@@ -20,7 +18,6 @@ def get_args():
     return parser.parse_args()
 
 
-
 if __name__ == '__main__':
     args = get_args()
 
@@ -44,6 +41,3 @@ def get_args():
 
         evaluation = MTEB(tasks=[task], task_langs=['zh'])
         evaluation.run(model, output_folder=f"zh_results/{args.model_name_or_path.split('/')[-1]}")
-
-
-
diff --git a/benchmark/eval_MTEB.py b/benchmark/eval_MTEB.py
@@ -1,8 +1,7 @@
 import argparse
 
-from mteb import MTEB
 from models import UniversalModel
-
+from mteb import MTEB
 
 query_instruction_for_retrieval_dict = {
     "BAAI/baai-general-embedding-large-en-instruction": "Represent this sentence for searching relevant passages: ",
@@ -39,6 +38,3 @@ def get_args():
 
         evaluation = MTEB(tasks=[task], task_langs=['zh'])
         evaluation.run(model, output_folder=f"en_results/{args.model_name_or_path.split('/')[-1]}")
-
-
-
diff --git a/benchmark/models.py b/benchmark/models.py
@@ -1,8 +1,9 @@
-import numpy as np
 from typing import cast, List, Dict
+
+import numpy as np
 import torch
-from tqdm import tqdm
 from mteb import DRESModel
+from tqdm import tqdm
 
 
 class UniversalModel(DRESModel):
@@ -33,7 +34,6 @@ def __init__(
         if num_gpus > 1:
             self.model = torch.nn.DataParallel(self.model)
 
-
     def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
         '''
         encode queries for retrieval task
@@ -45,7 +45,6 @@ def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
             input_texts = queries
         return self.encode(input_texts)
 
-
     def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs) -> np.ndarray:
         '''
         encode corpus for retrieval task
@@ -54,15 +53,14 @@ def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs) -> np.ndarray:
         input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
         return self.encode(input_texts)
 
-
     @torch.no_grad()
     def encode(self, sentences: List[str], batch_size: int = 256, **kwargs) -> np.ndarray:
 
         batch_size = min(batch_size, 256) * self.num_gpus
         self.model.eval()
 
         all_embeddings = []
-        for start_index in tqdm(range(0, len(sentences), batch_size), desc="Batches", disable=len(sentences)<256):
+        for start_index in tqdm(range(0, len(sentences), batch_size), desc="Batches", disable=len(sentences) < 256):
             sentences_batch = sentences[start_index:start_index + batch_size]
             inputs = self.tokenizer(
                 sentences_batch,
@@ -79,10 +77,3 @@ def encode(self, sentences: List[str], batch_size: int = 256, **kwargs) -> np.nd
             all_embeddings.append(embeddings.cpu().numpy())
 
         return np.concatenate(all_embeddings, axis=0)
-
-
-
-
-
-
-
diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py
@@ -1,10 +1,10 @@
 import argparse
-from collections import defaultdict
-import os
 import json
+import os
+from collections import defaultdict
 
-from mteb import MTEB
 from C_MTEB import *
+from mteb import MTEB
 
 
 def read_results(task_types, except_tasks, args):
@@ -58,8 +58,8 @@ def output_markdown(tasks_results, model_names, save_file):
             for task_name in type_results.keys():
                 first_line += f" {task_name} |"
                 second_line += ":--------:|"
-            f.write(first_line+' Avg |  \n')
-            f.write(second_line+':--------:|  \n')
+            f.write(first_line + ' Avg |  \n')
+            f.write(second_line + ':--------:|  \n')
 
             for model in model_names:
                 write_line = f"| {model} |"
@@ -72,12 +72,11 @@ def output_markdown(tasks_results, model_names, save_file):
                         write_line += f"  |"
 
                 if len(all_res) == len(type_results.keys()):
-                    write_line += f" {round(sum(all_res)/len(all_res), 2)} |"
+                    write_line += f" {round(sum(all_res) / len(all_res), 2)} |"
                     task_type_res[t_type][model] = all_res
                 else:
                     write_line += f"  |"
-                f.write(write_line+'  \n')
-
+                f.write(write_line + '  \n')
 
         f.write(f'Overall  \n')
         first_line = "| Model |"
@@ -93,7 +92,7 @@ def output_markdown(tasks_results, model_names, save_file):
             all_res = []
             for type_name, results in task_type_res.items():
                 if model in results:
-                    write_line += f" {round(sum(results[model])/len(results[model]), 2)} |"
+                    write_line += f" {round(sum(results[model]) / len(results[model]), 2)} |"
                     all_res.extend(results[model])
                 else:
                     write_line += f"  |"
@@ -104,8 +103,6 @@ def output_markdown(tasks_results, model_names, save_file):
             f.write(write_line + '  \n')
 
 
-
-
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('--results_dir', default="./zh_results", type=str)
@@ -120,14 +117,12 @@ def get_args():
         task_types = ["Retrieval", "STS", "PairClassification", "Classification", "Reranking", "Clustering"]
         except_tasks = ['AmazonReviewsClassification', 'STS22']
     elif args.lang == 'en':
-        task_types = ["Retrieval", "STS", "Summarization", "PairClassification", "Classification", "Reranking", "Clustering"]
+        task_types = ["Retrieval", "STS", "Summarization", "PairClassification", "Classification", "Reranking",
+                      "Clustering"]
         except_tasks = []
     else:
         raise NotImplementedError(f"args.lang must be zh or en, but{args.lang}")
 
-
     task_results, model_dirs = read_results(task_types, except_tasks, args=args)
-    output_markdown(task_results, model_dirs.keys(), save_file=os.path.join(args.results_dir, f'{args.lang}_results.md'))
-
-
-
+    output_markdown(task_results, model_dirs.keys(),
+                    save_file=os.path.join(args.results_dir, f'{args.lang}_results.md'))
diff --git a/examples/search_demo/arguments.py b/examples/search_demo/arguments.py
@@ -4,12 +4,13 @@
 @dataclass
 class ModelArguments:
     model_name_or_path: str = field(
-        default='BAAI/baai-general-embedding-large-zh', metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+        default='BAAI/baai-general-embedding-large-zh',
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
     )
 
 
 @dataclass
 class DataArguments:
     data_path: str = field(
         default='./data', metadata={"help": "Path to wikipedia-22-12"}
-    )
+    )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -169,6 +169,6 @@ You can easily finetune your model with it.



		## Citing & Authors

		<!--- Describe where people can find more information -->