Skip to content

Commit

Permalink
fetch more results
Browse files Browse the repository at this point in the history
  • Loading branch information
Samoed committed Jan 22, 2025
1 parent f0b35fd commit e4795a6
Show file tree
Hide file tree
Showing 16,104 changed files with 871,367 additions and 12,938 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
40 changes: 25 additions & 15 deletions load_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,17 @@ def get_model_parameters_memory(model_info: ModelInfo) -> tuple[int| None, float
return None, None


def get_dim_seq_size(model: ModelInfo) -> tuple[str | None, str | None, int, float]:
def get_dim_seq_size(model: ModelInfo) -> tuple[str | None, str | None, int, float, str | None]:
siblings = model.siblings or []
filenames = [sib.rfilename for sib in siblings]
dim, seq = None, None
similarity_fn_name = None
for filename in filenames:
if re.match(r"\d+_Pooling/config.json", filename):
st_config_path = hf_hub_download(model.id, filename=filename)
dim = json.load(open(st_config_path)).get("word_embedding_dimension", None)
with open(st_config_path) as f:
pooling_config = json.load(f)
dim = pooling_config.get("word_embedding_dimension", None)
break
for filename in filenames:
if re.match(r"\d+_Dense/config.json", filename):
Expand All @@ -87,17 +90,21 @@ def get_dim_seq_size(model: ModelInfo) -> tuple[str | None, str | None, int, flo
if not dim:
dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", None)))
seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", None))))

if "config_sentence_transformers.json" in filenames:
st_config_path = hf_hub_download(model.id, filename="config_sentence_transformers.json")
with open(st_config_path) as f:
st_config = json.load(f)
similarity_fn_name = st_config.get("similarity_fn_name", None)
parameters, memory = get_model_parameters_memory(model)
return dim, seq, parameters, memory
return dim, seq, parameters, memory, similarity_fn_name


def create_model_meta(model_info: ModelInfo) -> ModelMeta | None:
readme_path = hf_hub_download(model_info.id, filename="README.md", etag_timeout=30)
meta = metadata_load(readme_path)
dim, seq, parameters, memory = None, None, None, None
dim, seq, parameters, memory, similarity_fn_name = None, None, None, None, None
try:
dim, seq, parameters, memory = get_dim_seq_size(model_info)
dim, seq, parameters, memory, similarity_fn_name = get_dim_seq_size(model_info)
except Exception as e:
logger.error(f"Error getting model parameters for {model_info.id}, {e}")

Expand All @@ -110,7 +117,12 @@ def create_model_meta(model_info: ModelInfo) -> ModelMeta | None:
for i in range(len(languages)):
if languages[i] is False:
languages[i] = "no"

datasets = meta.get("datasets", None)
if datasets is not None:
datasets = {
d: []
for d in datasets
}
model_meta = ModelMeta(
name=model_info.id,
revision=model_info.sha,
Expand All @@ -122,6 +134,11 @@ def create_model_meta(model_info: ModelInfo) -> ModelMeta | None:
max_tokens=seq,
n_parameters=parameters,
languages=languages,
public_training_code=None,
public_training_data=None,
similarity_fn_name=similarity_fn_name,
use_instructions=None,
training_datasets=datasets,
)
return model_meta

Expand All @@ -139,14 +156,7 @@ def parse_readme(model_info: ModelInfo) -> dict[str, dict[str, Any]] | None:
return
model_index = meta["model-index"][0]
model_name_from_readme = model_index.get("name", None)
orgs = ["Alibaba-NLP", "HIT-TMG", "McGill-NLP", "Snowflake", "facebook", "jinaai", "nomic-ai"]
is_org = any([model_id.startswith(org) for org in orgs])
# There a lot of reuploads with tunes, quantization, etc. We only want the original model
# to prevent this most of the time we can check if the model name from the readme is the same as the model id
# but some orgs have a different naming in their readme
if model_name_from_readme and not model_info.id.endswith(model_name_from_readme) and not is_org:
logger.warning(f"Model name mismatch: {model_info.id} vs {model_name_from_readme}")
return

results = model_index.get("results", [])
model_results = {}
for result in results:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@
"languages": [],
"loader": null,
"n_parameters": 135193344,
"memory_usage": null,
"max_tokens": 512,
"max_tokens": 512.0,
"embed_dim": 768,
"license": null,
"open_weights": true,
"public_training_data": null,
"public_training_code": null,
"public_training_data": null,
"framework": [
"Sentence Transformers"
],
"reference": null,
"similarity_fn_name": null,
"use_instructions": null,
"zero_shot_benchmarks": null
"training_datasets": {},
"adapted_from": null,
"superseded_by": null
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@
"languages": [],
"loader": null,
"n_parameters": 135193344,
"memory_usage": null,
"max_tokens": 512,
"max_tokens": 512.0,
"embed_dim": 768,
"license": null,
"open_weights": true,
"public_training_data": null,
"public_training_code": null,
"public_training_data": null,
"framework": [
"Sentence Transformers"
],
"reference": null,
"similarity_fn_name": null,
"use_instructions": null,
"zero_shot_benchmarks": null
"training_datasets": {},
"adapted_from": null,
"superseded_by": null
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
"task_name": "AlloProfClusteringP2P",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"v_measure": 0.6234594305243399,
"main_score": 0.6234594305243399
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
"task_name": "AlloProfClusteringS2S",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"v_measure": 0.2572945498452115,
"main_score": 0.2572945498452115
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"dataset_revision": "65393d0d7a08a10b4e348135e824f385d420b0fd",
"task_name": "AlloprofReranking",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"map": 0.26596323297349184,
"mrr": 0.26091629657044163,
"main_score": 0.26596323297349184
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"dataset_revision": "fcf295ea64c750f41fadbaa37b9b861558e1bfbd",
"task_name": "AlloprofRetrieval",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"map_at_1": 0.00345,
"map_at_10": 0.00934,
"map_at_100": 0.01191,
"map_at_1000": 0.013419999999999998,
"map_at_20": 0.0102,
"map_at_3": 0.006689999999999999,
"map_at_5": 0.00753,
"mrr_at_1": 0.00345,
"mrr_at_10": 0.00934,
"mrr_at_100": 0.01191,
"mrr_at_1000": 0.013419999999999998,
"mrr_at_20": 0.0102,
"mrr_at_3": 0.006689999999999999,
"mrr_at_5": 0.00753,
"ndcg_at_1": 0.00345,
"ndcg_at_10": 0.013839999999999998,
"ndcg_at_100": 0.03151,
"ndcg_at_1000": 0.09014,
"ndcg_at_20": 0.01692,
"ndcg_at_3": 0.00785,
"ndcg_at_5": 0.00941,
"precision_at_1": 0.00345,
"precision_at_10": 0.00289,
"precision_at_100": 0.00124,
"precision_at_1000": 0.00063,
"precision_at_20": 0.00205,
"precision_at_3": 0.00374,
"precision_at_5": 0.00302,
"recall_at_1": 0.00345,
"recall_at_10": 0.02893,
"recall_at_100": 0.12435,
"recall_at_1000": 0.62867,
"recall_at_20": 0.04102,
"recall_at_3": 0.01123,
"recall_at_5": 0.015110000000000002,
"main_score": 0.013839999999999998
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"dataset_revision": "1399c76144fd37290681b995c656ef9b2e06e26d",
"task_name": "AmazonReviewsClassification",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"None"
],
"accuracy": 0.32661999999999997,
"f1": 0.32443152253731844,
"main_score": 0.32661999999999997
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"dataset_revision": "5effa1b9b5fa3b0f9e12523e6e43e5f86a6e6d59",
"task_name": "BSARDRetrieval",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"map_at_1": 0.0,
"map_at_10": 0.0,
"map_at_100": 0.00062,
"map_at_1000": 0.00077,
"map_at_20": 0.0,
"map_at_3": 0.0,
"map_at_5": 0.0,
"mrr_at_1": 0.0,
"mrr_at_10": 0.0,
"mrr_at_100": 0.00062,
"mrr_at_1000": 0.00077,
"mrr_at_20": 0.0,
"mrr_at_3": 0.0,
"mrr_at_5": 0.0,
"ndcg_at_1": 0.0,
"ndcg_at_10": 0.0,
"ndcg_at_100": 0.00484,
"ndcg_at_1000": 0.01054,
"ndcg_at_20": 0.0,
"ndcg_at_3": 0.0,
"ndcg_at_5": 0.0,
"precision_at_1": 0.0,
"precision_at_10": 0.0,
"precision_at_100": 0.00027,
"precision_at_1000": 8e-05,
"precision_at_20": 0.0,
"precision_at_3": 0.0,
"precision_at_5": 0.0,
"recall_at_1": 0.0,
"recall_at_10": 0.0,
"recall_at_100": 0.02703,
"recall_at_1000": 0.07658,
"recall_at_20": 0.0,
"recall_at_3": 0.0,
"recall_at_5": 0.0,
"main_score": 0.02703
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "e06ebbbb123f8144bef1a5d18796f3dec9ae2915",
"task_name": "HALClusteringS2S",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"v_measure": 0.1377084465510841,
"main_score": 0.1377084465510841
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "b5d54f8f3b61ae17845046286940f03c6bc79bc7",
"task_name": "MLSUMClusteringP2P",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"None"
],
"v_measure": 0.4543375637260015,
"main_score": 0.4543375637260015
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "b5d54f8f3b61ae17845046286940f03c6bc79bc7",
"task_name": "MLSUMClusteringS2S",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"None"
],
"v_measure": 0.45205646487969753,
"main_score": 0.45205646487969753
}
]
}
}
Loading

0 comments on commit e4795a6

Please sign in to comment.