Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New to old data converter #64

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
759c92d
first draft of new to old data converter
comane Apr 16, 2024
db610a8
added convertion from new data to old DATA_... format
comane Apr 27, 2024
302437b
kinematics written as numbers
comane Apr 29, 2024
ccd2d9c
fix typo
May 8, 2024
08c8894
add .pdf PBSP logos
May 13, 2024
18308d5
added load_commondata to core, level0_commondata_wc and make_level1_d…
May 14, 2024
db2124a
added parse_fakepdf to config.py
May 16, 2024
f5ba638
add chi2 provider functions
May 20, 2024
f3a6dd9
added usage write_chi2
May 20, 2024
3513ff1
fixed repo
May 20, 2024
dd605f9
moved function in simunet_analysis & changed their name
May 26, 2024
c9cb4cb
changed cuts to commondata_table_indices
May 30, 2024
2caf4cb
changed cuts to commondata_table_indices
May 30, 2024
664a69b
added rules classes, static KIN_LABEL dict, and replaced cpp Export m…
Jun 12, 2024
2deae97
added commondatawriter.py & export method for CommonData python objects
Jun 12, 2024
b34de76
added xq2 map for hadronic MQQ processes ref. [2303.06159]
Jun 12, 2024
98f9c77
Revert "added xq2 map for hadronic MQQ processes ref. [2303.06159]"
Jun 18, 2024
cd67771
Revert "added commondatawriter.py & export method for CommonData pyth…
Jun 18, 2024
9c32bea
Revert "added rules classes, static KIN_LABEL dict, and replaced cpp …
Jun 18, 2024
387a866
debug convert_new_data_to_old
Jun 25, 2024
b65ceb6
added test_utils to .gitignore
Jun 25, 2024
069d827
tested writer
Jul 4, 2024
ce8d085
changes in utils are useful for data converter, changes in dataplots …
Jul 11, 2024
b55a5e2
scripts to convert and test the conversion
Oct 24, 2024
d1b60f4
make plotting files script (works on dis datasets)
Oct 25, 2024
b4eaf59
corrected bug
Oct 25, 2024
3d610ab
added cuts to fixed sm predictions
Nov 4, 2024
ed8fb57
update of commodata converter picking the right files
Nov 14, 2024
02a623b
added matshow
Jan 25, 2025
77a6335
Merge branch 'main' into commondata_converter_new_to_old
Jan 25, 2025
1360caf
updates
Jan 25, 2025
9cf70a0
update for DIS experimentes
Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -433,5 +433,6 @@ Session.vim
# auto-generated tag files
tags

validphys2/src/validphys/test_utils

# End of https://www.gitignore.io/api/c++,latex,cmake,python,jupyternotebook,qtcreator,vim
108 changes: 108 additions & 0 deletions validphys2/src/validphys/commondata_new_to_old.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
Commondata converter script from new to old format:
it must be run in an up to date simunet environment, in the `commondata_converter_new_to_old` branch.
"""

import os
import sys
import yaml
from validphys.utils import uncertainty_yaml_to_systype, convert_new_data_to_old

# test whether the runcard is passed
if len(sys.argv) != 2:
raise Exception("No runcard is passed!")
card_name = sys.argv[1]
if not os.path.isfile(card_name):
raise Exception("Runcard does not exist!")
# load runcard
with open(card_name, "rb") as stream:
runcard = yaml.safe_load(stream)
# load datasets to convert
datasets = runcard["dataset_inputs"]

# create test directory if it does not already exist
test_dir = "test_utils"
if not os.path.isdir(test_dir):
os.mkdir(test_dir)

# changed by the user
nnpdf_path = "/home/ubunteto/Software/nnpdf"
# new commondata path
new_commondata = f"{nnpdf_path}/nnpdf_data/nnpdf_data/commondata"
# open conversion dictionary
with open(f"{new_commondata}/dataset_names.yml", "rb") as stream:
conversion = yaml.safe_load(stream)

# old format
old_format_names = list(conversion.keys())
# new format
new_format_names = []
for c in conversion:
try:
new_format_names.append(conversion[c]["dataset"])
except TypeError:
new_format_names.append(conversion[c])

# prepare list of the datasets to be converted
conversion_ds = []
for ds in datasets:
if ds["dataset"] in old_format_names:
d = conversion[ds["dataset"]]
d["name"] = ds["dataset"]
conversion_ds.append(d)
elif ds["dataset"] in new_format_names:
conversion_ds.append({"dataset": ds["dataset"], "variant": "legacy", "name": ds["dataset"]})
else:
conversion_ds.append({"dataset": ds["dataset"], "variant": None, "name": ds["dataset"]})

# separate the dataset & the observable names
for ds in conversion_ds:
s = ds["dataset"]
ds["dataset"] = s[:s.rfind("_")]
ds["obs"] = s[s.rfind("_")+1:]
n = ds["name"]
ds["name"] = n[:n.rfind("_")]

# convert
for i, ds in enumerate(conversion_ds):
var_ind, obs_ind = "variant", "obs"
# load metadata file
path_metadata = new_commondata+"/"+ds["dataset"]+f"/metadata.yaml"
with open(path_metadata, "r") as stream:
metadata = yaml.safe_load(stream)
for o in metadata["implemented_observables"]:
if o["observable_name"] == ds[obs_ind]:
data_file_name, unc_file_names, kin_file_name = o["data_central"], o["data_uncertainties"], o["kinematics"]["file"]
# if only in the new format
if not ds[var_ind]:
path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/{data_file_name}"
path_kin = new_commondata+"/"+ds["dataset"]+f"/{kin_file_name}"
path_unc_files = [new_commondata+"/"+ds["dataset"]+f"/{unc_file_name}" for unc_file_name in unc_file_names]
# if also in the old format (legacy variants)
else:
if os.path.isfile(new_commondata+"/"+ds["dataset"]+f"/data_{ds[var_ind]}_{ds[obs_ind]}.yaml"):
path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/data_{ds[var_ind]}_{ds[obs_ind]}.yaml"
else:
path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/data_legacy_{ds[obs_ind]}.yaml"
path_unc_file = new_commondata+"/"+ds["dataset"]+f"/uncertainties_{ds[var_ind]}_{ds[obs_ind]}.yaml"
path_kin = new_commondata+"/"+ds["dataset"]+f"/kinematics_{ds[obs_ind]}.yaml"
# write uncertainty files
uncertainty_yaml_to_systype(path_unc_files,
name_dataset=ds["name"],
observable=ds["obs"],
path_systype=test_dir)
# write commondata files
convert_new_data_to_old(path_data_yaml,
path_unc_files,
path_kin,
path_metadata,
name_dataset=ds["name"],
observable=ds["obs"],
path_DATA=test_dir)
# output
name = ds["name"]+"_"+ds["obs"]
print(f"{i+1:>2}. {name:>40} converted!")

# write check runcard
with open("test_utils/check_commondata_new_to_old.yaml", "w") as stream:
yaml.safe_dump(conversion_ds, stream)
22 changes: 11 additions & 11 deletions validphys2/src/validphys/convolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,17 +119,17 @@ def _predictions(dataset, pdf, fkfunc):
# predictions instead.
all_predictions = []
for fk in dataset.fkspecs:
if not fk.use_fixed_predictions:
all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf))
else:
with open(fk.fixed_predictions_path, 'rb') as f:
fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed'])
# Now need to reshape it according it to the expected number of predictions
if fkfunc == central_fk_predictions:
all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data']))
elif fkfunc == fk_predictions:
fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1))
all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())]))
if not fk.use_fixed_predictions:
all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf))
else:
with open(fk.fixed_predictions_path, 'rb') as f:
fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed'])[cuts]
# Now need to reshape it according it to the expected number of predictions
if fkfunc == central_fk_predictions:
all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data']))
elif fkfunc == fk_predictions:
fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1))
all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())]))

return opfunc(*all_predictions)

Expand Down
20 changes: 15 additions & 5 deletions validphys2/src/validphys/dataplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,11 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
"""
info = get_info(dataset)

table = kitable(dataset, info)
try:
table = kitable(dataset, info)
except:
log.warning(f"Problems with kitable loading {dataset.name}")
table = kitable(dataset.commondata, info)
figby = sane_groupby_iter(table, info.figure_by)

basis = obs_pdf_correlations.basis
Expand All @@ -880,7 +884,9 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
plotting_var = info.get_xcol(table)

#TODO: vmin vmax should be global or by figure?
vmin,vmax = min(plotting_var), max(plotting_var)
vmin, vmax = min(plotting_var), max(plotting_var)
if type(vmin) == str or type(vmax) == str:
vmin, vmax = 0, 1
if info.x_scale == 'log':
norm = mcolors.LogNorm(vmin, vmax)
else:
Expand All @@ -889,7 +895,7 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
sm = cm.ScalarMappable(cmap=cm.viridis, norm=norm)

for same_vals, fb in figby:
grid = fullgrid[ np.asarray(fb.index),...]
grid = fullgrid[np.arange(len(fb.index)), ...]


#Use the maximum absolute correlation for plotting purposes
Expand All @@ -906,9 +912,13 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
h*=2.5
fig,axes = plt.subplots(nrows=nf ,sharex=True, figsize=(w,h), sharey=True)
fig.suptitle(title)
colors = sm.to_rgba(info.get_xcol(fb))
if np.vectorize(isinstance)(info.get_xcol(fb), str).any():
temp = np.linspace(start=0, stop=1, num=len(info.get_xcol(fb)))
colors = sm.to_rgba(temp)
else:
colors = sm.to_rgba(info.get_xcol(fb))
for flindex, (ax, fl) in enumerate(zip(axes, fls)):
for i,color in enumerate(colors):
for i, color in enumerate(colors):
ax.plot(x, grid[i,flindex,:].T, color=color)


Expand Down
72 changes: 72 additions & 0 deletions validphys2/src/validphys/make_plotting_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import sys
import yaml
import shutil
import filecmp

# simunet environment commondata path
old_commondata = "/Users/teto/miniconda3/envs/simunet_release/share/NNPDF/data/commondata"
# nnpdf commondata path
new_commondata = "/Users/teto/Software/nnpdf_git/nnpdf/nnpdf_data/nnpdf_data/commondata"
# test whether the runcard is passed
if len(sys.argv) != 2:
raise Exception("No runcard is passed!")
card_name = sys.argv[1]
if not os.path.isfile(card_name):
raise Exception("Runcard does not exist!")
# load runcard
with open(card_name, "rb") as stream:
card = yaml.safe_load(stream)
# load conversion dictionary
with open(new_commondata+"/dataset_names.yml", "rb") as stream:
conv = yaml.safe_load(stream)
# load datasets to convert
datasets = card["dataset_inputs"]
# temporary list
temp = []
# back conversion map
back_conv = {}
# loop over datasets to convert
for ds in datasets:
ds_name = ds["dataset"]
if ds_name in list(conv.keys()) and "-" in ds_name:
# save the datasets to map
temp.append(conv[ds_name])
# print(f"{ds_name} is in the old format with a new name! (Do it manually)")
else:
for cds in conv:
try:
flag = ds_name == conv[cds]["dataset"]
except TypeError:
flag = ds_name == conv[cds]
if flag:
back_conv[ds_name] = cds
# loop over the datasets that we still have to convert
for ds in temp:
ds_name, ds_var = ds["dataset"], ds["variant"]
back_conv[ds_name] = []
for cds in conv:
try:
flag = (ds_name == conv[cds]["dataset"]) and (ds_var == conv[cds]["variant"] and "-" not in cds)
except TypeError:
flag = ds_name == conv[cds]
if flag:
back_conv[ds_name] = cds
# copy
for i, bc in enumerate(back_conv):
# new file name
filename_new = f"test_utils/PLOTTING_{bc}.yml"
# old file name
if os.path.isfile(old_commondata+f"/PLOTTING_{back_conv[bc]}.yml"):
filename_old = old_commondata+f"/PLOTTING_{back_conv[bc]}.yml"
elif os.path.isfile(old_commondata+f"/PLOTTING_{back_conv[bc]}.yaml"):
filename_old = old_commondata+f"/PLOTTING_{back_conv[bc]}.yaml"
else:
print(f"Missing PLOTTING file for {back_conv[bc]}!")
# copy
shutil.copy(filename_old, filename_new)
# test the copies
if filecmp.cmp(filename_old, filename_new):
print(f"{i+1:>2}. Copied plotting file {back_conv[bc]:>40} -> {bc:>40}!")
else:
print(f"{i+1:>2}. Error during copy of plotting file {back_conv[bc]:>40} -> {bc:>40}!")
58 changes: 58 additions & 0 deletions validphys2/src/validphys/test_commondata_new_to_old.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Test the commondata converter from new to old format:
it must be run in an up to date nnpdf environment.
"""

import yaml
from numpy import allclose
from validphys.commondataparser import parse_set_metadata, load_commondata_new, load_commondata_old
from validphys.covmats import covmat_from_systematics
from matplotlib.pyplot import matshow, show

# nnpdf path
nnpdf_path = "/home/ubunteto/Software/nnpdf"
# open the yaml file created by commondata_new_to_old script
with open("test_utils/check_commondata_new_to_old.yaml", "rb") as stream:
datasets = yaml.safe_load(stream)
# silly dictionary to output if the feature is sound or not
ok = {1: "OK :D", 0: "NOT OK :C"}
# fake dataset input for covmat_from_systematics
inp = None
# list to store the implementation errors, useful for IPython debug
cd_errors, cm_errors = [], []
# loop over the selected datasets
for i, ds in enumerate(datasets):
# dataset name, observable name, and dataset variant
setname, name, observable, variant = ds["dataset"], ds["name"], ds["obs"], ds["variant"]
# old commondata
cd_old = load_commondata_old(commondatafile=f"test_utils/DATA_{name}_{observable}.dat",
systypefile=f"test_utils/SYSTYPE_{name}_{observable}_DEFAULT.dat",
setname=setname)
# load metadata of the new commondata
metadata = parse_set_metadata(metadata_file=f"{nnpdf_path}/nnpdf_data/nnpdf_data/commondata/{setname}/metadata.yaml")
# new commondata
if variant:
cd_new = load_commondata_new(metadata=metadata.select_observable(observable).apply_variant(variant))
else:
cd_new = load_commondata_new(metadata=metadata.select_observable(observable))
# load covariance matrices
covmat_old = covmat_from_systematics(loaded_commondata_with_cuts=cd_old,
dataset_input=inp,
use_weights_in_covmat=False)
covmat_new = covmat_from_systematics(loaded_commondata_with_cuts=cd_new,
dataset_input=inp,
use_weights_in_covmat=False)
# matshow(covmat_new - covmat_old)
# show()
# test central values
ds["commondata"] = allclose(cd_old.central_values, cd_new.central_values)
if not ds["commondata"]:
cd_errors.append({"old": cd_old, "new": cd_new})
# test covariance matrix
ds["covmat"] = allclose(covmat_old, covmat_new)
if not ds["covmat"]:
cm_errors.append({"old": covmat_old, "new": covmat_new})
# output
cd, cm = ds["commondata"], ds["covmat"]
name = f"{setname}_{observable}"
print(f"{i+1:2}. {name:>40} -> commondata is {ok[cd]:>9} & covariance matrix is {ok[cm]:>9}")
Loading