forked from vepadulano/mkShapePyRDF
-
Notifications
You must be signed in to change notification settings - Fork 3
/
latinos_rdf.py
233 lines (192 loc) · 6.8 KB
/
latinos_rdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import os
import json
import copy
from pprint import pprint
import ROOT as R
class Node:
def __init__(self, name, obj):
self.name = name
self.expr = obj["expr"]
self.vars = []
self.aliases = []
self.weight = None
self.parent = obj["parent"]
self.doVars = obj.get("doVars", False)
self._rdf_cache = []
@property
def rdf_node(self):
return self._rdf_cache[-1]
@rdf_node.setter
def rdf_node(self, node):
self._rdf_cache.append(node)
def __getattr__(self, key):
return getattr(self.rdf_node,key)
def __str__(self):
out = []
out.append("name: {}".format( self.name))
out.append("parent: {}".format( self.parent))
#out.append("cache: {}".format( self._rdf_cache))
out.append("cut: {}".format( self.expr))
out.append("vars: {}".format(",".join(self.vars)))
out.append("aliases: {}".format(",".join(self.aliases)))
out.append("weight: {}".format(self.weight))
return "\n".join(out)
def __repr__(self):
return str(self)
#########################################################################
class Tree:
def __init__(self, name, cuts, keep_negative_weights=True):
self.name = name
self.keep_negative_weights = keep_negative_weights
self.tree = {}
self.variables = []
for key, obj in cuts.items():
self.tree[key] = Node(key, obj)
def define_aliases(self, node, aliases):
if node not in self.tree:
print("Node not found in tree")
return False
for key in aliases.keys():
self.tree[node].rdf_node = self.tree[node].rdf_node.Define(key, aliases[key]["expr"])
self.tree[node].aliases.append(key)
return True
def define_cuts(self, cut):
if cut not in self.tree:
print("Cut not found")
return False
# Create the filter
node = self.tree[cut]
if node.parent == None:
#it's the supercut
node.rdf_node = node.rdf_node.Filter(node.expr, cut)
else:
node.rdf_node = self.tree[node.parent].rdf_node.Filter(node.expr, cut)
# Now do it for each children
for child_node in self.tree.values():
if child_node.parent == node.name:
self.define_cuts(child_node.name)
def define_variables(self, variables):
for name, node in self.tree.items():
if node.doVars == True:
for varkey, varvalue in variables.items():
node.rdf_node = node.rdf_node.Define(name+"_"+varkey, varvalue["name"])
node.vars.append(name+"_"+varkey)
def define_weight(self, node, weight):
if node not in self.tree:
print("Cut not found")
return False
node = self.tree[node]
node.weight = weight
node.rdf_node = node.rdf_node.Define("weight_", weight)
# Check if negative weights are requested or needs to be discarded
if self.keep_negative_weights:
# Add also a cut on weight != 0 in case cut and weight are mixed
node.rdf_node = node.rdf_node.Filter("weight_ != 0.")
else:
node.rdf_node = node.rdf_node.Filter("weight_ > 0.")
def __getattr__(self, key):
return self.tree.get(key, None)
def __getitem__(self, key):
try:
return self.tree.get(key, None)
except e:
print("Cut not found! ", key)
raise e
def __str__(self):
out = []
out.append("Tree: " + self.name)
for name, node in self.tree.items():
out.append(str(node))
out.append("--------------------------------------------------------------------------------")
return "\n".join(out)
def __repr__(self):
return str(self)
#######################################################################################################
def build_dataframe(conf_dir, version_tag, sample, rdf_class, rdf_type, keep_negative_weights=True):
# samples = json.load(open(conf_dir + "/samples.json"))
# variables = {}
# cuts = {}
# aliases = {}
# exec(open(conf_dir+"/cuts.py").read())
# exec(open(conf_dir+"/variables.py").read())
# exec(open(conf_dir+"/aliases.py").read())
conf_r = ConfigReader(conf_dir, version_tag)
# Let's read the sample files as requested
if sample not in conf_r.samples:
print("Requested sample not exists!")
return None
sample_data = conf_r.samples[sample]
# We have to check is there is a weights entries:
# in that case we have to group the file in different DF
dfs = []
weights_group = []
nfiles = []
# Check if the samples had to be devided in
# different dataframes with different weights.
if "weights" in sample_data:
files_groups = {}
# dividere il dataframe in diversi pezzi
for iw, w in enumerate(sample_data["weights"]):
if w not in weights_group:
weights_group.append(w)
files_groups[w] = []
files_groups[w].append(sample_data["name"][iw])
#create all the dfs
for w, fgroup in files_groups.items():
if rdf_type == "root":
files = R.std.vector("string")()
for f in fgroup:
files.push_back(f[3:])
else:
files = [ f[3:] for f in fgroup ]
# Create the dataframe
df = rdf_class.RDataFrame("Events", files )
dfs.append(df)
nfiles.append(len(fgroup))
else:
if rdf_type == "root":
files = R.std.vector("string")()
for f in sample_data["name"]:
files.push_back(f[3:])
else:
files = [ f[3:] for f in sample_data["name"] ]
# Create RDataFrame
df = rdf_class.RDataFrame("Events", files)
dfs.append(df)
nfiles.append(len(sample_data["name"]))
# Now for each initial DF,
# Create alias, global weight, create cuts, create variables
chains = []
for idf, df in enumerate(dfs):
# The cut tree is the base structure
tree = Tree(sample, conf_r.cuts, keep_negative_weights)
tree['supercut'].rdf_node = df
# Filter out aliases not for this samples
conf_r.aliases = { key: obj for key, obj in conf_r.aliases.items()
if "samples" not in obj or sample in obj["samples"]}
tree.define_aliases("supercut", conf_r.aliases)
# Now add the sample global weight
weight = "("+ sample_data["weight"] +")"
if weights_group:
weight += "*("+ weights_group[idf] + ")"
# Define the weight on the super cut, after aliases
# This is cut becase the weight can be used as a cut
tree.define_weight("supercut", weight)
tree.define_cuts("supercut")
tree.define_variables(conf_r.variables)
chains.append(tree)
#return also number of files
return chains, nfiles
class ConfigReader:
def __init__(self, conf_dir, version_tag):
self.samples = json.load(open(conf_dir + "/samples_" + version_tag+ ".json"))
samples = self.samples
variables = {}
exec(open(conf_dir+"/variables.py").read())
self.variables = variables
cuts = {}
exec(open(conf_dir+"/cuts.py").read())
self.cuts = cuts
aliases = {}
exec(open(conf_dir+"/aliases.py").read())
self.aliases = aliases