-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdatasetproperties.py
167 lines (135 loc) · 6.08 KB
/
datasetproperties.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
functions for computing main intrinsic properties of a dataset
"""
from tqdm import tqdm
from random import sample
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Subset
from src.dimensionality import estimate_intrinsic_dim
from src.nnutils import LayerActivationsDataset
def makelogdir():
if not os.path.exists("logs"):
os.makedirs("logs")
return
def compute_labelsharpness(
dataset,
batchsize=500,
M=1000,
device="cuda"
):
"""
compute the label sharpness ($\hat{K}_\mathcal{F}$) of a dataset, as defined in Sec. 3.2 of the paper.
Args:
dataset: torch.utils.data.Dataset object
batchsize: batchsize to use for computing the Lipschitz constant estimate
M: number of samples to use for computing the Lipschitz constant estimate (See Sec. 3.2 of the paper)
device: device to use for computing the Lipschitz constant estimate
"""
makelogdir()
# deal with multiclass version instead of binary
multiclass = not all([s[1] in [0, 1] for s in enumerate(dataset)])
dataset_sampled = Subset(dataset, sample(list(range(len(dataset))), M))
shuffle = True
dataloader1 = DataLoader(dataset_sampled, batch_size=batchsize, shuffle=shuffle)
dataloader2 = DataLoader(dataset_sampled, batch_size=batchsize, shuffle=shuffle)
Ks = []
for bi1, s1 in tqdm(enumerate(dataloader1), desc='computing dataset label sharpness', total=M//batchsize):
for bi2, s2 in enumerate(dataloader2):
x1, y1 = s1
x2, y2 = s2
x1 = x1.to(device)
x2 = x2.to(device)
y1 = y1.to(device)
y2 = y2.to(device)
if multiclass:
# multiclass version
numerator = (y1 == y2).float()
else:
# default binary version
numerator = torch.abs(y1.float() - y2.float())
K = numerator / torch.norm(x1.float() - x2.float(), p=2, dim=(1, 2, 3))
# filter out where x1 = x2
K = K[~torch.isinf(K) & ~torch.isnan(K)]
K = K.tolist()
Ks += K
Kmax = np.max(Ks)
return Kmax
def compute_intrinsic_datadim(
dataset,
estimator='mle',
estimator_batchsize=1000,
hyperparam_k=20,
dataset_name="dataset"
):
"""
compute the intrinsic dimension of a dataset, as defined in Sec. 3.1 of the paper.
Args:
dataset: torch.utils.data.Dataset object
estimator: estimator to use for computing the intrinsic dimension. Must be one of 'mle', 'geomle', 'twonn'
estimator_batchsize: batchsize to use for computing the intrinsic dimension estimate
hyperparam_k: hyperparameter k to use for computing the intrinsic dimension (See Sec. 3.1 of the paper)
dataset_name: name of the dataset, used for saving the intrinsic dimension estimate
"""
assert estimator in ['mle', 'geomle', 'twonn'], "estimator must be one of 'mle', 'geomle', 'twonn'"
makelogdir()
return estimate_intrinsic_dim(dataset, dataset_name, estimator, batchsize=estimator_batchsize, hyperparam_k=hyperparam_k)
def compute_intrinsic_reprdim(
dataset,
model,
layer,
estimator='mle',
batchsize=256,
estimator_batchsize=1000,
hyperparam_k=20,
dataset_name="dataset",
device="cuda"
):
"""
compute the intrinsic dimension of a neural network's learned representations of a dataset, in one of it's layers.
Args:
dataset: torch.utils.data.Dataset object
model: torch.nn.Module object; the neural network to use for computing the intrinsic dimension
layer: torch.nn.Module of the layer to use for computing the intrinsic dimension, an attribute of model (e.g. model.layer1 for resnet18)
estimator: estimator to use for computing the intrinsic dimension. Must be one of 'mle', 'geomle', 'twonn'
batchsize: batchsize to use for computing the neural network's activations given input data
estimator_batchsize: batchsize to use for computing the intrinsic dimension estimate
hyperparam_k: hyperparameter k to use for computing the intrinsic dimension (See Sec. 3.1 of the paper)
dataset_name: name of the dataset, used for saving the intrinsic dimension estimate
device: device to use for computing the neural network's activations given input data
"""
assert estimator in ['mle', 'geomle', 'twonn'], "estimator must be one of 'mle', 'geomle', 'twonn'"
makelogdir()
input_dataloader = DataLoader(dataset, batch_size=batchsize)
# register hook to save activations
activations = []
def hook(model, input, output):
activations.append(output.detach().cpu())
handle = layer.register_forward_hook(hook)
# compute activations by passing data through net
for batch_idx, (x_in, _) in tqdm(enumerate(input_dataloader),
desc='completing forward passes...',
total=len(dataset)//batchsize
):
x_in = x_in.to(device)
output = model(x_in)
# memory management/get things off GPU
del output
activation_data = torch.cat(activations)
handle.remove() # remove hook so earlier layers aren't tracked
activation_data = activation_data.to('cpu')
# memory management/get things off GPU
# load activations into dataset
lbls = [dataset[i][1] for i in range(len(dataset))]
activation_dataset = LayerActivationsDataset(activation_data, lbls)
# compute intrinsic dim
try:
layer_activations_intrinsic_dim = estimate_intrinsic_dim(activation_dataset,
dataset_name, estimator,
batchsize=estimator_batchsize, hyperparam_k=hyperparam_k)
return layer_activations_intrinsic_dim
except (ValueError, OverflowError) as e:
# NaN or inf result for ID
print(e)
return np.nan