forked from fedbiomed/fedbiomed
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_sklearn_data_manager.py
153 lines (115 loc) · 6.11 KB
/
test_sklearn_data_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import math
import unittest
import numpy as np
import pandas as pd
from fedbiomed.common.data import SkLearnDataManager
from fedbiomed.common.exceptions import FedbiomedTypeError
class TestSkLearnDataManager(unittest.TestCase):
def setUp(self):
# Setup global TorchDataset class
self.inputs = np.array([[1, 4, 3, 7],
[4, 6, 3, 1],
[1, 5, 3, 7],
[8, 2, 6, 9]
])
self.target = np.array([5, 5, 1, 4])
self.sklearn_data_manager = SkLearnDataManager(inputs=self.inputs,
target=self.target)
def assertIterableEqual(self, it1, it2):
self.assertListEqual([x for x in it1], [x for x in it2])
def assertNPArrayEqual(self, arr1, arr2):
self.assertIterableEqual(arr1.flatten(), arr2.flatten())
def test_sklearn_data_manager_01_init(self):
""" Testing dataset getter method """
# Test if arguments provided as pd.DataFrame and they have been properly converted to the
# np.ndarray
inputs = pd.DataFrame(self.inputs)
target = pd.DataFrame(self.target)
self.sklearn_data_manager = SkLearnDataManager(inputs=inputs,
target=target)
self.assertIsInstance(self.sklearn_data_manager._inputs, np.ndarray)
self.assertIsInstance(self.sklearn_data_manager._target, np.ndarray)
def test_sklearn_data_manager_02_getter_dataset(self):
result = self.sklearn_data_manager.dataset()
self.assertTupleEqual((self.inputs, self.target), result)
def test_sklearn_data_manager_03_split(self):
"""
Testing split method of SkLearnDataManager
- Test _subset_loader
"""
with self.assertRaises(FedbiomedTypeError):
self.sklearn_data_manager.split(test_ratio=-1.)
with self.assertRaises(FedbiomedTypeError):
self.sklearn_data_manager.split(test_ratio=2.)
with self.assertRaises(FedbiomedTypeError):
self.sklearn_data_manager.split(test_ratio='not-float')
# Get number of samples
n_samples = len(self.sklearn_data_manager.dataset()[0])
ratio = 0.5
n_test = math.floor(n_samples * ratio)
n_train = n_samples - n_test
loader_train, loader_test = self.sklearn_data_manager.split(test_ratio=ratio)
msg_test = 'Number of samples of test loader is not as expected'
msg_train = 'Number of samples of train loader is not as expected'
self.assertEqual(len(loader_test.dataset), n_test, msg_test)
self.assertEqual(len(loader_train.dataset), n_train, msg_train)
# Test if test ratio is 1
ratio = 1.
loader_train, loader_test = self.sklearn_data_manager.split(test_ratio=ratio)
self.assertEqual(len(loader_test.dataset), n_samples, msg_test)
self.assertEqual(len(loader_train.dataset), 0)
# Test if test ratio is 0
ratio = 0.
loader_train, loader_test = self.sklearn_data_manager.split(test_ratio=ratio)
self.assertEqual(len(loader_test.dataset), 0, msg_test)
self.assertEqual(len(loader_train.dataset), n_samples, msg_train)
def test_sklearn_data_manager_03_getter_subsets(self):
""" Test getter for subset train and subset test"""
ratio = 0.5
n_samples = len(self.sklearn_data_manager.dataset()[0])
n_test = math.floor(n_samples * ratio)
n_train = n_samples - n_test
self.sklearn_data_manager.split(test_ratio=ratio)
subset_test = self.sklearn_data_manager.subset_test()
subset_train = self.sklearn_data_manager.subset_train()
self.assertEqual(len(subset_test[0]), n_test)
self.assertEqual(len(subset_test[1]), n_test)
self.assertEqual(len(subset_train[0]), n_train)
self.assertEqual(len(subset_train[1]), n_train)
def test_sklearn_data_manager_04_subset_loader(self):
# Invalid subset
with self.assertRaises(FedbiomedTypeError):
self.sklearn_data_manager._subset_loader(subset=np.array([1, 2, 3]))
# Invalid nested subset
with self.assertRaises(FedbiomedTypeError):
self.sklearn_data_manager._subset_loader(subset=([1, 2, 3], [1, 2, 3]))
def test_sklearn_data_manager_05_integration_with_npdataloader(self):
test_ratio = 0.
sklearn_data_manager = SkLearnDataManager(inputs=self.inputs,
target=self.target,
batch_size=1,
shuffle=False,
drop_last=False)
self.assertDictEqual({'batch_size': 1, 'shuffle': False, 'drop_last': False},
sklearn_data_manager._loader_arguments)
loader_train, loader_test = sklearn_data_manager.split(test_ratio=test_ratio)
self.assertEqual(len(loader_test), 0)
for i, (data, target) in enumerate(loader_train):
self.assertNPArrayEqual(data, self.inputs[i, :])
self.assertNPArrayEqual(target, self.target[i])
batch_size = 3
sklearn_data_manager = SkLearnDataManager(inputs=self.inputs,
target=self.target,
batch_size=batch_size,
shuffle=False,
drop_last=True)
loader_train, loader_test = sklearn_data_manager.split(test_ratio=test_ratio)
self.assertEqual(len(loader_test), 0)
count_iter = 0
for i, (data, target) in enumerate(loader_train):
self.assertNPArrayEqual(data, self.inputs[:batch_size, :])
self.assertNPArrayEqual(target, self.target[:batch_size])
count_iter += 1
self.assertEqual(count_iter, 1) # assert that only one iteration was made because of drop_last=True
if __name__ == '__main__': # pragma: no cover
unittest.main()