-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_data.m
executable file
·133 lines (118 loc) · 5.05 KB
/
generate_data.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
function [X, labels, t] = generate_data(dataname, n, noise)
%GENERATE_DATA Generates an artificial dataset (manifold)
%
% [X, labels, t] = generate_data(dataname, n, noise)
%
% Generates an artificial dataset. Possible datasets are: 'swiss' for the Swiss roll
% dataset, 'helix' for the helix dataset, 'twinpeaks' for the twinpeaks dataset,
% '3d_clusters' for the 3D clusters dataset, and 'intersect' for the intersecting
% dataset. The variable n indicates the number of datapoints to generate
% (default = 1000). The variable noise indicates the amount of noise that
% is added to the data (default = 0.05). The function returns the
% high-dimensional dataset in X, and corresponding labels in labels. In
% addition, the function returns the coordinates of the datapoints on the
% underlying manifold in t.
%
%
% This file is part of the Matlab Toolbox for Dimensionality Reduction.
% The toolbox can be obtained from http://homepage.tudelft.nl/19j49
% You are free to use, change, or redistribute this code in any way you
% want for non-commercial purposes. However, it is appreciated if you
% maintain the name of the original author.
%
% (C) Laurens van der Maaten, Delft University of Technology
welcome;
if ~exist('n', 'var')
n = 1000;
end
if ~exist('noise', 'var')
noise = 0.05;
end
switch dataname
case 'swiss'
t = (3 * pi / 2) * (1 + 2 * rand(n, 1));
height = 30 * rand(n, 1);
X = [t .* cos(t) height t .* sin(t)] + noise * randn(n, 3);
%labels = uint8(t);
labels = rem(sum([round(t / 2) round(height / 12)], 2), 2);
t = [t height];
case 'brokenswiss'
t = [(3 * pi / 2) * (1 + 2 * rand(ceil(n / 2), 1) * .4); (3 * pi / 2) * (1 + 2 * (rand(floor(n / 2), 1) * .4 + .6))];
height = 30 * rand(n, 1);
X = [t .* cos(t) height t .* sin(t)] + noise * randn(n, 3);
labels = uint8(t);
%labels = rem(sum([round(t / 2) round(height / 12)], 2), 2);
t = [t height];
case 'changing_swiss'
r = zeros(1, n);
for i=1:n
pass = 0;
while ~pass
rr = rand(1);
if rand(1) > rr
r(i) = rr;
pass = 1;
end
end
end
t = (3 * pi / 2) * (1 + 2 * r);
height = 21 * rand(1, n);
X = [t .* cos(t); height; t .* sin(t)]' + noise * randn(n, 3);
%labels = uint8(t)';
labels = rem(sum([round(t / 2); round(height / 10)], 1), 2)';
case 'helix'
t = [1:n]' / n;
t = t .^ (1.0) * 2 * pi;
X = [(2 + cos(8 * t)) .* cos(t) (2 + cos(8 * t)) .* sin(t) sin(8 * t)] + noise * randn(n, 3);
%labels = uint8(t);
labels = rem(round(t * 1.5), 2);
case 'twinpeaks'
inc = 1.5 / sqrt(n);
[xx2, yy2] = meshgrid(-1:inc:1);
xy = 1 - 2 * rand(2, n);
X = [xy; sin(pi * xy(1,:)) .* tanh(3 * xy(2,:))]' + noise * randn(n, 3);
X(:,3) = X(:,3) * 10;
t = xy';
%labels = uint8(X(:,3));
labels = rem(sum(round((X + repmat(min(X, [], 1), [size(X, 1) 1])) ./ 10), 2), 2);
case '3d_clusters'
numClusters = 5;
centers = 10 * rand(numClusters, 3);
D = L2_distance(centers', centers');
minDistance = min(D(D > 0));
k = 1;
n2 = n - (numClusters - 1) * 9;
X = repmat(0, [n 3]);
labels = repmat(0, [n 1]);
for i=1:numClusters
for j=1:ceil(n2 / numClusters)
X(k, 1:3) = centers(i, 1:3) + (rand(1, 3) - 0.5) * minDistance / sqrt(12);
labels(k) = i;
k = k + 1;
end
end
X = X + noise * randn(size(X, 1), 3);
t = [];
case 'intersect'
t = [1:n]' ./ n .* (2 * pi);
x = cos(t);
y = sin(t);
height = rand(length(x), 1) * 5;
X = [x x .* y height] + noise * randn(n, 3);
%labels = uint8(5 * t);
labels = rem(sum([round(t / 2) round(height / 2)], 2), 2);
case 'difficult'
% Generate underlying manifold
no_dims = 5;
no_points_per_dim = round(n ^ (1 / no_dims));
l = linspace(0, 1, no_points_per_dim);
t = combn(l, no_dims);
% Generate high-dimensional dataset
X = [cos(t(:,1)) tanh(3 * t(:,2)) t(:,1) + t(:,3) t(:,4) .* sin(t(:,2)) sin(t(:,1) + t(:,5)) t(:,5) .* cos(t(:,2)) t(:,5) + t(:,4) t(:,2) t(:,3) .* t(:,4) t(:,1)];
X = X + noise * randn(size(X));
% Generate labels for dataset (2x2x2x2x2 checkerboard pattern)
tt = 1 + round(t);
labels = rem(sum(tt, 2), 2);
otherwise
error('Unknown dataset name.');
end