-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregLines.tcl
370 lines (370 loc) · 15.1 KB
/
regLines.tcl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#regLines
#regLines.tcl
##===================================================================
# Copyright (c) 2018 Yuji SODE <[email protected]>
#
# This software is released under the MIT License.
# See LICENSE or http://opensource.org/licenses/mit-license.php
##===================================================================
#Tool for generating random variables that follows given frequency distibution using linear regressions in given intervals.
#=== Synopsis ===
#::regLines::reglines X Y ?name?;
#
#it outputs tcl script file that defines additional math functions (lines(x), linesPDF(x) and linesVar()) and returns generated filename
# - $X and $Y: numerical lists for x-axis and y-axis
# - $name: a text used in order to generate filename of output file, and numbers are default value
# generated filename has a form of "${name}_regL.tcl"
#--------------------------------------------------------------------
#*** <namespace ::regLines> ***
#interface for linear regressions
# - proc getRange X: procedure that returns target data range
# - $X: a numerical list for x-axis
#
# - proc getLine X Y: procedure for linear regression that returns result as a list
# The returned list is {A B R2} representing model of "y=f(x)=Ax+B, and R2=Var(f)/Var(y)"
# - $X: a numerical list for x-axis
# - $Y: a numerical list for y-axis
#--------------------------------------------------------------------
#*** <namespace ::tcl::mathfunc> ***
#additional functions for regression analysis
#--- lSum.tcl (Yuji SODE, 2018): https://gist.github.com/YujiSODE/1f9a4e2729212691972b196a76ba9bd0 ---
# - lSum(list): function that returns sum of given list
# - $list: a numerical list
#--------------------------------------------------------------------
# - avg(list): function that estimates average using a given numerical list
# - $list: a numerical list
#
# - var(list): function that estimates variance using a numerical list and "list size -1"
# - $list: a numerical list
#
# - cov(X,Y): function that estimates covariance using two numerical lists and "list size -1"
# - $X: a numerical list for x-axis
# - $Y: a numerical list for y-axis
#--------------------------------------------------------------------
#+++ functions defined by output tcl script file +++
# - lines(x): function that returns estimated sample distribution
# - linesPDF(x): function that returns a value of probability density function estimated from the sample distribution
# - linesVar(): function that returns a random variable following PDF
# - $x: a numerical value
##===================================================================
set auto_noexec 1;
package require Tcl 8.6;
#*** <namespace ::tcl::mathfunc> ***
#additional functions for regression analysis
namespace eval ::tcl::mathfunc {
#=== lSum.tcl (Yuji SODE, 2018): https://gist.github.com/YujiSODE/1f9a4e2729212691972b196a76ba9bd0 ===
#it returns sum of given list
#Reference: Iri, M. and Fujino, Y. 1985. Suchi keisan no joshiki (Japanese). Kyoritsu Shuppan Co., Ltd. ISBN 978-4-320-01343-8
proc lSum {list} {
#=== lSum.tcl (Yuji SODE, 2018): https://gist.github.com/YujiSODE/1f9a4e2729212691972b196a76ba9bd0 ===
#Reference: Iri, M. and Fujino, Y. 1985. Suchi keisan no joshiki (Japanese). Kyoritsu Shuppan Co., Ltd. ISBN 978-4-320-01343-8
namespace path {::tcl::mathop};set S 0.0;set R 0.0;set T 0.0;foreach e $list {set R [+ $R [expr double($e)]];set T $S;set S [+ $S $R];set T [+ $S [expr {-$T}]];set R [+ $R [expr {-$T}]];};return $S;
};
#
#it estimates average using a given numerical list
proc avg {list} {
# - $list: a numerical list
set v {};
#n is list size
set n [expr {double([llength $list])}];
foreach e $list {
lappend v [expr {double($e)}];
};
return [expr {lSum($v)/$n}];
};
#it estimates variance using a numerical list and "list size -1"
proc var {list} {
# - $list: a numerical list
set v {};
#n is list size
set n [expr {double([llength $list])}];
#m is average of list
set m [expr {avg($list)}];
foreach e $list {
lappend v [expr {(double($e)-$m)**2}];
};
unset m;
return [expr {lSum($v)/($n-1)}];
};
#it estimates covariance using two numerical lists and "list size -1"
proc cov {X Y} {
# - $X: a numerical list for x-axis
# - $Y: a numerical list for y-axis
#set v 0.0;
set v {};
set i 0;set x0 0.0;set y0 0.0;
#nX is list size of $X
set nX [expr {double([llength $X])}];
#mX is average of $X
set mX [expr {avg($X)}];
#mY is average of $Y
set mY [expr {avg($Y)}];
while {$i<$nX} {
set x0 [expr {double([lindex $X $i])}];
set y0 [expr {double([lindex $Y $i])}];
lappend v [expr {($x0-$mX)*($y0-$mY)}];
incr i 1;
};
unset mX mY i x0 y0;
return [expr {lSum($v)/($nX-1)}];
};
};
#*** <namespace ::regLines> ***
#interface for linear regressions
namespace eval ::regLines {
#procedure that returns target data range
proc getRange {X} {
# - $X: a numerical list for x-axis
set v {};
set i 0.0;
#rgEx is regular expression that matches real number
set rgEx {^(?:[+-]?[0-9]+(?:\.[0-9]+)?(?:[eE][+-]?[0-9]+(?:\.[0-9]+)?)?)$|^(?:\.[0-9]+)$};
puts stdout {Please input data range (dx or x0 x1 ... xn)};
set range [gets stdin];
#data range size<1 or not real number element is in data range
while {([llength $range]<1)||([lsearch -regexp $range $rgEx]<0)} {
puts stdout {Please input data range (dx or x0 x1 ... xn)};
set range [gets stdin];
};
#data range size>0
#to remove duplications
lsort -real -unique $range;
if {[llength $range]<2} {
#data range size=1
#error is returned when input dx is not positive
if {!($range>0)} {
return -code error "ERROR: input dx is not positive";
};
set xSort [lsort -real -increasing $X];
set i [format %.4f [lindex $xSort 0]];
set max [format %.4f [lindex $xSort end]];
while {$i<$max} {
lappend v $i;
set i [format %.4f [expr {$i+$range}]];
};
lappend v $max;
} else {
#data range size>1
set v $range;
};
return $v;
};
#procedure for linear regression that returns result as a list
#the returned list is {A B R2} representing model of "y=f(x)=Ax+B, and R2=Var(f)/Var(y)"
proc getLine {X Y} {
# - $X: a numerical list for x-axis
# - $Y: a numerical list for y-axis
if {([llength $X]<2)||([llength $Y]<2)} {return "0 0 0";};
set f {};
#=== linear model: y=Ax+B ===
set A 0.0;
set B 0.0;
#R2 is coefficient of determination
set R2 0.0;
#averages of given values
set avX [expr {avg($X)}];
set avY [expr {avg($Y)}];
#variances of given values
set s2X [expr {var($X)}];
set s2Y [expr {var($Y)}];
#covariance of given values
set sXY [expr {cov($X,$Y)}];
#=== regression ===
if {$s2X!=0} {
#== when variance of X is not 0 ==
set A [expr {$sXY/$s2X}];
set B [expr {$avY-$A*$avX}];
#coefficient of determination
set f [lmap e $X {list [expr {$A*double($e)+$B}];}];
set R2 [expr {($s2Y!=0)?var($f)/$s2Y:1}];
} else {
#== when variance of X is 0 ==
#f(x!=c)=0 and f(x=c)=Inf
set A "0@$avX";
set B Inf;
set R2 1;
};
unset avX avY s2X s2Y sXY f;
#model: y=Ax+B
return "$A $B $R2";
};
#it outputs tcl script file that defines additional math functions (lines(x), linesPDF(x) and linesVar()) and returns generated filename
proc reglines {X Y {name {}}} {
# - $X: a numerical list for x-axis
# - $Y: a numerical list for y-axis
# - $name: a text used in order to generate filename of output file, and numbers are default value
# generated filename has a form of "${name}_regL.tcl"
if {[llength $X]!=[llength $Y]} {return -code error "ERROR: list sizes are different";};
set fileName "[expr {[llength $name]>0?${name}:[clock seconds]}]_regL.tcl";
#++++++ [module]:_mdl is script of output module ++++++
set _mdl "\#$fileName\n";
append _mdl "\#This script was generated using regLines.tcl\n";
#area is area of estimated regression lines in given data range
set area 0.0;
#PDFMax is max value of estimated PDF in given data range
set PDFMax {};
#ch is channel to output
set ch {};
#dRange is data range list
set dRange [lsort -real -increasing [::regLines::getRange $X]];
set rgN [llength $dRange];
#++++++ [module]: comment for data range ++++++
append _mdl "\#data range: [lindex $dRange 0] to [lindex $dRange end]\n";
append _mdl "\#break points: [join $dRange ,]\n";
#++++++ [module]: function lSum ++++++
append _mdl "\#it returns sum of given list\n";
append _mdl "proc ::tcl::mathfunc::lSum \{list\} \{";
append _mdl [info body ::tcl::mathfunc::lSum];
#++++++ [module]: function lSum END +++++
append _mdl "\}\;\n";
set i 0;
#dClassesX and dClassesY are arrays of classified values
#numerical values are classified into data range size-1 classes and the others
array set dClassesX {};
array set dClassesY {};
foreach e [lrange $dRange 0 end-1] {
set dClassesX($e) {};
set dClassesY($e) {};
};
#Lines is an array of lists that have parameters of a regression
#Lines has data range size-1 elements
array set Lines {};
#=== x-value and y-value are classified based on data range ===
foreach eX $X eY $Y {
set eX [expr {double($eX)}];
if {($eX<[lindex $dRange 0])||($eX>[lindex $dRange end])} {continue;};
set i 0;
while {$i<($rgN-1)} {
if {($eX<[lindex $dRange $i+1])&&!($eX<[lindex $dRange $i])} {
lappend dClassesX([lindex $dRange $i]) $eX;
lappend dClassesY([lindex $dRange $i]) $eY;
};
incr i 1;
};
if {!($eX!=[lindex $dRange end])} {
lappend dClassesX([lindex $dRange end-1]) $eX;
lappend dClassesY([lindex $dRange end-1]) $eY;
};
};
foreach eX [lrange $dRange 0 end-1] eY [lrange $dRange 0 end-1] {
if {!([llength $dClassesX($eX)]>0)||!([llength $dClassesY($eY)]>0)} {
set dClassesX($eX) 0;
set dClassesY($eY) 0;
};
};
#=== linear regressions ===
#Lines is an array of lists that have parameters of a regression: {A B R2}
#Lines has data range size-1 elements
foreach e [lrange $dRange 0 end-1] {
set Lines($e) [::regLines::getLine $dClassesX($e) $dClassesY($e)];
};
#estimating total area, max value and min value of regression lines in the given range
set i 0;
set aSub {};
while {$i<($rgN-1)} {
set e1 [lindex $dRange $i];
set e1D [expr {double($e1)}];
set e2 [lindex $dRange $i+1];
set e2D [expr {double($e2)}];
#values of regression lines
set ax1 [expr {[lindex $Lines($e1) 1]!=Inf?double([lindex $Lines($e1) 0])*$e1D:Inf}];
set ax2 [expr {[lindex $Lines($e1) 1]!=Inf?double([lindex $Lines($e1) 0])*$e2D:Inf}];
if {$ax1!=Inf} {
lappend ax1 [expr {double([lindex $Lines($e1) 1])}];
lappend ax2 [expr {double([lindex $Lines($e1) 1])}];
lappend PDFMax [expr {lSum($ax1)}];
lappend PDFMax [expr {lSum($ax2)}];
};
if {[lindex $Lines($e1) 1]!=Inf} {
#(A/2)*(x2**2) cf. f(x)=Ax+B
lappend aSub [expr {(double([lindex $Lines($e1) 0])/2)*($e2D**2)}];
#B*x2 cf. f(x)=Ax+B
lappend aSub [expr {double([lindex $Lines($e1) 1])*$e2D}];
#(-A/2)*(x1**2) cf. f(x)=Ax+B
lappend aSub [expr {(-double([lindex $Lines($e1) 0])/2)*($e1D**2)}];
#-B*x1 cf. f(x)=Ax+B
lappend aSub [expr {-double([lindex $Lines($e1) 1])*$e1D}];
};
incr i 1;
};
set area [expr {lSum($aSub)}];
set PDFMax [expr {double([lindex [lsort -real -increasing $PDFMax] end])/$area}];
if {[llength $PDFMax]<1} {return -code error "ERROR: data ranges and/or regression lines are invalid";};
unset e1 e2 e1D e2D aSub ax1 ax2;
#++++++ [module]: function lines ++++++
append _mdl "\#it returns estimated sample distribution\n";
append _mdl "proc ::tcl::mathfunc::lines \{x\} \{\n";
append _mdl "\tset X \[expr \{double\(\$x\)\}\]\;\n";
append _mdl "\t\#R is data range\n";
append _mdl "\tset R \{[lsort -real -increasing $dRange]\}\;\n";
append _mdl "\t\#nR is data range size\n";
append _mdl "\tset nR [llength $dRange]\;\n";
append _mdl "\t\#l is an array of regression results: \{A B R2\} for y=Ax+B and coefficient of determination\n";
foreach e [lsort -real -increasing [array names Lines]] {
append _mdl "\tset l\($e\) \{$Lines($e)\}\;\n";
};
append _mdl "\tif \{\(\$X<[lindex $dRange 0]\)||\(\$X>[lindex $dRange end]\)\} \{return 0\;\} else \{\n";
append _mdl "\t\tset i 0;while \{\$i<\(\$nR-1\)\} \{\n";
append _mdl "\t\t\t\if \{\(\$X<\[lindex \$R \$i+1\]\)&&!\(\$X<\[lindex \$R \$i\]\)\} \{\n";
#++++++ [module]: when Rmin < $X < Rmax ++++++
#++++++ [module]: estimating value of f(x) +++++
append _mdl "\t\t\t\tset e \[lindex \$R \$i\]\;if \{!\(\[lindex \$l\(\$e\) 1\]!=Inf\)\} \{\n";
append _mdl "\t\t\t\t\treturn \[expr \{\$X!=\[string range \[lindex \$l\(\$e\) 0\] 2 end\]?0:Inf\}\]\;\n";
append _mdl "\t\t\t\t\} else \{\n";
append _mdl "\t\t\t\t\tset v \{\}\;\n";
append _mdl "\t\t\t\t\tlappend v \[expr \{\[lindex \$l\(\$e\) 0\]*\$X\}\]\;lappend v \[lindex \$l\(\$e\) 1\]\;\n";
append _mdl "\t\t\t\t\treturn \[expr \{lSum\(\$v\)\}\]\;\n";
append _mdl "\t\t\t\t\}\;\n";
#
append _mdl "\t\t\t\}\;\n";
append _mdl "\t\tincr i 1\;\}\;\n";
#++++++ [module]: when $X = Rmax ++++++
#++++++ [module]: estimating value of f(x) +++++
append _mdl "\t\tset e \[lindex \$R end-1\]\;if \{!\(\[lindex \$l\(\$e\) 1\]!=Inf\)\} \{\n";
append _mdl "\t\t\treturn \[expr \{\$X!=\[string range \[lindex \$l\(\$e\) 0\] 2 end\]?0:Inf\}\]\;\n";
append _mdl "\t\t\} else \{\n";
append _mdl "\t\t\tset v \{\}\;\n";
append _mdl "\t\t\tlappend v \[expr \{\[lindex \$l\(\$e\) 0\]*\$X\}\]\;lappend v \[lindex \$l\(\$e\) 1\]\;\n";
append _mdl "\t\t\treturn \[expr \{lSum\(\$v\)\}\]\;\n";
append _mdl "\t\t\}\;\n";
#
append _mdl "\t\}\;\n";
#++++++ [module]: function lines END +++++
append _mdl "\}\;\n";
#++++++ [module]: function linesPDF ++++++
append _mdl "\#it returns a value of probability density function estimated from the sample distribution\n";
append _mdl "proc ::tcl::mathfunc::linesPDF \{x\} \{\n";
append _mdl "\treturn \[expr \{lines\(\$x\)/$area\}\]\;";
#++++++ [module]: function linesPDF END +++++
append _mdl "\}\;\n";
#++++++ [module]: function linesVar ++++++
append _mdl "\#it returns a random variable that follows PDF\n";
append _mdl "proc ::tcl::mathfunc::linesVar \{\} \{\n";
set dRg {};
lappend dRg [expr -[lindex $dRange 0]];
lappend dRg [lindex $dRange end];
append _mdl "\tset Y \[expr \{double\([lindex $dRange 0]\)+double\([expr {lSum($dRg)}]\)*rand\(\)\}\]\;\n";
append _mdl "\tset U \[expr rand\(\)\]\;\n";
append _mdl "\tset v \[expr \{linesPDF\(\$Y\)/$PDFMax\}\]\;\n";
append _mdl "\twhile \{\$U>\$v\} \{\n";
append _mdl "\t\tset Y \[expr \{double\([lindex $dRange 0]\)+double\([expr {lSum($dRg)}]\)*rand\(\)\}\]\;\n";
append _mdl "\t\tset U \[expr rand\(\)\]\;\n";
append _mdl "\t\tset v \[expr \{linesPDF\(\$Y\)/$PDFMax\}\]\;\n";
append _mdl "\t\}\;\n";
append _mdl "\treturn \$Y\;\n";
#++++++ [module]: function linesVar END +++++
append _mdl "\}\;\n";
#=== text log ===
puts stdout "data range: [lindex $dRange 0] to [lindex $dRange end]";
puts stdout "regression results: \{A B R2\} for y=Ax+B and coefficient of determination";
parray Lines;
puts stdout "max value of PDF:$PDFMax";
#=== output of module file ===
set ch [open $fileName w];
fconfigure $ch -encoding utf-8;
puts -nonewline $ch $_mdl;
close $ch;unset ch _mdl;
source -encoding utf-8 $fileName;
return $fileName;
};
};