-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlithuanian.sbl
145 lines (117 loc) · 3.69 KB
/
lithuanian.sbl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
ls ( stem )
//Stemmer for test1.txt data
/* Special characters in Unicode Latin-1 and Latin Extended-A */
// ' nosine
stringdef a' decimal '261' // ą a + ogonek
stringdef e' decimal '281' // ę e + ogonek
stringdef i' decimal '303' // į i + ogonek
stringdef u' decimal '371' // ų u + ogonek
// . taskas
stringdef e. decimal '279' // ė e + dot
// - ilgoji
stringdef u- decimal '363' // ū u + macron
// * varnele
stringdef c* decimal '269' // č c + caron (haček)
stringdef s* decimal '353' // š s + caron (haček)
stringdef z* decimal '382' // ž z + caron (haček)
// [C](VC)^m[V|C]
// definitions of variables for
// p1 - position of m = 0
// p2 - position of m = 1
integers ( p1 p2 )
// booleans - to be commented
// PRE
// FOUND
// CHANGE
booleans ( PRE FOUND CHANGE )
// escape symbols for substituting lithuanian characters
stringescapes { }
// groupings
// v - lithuanian vowels
groupings ( v )
// v - all lithuanian vowels
define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}'
// all lithuanian stemmer routines: 5 steps
routines (
step2 R1 R2 step1 fix_chdz fix_gd
)
backwardmode(
define R1 as $p1 <= cursor
define step1 as repeat (
setlimit tomark p1 for ([substring]) R1 among(
'a' 'ai' 'ain' 'aut' 'ais' 'al' 'ams' 'as' 'atv' 'au' 'auj' 'aus' 'avim'
'e' 'ei' 'eiv' 'um' 'ekl' 'el' 'enie' 'enims' 'enis' 'eniu' 'ens' 'enyb' 'ers' 'es'
'i' 'ia' 'iai' 'iais' 'iams' 'ias' 'iau' 'iaus' 'id' 'ie' 'ien' 'ienoj' 'ies' 'iet' 'ij' 'ik' 'ikl'
'il' 'im' 'imas' 'imi' 'imis' 'ims' 'io' 'ioje' 'iomis' 'ioms' 'ios' 'iose' 'is' 'iu'
'iui' 'iuje' 'iuk' 'iumi' 'iuose' 'ius' 'i{a'}' 'i{s*}k' 'i{u'}' 'j{u'}' 'i{u-}k{s*}' 'in{e'}' 'in{e.}' 'ij{a'}' 'iuo' 'iaud'
'ij{u'}'
'jim'
'kl'
's'
'm'
'o' 'oj' 'oje' 'ojant' 'iniai' 'ojim' 'ok' 'ok{s*}n' 'iant' 'omis' 'odam' 'ni{a'}j' 'oms' 'os' 'ose' 'ov' 'o{c*}'
'sen' 'sm' 'sn'
'{z*}ti' 'ti' 'tin' 'toj' 'ing' 'tu' 'tuk' 'tuv' 'tyn' 'tyv' 'tydam'
'u' 'ui' 'uje' 'uk' 'ul' 'uly' 'umi' 'umis' 'uol' 'uomen' 'uose' 'us' 'ut' 'u{z*}'
'y' 'yb' 'yje' 'amet' 'yk' 'ykl' 'ym' 'yn' 'ys' 'yse' 'yst' 'yt'
'{a'}'
'{e'}'
'{e.}' '{e.}j' '{e.}je' '{e.}mis' '{e.}ms' '{e.}n' '{e.}s' '{e.}se' '{e.}z' '{e.}l'
'{i'}'
'{s*}{e.}'
'{u'}'
'{u-}n'
'{u-}s'
'an'
)
delete
)
define step2 as repeat (
setlimit tomark p1 for ([substring]) among(
'tydam' 'os' 'uoda' 'uodam' 'audam'
'tuodav' 'tuoj' 'uodav'
'nam{e.}s' 'nasi' 'nat{e.}s' 'niesi' 'nuosi' '{z*}dam'
'indav' 'audav' 'indam' 'ind' 'odav' 'voj' 'iuot'
'uot' 'sies' 'sim{e.}s' 'sit{e.}s' 'siuos' 'uojat' 'uod' 'iej' 'int'
'd{e.}k'
'{e.}dam' '{e.}dav' 'gdav' 'gd' 'iuodam' 'aut' 'ią' 'iav' 'um' 'dav' 'davaisi' 'davausi' 'davom{e.}s' 'davosi' 'davotės' 'kis'
'auj' 'aisi' 'ausi' 'esn' 'es'
'tumei' 'tum{e.}m{e.}s' 'tum{e.}m{e.}' 'tum{e.}s' 'tum{e.}' 'tut{e.}s' 'tut{e.}' 't{u'}si'
'ot'
'{e.}tum' '{e.}m' '{e.}t' '{e.}s' '{e.}si' 'in' 'inink'
'ytum' 'ydav' 'ytume' 'ytum{e.}me' 'ytum{e.}te' 'ytute' 'yt{u-}' 'y{c*}iau'
'uriuo'
'si' 'sis' 'tis' 'sime' 'site' 'odam'
'{z*}iau'
'{c*}iau'
'av'
'{a'}j'
)
delete
)
define fix_chdz as (
[substring] among (
'{c*}' (<-'t' set CHANGE)
'd{z*}' (<-'d' set CHANGE)
)
)
define fix_gd as (
[substring] among (
'gd' (<-'g' set CHANGE)
)
)
)
define stem as (
$p1 = limit
$p2 = limit
do(
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
backwards (
do step1
do fix_chdz
do step2
fix_gd
)
)