-
Notifications
You must be signed in to change notification settings - Fork 1
/
spec.ctr
96 lines (81 loc) · 2.82 KB
/
spec.ctr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
Broom memoryLimit: 1024 * 1024 * 1024 * 4.
import main: \*.
import
Library/Test: 'describe'
.
describe Lexer do: {:it:let
it exists do: {
{ Lexer. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
it ~ 'is not Nil' do: {
lexer should not = Nil.
}.
let[Reflect thisContext] collist {
File new: 'collation', read split: '\n',
fmap: {:x ^x split: ' ' max: 1.}
}.
it can normalise text do: {
lexer collate: collist.
(lexer split: 'ٯٷڪګ')
should = ['قوکک'].
}.
it can lex simple persian sentences do: {
(lexer split: 'کلمات فارسی')
should = ['کلمات', 'فارسی'].
}.
it can lex persian sentences with half-width spaces do: {
(lexer split: 'این جمله (خیلی) پیچیده نیست!!!')
should = (['این' ,'جمله' ,'(' ,'خیلی' ,')' ,'پیچیده' ,'نیست', '!', '!', '!']).
}.
it can lex arbitrary persian text do: {
# Pen writeln:
(lexer split: 'میشدید اون پارسر ترم رو دیدی؟ یه تیکه هست مچ رو درست پیدا میکنه ولی توشو خالی میده')
should = [
'میشدید', 'اون', 'پارسر',
'ترم', 'رو', 'دیدی', '؟', 'یه',
'تیکه', 'هست', 'مچ', 'رو', 'درست',
'پیدا', 'میکنه', 'ولی', 'توشو',
'خالی', 'میده'
].
# .
}.
}.
#:language XFrozen
describe TokenAnalyser do: {:it:let
it exists do: {
{ TokenAnalyser. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
let[Reflect thisContext] analyser { TokenAnalyser with: lexer }.
let[Reflect thisContext] glblctx { Reflect thisContext }.
let[Reflect thisContext] outfile { File new: 'outfile', open: 'w+' }.
# let[Reflect thisContext] testfile { File new: 'testfile', open: 'r' }.
it can tokenise and tag entities do: {:thisTest
var allTargetTexts is (
File
list: 'samples',
filter: (\:_:descr (descr @ 'type' = 'file') & (descr @ 'file' startsWith: 'sample')),
fmap: \:descr (File new: 'samples/' + (descr @ 'file'), read)
).
thisTest tests: 'Token Tagging'.
var consolidated is allTargetTexts fmap: {:targetText
^analyser
tag: targetText
pre: {:i thisTest progress: i + 1. }
post: { thisTest show. }.
}.
consolidated is analyser
tagPhrases: consolidated
pre: {:i thisTest progress: i + 1. }
post: { thisTest show. }.
consolidated should not = Nil.
# Pen writeln: consolidated.
#{
outfile write: (
#analyser prettyPrint: consolidated tokens: allTokens toArray
consolidated fmap: (\:x x join: '\t\n'), join: '\n'
).
#} should output: testfile read to: outfile.
}.
}.