-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathregex.c
577 lines (477 loc) · 14 KB
/
regex.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
#include <ctype.h>
#include <limits.h>
#define AEM_INTERNAL
#include <aem/log.h>
#include <aem/nfa-compile.h>
#include <aem/stack.h>
#include <aem/stringbuf.h>
#include <aem/translate.h>
#include <aem/utf8.h>
#include "regex.h"
/// AST construction
static int match_escape(struct aem_nfa_compile_ctx *ctx, uint32_t *c_p, int *esc_p)
{
aem_assert(ctx);
return aem_string_unescape_rune(&ctx->in, c_p, esc_p);
}
/// RE => AST
static struct aem_nfa_node *re_parse_named_class(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
struct aem_stringslice in = ctx->in;
if (!aem_stringslice_match(&in, "[:"))
return NULL;
int neg = aem_stringslice_match(&in, "^");
struct aem_stringslice classname = aem_stringslice_match_alnum(&in);
if (!aem_stringslice_ok(classname))
return NULL;
if (!aem_stringslice_match(&in, ":]"))
return NULL;
enum aem_nfa_cclass cclass;
for (cclass = 0; cclass < AEM_NFA_CCLASS_MAX; cclass++)
if (aem_stringslice_eq(classname, aem_nfa_cclass_name(cclass)))
break;
if (cclass >= AEM_NFA_CCLASS_MAX)
return NULL;
struct aem_nfa_node *node = aem_nfa_node_new(AEM_NFA_NODE_CLASS);
if (!node)
return NULL;
node->text = aem_stringslice_new(ctx->in.start, in.start);
ctx->in = in;
node->args.cclass = (struct aem_nfa_node_class){.neg = neg, .cclass = cclass};
return node;
}
static struct aem_nfa_node *re_parse_range(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
{
struct aem_nfa_node *node = re_parse_named_class(ctx);
if (node)
return node;
}
struct aem_nfa_node *node = aem_nfa_node_new(AEM_NFA_NODE_RANGE);
if (!node)
return NULL;
node->text = ctx->in;
struct aem_stringslice orig = ctx->in;
uint32_t lo;
if (!match_escape(ctx, &lo, NULL))
goto fail;
uint32_t hi = lo;
if (aem_stringslice_match(&ctx->in, "-")) {
if (!match_escape(ctx, &hi, NULL))
goto fail;
}
node->args.range.min = lo;
node->args.range.max = hi;
node->text.end = ctx->in.start;
return node;
fail:
aem_nfa_node_free(node);
ctx->in = orig;
return NULL;
}
static int aem_nfa_brackets_compar(const void *p1, const void *p2)
{
aem_assert(p1);
aem_assert(p2);
struct aem_nfa_node *n1 = *(struct aem_nfa_node **)p1;
struct aem_nfa_node *n2 = *(struct aem_nfa_node **)p2;
aem_assert(n1);
aem_assert(n2);
if (n1->type != AEM_NFA_NODE_RANGE || n2->type != AEM_NFA_NODE_RANGE) {
// If they somehow aren't both ranges, sort
// in input order (i.e. the original order).
return n1->text.start - n2->text.start;
}
struct aem_nfa_node_range r1 = n1->args.range;
struct aem_nfa_node_range r2 = n2->args.range;
return r1.min - r2.min;
}
static struct aem_nfa_node *re_parse_brackets(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
struct aem_stringslice orig = ctx->in;
if (!aem_stringslice_match(&ctx->in, "["))
goto fail_nofree;
struct aem_nfa_node *node = aem_nfa_node_new(AEM_NFA_NODE_ALTERNATION);
if (!node)
goto fail_nofree;
node->text = orig;
int negate = aem_stringslice_match(&ctx->in, "^");
while (aem_stringslice_ok(ctx->in)) {
struct aem_nfa_node *range = re_parse_range(ctx);
if (!range)
goto fail;
aem_nfa_node_push(node, range);
if (aem_stringslice_match(&ctx->in, "]"))
break;
}
node->text.end = ctx->in.start;
// Sort ranges
aem_stack_qsort(&node->children, aem_nfa_brackets_compar);
if (negate) {
// Complement ranges
// Move old node->children into temporary
struct aem_stack stk = node->children;
// Make new node->children
aem_stack_init_prealloc(&node->children, stk.n+1);
struct aem_nfa_node_range range_prev = {.min = 0, .max = UINT_MAX};
AEM_STACK_FOREACH(i, &stk) {
struct aem_nfa_node *child = stk.s[i];
if (!child)
continue;
if (child->type != AEM_NFA_NODE_RANGE) {
aem_logf_ctx(AEM_LOG_ERROR, "Can't complement non-range inside [^...]!");
aem_stack_dtor(&stk);
goto fail;
}
const struct aem_nfa_node_range range = child->args.range;
// TODO BUG: range.min == 0
// TODO HACK: UINT_MAX + 1 == 0, so skip if first range starts at 0
// TODO: This would all be a lot simpler if ranges were [min, max).
struct aem_nfa_node_range range_new = {.min = range_prev.max+1, .max = range.min-1};
if (range_new.max != UINT_MAX && range_new.min <= range_new.max) {
// Reuse this range to represent the
// characters between it and the
// previous one.
child->args.range = range_new;
aem_nfa_node_push(node, child);
} else {
// Overlapping/null ranges
aem_nfa_node_free(child);
}
range_prev = range;
}
struct aem_nfa_node_range range_last = {.min = range_prev.max+1, .max = UINT_MAX};
// TODO HACK: UINT_MAX + 1 == 0, so skip if final range ends at UINT_MAX
if (range_last.min && range_last.min <= range_last.max) {
struct aem_nfa_node *child = aem_nfa_node_new(AEM_NFA_NODE_RANGE);
if (!child) {
aem_stack_dtor(&stk);
goto fail;
}
child->args.range = range_last;
aem_nfa_node_push(node, child);
}
// Destroy old node->children
aem_stack_dtor(&stk);
} else {
// TODO: Else merge adjacent or overlapping ranges
}
if (!(ctx->flags & AEM_REGEX_FLAG_BINARY)) {
aem_logf_ctx(AEM_LOG_NYI, "NYI: expand UTF-8 ranges");
}
return node;
fail:
aem_nfa_node_free(node);
fail_nofree:
ctx->in = orig;
return NULL;
}
static struct aem_nfa_node *re_parse_pattern(struct aem_nfa_compile_ctx *ctx);
static struct aem_nfa_node *re_parse_atom(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
struct aem_stringslice orig = ctx->in;
struct aem_stringslice out = ctx->in;
if (aem_stringslice_match(&ctx->in, "[")) {
ctx->in = orig;
struct aem_nfa_node *brackets = re_parse_brackets(ctx);
return brackets;
} else if (aem_stringslice_match(&ctx->in, "(")) {
int do_capture = 1;
enum aem_regex_flags flags = ctx->flags;
if (aem_stringslice_match(&ctx->in, "?")) {
ctx->flags = aem_regex_flags_adj(&ctx->in, ctx->flags, 1);
AEM_LOG_MULTI(out, AEM_LOG_DEBUG) {
aem_stringbuf_puts(out, "Change flags from ");
aem_regex_flags_describe(out, flags, 0);
aem_stringbuf_puts(out, " to ");
aem_regex_flags_describe(out, ctx->flags, 0);
}
if (aem_stringslice_match(&ctx->in, ":")) {
do_capture = 0;
} else {
aem_logf_ctx(AEM_LOG_NYI, "NYI: set flags for current group (?flags)");
}
}
size_t i = ctx->n_captures;
if (do_capture)
ctx->n_captures++; // Count captures in lexical order
struct aem_nfa_node *pattern = re_parse_pattern(ctx);
ctx->flags = flags; // Restore flags
if (!aem_stringslice_match(&ctx->in, ")")) {
aem_nfa_node_free(pattern);
ctx->n_captures = i;
goto fail;
}
out.end = ctx->in.start;
if (!do_capture)
return pattern;
if ((ctx->flags & AEM_REGEX_FLAG_EXPLICIT_CAPTURES) && pattern->type == AEM_NFA_NODE_ALTERNATION)
return pattern;
struct aem_nfa_node *capture = aem_nfa_node_new(AEM_NFA_NODE_CAPTURE);
if (!capture) {
aem_nfa_node_free(pattern);
ctx->n_captures = i;
goto fail;
}
capture->text = out;
capture->args.capture.capture = i;
aem_nfa_node_push(capture, pattern);
return capture;
} else {
uint32_t c;
int esc;
if (!match_escape(ctx, &c, &esc))
goto fail;
enum aem_nfa_node_type type = AEM_NFA_NODE_ATOM;
union aem_nfa_node_args args = {.atom = {.c = c}};
switch (esc) {
case 0: // Unescaped
switch (c) {
case '.':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = 0, .frontier = 0, .cclass = (ctx->flags & AEM_REGEX_FLAG_BINARY) ? AEM_NFA_CCLASS_ANY : AEM_NFA_CCLASS_LINE};
break;
case '^':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = 0, .frontier = 1, .cclass = AEM_NFA_CCLASS_LINE};
break;
case '$':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = 1, .frontier = 1, .cclass = AEM_NFA_CCLASS_LINE};
break;
case ')':
case '?':
case '*':
case '+':
case '|':
case '\\':
// Not an atom
goto fail;
default:
// Plain character
break;
}
break;
case 1: // Substituted escape: do nothing else
break;
case 2: {
// Unsubstitued escape
int neg = isupper(c) != 0;
switch (c) {
case '<':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = 0, .frontier = 1, .cclass = AEM_NFA_CCLASS_ALNUM};
break;
case '>':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = 1, .frontier = 1, .cclass = AEM_NFA_CCLASS_ALNUM};
break;
case 'A':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = 0, .frontier = 1, .cclass = AEM_NFA_CCLASS_ANY};
break;
case 'z':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = 1, .frontier = 1, .cclass = AEM_NFA_CCLASS_ANY};
break;
case 'w':
case 'W':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = 0, .frontier = 0, .cclass = AEM_NFA_CCLASS_ALNUM};
break;
case 'd':
case 'D':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = neg, .frontier = 0, .cclass = AEM_NFA_CCLASS_DIGIT};
break;
case 's':
case 'S':
type = AEM_NFA_NODE_CLASS;
args.cclass = (struct aem_nfa_node_class){.neg = neg, .frontier = 0, .cclass = AEM_NFA_CCLASS_SPACE};
break;
case '(':
case ')':
case '[':
case '?':
case '*':
case '+':
case '|':
case '\\':
break;
default:
aem_logf_ctx(AEM_LOG_WARN, "Unnecessary escape: \\%c", c);
}
break;
}
default:
aem_logf_ctx(AEM_LOG_BUG, "Invalid esc: %d (char %08x)", esc, c);
break;
}
struct aem_nfa_node *node = aem_nfa_node_new(type);
if (!node)
goto fail;
out.end = ctx->in.start;
node->text = out;
node->args = args;
return node;
}
fail:
ctx->in = orig;
return NULL;
}
// Atom, possibly followed by a postfix repetition operator
static struct aem_nfa_node *re_parse_postfix(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
struct aem_nfa_node *atom = re_parse_atom(ctx);
if (!atom)
return NULL;
struct aem_stringslice out = ctx->in;
struct aem_nfa_node_repeat repeat = {.min = 0, .max = UINT_MAX};
struct aem_stringslice orig = ctx->in;
if (aem_stringslice_match(&ctx->in, "?")) {
repeat.min = 0;
repeat.max = 1;
} else if (aem_stringslice_match(&ctx->in, "*")) {
repeat.min = 0;
repeat.max = UINT_MAX;
} else if (aem_stringslice_match(&ctx->in, "+")) {
repeat.min = 1;
repeat.max = UINT_MAX;
} else if (aem_stringslice_match(&ctx->in, "{")) {
// Try to get a lower bound
int lower = aem_stringslice_match_uint_base(&ctx->in, 10, &repeat.min);
if (!lower) {
repeat.min = 0;
}
// Try to get a comma
int comma = aem_stringslice_match(&ctx->in, ",");
if (!lower && !comma)
return atom;
// Try to get a upper bound, but only if we got a comma
int upper = comma && aem_stringslice_match_uint_base(&ctx->in, 10, &repeat.max);
if (!upper) {
repeat.max = UINT_MAX;
}
if (lower && !comma)
repeat.max = repeat.min;
if (!aem_stringslice_match(&ctx->in, "}"))
goto fail;
} else {
return atom;
}
repeat.reluctant = aem_stringslice_match(&ctx->in, "?");
if (repeat.min > repeat.max) {
aem_logf_ctx(AEM_LOG_ERROR, "Repetition min %d > max %d!", repeat.min, repeat.max);
goto fail;
}
out.end = ctx->in.start;
if (!aem_stringslice_ok(out))
return atom;
if ((ctx->flags & AEM_REGEX_FLAG_EXPLICIT_CAPTURES) && atom->type == AEM_NFA_NODE_CAPTURE) {
const struct aem_nfa_node_capture capture = atom->args.capture;
aem_logf_ctx(AEM_LOG_NOTICE, "Deleting capture %zd/%zd", capture.capture, ctx->n_captures);
if (capture.capture == ctx->n_captures-1) {
ctx->n_captures--;
}
struct aem_nfa_node *child = aem_stack_pop(&atom->children);
aem_assert(!atom->children.n);
aem_nfa_node_free(atom);
atom = child;
}
struct aem_nfa_node *node = aem_nfa_node_new(AEM_NFA_NODE_REPEAT);
if (!node) {
aem_nfa_node_free(atom);
return NULL;
}
node->text = out;
node->args.repeat = repeat;
aem_nfa_node_push(node, atom);
return node;
fail:
aem_nfa_node_free(atom);
ctx->in = orig;
return NULL;
}
// Zero or more postfix'd atoms
static struct aem_nfa_node *re_parse_branch(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
struct aem_nfa_node *node = aem_nfa_node_new(AEM_NFA_NODE_BRANCH);
if (!node)
return NULL;
while (aem_stringslice_ok(ctx->in)) {
struct aem_nfa_node *atom = re_parse_postfix(ctx);
if (!atom) {
// TODO: No more is indistinguishable from a real error.
break;
}
aem_nfa_node_push(node, atom);
}
if (node->children.n == 1) {
struct aem_nfa_node *child = aem_stack_pop(&node->children);
aem_nfa_node_free(node);
return child;
}
return node;
}
static struct aem_nfa_node *re_parse_pattern(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
struct aem_stringslice orig = ctx->in;
struct aem_nfa_node *branch = re_parse_branch(ctx);
struct aem_stringslice out = ctx->in;
if (!aem_stringslice_match(&ctx->in, "|"))
return branch;
out.end = ctx->in.start;
struct aem_nfa_node *node = aem_nfa_node_new(AEM_NFA_NODE_ALTERNATION);
if (!node) {
aem_nfa_node_free(branch);
ctx->in = orig;
return NULL;
}
node->text = out;
aem_nfa_node_push(node, branch);
do {
struct aem_nfa_node *rest = re_parse_branch(ctx);
aem_nfa_node_push(node, rest);
} while (aem_stringslice_match(&ctx->in, "|"));
return node;
}
static struct aem_nfa_node *aem_regex_compile(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
if (!(ctx->flags & AEM_REGEX_FLAG_BINARY))
aem_logf_ctx_once(AEM_LOG_NYI, "NYI: new UTF-8 mode");
return re_parse_pattern(ctx);
}
AEM_NFA_ADD_DEFINE(regex)
static struct aem_nfa_node *aem_string_compile(struct aem_nfa_compile_ctx *ctx)
{
aem_assert(ctx);
struct aem_nfa_node *root = aem_nfa_node_new(AEM_NFA_NODE_BRANCH);
if (!root)
return NULL;
ctx->flags |= AEM_REGEX_FLAG_BINARY;
for (;;) {
struct aem_stringslice atom = ctx->in;
int c = aem_stringslice_getc(&ctx->in);
if (c < 0)
break;
atom.end = ctx->in.start;
struct aem_nfa_node *node = aem_nfa_node_new(AEM_NFA_NODE_ATOM);
if (!node) {
aem_nfa_node_free(root);
return NULL;
}
node->text = atom;
node->args.atom.c = c;
aem_nfa_node_push(root, node);
}
return root;
}
AEM_NFA_ADD_DEFINE(string)