Skip to content

Commit

Permalink
add (til) PEG special
Browse files Browse the repository at this point in the history
  • Loading branch information
ianthehenry committed Dec 4, 2024
1 parent 5d1bd8a commit 64b1d91
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 44 deletions.
121 changes: 78 additions & 43 deletions src/core/peg.c
Original file line number Diff line number Diff line change
Expand Up @@ -171,17 +171,22 @@ static int64_t peg_convert_u64_s64(uint64_t from, int width) {
} while (0)
#define up1(s) ((s)->depth++)

static const uint8_t *ignore_capture_to_out;

/* Evaluate a peg rule
* Pre-conditions: s is in a valid state
* Post-conditions: If there is a match, returns a pointer to the next text.
* All captures on the capture stack are valid. If there is no match,
* returns NULL. Extra captures from successful child expressions can be
* left on the capture stack.
* capture_to_out can be set by rules in order to capture a different amount of
* text than you advance over.
*/
static const uint8_t *peg_rule(
PegState *s,
const uint32_t *rule,
const uint8_t *text) {
const uint8_t *text,
const uint8_t **capture_to_out) {
tail:
switch (*rule) {
default:
Expand Down Expand Up @@ -227,7 +232,7 @@ static const uint8_t *peg_rule(
text += ((int32_t *)rule)[1];
if (text < s->text_start || text > s->text_end) return NULL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[2], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[2], text, &ignore_capture_to_out);
up1(s);
text -= ((int32_t *)rule)[1];
return result ? text : NULL;
Expand All @@ -240,9 +245,11 @@ static const uint8_t *peg_rule(
down1(s);
CapState cs = cap_save(s);
for (uint32_t i = 0; i < len - 1; i++) {
const uint8_t *result = peg_rule(s, s->bytecode + args[i], text);
const uint8_t *capture_to = NULL;
const uint8_t *result = peg_rule(s, s->bytecode + args[i], text, &capture_to);
if (result) {
up1(s);
*capture_to_out = capture_to;
return result;
}
cap_load(s, cs);
Expand All @@ -258,7 +265,7 @@ static const uint8_t *peg_rule(
if (len == 0) return text;
down1(s);
for (uint32_t i = 0; text && i < len - 1; i++)
text = peg_rule(s, s->bytecode + args[i], text);
text = peg_rule(s, s->bytecode + args[i], text, &ignore_capture_to_out);
up1(s);
if (!text) return NULL;
rule = s->bytecode + args[len - 1];
Expand All @@ -269,7 +276,7 @@ static const uint8_t *peg_rule(
const uint32_t *rule_a = s->bytecode + rule[1];
const uint32_t *rule_b = s->bytecode + rule[2];
down1(s);
const uint8_t *result = peg_rule(s, rule_a, text);
const uint8_t *result = peg_rule(s, rule_a, text, &ignore_capture_to_out);
up1(s);
if (!result) return NULL;
rule = rule_b;
Expand All @@ -280,7 +287,7 @@ static const uint8_t *peg_rule(
const uint32_t *rule_b = s->bytecode + rule[2];
down1(s);
CapState cs = cap_save(s);
const uint8_t *result = peg_rule(s, rule_a, text);
const uint8_t *result = peg_rule(s, rule_a, text, &ignore_capture_to_out);
if (!!result) {
up1(s);
return NULL;
Expand All @@ -296,7 +303,7 @@ static const uint8_t *peg_rule(
const uint32_t *rule_a = s->bytecode + rule[1];
down1(s);
CapState cs = cap_save(s);
const uint8_t *result = peg_rule(s, rule_a, text);
const uint8_t *result = peg_rule(s, rule_a, text, &ignore_capture_to_out);
if (result) {
up1(s);
return NULL;
Expand All @@ -307,17 +314,18 @@ static const uint8_t *peg_rule(
}
}

case RULE_THRU:
case RULE_TO: {
case RULE_TO:
case RULE_TIL:
case RULE_THRU: {
const uint32_t *rule_a = s->bytecode + rule[1];
const uint8_t *next_text = NULL;
CapState cs = cap_save(s);
down1(s);
while (text <= s->text_end) {
CapState cs2 = cap_save(s);
next_text = peg_rule(s, rule_a, text);
next_text = peg_rule(s, rule_a, text, &ignore_capture_to_out);
if (next_text) {
if (rule[0] == RULE_TO) cap_load(s, cs2);
if (rule[0] != RULE_THRU) cap_load(s, cs2);
break;
}
cap_load(s, cs2);
Expand All @@ -328,6 +336,10 @@ static const uint8_t *peg_rule(
cap_load(s, cs);
return NULL;
}
if (rule[0] == RULE_TIL) {
*capture_to_out = text;
}

return rule[0] == RULE_TO ? text : next_text;
}

Expand All @@ -341,7 +353,9 @@ static const uint8_t *peg_rule(
down1(s);
while (captured < hi) {
CapState cs2 = cap_save(s);
next_text = peg_rule(s, rule_a, text);
const uint8_t *capture_to = NULL;
next_text = peg_rule(s, rule_a, text, &capture_to);
*capture_to_out = capture_to;
if (!next_text || next_text == text) {
cap_load(s, cs2);
break;
Expand Down Expand Up @@ -402,36 +416,44 @@ static const uint8_t *peg_rule(

case RULE_CAPTURE: {
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *capture_to = NULL;
const uint8_t *advance_to = peg_rule(s, s->bytecode + rule[1], text, &capture_to);
if (capture_to == NULL) {
capture_to = advance_to;
}
up1(s);
if (!result) return NULL;
if (!advance_to) return NULL;
/* Specialized pushcap - avoid intermediate string creation */
if (!s->has_backref && s->mode == PEG_MODE_ACCUMULATE) {
janet_buffer_push_bytes(s->scratch, text, (int32_t)(result - text));
janet_buffer_push_bytes(s->scratch, text, (int32_t)(capture_to - text));
} else {
uint32_t tag = rule[2];
pushcap(s, janet_stringv(text, (int32_t)(result - text)), tag);
pushcap(s, janet_stringv(text, (int32_t)(capture_to - text)), tag);
}
return result;
return advance_to;
}

case RULE_CAPTURE_NUM: {
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *capture_to = NULL;
const uint8_t *advance_to = peg_rule(s, s->bytecode + rule[1], text, &capture_to);
if (capture_to == NULL) {
capture_to = advance_to;
}
up1(s);
if (!result) return NULL;
if (!advance_to) return NULL;
/* check number parsing */
double x = 0.0;
int32_t base = (int32_t) rule[2];
if (janet_scan_number_base(text, (int32_t)(result - text), base, &x)) return NULL;
if (janet_scan_number_base(text, (int32_t)(capture_to - text), base, &x)) return NULL;
/* Specialized pushcap - avoid intermediate string creation */
if (!s->has_backref && s->mode == PEG_MODE_ACCUMULATE) {
janet_buffer_push_bytes(s->scratch, text, (int32_t)(result - text));
janet_buffer_push_bytes(s->scratch, text, (int32_t)(capture_to - text));
} else {
uint32_t tag = rule[3];
pushcap(s, janet_wrap_number(x), tag);
}
return result;
return advance_to;
}

case RULE_ACCUMULATE: {
Expand All @@ -444,7 +466,7 @@ static const uint8_t *peg_rule(
CapState cs = cap_save(s);
s->mode = PEG_MODE_ACCUMULATE;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text, capture_to_out);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
Expand All @@ -458,7 +480,7 @@ static const uint8_t *peg_rule(
case RULE_DROP: {
CapState cs = cap_save(s);
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text, capture_to_out);
up1(s);
if (!result) return NULL;
cap_load(s, cs);
Expand All @@ -468,7 +490,7 @@ static const uint8_t *peg_rule(
case RULE_ONLY_TAGS: {
CapState cs = cap_save(s);
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text, capture_to_out);
up1(s);
if (!result) return NULL;
cap_load_keept(s, cs);
Expand All @@ -481,7 +503,7 @@ static const uint8_t *peg_rule(
CapState cs = cap_save(s);
s->mode = PEG_MODE_NORMAL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text, capture_to_out);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
Expand All @@ -504,7 +526,7 @@ static const uint8_t *peg_rule(
CapState cs = cap_save(s);
s->mode = PEG_MODE_NORMAL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[2], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[2], text, capture_to_out);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
Expand All @@ -525,23 +547,27 @@ static const uint8_t *peg_rule(
const uint32_t *rule_window = s->bytecode + rule[1];
const uint32_t *rule_subpattern = s->bytecode + rule[2];
down1(s);
const uint8_t *window_end = peg_rule(s, rule_window, text);
const uint8_t *window_capture_to = NULL;
const uint8_t *window_advance_to = peg_rule(s, rule_window, text, &window_capture_to);
if (window_capture_to == NULL) {
window_capture_to = window_advance_to;
}
up1(s);
if (!window_end) {
if (!window_advance_to) {
return NULL;
}
const uint8_t *saved_end = s->text_end;
s->text_end = window_end;
s->text_end = window_capture_to;
down1(s);
const uint8_t *next_text = peg_rule(s, rule_subpattern, text_start);
const uint8_t *next_text = peg_rule(s, rule_subpattern, text_start, &ignore_capture_to_out);
up1(s);
s->text_end = saved_end;

if (!next_text) {
return NULL;
}

return window_end;
return window_advance_to;
}

case RULE_SPLIT: {
Expand All @@ -555,7 +581,7 @@ static const uint8_t *peg_rule(
CapState cs = cap_save(s);
down1(s);
while (text <= s->text_end) {
separator_end = peg_rule(s, rule_separator, text);
separator_end = peg_rule(s, rule_separator, text, &ignore_capture_to_out);
cap_load(s, cs);
if (separator_end) {
break;
Expand All @@ -570,7 +596,9 @@ static const uint8_t *peg_rule(
}

down1(s);
const uint8_t *subpattern_end = peg_rule(s, rule_subpattern, text_start);
const uint8_t *capture_to = NULL;
const uint8_t *subpattern_end = peg_rule(s, rule_subpattern, text_start, &capture_to);
*capture_to_out = capture_to;
up1(s);
s->text_end = saved_end;

Expand All @@ -589,7 +617,7 @@ static const uint8_t *peg_rule(
CapState cs = cap_save(s);
s->mode = PEG_MODE_NORMAL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text, capture_to_out);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
Expand Down Expand Up @@ -633,7 +661,7 @@ static const uint8_t *peg_rule(
s->mode = PEG_MODE_NORMAL;
int32_t old_cap = s->captures->count;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text, capture_to_out);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
Expand Down Expand Up @@ -672,7 +700,7 @@ static const uint8_t *peg_rule(
const uint8_t *next_text;
CapState cs = cap_save(s);
down1(s);
next_text = peg_rule(s, s->bytecode + rule[1], text);
next_text = peg_rule(s, s->bytecode + rule[1], text, &ignore_capture_to_out);
up1(s);
if (NULL == next_text) return NULL;
s->mode = oldmode;
Expand All @@ -688,7 +716,9 @@ static const uint8_t *peg_rule(
cap_load(s, cs);
for (int32_t i = 0; i < nrep; i++) {
down1(s);
next_text = peg_rule(s, s->bytecode + rule[2], next_text);
const uint8_t *capture_to = NULL;
next_text = peg_rule(s, s->bytecode + rule[2], next_text, &capture_to);
*capture_to_out = capture_to;
up1(s);
if (NULL == next_text) {
cap_load(s, cs);
Expand Down Expand Up @@ -742,7 +772,7 @@ static const uint8_t *peg_rule(
case RULE_UNREF: {
int32_t tcap = s->tags->count;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text, capture_to_out);
up1(s);
if (!result) return NULL;
int32_t final_tcap = s->tags->count;
Expand Down Expand Up @@ -1092,6 +1122,9 @@ static void spec_to(Builder *b, int32_t argc, const Janet *argv) {
static void spec_thru(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_THRU);
}
static void spec_til(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_TIL);
}
static void spec_drop(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_DROP);
}
Expand Down Expand Up @@ -1323,6 +1356,7 @@ static const SpecialPair peg_specials[] = {
{"split", spec_split},
{"sub", spec_sub},
{"thru", spec_thru},
{"til", spec_til},
{"to", spec_to},
{"uint", spec_uint_le},
{"uint-be", spec_uint_be},
Expand Down Expand Up @@ -1670,6 +1704,7 @@ static void *peg_unmarshal(JanetMarshalContext *ctx) {
case RULE_ONLY_TAGS:
case RULE_NOT:
case RULE_TO:
case RULE_TIL:
case RULE_THRU:
/* [rule] */
if (rule[1] >= blen) goto bad;
Expand Down Expand Up @@ -1852,7 +1887,7 @@ JANET_CORE_FN(cfun_peg_match,
"Match a Parsing Expression Grammar to a byte string and return an array of captured values. "
"Returns nil if text does not match the language defined by peg. The syntax of PEGs is documented on the Janet website.") {
PegCall c = peg_cfun_init(argc, argv, 0);
const uint8_t *result = peg_rule(&c.s, c.s.bytecode, c.bytes.bytes + c.start);
const uint8_t *result = peg_rule(&c.s, c.s.bytecode, c.bytes.bytes + c.start, &ignore_capture_to_out);
return result ? janet_wrap_array(c.s.captures) : janet_wrap_nil();
}

Expand All @@ -1862,7 +1897,7 @@ JANET_CORE_FN(cfun_peg_find,
PegCall c = peg_cfun_init(argc, argv, 0);
for (int32_t i = c.start; i < c.bytes.len; i++) {
peg_call_reset(&c);
if (peg_rule(&c.s, c.s.bytecode, c.bytes.bytes + i))
if (peg_rule(&c.s, c.s.bytecode, c.bytes.bytes + i, &ignore_capture_to_out))
return janet_wrap_integer(i);
}
return janet_wrap_nil();
Expand All @@ -1875,7 +1910,7 @@ JANET_CORE_FN(cfun_peg_find_all,
JanetArray *ret = janet_array(0);
for (int32_t i = c.start; i < c.bytes.len; i++) {
peg_call_reset(&c);
if (peg_rule(&c.s, c.s.bytecode, c.bytes.bytes + i))
if (peg_rule(&c.s, c.s.bytecode, c.bytes.bytes + i, &ignore_capture_to_out))
janet_array_push(ret, janet_wrap_integer(i));
}
return janet_wrap_array(ret);
Expand All @@ -1887,7 +1922,7 @@ static Janet cfun_peg_replace_generic(int32_t argc, Janet *argv, int only_one) {
int32_t trail = 0;
for (int32_t i = c.start; i < c.bytes.len;) {
peg_call_reset(&c);
const uint8_t *result = peg_rule(&c.s, c.s.bytecode, c.bytes.bytes + i);
const uint8_t *result = peg_rule(&c.s, c.s.bytecode, c.bytes.bytes + i, &ignore_capture_to_out);
if (NULL != result) {
if (trail < i) {
janet_buffer_push_bytes(ret, c.bytes.bytes + trail, (i - trail));
Expand Down
1 change: 1 addition & 0 deletions src/include/janet.h
Original file line number Diff line number Diff line change
Expand Up @@ -2172,6 +2172,7 @@ typedef enum {
RULE_DROP, /* [rule] */
RULE_BACKMATCH, /* [tag] */
RULE_TO, /* [rule] */
RULE_TIL, /* [rule] */
RULE_THRU, /* [rule] */
RULE_LENPREFIX, /* [rule_a, rule_b (repeat rule_b rule_a times)] */
RULE_READINT, /* [(signedness << 4) | (endianness << 5) | bytewidth, tag] */
Expand Down
Loading

0 comments on commit 64b1d91

Please sign in to comment.