Skip to content

Commit

Permalink
chore: more progress on rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
jo3-l committed Aug 20, 2023
1 parent 43a337d commit 3cab174
Show file tree
Hide file tree
Showing 11 changed files with 58 additions and 139 deletions.
13 changes: 13 additions & 0 deletions src/detector.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
export interface Match<T = unknown> {
startIndex: number;
endIndex: number;
meta: Readonly<T>;
}

export interface Pattern<T = unknown> {
matchFirst(chars: number[]): Match<T>;
appendAllMatches(dst: Match<T>[], chars: number[]): void;
test(chars: number[]): boolean;
}

export class ObscenityDetector<T = unknown> {}

Check warning on line 13 in src/detector.ts

View workflow job for this annotation

GitHub Actions / Run ESLint

'T' is defined but never used
Empty file removed src/dsl/impl/bitap.ts
Empty file.
Empty file removed src/dsl/impl/vm.ts
Empty file.
16 changes: 5 additions & 11 deletions src/dsl/syntax/ast.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,13 @@ export enum SyntaxKind {
Wildcard,
Optional,
CharSet,
Repetition,
}

export type Node = Literal | Wildcard | Optional | CharSet | Repetition;
export type Node = Literal | Wildcard | Optional | CharSet;

export interface Literal {
kind: SyntaxKind.Literal;
chars: string[];
chars: number[];
}

export interface Wildcard {
Expand All @@ -31,7 +30,7 @@ export interface Wildcard {

export interface Optional {
kind: SyntaxKind.Optional;
inner: Node[];
children: Node[];
}

export interface CharSet {
Expand All @@ -43,11 +42,6 @@ export interface CharSet {
* A range of characters, including both endpoints.
*/
export interface CharRange {
lo: string;
hi: string;
}

export interface Repetition {
kind: SyntaxKind.Repetition;
char: string;
lo: number;
hi: number;
}
47 changes: 16 additions & 31 deletions src/dsl/syntax/parse.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { charValue } from '../../util/char';
import type { Ast, CharRange, CharSet, Node, Optional, Wildcard } from './ast';
import { BoundaryAssertion, SyntaxKind } from './ast';
import * as assert from 'node:assert';
Expand All @@ -17,7 +18,7 @@ export interface Position {
offset: number;
}

export const metachars = ['{', '}', '[', ']', '+', '?', '|'];
export const metachars = ['{', '}', '[', ']', '?', '|'];

export class Parser {
private chars: string[] = [];
Expand All @@ -37,7 +38,7 @@ export class Parser {
this.chars = chars;

const nodes: Node[] = [];
while (!this.done()) nodes.push(...this.parseAny());
while (!this.done()) nodes.push(this.parseAny());
return { source: pattern, nodes, boundaryAssertions };
}

Expand All @@ -54,10 +55,10 @@ export class Parser {
return assertions;
}

private *parseAny(): Generator<Node> {
if (this.at('?')) return yield this.parseWildcard();
if (this.at('[')) return yield this.parseOpt();
if (this.at('{')) return yield this.parseCharSet();
private parseAny(): Node {
if (this.at('?')) return this.parseWildcard();
if (this.at('[')) return this.parseOpt();
if (this.at('{')) return this.parseCharSet();

/* eslint-disable no-fallthrough */
switch (this.peek()) {
Expand All @@ -69,26 +70,10 @@ export class Parser {
this.error(
"boundary assertions are only permitted at start/end of pattern; use a backslash '\\' to escape if a literal '|' is desired",
);
case '+':
if (this.offset > 0) {
this.error(
"'+' can only be used after a character, not a wildcard/optional/char set; use a backslash '\\' to escape if a literal '+' is desired",
);
} else {
this.error(
"'+' is a special character denoting repetition; use a backslash '\\' to escape if a literal '+' is desired",
);
}
default: {
const chars: string[] = [];
while (!this.done() && !metachars.includes(this.peek())) chars.push(this.nextSkipEscape());
if (this.eat('+')) {
const last = chars.pop()!;
if (chars.length > 0) yield { kind: SyntaxKind.Literal, chars };
yield { kind: SyntaxKind.Repetition, char: last };
} else {
yield { kind: SyntaxKind.Literal, chars };
}
const chars: number[] = [];
while (!this.done() && !metachars.includes(this.peek())) chars.push(charValue(this.nextSkipEscape()));
return { kind: SyntaxKind.Literal, chars };
}
}
/* eslint-enable no-fallthrough */
Expand All @@ -112,15 +97,15 @@ export class Parser {
const openPos = this.pos();
assert.ok(this.eat('['));

const inner: Node[] = [];
const children: Node[] = [];
while (!this.done()) {
if (this.eat(']')) {
if (inner.length === 0) {
if (children.length === 0) {
this.error('empty optional expressions are not permitted', { col: this.col - 1, offset: this.offset - 1 });
}
return { kind: SyntaxKind.Optional, inner };
return { kind: SyntaxKind.Optional, children };
}
inner.push(...this.parseAny());
children.push(this.parseAny());
}
this.error('unclosed optional expression', openPos);
}
Expand All @@ -140,12 +125,12 @@ export class Parser {
}
if (needComma && !this.eat(',')) this.error('expected comma separating elements of character set');

const lo = this.next();
const lo = charValue(this.next());
let hi = lo;
if (this.eat('-')) {
// next char is upper bound
if (this.done()) this.error("expected character following '-' in character set");
hi = this.next();
hi = charValue(this.next());
}

ranges.push({ lo, hi });
Expand Down
21 changes: 9 additions & 12 deletions src/dsl/syntax/untrusted.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,15 @@ export enum SpecialPatternSyntax {
Wildcards = 1 << 0,
Optionals = 1 << 1,
CharSets = 1 << 2,
Repetitions = 1 << 3,
BoundaryAssertions = 1 << 4,
All = Optionals | Wildcards | CharSets | Repetitions | BoundaryAssertions,
BoundaryAssertions = 1 << 3,
All = Optionals | Wildcards | CharSets | BoundaryAssertions,
}

export function printSyntax(syntax: SpecialPatternSyntax) {
const items: string[] = [];
if (syntax & SpecialPatternSyntax.Wildcards) items.push('wildcards');
if (syntax & SpecialPatternSyntax.Optionals) items.push('optional expressions');
if (syntax & SpecialPatternSyntax.CharSets) items.push('character sets');
if (syntax & SpecialPatternSyntax.Repetitions) items.push('repetitions');
if (syntax & SpecialPatternSyntax.BoundaryAssertions) items.push('boundary assertions');
return items.join(', ');
}
Expand Down Expand Up @@ -59,24 +57,22 @@ export class PatternTooLongError extends Error {
}

export function detectUsedSyntax(ast: Ast) {
function visit(node: Node): SpecialPatternSyntax {
function usedSyntax(node: Node): SpecialPatternSyntax {
switch (node.kind) {
case SyntaxKind.Optional:
return node.inner.reduce((syntax, node) => syntax | visit(node), SpecialPatternSyntax.Optionals);
return node.children.reduce((syntax, node) => syntax | usedSyntax(node), SpecialPatternSyntax.Optionals);
case SyntaxKind.Wildcard:
return SpecialPatternSyntax.Wildcards;
case SyntaxKind.CharSet:
return SpecialPatternSyntax.CharSets;
case SyntaxKind.Repetition:
return SpecialPatternSyntax.Repetitions;
case SyntaxKind.Literal:
return SpecialPatternSyntax.None;
}
}

let syntax = SpecialPatternSyntax.None;
if (ast.boundaryAssertions !== BoundaryAssertion.None) syntax |= SpecialPatternSyntax.BoundaryAssertions;
return syntax | ast.nodes.reduce((syntax, node) => syntax | visit(node), SpecialPatternSyntax.None);
return syntax | ast.nodes.reduce((syntax, node) => syntax | usedSyntax(node), SpecialPatternSyntax.None);
}

export class DisallowedSyntaxError extends Error {
Expand All @@ -95,11 +91,12 @@ export class DisallowedSyntaxError extends Error {
}

export function determineOptionalNestingDepth(ast: Ast) {
function visit(node: Node): number {
if (node.kind === SyntaxKind.Optional) return node.inner.reduce((max, node) => Math.max(max, visit(node)), 0) + 1;
function nestingDepth(node: Node): number {
if (node.kind === SyntaxKind.Optional)
return node.children.reduce((max, node) => Math.max(max, nestingDepth(node)), 0) + 1;
return 0;
}
return ast.nodes.reduce((max, node) => Math.max(max, visit(node)), 0);
return ast.nodes.reduce((max, node) => Math.max(max, nestingDepth(node)), 0);
}

export class ExcessiveOptionalNestingError extends Error {
Expand Down
7 changes: 0 additions & 7 deletions src/pattern.ts

This file was deleted.

5 changes: 5 additions & 0 deletions src/util/char.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export function charValue(c: string) {
return c.codePointAt(0)!;
}

export const maxAscii = 128;
15 changes: 5 additions & 10 deletions test/dsl/syntax/__helpers__/ast.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import type {
Literal,
Node,
Optional,
Repetition,
Wildcard,
} from '../../../../src/dsl/syntax/ast';
import { SyntaxKind } from '../../../../src/dsl/syntax/ast';
Expand All @@ -19,26 +18,22 @@ export function ast(nodes: Node | Node[], source: string, boundaryAssertions: Bo
}

export function lit(content: string): Literal {
return { kind: SyntaxKind.Literal, chars: [...content] };
return { kind: SyntaxKind.Literal, chars: [...content].map((c) => c.codePointAt(0)!) };
}

export function wildcard(): Wildcard {
return { kind: SyntaxKind.Wildcard };
}

export function opt(inner: Node | Node[]): Optional {
return { kind: SyntaxKind.Optional, inner: Array.isArray(inner) ? inner : [inner] };
export function opt(children: Node | Node[]): Optional {
return { kind: SyntaxKind.Optional, children: Array.isArray(children) ? children : [children] };
}

export function charSet(elements: ([string, string] | string)[]): CharSet {
const ranges: CharRange[] = [];
for (const elem of elements) {
if (typeof elem === 'string') ranges.push({ lo: elem, hi: elem });
else ranges.push({ lo: elem[0], hi: elem[1] });
if (typeof elem === 'string') ranges.push({ lo: elem.codePointAt(0)!, hi: elem.codePointAt(0)! });
else ranges.push({ lo: elem[0].codePointAt(0)!, hi: elem[1].codePointAt(0)! });
}
return { kind: SyntaxKind.CharSet, ranges };
}

export function rep(char: string): Repetition {
return { kind: SyntaxKind.Repetition, char };
}
4 changes: 0 additions & 4 deletions test/dsl/syntax/escape.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,4 @@ describe('patternEscape', () => {
it("should escape '|'", () => {
expect(patternEscape('|foo|bar|baz|')).toBe(r`\|foo\|bar\|baz\|`);
});

it("should escape '+'", () => {
expect(patternEscape('+abc+ d ++')).toBe(r`\+abc\+ d \+\+`);
});
});
69 changes: 5 additions & 64 deletions test/dsl/syntax/parse.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { describe, expect, it } from 'vitest';
import { Parser } from '../../../src/dsl/syntax/parse';
import { B, ast, charSet, lit, opt, rep, wildcard } from './__helpers__/ast';
import { B, ast, charSet, lit, opt, wildcard } from './__helpers__/ast';

const r = String.raw;

Expand Down Expand Up @@ -57,53 +57,6 @@ describe('literals', () => {
});
});

describe('repetitions', () => {
it('should parse a simple repetition', () => {
const pattern = 'f+';
const expected = ast(rep('f'), pattern, B.None);
expect(parse(pattern)).toStrictEqual(expected);
});

it('should parse repetition of an escaped metachar', () => {
const pattern = r`\]+`;
const expected = ast(rep(']'), pattern, B.None);
expect(parse(pattern)).toStrictEqual(expected);
});

it('should parse repetitions following literals', () => {
const pattern = 'abcdefg+';
const expected = ast([lit('abcdef'), rep('g')], pattern, B.None);
expect(parse(pattern)).toStrictEqual(expected);
});

describe('syntax errors', () => {
it("should reject a lone '+' at the start of a pattern", () => {
const pattern = '+bar';
expect(() => parse(pattern)).toThrow("1:0: '+'");
});

it("should reject '+' used after a wildcard", () => {
const pattern = '?+';
expect(() => parse(pattern)).toThrow("1:1: '+'");
});

it("should reject '+' used after an optional", () => {
const pattern = '[foo]+';
expect(() => parse(pattern)).toThrow("1:5: '+'");
});

it("should reject '+' used after a char set", () => {
const pattern = '{a,b}+';
expect(() => parse(pattern)).toThrow("1:5: '+'");
});

it('should reject multiple + in sequence', () => {
const pattern = 'a+++';
expect(() => parse(pattern)).toThrow("1:2: '+'");
});
});
});

describe('wildcards', () => {
it('should parse a single wildcard', () => {
const pattern = '?';
Expand Down Expand Up @@ -137,18 +90,6 @@ describe('optionals', () => {
expect(parse(pattern)).toStrictEqual(expected);
});

it('should parse an optional containing a repetition', () => {
const pattern = '[d+]';
const expected = ast(opt(rep('d')), pattern, B.None);
expect(parse(pattern)).toStrictEqual(expected);
});

it('should parse an optional containing a literal and repetition in sequence', () => {
const pattern = '[foobar+]';
const expected = ast(opt([lit('fooba'), rep('r')]), pattern, B.None);
expect(parse(pattern)).toStrictEqual(expected);
});

it('should parse an optional containing a literal and wildcard in sequence', () => {
const pattern = '[foo?bar]';
const expected = ast(opt([lit('foo'), wildcard(), lit('bar')]), pattern, B.None);
Expand Down Expand Up @@ -268,16 +209,16 @@ describe('boundary assertions', () => {
it('should support parsing multiple patterns with same Parser instance', () => {
const parser = new Parser();

const pattern0 = '|{h,i,πŒ†} [[there+]] wor?d';
const pattern0 = '|{h,i,πŒ†} [[there?]] wor?d';
const expected0 = ast(
[charSet(['h', 'i', 'πŒ†']), lit(' '), opt(opt([lit('ther'), rep('e')])), lit(' wor'), wildcard(), lit('d')],
[charSet(['h', 'i', 'πŒ†']), lit(' '), opt(opt([lit('ther'), opt(lit('e'))])), lit(' wor'), wildcard(), lit('d')],
pattern0,
B.Start,
);
expect(parser.parse(pattern0)).toStrictEqual(expected0);

const pattern1 = 'bar+|';
const expected1 = ast([lit('ba'), rep('r')], pattern1, B.End);
const pattern1 = 'bar?|';
const expected1 = ast([lit('ba'), opt(lit('r'))], pattern1, B.End);
expect(parser.parse(pattern1)).toStrictEqual(expected1);
});

Expand Down

0 comments on commit 3cab174

Please sign in to comment.