From 3cab1743e8b08b6afb7b8d0eae7448c5ee8b5e77 Mon Sep 17 00:00:00 2001 From: Joe Date: Sun, 20 Aug 2023 15:57:46 -0700 Subject: [PATCH] chore: more progress on rewrite --- src/detector.ts | 13 ++++++ src/dsl/impl/bitap.ts | 0 src/dsl/impl/vm.ts | 0 src/dsl/syntax/ast.ts | 16 +++---- src/dsl/syntax/parse.ts | 47 +++++++------------- src/dsl/syntax/untrusted.ts | 21 ++++----- src/pattern.ts | 7 --- src/util/char.ts | 5 +++ test/dsl/syntax/__helpers__/ast.ts | 15 +++---- test/dsl/syntax/escape.test.ts | 4 -- test/dsl/syntax/parse.test.ts | 69 +++--------------------------- 11 files changed, 58 insertions(+), 139 deletions(-) create mode 100644 src/detector.ts delete mode 100644 src/dsl/impl/bitap.ts delete mode 100644 src/dsl/impl/vm.ts delete mode 100644 src/pattern.ts create mode 100644 src/util/char.ts diff --git a/src/detector.ts b/src/detector.ts new file mode 100644 index 0000000..8ded236 --- /dev/null +++ b/src/detector.ts @@ -0,0 +1,13 @@ +export interface Match { + startIndex: number; + endIndex: number; + meta: Readonly; +} + +export interface Pattern { + matchFirst(chars: number[]): Match; + appendAllMatches(dst: Match[], chars: number[]): void; + test(chars: number[]): boolean; +} + +export class ObscenityDetector {} diff --git a/src/dsl/impl/bitap.ts b/src/dsl/impl/bitap.ts deleted file mode 100644 index e69de29..0000000 diff --git a/src/dsl/impl/vm.ts b/src/dsl/impl/vm.ts deleted file mode 100644 index e69de29..0000000 diff --git a/src/dsl/syntax/ast.ts b/src/dsl/syntax/ast.ts index 8bbf8cb..9f3a91c 100644 --- a/src/dsl/syntax/ast.ts +++ b/src/dsl/syntax/ast.ts @@ -15,14 +15,13 @@ export enum SyntaxKind { Wildcard, Optional, CharSet, - Repetition, } -export type Node = Literal | Wildcard | Optional | CharSet | Repetition; +export type Node = Literal | Wildcard | Optional | CharSet; export interface Literal { kind: SyntaxKind.Literal; - chars: string[]; + chars: number[]; } export interface Wildcard { @@ -31,7 +30,7 @@ export interface Wildcard { export interface Optional { kind: SyntaxKind.Optional; - inner: Node[]; + children: Node[]; } export interface CharSet { @@ -43,11 +42,6 @@ export interface CharSet { * A range of characters, including both endpoints. */ export interface CharRange { - lo: string; - hi: string; -} - -export interface Repetition { - kind: SyntaxKind.Repetition; - char: string; + lo: number; + hi: number; } diff --git a/src/dsl/syntax/parse.ts b/src/dsl/syntax/parse.ts index 85f9f36..12f99ac 100644 --- a/src/dsl/syntax/parse.ts +++ b/src/dsl/syntax/parse.ts @@ -1,3 +1,4 @@ +import { charValue } from '../../util/char'; import type { Ast, CharRange, CharSet, Node, Optional, Wildcard } from './ast'; import { BoundaryAssertion, SyntaxKind } from './ast'; import * as assert from 'node:assert'; @@ -17,7 +18,7 @@ export interface Position { offset: number; } -export const metachars = ['{', '}', '[', ']', '+', '?', '|']; +export const metachars = ['{', '}', '[', ']', '?', '|']; export class Parser { private chars: string[] = []; @@ -37,7 +38,7 @@ export class Parser { this.chars = chars; const nodes: Node[] = []; - while (!this.done()) nodes.push(...this.parseAny()); + while (!this.done()) nodes.push(this.parseAny()); return { source: pattern, nodes, boundaryAssertions }; } @@ -54,10 +55,10 @@ export class Parser { return assertions; } - private *parseAny(): Generator { - if (this.at('?')) return yield this.parseWildcard(); - if (this.at('[')) return yield this.parseOpt(); - if (this.at('{')) return yield this.parseCharSet(); + private parseAny(): Node { + if (this.at('?')) return this.parseWildcard(); + if (this.at('[')) return this.parseOpt(); + if (this.at('{')) return this.parseCharSet(); /* eslint-disable no-fallthrough */ switch (this.peek()) { @@ -69,26 +70,10 @@ export class Parser { this.error( "boundary assertions are only permitted at start/end of pattern; use a backslash '\\' to escape if a literal '|' is desired", ); - case '+': - if (this.offset > 0) { - this.error( - "'+' can only be used after a character, not a wildcard/optional/char set; use a backslash '\\' to escape if a literal '+' is desired", - ); - } else { - this.error( - "'+' is a special character denoting repetition; use a backslash '\\' to escape if a literal '+' is desired", - ); - } default: { - const chars: string[] = []; - while (!this.done() && !metachars.includes(this.peek())) chars.push(this.nextSkipEscape()); - if (this.eat('+')) { - const last = chars.pop()!; - if (chars.length > 0) yield { kind: SyntaxKind.Literal, chars }; - yield { kind: SyntaxKind.Repetition, char: last }; - } else { - yield { kind: SyntaxKind.Literal, chars }; - } + const chars: number[] = []; + while (!this.done() && !metachars.includes(this.peek())) chars.push(charValue(this.nextSkipEscape())); + return { kind: SyntaxKind.Literal, chars }; } } /* eslint-enable no-fallthrough */ @@ -112,15 +97,15 @@ export class Parser { const openPos = this.pos(); assert.ok(this.eat('[')); - const inner: Node[] = []; + const children: Node[] = []; while (!this.done()) { if (this.eat(']')) { - if (inner.length === 0) { + if (children.length === 0) { this.error('empty optional expressions are not permitted', { col: this.col - 1, offset: this.offset - 1 }); } - return { kind: SyntaxKind.Optional, inner }; + return { kind: SyntaxKind.Optional, children }; } - inner.push(...this.parseAny()); + children.push(this.parseAny()); } this.error('unclosed optional expression', openPos); } @@ -140,12 +125,12 @@ export class Parser { } if (needComma && !this.eat(',')) this.error('expected comma separating elements of character set'); - const lo = this.next(); + const lo = charValue(this.next()); let hi = lo; if (this.eat('-')) { // next char is upper bound if (this.done()) this.error("expected character following '-' in character set"); - hi = this.next(); + hi = charValue(this.next()); } ranges.push({ lo, hi }); diff --git a/src/dsl/syntax/untrusted.ts b/src/dsl/syntax/untrusted.ts index 198adfd..dd501d6 100644 --- a/src/dsl/syntax/untrusted.ts +++ b/src/dsl/syntax/untrusted.ts @@ -7,9 +7,8 @@ export enum SpecialPatternSyntax { Wildcards = 1 << 0, Optionals = 1 << 1, CharSets = 1 << 2, - Repetitions = 1 << 3, - BoundaryAssertions = 1 << 4, - All = Optionals | Wildcards | CharSets | Repetitions | BoundaryAssertions, + BoundaryAssertions = 1 << 3, + All = Optionals | Wildcards | CharSets | BoundaryAssertions, } export function printSyntax(syntax: SpecialPatternSyntax) { @@ -17,7 +16,6 @@ export function printSyntax(syntax: SpecialPatternSyntax) { if (syntax & SpecialPatternSyntax.Wildcards) items.push('wildcards'); if (syntax & SpecialPatternSyntax.Optionals) items.push('optional expressions'); if (syntax & SpecialPatternSyntax.CharSets) items.push('character sets'); - if (syntax & SpecialPatternSyntax.Repetitions) items.push('repetitions'); if (syntax & SpecialPatternSyntax.BoundaryAssertions) items.push('boundary assertions'); return items.join(', '); } @@ -59,16 +57,14 @@ export class PatternTooLongError extends Error { } export function detectUsedSyntax(ast: Ast) { - function visit(node: Node): SpecialPatternSyntax { + function usedSyntax(node: Node): SpecialPatternSyntax { switch (node.kind) { case SyntaxKind.Optional: - return node.inner.reduce((syntax, node) => syntax | visit(node), SpecialPatternSyntax.Optionals); + return node.children.reduce((syntax, node) => syntax | usedSyntax(node), SpecialPatternSyntax.Optionals); case SyntaxKind.Wildcard: return SpecialPatternSyntax.Wildcards; case SyntaxKind.CharSet: return SpecialPatternSyntax.CharSets; - case SyntaxKind.Repetition: - return SpecialPatternSyntax.Repetitions; case SyntaxKind.Literal: return SpecialPatternSyntax.None; } @@ -76,7 +72,7 @@ export function detectUsedSyntax(ast: Ast) { let syntax = SpecialPatternSyntax.None; if (ast.boundaryAssertions !== BoundaryAssertion.None) syntax |= SpecialPatternSyntax.BoundaryAssertions; - return syntax | ast.nodes.reduce((syntax, node) => syntax | visit(node), SpecialPatternSyntax.None); + return syntax | ast.nodes.reduce((syntax, node) => syntax | usedSyntax(node), SpecialPatternSyntax.None); } export class DisallowedSyntaxError extends Error { @@ -95,11 +91,12 @@ export class DisallowedSyntaxError extends Error { } export function determineOptionalNestingDepth(ast: Ast) { - function visit(node: Node): number { - if (node.kind === SyntaxKind.Optional) return node.inner.reduce((max, node) => Math.max(max, visit(node)), 0) + 1; + function nestingDepth(node: Node): number { + if (node.kind === SyntaxKind.Optional) + return node.children.reduce((max, node) => Math.max(max, nestingDepth(node)), 0) + 1; return 0; } - return ast.nodes.reduce((max, node) => Math.max(max, visit(node)), 0); + return ast.nodes.reduce((max, node) => Math.max(max, nestingDepth(node)), 0); } export class ExcessiveOptionalNestingError extends Error { diff --git a/src/pattern.ts b/src/pattern.ts deleted file mode 100644 index 780a1ac..0000000 --- a/src/pattern.ts +++ /dev/null @@ -1,7 +0,0 @@ -type TODO = unknown; - -export interface Pattern { - matchFirst(text: string): TODO; - matchAll(text: string): TODO[]; - test(text: string): boolean; -} diff --git a/src/util/char.ts b/src/util/char.ts new file mode 100644 index 0000000..453327e --- /dev/null +++ b/src/util/char.ts @@ -0,0 +1,5 @@ +export function charValue(c: string) { + return c.codePointAt(0)!; +} + +export const maxAscii = 128; diff --git a/test/dsl/syntax/__helpers__/ast.ts b/test/dsl/syntax/__helpers__/ast.ts index 0617fba..6dd98a1 100644 --- a/test/dsl/syntax/__helpers__/ast.ts +++ b/test/dsl/syntax/__helpers__/ast.ts @@ -7,7 +7,6 @@ import type { Literal, Node, Optional, - Repetition, Wildcard, } from '../../../../src/dsl/syntax/ast'; import { SyntaxKind } from '../../../../src/dsl/syntax/ast'; @@ -19,26 +18,22 @@ export function ast(nodes: Node | Node[], source: string, boundaryAssertions: Bo } export function lit(content: string): Literal { - return { kind: SyntaxKind.Literal, chars: [...content] }; + return { kind: SyntaxKind.Literal, chars: [...content].map((c) => c.codePointAt(0)!) }; } export function wildcard(): Wildcard { return { kind: SyntaxKind.Wildcard }; } -export function opt(inner: Node | Node[]): Optional { - return { kind: SyntaxKind.Optional, inner: Array.isArray(inner) ? inner : [inner] }; +export function opt(children: Node | Node[]): Optional { + return { kind: SyntaxKind.Optional, children: Array.isArray(children) ? children : [children] }; } export function charSet(elements: ([string, string] | string)[]): CharSet { const ranges: CharRange[] = []; for (const elem of elements) { - if (typeof elem === 'string') ranges.push({ lo: elem, hi: elem }); - else ranges.push({ lo: elem[0], hi: elem[1] }); + if (typeof elem === 'string') ranges.push({ lo: elem.codePointAt(0)!, hi: elem.codePointAt(0)! }); + else ranges.push({ lo: elem[0].codePointAt(0)!, hi: elem[1].codePointAt(0)! }); } return { kind: SyntaxKind.CharSet, ranges }; } - -export function rep(char: string): Repetition { - return { kind: SyntaxKind.Repetition, char }; -} diff --git a/test/dsl/syntax/escape.test.ts b/test/dsl/syntax/escape.test.ts index 7e77b62..01ebf38 100644 --- a/test/dsl/syntax/escape.test.ts +++ b/test/dsl/syntax/escape.test.ts @@ -28,8 +28,4 @@ describe('patternEscape', () => { it("should escape '|'", () => { expect(patternEscape('|foo|bar|baz|')).toBe(r`\|foo\|bar\|baz\|`); }); - - it("should escape '+'", () => { - expect(patternEscape('+abc+ d ++')).toBe(r`\+abc\+ d \+\+`); - }); }); diff --git a/test/dsl/syntax/parse.test.ts b/test/dsl/syntax/parse.test.ts index 4bfa612..e876746 100644 --- a/test/dsl/syntax/parse.test.ts +++ b/test/dsl/syntax/parse.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; import { Parser } from '../../../src/dsl/syntax/parse'; -import { B, ast, charSet, lit, opt, rep, wildcard } from './__helpers__/ast'; +import { B, ast, charSet, lit, opt, wildcard } from './__helpers__/ast'; const r = String.raw; @@ -57,53 +57,6 @@ describe('literals', () => { }); }); -describe('repetitions', () => { - it('should parse a simple repetition', () => { - const pattern = 'f+'; - const expected = ast(rep('f'), pattern, B.None); - expect(parse(pattern)).toStrictEqual(expected); - }); - - it('should parse repetition of an escaped metachar', () => { - const pattern = r`\]+`; - const expected = ast(rep(']'), pattern, B.None); - expect(parse(pattern)).toStrictEqual(expected); - }); - - it('should parse repetitions following literals', () => { - const pattern = 'abcdefg+'; - const expected = ast([lit('abcdef'), rep('g')], pattern, B.None); - expect(parse(pattern)).toStrictEqual(expected); - }); - - describe('syntax errors', () => { - it("should reject a lone '+' at the start of a pattern", () => { - const pattern = '+bar'; - expect(() => parse(pattern)).toThrow("1:0: '+'"); - }); - - it("should reject '+' used after a wildcard", () => { - const pattern = '?+'; - expect(() => parse(pattern)).toThrow("1:1: '+'"); - }); - - it("should reject '+' used after an optional", () => { - const pattern = '[foo]+'; - expect(() => parse(pattern)).toThrow("1:5: '+'"); - }); - - it("should reject '+' used after a char set", () => { - const pattern = '{a,b}+'; - expect(() => parse(pattern)).toThrow("1:5: '+'"); - }); - - it('should reject multiple + in sequence', () => { - const pattern = 'a+++'; - expect(() => parse(pattern)).toThrow("1:2: '+'"); - }); - }); -}); - describe('wildcards', () => { it('should parse a single wildcard', () => { const pattern = '?'; @@ -137,18 +90,6 @@ describe('optionals', () => { expect(parse(pattern)).toStrictEqual(expected); }); - it('should parse an optional containing a repetition', () => { - const pattern = '[d+]'; - const expected = ast(opt(rep('d')), pattern, B.None); - expect(parse(pattern)).toStrictEqual(expected); - }); - - it('should parse an optional containing a literal and repetition in sequence', () => { - const pattern = '[foobar+]'; - const expected = ast(opt([lit('fooba'), rep('r')]), pattern, B.None); - expect(parse(pattern)).toStrictEqual(expected); - }); - it('should parse an optional containing a literal and wildcard in sequence', () => { const pattern = '[foo?bar]'; const expected = ast(opt([lit('foo'), wildcard(), lit('bar')]), pattern, B.None); @@ -268,16 +209,16 @@ describe('boundary assertions', () => { it('should support parsing multiple patterns with same Parser instance', () => { const parser = new Parser(); - const pattern0 = '|{h,i,𝌆} [[there+]] wor?d'; + const pattern0 = '|{h,i,𝌆} [[there?]] wor?d'; const expected0 = ast( - [charSet(['h', 'i', '𝌆']), lit(' '), opt(opt([lit('ther'), rep('e')])), lit(' wor'), wildcard(), lit('d')], + [charSet(['h', 'i', '𝌆']), lit(' '), opt(opt([lit('ther'), opt(lit('e'))])), lit(' wor'), wildcard(), lit('d')], pattern0, B.Start, ); expect(parser.parse(pattern0)).toStrictEqual(expected0); - const pattern1 = 'bar+|'; - const expected1 = ast([lit('ba'), rep('r')], pattern1, B.End); + const pattern1 = 'bar?|'; + const expected1 = ast([lit('ba'), opt(lit('r'))], pattern1, B.End); expect(parser.parse(pattern1)).toStrictEqual(expected1); });