Skip to content

Commit

Permalink
Merge pull request #1630 from milachae/improve_tokenizer
Browse files Browse the repository at this point in the history
Improve tokenizer performance
  • Loading branch information
rien authored Oct 24, 2024
2 parents 63d9322 + 2b877e5 commit bb6841e
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 56 deletions.
23 changes: 11 additions & 12 deletions lib/src/lib/tokenizer/charTokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@ import { Region } from "@dodona/dolos-core";
import { Token, Tokenizer } from "./tokenizer.js";

export class CharTokenizer extends Tokenizer {
/**
* Runs the parser on a given string. Returns an async iterator returning
* tuples containing the stringified version of the token and the
* corresponding position.
*
* @param text The text string to parse
*/
public *generateTokens(text: string): IterableIterator<Token> {
for (const [lineNumber, line] of text.split("\n").entries())
yield* line
.split("")
.map((char, col) => this.newToken(char, new Region(lineNumber, col, lineNumber, col + 1)));

generateTokens(text: string): Token[] {
const tokens: Token[] = [];
for (const [lineNumber, line] of text.split("\n").entries()) {
for (let col = 0; col < line.length; col++) {
tokens.push(this.newToken(line[col], new Region(lineNumber, col, lineNumber, col + 1)));

}
}

return tokens;
}
}
64 changes: 30 additions & 34 deletions lib/src/lib/tokenizer/codeTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { default as Parser, SyntaxNode } from "tree-sitter";
import { Region, assert } from "@dodona/dolos-core";
import { Region } from "@dodona/dolos-core";
import { Token, Tokenizer } from "./tokenizer.js";
import { ProgrammingLanguage } from "../language.js";

Expand Down Expand Up @@ -32,56 +32,52 @@ export class CodeTokenizer extends Tokenizer {
}

/**
* Runs the parser on a given string. Returns an async iterator returning
* tuples containing the stringified version of the token and the
* Runs the parser on a given string. Returns a list of Tokens
* containing the stringified version of the token and the
* corresponding position.
*
* @param text The text string to parse
*/
public *generateTokens(text: string): IterableIterator<Token> {
public generateTokens(text: string): Token[] {
const tree = this.parser.parse(text, undefined, { bufferSize: Math.max(32 * 1024, text.length * 2) });
yield* this.tokenizeNode(tree.rootNode);
const tokens: Token[] = [];
this.tokenizeNode(tree.rootNode, tokens);
return tokens;
}

private *tokenizeNode(node: SyntaxNode): IterableIterator<Token> {
const fullSpan = new Region(
/**
* Tokenizes the given node and its child nodes. It will create a list of Tokens
* containing the stringified version of the token and the corresponding position.
*
* @param node The node (and child nodes) that will be tokenized.
* @param tokens A list of tokens that will be filled during the execution of the function
* @returns A tuple `(startRow, startCol)`, It represents
* the starting position of the given tokenized node.
*/
private tokenizeNode(node: SyntaxNode, tokens: Token[]): [number,number]{
const location = new Region(
node.startPosition.row,
node.startPosition.column,
node.endPosition.row,
node.endPosition.column
);

const location = Region.firstDiff(fullSpan, this.getChildrenRegions(node));
assert(location !== null, "There should be at least one diff'ed region");

yield this.newToken("(", location);
tokens.push(this.newToken("(", location));
tokens.push(this.newToken(node.type, location));
for (const child of node.namedChildren) {

// "(node.type child1 child2 ...)"
yield this.newToken(node.type, location);
const [childStartRow, childStartCol] = this.tokenizeNode(child, tokens);

for (const child of node.namedChildren) {
yield* this.tokenizeNode(child);
// If the code is already captured in one of the children, the region of the current node can be shortened.
if ((childStartRow < location.endRow) || (childStartRow === location.endRow && childStartCol < location.endCol)) {
location.endRow = childStartRow;
location.endCol = childStartCol;
}
}
yield this.newToken(")", location);
}

private getChildrenRegions(node: SyntaxNode): Region[] {
const nodeToRegion = (node: SyntaxNode):Region => new Region(
node.startPosition.row,
node.startPosition.column,
node.endPosition.row,
node.endPosition.column
);

const getChildrenRegion =
(node: SyntaxNode): Region[] =>
node.children.reduce<Region[]>(
(list, child) =>
list.concat(getChildrenRegion(child))
.concat(nodeToRegion(node)),
[]
);
tokens.push(this.newToken(")", location));

return node.children.map(getChildrenRegion).flat();
// Also return the startRow and startCol, this can be used by the parent node.
return [location.startRow, location.startCol];
}
}
8 changes: 4 additions & 4 deletions lib/src/lib/tokenizer/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ export abstract class Tokenizer {


/**
* Runs the tokenizer on a given Buffer. Returns an async iterator returning
* tuples containing the stringified version of the token and the
* Runs the parser on a given string. Returns a list of Tokens
* containing the stringified version of the token and the
* corresponding position.
*
* @param text The text string to parse
*/
public abstract generateTokens(text: string): IterableIterator<Token>;
public abstract generateTokens(text:string): Token[];

/**
* Returns a tokenized version of the given file.
Expand All @@ -36,7 +36,7 @@ export abstract class Tokenizer {
* @param text The buffer to stringify
*/
public tokenize(text: string): string {
return Array.of(...this.generateTokens(text)).join();
return this.generateTokens(text).join();
}

/**
Expand Down
10 changes: 5 additions & 5 deletions lib/src/test/dolos.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ test("equal content should be a full match", async t => {
t.is(fragments.length, 1);
const match = fragments[0];

t.deepEqual(new Region(2, 2, 9, 37), match.leftSelection);
t.deepEqual(new Region(2, 2, 9, 37), match.rightSelection);
t.deepEqual(new Region(2, 2, 9, 32), match.leftSelection);
t.deepEqual(new Region(2, 2, 9, 32), match.rightSelection);

t.is(0, match.leftkgrams.from);
t.is(0, match.rightkgrams.from);
Expand Down Expand Up @@ -64,7 +64,7 @@ test("renamed variables should be a full match", async t => {
const wereld = () => "wereld";
function halloWereld() {
console.log(halo() + " " + wereld())
console.log(hallo() + " " + wereld())
}
`;

Expand All @@ -82,8 +82,8 @@ test("renamed variables should be a full match", async t => {
t.is(fragments.length, 1);
const match = fragments[0];

t.deepEqual(new Region(2, 2, 9, 37), match.leftSelection);
t.deepEqual(new Region(2, 2, 9, 37), match.rightSelection);
t.deepEqual(new Region(2, 2, 9, 32), match.leftSelection);
t.deepEqual(new Region(2, 2, 9, 32), match.rightSelection);

t.is(0, match.leftkgrams.from);
t.is(0, match.rightkgrams.from);
Expand Down
112 changes: 111 additions & 1 deletion lib/src/test/tokenizer.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import test from "ava";
import { File } from "@dodona/dolos-core";
import { File, Region } from "@dodona/dolos-core";
import { LanguagePicker } from "../lib/language.js";
import { readPath } from "../lib/reader.js";

Expand All @@ -24,6 +24,27 @@ const languageFiles = {
"verilog": "../samples/verilog/module.v"
} as {[key: string]: string};

const tokenLength = {
"../samples/bash/caesar.sh": 1185,
"../samples/c/caesar.c": 582,
"../samples/c-sharp/Caesar.cs": 606,
"../samples/char/caesar.txt": 3700,
"../samples/cpp/caesar.cpp": 801,
"../samples/elm/Caesar.elm": 753,
"../samples/groovy/caesar.groovy": 282,
"../samples/java/Caesar.java": 522,
"../samples/javascript/sample.js": 861,
"../samples/python/caesar.py": 309,
"../samples/php/caesar.php": 411,
"../samples/modelica/sample.mo": 7542,
"../samples/r/caesar.R": 594,
"../samples/scala/Caesar.scala": 366,
"../samples/sql/sample.sql": 543,
"../samples/tsx/sample.tsx": 1659,
"../samples/typescript/caesar.ts": 378,
"../samples/verilog/module.v": 2484
} as {[key: string]: number};

for (const [languageName, languageFile] of Object.entries(languageFiles)) {
test(`LanguagePicker can find ${languageName} correctly by name`, async t => {
const language = await new LanguagePicker().findLanguage(languageName);
Expand All @@ -49,6 +70,7 @@ for (const [languageName, languageFile] of Object.entries(languageFiles)) {
const { tokens } = tokenizer.tokenizeFile(file);
t.truthy(tokens);
t.snapshot(tokens, "stable tokenization");
t.is(tokens.length, tokenLength[languageFile]);
});
}

Expand Down Expand Up @@ -87,3 +109,91 @@ test("should be able to parse larger files", async t => {
const { tokens } = tokenizer.tokenizeFile(file);
t.truthy(tokens);
});

test("should be able to correctly tokenize a variable", async t => {
const file = new File("long.js", "var test = 1;");
const language = await (new LanguagePicker().findLanguage("javascript"));
const tokenizer = await language.createTokenizer();

const { tokens, mapping } = tokenizer.tokenizeFile(file);
t.is(tokens.join(""), "(program(variable_declaration(variable_declarator(identifier)(number))))");
t.is(mapping.length, 15);
t.deepEqual([
new Region(0, 0, 0, 0),
new Region(0, 0, 0, 0),
new Region(0, 0, 0, 4),
new Region(0, 0, 0, 4),
new Region(0, 4, 0, 4),
new Region(0, 4, 0, 4),
new Region(0, 4, 0, 8),
new Region(0, 4, 0, 8),
new Region(0, 4, 0, 8),
new Region(0, 11, 0, 12),
new Region(0, 11, 0, 12),
new Region(0, 11, 0, 12),
new Region(0, 4, 0, 4),
new Region(0, 0, 0, 4),
new Region(0, 0, 0, 0)
], mapping);
});

test("should be able to correctly tokenize a loop", async t => {
const file = new File("long.js", "let i = 0;\nwhile (i < 10) {\n i += 1;\n}");
const language = await (new LanguagePicker().findLanguage("javascript"));

const tokenizer = await language.createTokenizer();
const { tokens, mapping } = tokenizer.tokenizeFile(file);
t.is(tokens.join(""), "(program(lexical_declaration(variable_declarator(identifier)(number)))" +
"(while_statement(parenthesized_expression(binary_expression(identifier)(number)))" +
"(statement_block(expression_statement(augmented_assignment_expression(identifier)(number))))))");
t.is(mapping.length, 45);
t.deepEqual( [
new Region (0,0,0,0),
new Region (0,0,0,0),
new Region (0,0,0,4),
new Region (0,0,0,4),
new Region (0,4,0,4),
new Region (0,4,0,4),
new Region (0,4,0,5),
new Region (0,4,0,5),
new Region (0,4,0,5),
new Region (0,8,0,9),
new Region (0,8,0,9),
new Region (0,8,0,9),
new Region (0,4,0,4),
new Region (0,0,0,4),
new Region (1,0,1,6),
new Region (1,0,1,6),
new Region (1,6,1,7),
new Region (1,6,1,7),
new Region (1,7,1,7),
new Region (1,7,1,7),
new Region (1,7,1,8),
new Region (1,7,1,8),
new Region (1,7,1,8),
new Region (1,11,1,13),
new Region (1,11,1,13),
new Region (1,11,1,13),
new Region (1,7,1,7),
new Region (1,6,1,7),
new Region (1,15,2,2),
new Region (1,15,2,2),
new Region (2,2,2,2),
new Region (2,2,2,2),
new Region (2,2,2,2),
new Region (2,2,2,2),
new Region (2,2,2,3),
new Region (2,2,2,3),
new Region (2,2,2,3),
new Region (2,7,2,8),
new Region (2,7,2,8),
new Region (2,7,2,8),
new Region (2,2,2,2),
new Region (2,2,2,2),
new Region (1,15,2,2),
new Region (1,0,1,6),
new Region (0,0,0,0),
],mapping
);
});

0 comments on commit bb6841e

Please sign in to comment.