Merge pull request #1630 from milachae/improve_tokenizer

Improve tokenizer performance
dodona-edu · Oct 24, 2024 · bb6841e · bb6841e
2 parents 63d9322 + 2b877e5
commit bb6841e
Show file tree

Hide file tree

Showing 5 changed files with 161 additions and 56 deletions.
diff --git a/lib/src/lib/tokenizer/charTokenizer.ts b/lib/src/lib/tokenizer/charTokenizer.ts
@@ -2,17 +2,16 @@ import { Region } from "@dodona/dolos-core";
 import { Token, Tokenizer } from "./tokenizer.js";
 
 export class CharTokenizer extends Tokenizer {
-  /**
-   * Runs the parser on a given string. Returns an async iterator returning
-   * tuples containing the stringified version of the token and the
-   * corresponding position.
-   *
-   * @param text The text string to parse
-   */
-  public *generateTokens(text: string): IterableIterator<Token> {
-    for (const [lineNumber, line] of text.split("\n").entries())
-      yield* line
-        .split("")
-        .map((char, col) => this.newToken(char, new Region(lineNumber, col, lineNumber, col + 1)));
+
+  generateTokens(text: string): Token[] {
+    const tokens: Token[] = [];
+    for (const [lineNumber, line] of text.split("\n").entries()) {
+      for (let col = 0; col < line.length; col++) {
+        tokens.push(this.newToken(line[col], new Region(lineNumber, col, lineNumber, col + 1)));
+
+      }
+    }
+
+    return tokens;
   }
 }
diff --git a/lib/src/lib/tokenizer/codeTokenizer.ts b/lib/src/lib/tokenizer/codeTokenizer.ts
@@ -1,5 +1,5 @@
 import { default as Parser, SyntaxNode } from "tree-sitter";
-import { Region, assert } from "@dodona/dolos-core";
+import { Region } from "@dodona/dolos-core";
 import { Token, Tokenizer } from "./tokenizer.js";
 import { ProgrammingLanguage } from "../language.js";
 
@@ -32,56 +32,52 @@ export class CodeTokenizer extends Tokenizer {
   }
 
   /**
-   * Runs the parser on a given string. Returns an async iterator returning
-   * tuples containing the stringified version of the token and the
+   * Runs the parser on a given string. Returns a list of Tokens
+   * containing the stringified version of the token and the
    * corresponding position.
    *
    * @param text The text string to parse
    */
-  public *generateTokens(text: string): IterableIterator<Token> {
+  public generateTokens(text: string): Token[] {
     const tree = this.parser.parse(text, undefined, { bufferSize: Math.max(32 * 1024, text.length * 2) });
-    yield* this.tokenizeNode(tree.rootNode);
+    const tokens: Token[] = [];
+    this.tokenizeNode(tree.rootNode, tokens);
+    return tokens;
   }
 
-  private *tokenizeNode(node: SyntaxNode): IterableIterator<Token> {
-    const fullSpan = new Region(
+  /**
+   * Tokenizes the given node and its child nodes. It will create a list of Tokens
+   * containing the stringified version of the token and the corresponding position.
+   *
+   * @param node The node (and child nodes) that will be tokenized.
+   * @param tokens A list of tokens that will be filled during the execution of the function
+   * @returns A tuple `(startRow, startCol)`, It represents
+   * the starting position of the given tokenized node.
+   */
+  private tokenizeNode(node: SyntaxNode, tokens: Token[]): [number,number]{
+    const location = new Region(
       node.startPosition.row,
       node.startPosition.column,
       node.endPosition.row,
       node.endPosition.column
     );
 
-    const location = Region.firstDiff(fullSpan, this.getChildrenRegions(node));
-    assert(location !== null, "There should be at least one diff'ed region");
-
-    yield this.newToken("(", location);
+    tokens.push(this.newToken("(", location));
+    tokens.push(this.newToken(node.type, location));
+    for (const child of node.namedChildren) {
 
-    // "(node.type child1 child2 ...)"
-    yield this.newToken(node.type, location);
+      const [childStartRow, childStartCol] = this.tokenizeNode(child, tokens);
 
-    for (const child of node.namedChildren) {
-      yield* this.tokenizeNode(child);
+      // If the code is already captured in one of the children, the region of the current node can be shortened.
+      if ((childStartRow < location.endRow) || (childStartRow === location.endRow && childStartCol < location.endCol)) {
+        location.endRow = childStartRow;
+        location.endCol = childStartCol;
+      }
     }
-    yield this.newToken(")", location);
-  }
-
-  private getChildrenRegions(node: SyntaxNode): Region[] {
-    const nodeToRegion = (node: SyntaxNode):Region => new Region(
-      node.startPosition.row,
-      node.startPosition.column,
-      node.endPosition.row,
-      node.endPosition.column
-    );
 
-    const getChildrenRegion =
-      (node: SyntaxNode): Region[] =>
-        node.children.reduce<Region[]>(
-          (list, child) =>
-            list.concat(getChildrenRegion(child))
-              .concat(nodeToRegion(node)),
-          []
-        );
+    tokens.push(this.newToken(")", location));
 
-    return node.children.map(getChildrenRegion).flat();
+    // Also return the startRow and startCol, this can be used by the parent node.
+    return [location.startRow, location.startCol];
   }
 }
diff --git a/lib/src/lib/tokenizer/tokenizer.ts b/lib/src/lib/tokenizer/tokenizer.ts
@@ -12,13 +12,13 @@ export abstract class Tokenizer {
 
 
   /**
-   * Runs the tokenizer on a given Buffer. Returns an async iterator returning
-   * tuples containing the stringified version of the token and the
+   * Runs the parser on a given string. Returns a list of Tokens
+   * containing the stringified version of the token and the
    * corresponding position.
    *
    * @param text The text string to parse
    */
-  public abstract generateTokens(text: string): IterableIterator<Token>;
+  public abstract generateTokens(text:string): Token[];
 
   /**
    * Returns a tokenized version of the given file.
@@ -36,7 +36,7 @@ export abstract class Tokenizer {
    * @param text The buffer to stringify
    */
   public tokenize(text: string): string {
-    return Array.of(...this.generateTokens(text)).join();
+    return this.generateTokens(text).join();
   }
 
   /**

diff --git a/lib/src/test/dolos.test.ts b/lib/src/test/dolos.test.ts
@@ -30,8 +30,8 @@ test("equal content should be a full match", async t => {
   t.is(fragments.length, 1);
   const match = fragments[0];
 
-  t.deepEqual(new Region(2, 2, 9, 37), match.leftSelection);
-  t.deepEqual(new Region(2, 2, 9, 37), match.rightSelection);
+  t.deepEqual(new Region(2, 2, 9, 32), match.leftSelection);
+  t.deepEqual(new Region(2, 2, 9, 32), match.rightSelection);
 
   t.is(0, match.leftkgrams.from);
   t.is(0, match.rightkgrams.from);
@@ -64,7 +64,7 @@ test("renamed variables should be a full match", async t => {
   const wereld = () => "wereld";
 
   function halloWereld() {
-    console.log(halo() + " " + wereld())
+    console.log(hallo() + " " + wereld())
   }
   `;
 
@@ -82,8 +82,8 @@ test("renamed variables should be a full match", async t => {
   t.is(fragments.length, 1);
   const match = fragments[0];
 
-  t.deepEqual(new Region(2, 2, 9, 37), match.leftSelection);
-  t.deepEqual(new Region(2, 2, 9, 37), match.rightSelection);
+  t.deepEqual(new Region(2, 2, 9, 32), match.leftSelection);
+  t.deepEqual(new Region(2, 2, 9, 32), match.rightSelection);
 
   t.is(0, match.leftkgrams.from);
   t.is(0, match.rightkgrams.from);

diff --git a/lib/src/test/tokenizer.test.ts b/lib/src/test/tokenizer.test.ts
@@ -1,5 +1,5 @@
 import test from "ava";
-import { File } from "@dodona/dolos-core";
+import { File, Region } from "@dodona/dolos-core";
 import { LanguagePicker } from "../lib/language.js";
 import { readPath } from "../lib/reader.js";
 
@@ -24,6 +24,27 @@ const languageFiles = {
   "verilog": "../samples/verilog/module.v"
 } as {[key: string]: string};
 
+const tokenLength = {
+  "../samples/bash/caesar.sh": 1185,
+  "../samples/c/caesar.c": 582,
+  "../samples/c-sharp/Caesar.cs": 606,
+  "../samples/char/caesar.txt": 3700,
+  "../samples/cpp/caesar.cpp": 801,
+  "../samples/elm/Caesar.elm": 753,
+  "../samples/groovy/caesar.groovy": 282,
+  "../samples/java/Caesar.java": 522,
+  "../samples/javascript/sample.js": 861,
+  "../samples/python/caesar.py": 309,
+  "../samples/php/caesar.php": 411,
+  "../samples/modelica/sample.mo": 7542,
+  "../samples/r/caesar.R": 594,
+  "../samples/scala/Caesar.scala": 366,
+  "../samples/sql/sample.sql": 543,
+  "../samples/tsx/sample.tsx": 1659,
+  "../samples/typescript/caesar.ts": 378,
+  "../samples/verilog/module.v": 2484
+} as {[key: string]: number};
+
 for (const [languageName, languageFile] of Object.entries(languageFiles)) {
   test(`LanguagePicker can find ${languageName} correctly by name`, async t => {
     const language = await new LanguagePicker().findLanguage(languageName);
@@ -49,6 +70,7 @@ for (const [languageName, languageFile] of Object.entries(languageFiles)) {
     const { tokens } = tokenizer.tokenizeFile(file);
     t.truthy(tokens);
     t.snapshot(tokens, "stable tokenization");
+    t.is(tokens.length, tokenLength[languageFile]);
   });
 }
 
@@ -87,3 +109,91 @@ test("should be able to parse larger files", async t => {
   const { tokens } = tokenizer.tokenizeFile(file);
   t.truthy(tokens);
 });
+
+test("should be able to correctly tokenize a variable", async t => {
+  const file = new File("long.js", "var test = 1;");
+  const language = await (new LanguagePicker().findLanguage("javascript"));
+  const tokenizer = await language.createTokenizer();
+
+  const { tokens, mapping } = tokenizer.tokenizeFile(file);
+  t.is(tokens.join(""), "(program(variable_declaration(variable_declarator(identifier)(number))))");
+  t.is(mapping.length, 15);
+  t.deepEqual([
+    new Region(0, 0, 0, 0),
+    new Region(0, 0, 0, 0),
+    new Region(0, 0, 0, 4),
+    new Region(0, 0, 0, 4),
+    new Region(0, 4, 0, 4),
+    new Region(0, 4, 0, 4),
+    new Region(0, 4, 0, 8),
+    new Region(0, 4, 0, 8),
+    new Region(0, 4, 0, 8),
+    new Region(0, 11, 0, 12),
+    new Region(0, 11, 0, 12),
+    new Region(0, 11, 0, 12),
+    new Region(0, 4, 0, 4),
+    new Region(0, 0, 0, 4),
+    new Region(0, 0, 0, 0)
+  ], mapping);
+});
+
+test("should be able to correctly tokenize a loop", async t => {
+  const file = new File("long.js", "let i = 0;\nwhile (i < 10) {\n  i += 1;\n}");
+  const language = await (new LanguagePicker().findLanguage("javascript"));
+
+  const tokenizer = await language.createTokenizer();
+  const { tokens, mapping } = tokenizer.tokenizeFile(file);
+  t.is(tokens.join(""), "(program(lexical_declaration(variable_declarator(identifier)(number)))" +
+      "(while_statement(parenthesized_expression(binary_expression(identifier)(number)))" +
+      "(statement_block(expression_statement(augmented_assignment_expression(identifier)(number))))))");
+  t.is(mapping.length, 45);
+  t.deepEqual( [
+    new Region (0,0,0,0),
+    new Region (0,0,0,0),
+    new Region (0,0,0,4),
+    new Region (0,0,0,4),
+    new Region (0,4,0,4),
+    new Region (0,4,0,4),
+    new Region (0,4,0,5),
+    new Region (0,4,0,5),
+    new Region (0,4,0,5),
+    new Region (0,8,0,9),
+    new Region (0,8,0,9),
+    new Region (0,8,0,9),
+    new Region (0,4,0,4),
+    new Region (0,0,0,4),
+    new Region (1,0,1,6),
+    new Region (1,0,1,6),
+    new Region (1,6,1,7),
+    new Region (1,6,1,7),
+    new Region (1,7,1,7),
+    new Region (1,7,1,7),
+    new Region (1,7,1,8),
+    new Region (1,7,1,8),
+    new Region (1,7,1,8),
+    new Region (1,11,1,13),
+    new Region (1,11,1,13),
+    new Region (1,11,1,13),
+    new Region (1,7,1,7),
+    new Region (1,6,1,7),
+    new Region (1,15,2,2),
+    new Region (1,15,2,2),
+    new Region (2,2,2,2),
+    new Region (2,2,2,2),
+    new Region (2,2,2,2),
+    new Region (2,2,2,2),
+    new Region (2,2,2,3),
+    new Region (2,2,2,3),
+    new Region (2,2,2,3),
+    new Region (2,7,2,8),
+    new Region (2,7,2,8),
+    new Region (2,7,2,8),
+    new Region (2,2,2,2),
+    new Region (2,2,2,2),
+    new Region (1,15,2,2),
+    new Region (1,0,1,6),
+    new Region (0,0,0,0),
+  ],mapping
+  );
+});
+