Re-engineer index to match both partial and multiple words (#30)

* Initial work in progress * Re-engineer index to match both partial and multiple words * Remove @liquicode/lib-tokenize dependency
hadynz · Feb 26, 2022 · ff6a882 · ff6a882
1 parent c6184be
commit ff6a882
Show file tree

Hide file tree

Showing 14 changed files with 356 additions and 64 deletions.
diff --git a/manifest.json b/manifest.json
@@ -2,7 +2,7 @@
   "id": "obsidian-sidekick",
   "name": "Sidekick",
   "description": "A companion to identify hidden connections that match your tags and pages",
-  "version": "1.4.3",
+  "version": "1.5.0",
   "minAppVersion": "0.13.8",
   "author": "Hady Osman",
   "authorUrl": "https://hady.geek.nz",

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "obsidian-sidekick",
-  "version": "1.4.3",
+  "version": "1.5.0",
   "description": "A companion to identify hidden connections that match your tags and pages",
   "main": "src/index.ts",
   "repository": {

diff --git a/src/cmExtension/suggestionsExtension.ts b/src/cmExtension/suggestionsExtension.ts
@@ -16,11 +16,11 @@ import './suggestionsExtension.css';
 
 const SuggestionCandidateClass = 'cm-suggestion-candidate';
 
-const underlineDecoration = (start: number, end: number, keyword: string) =>
+const underlineDecoration = (start: number, end: number, indexKeyword: string) =>
   Decoration.mark({
     class: SuggestionCandidateClass,
     attributes: {
-      'data-keyword': keyword,
+      'data-index-keyword': indexKeyword,
       'data-position-start': `${start}`,
       'data-position-end': `${end}`,
     },
@@ -68,7 +68,7 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin<Plugi
             const end = from + result.end;
 
             // Add the decoration
-            builder.add(start, end, underlineDecoration(start, end, result.keyword));
+            builder.add(start, end, underlineDecoration(start, end, result.indexKeyword));
           }
         }
 
@@ -89,13 +89,13 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin<Plugi
           }
 
           // Extract position and replacement text from target element data attributes state
-          const { positionStart, positionEnd, keyword } = target.dataset;
+          const { positionStart, positionEnd, indexKeyword } = target.dataset;
 
           // Show suggestions modal
           showSuggestionsModal({
             app,
             mouseEvent: e,
-            suggestions: search.getReplacementSuggestions(keyword),
+            suggestions: search.getReplacementSuggestions(indexKeyword),
             onClick: (replaceText) => {
               view.dispatch({
                 changes: {

diff --git a/src/indexing/indexer.ts b/src/indexing/indexer.ts
@@ -1,14 +1,17 @@
+import _ from 'lodash';
 import lokijs from 'lokijs';
 import { TypedEmitter } from 'tiny-typed-emitter';
 import type { TFile } from 'obsidian';
 
-import { tokenize } from './utils';
+import { stemPhrase } from '../stemmers';
+import { WordPermutationsTokenizer } from '../tokenizers';
 import type { PluginHelper } from '../plugin-helper';
 
 type Document = {
   fileCreationTime: number;
   type: 'tag' | 'alias' | 'page' | 'page-token';
   keyword: string;
+  originalText: string;
   replaceText: string;
 };
 
@@ -19,6 +22,7 @@ interface IndexerEvents {
 
 export class Indexer extends TypedEmitter<IndexerEvents> {
   private documents: Collection<Document>;
+  private permutationTokenizer: WordPermutationsTokenizer;
 
   constructor(private pluginHelper: PluginHelper) {
     super();
@@ -28,15 +32,19 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
     this.documents = db.addCollection<Document>('documents', {
       indices: ['fileCreationTime', 'keyword'],
     });
+
+    this.permutationTokenizer = new WordPermutationsTokenizer();
   }
 
   public getKeywords(): string[] {
     // Exclude any keywords associated with active file as we don't want recursive highlighting
     const exclusionFile = this.pluginHelper.activeFile;
 
-    return this.documents
+    const keywords = this.documents
       .where((doc) => doc.fileCreationTime !== exclusionFile.stat.ctime)
       .map((doc) => doc.keyword);
+
+    return _.uniq(keywords);
   }
 
   public getDocumentsByKeyword(keyword: string): Document[] {
@@ -62,15 +70,17 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
     this.documents.insert({
       fileCreationTime: file.stat.ctime,
       type: 'page',
-      keyword: file.basename.toLowerCase(),
+      keyword: stemPhrase(file.basename),
+      originalText: file.basename,
       replaceText: `[[${file.basename}]]`,
     });
 
-    tokenize(file.basename).forEach((token) => {
+    this.permutationTokenizer.tokenize(file.basename).forEach((token) => {
       this.documents.insert({
         fileCreationTime: file.stat.ctime,
         type: 'page-token',
         keyword: token,
+        originalText: file.basename,
         replaceText: `[[${file.basename}]]`,
       });
     });
@@ -80,6 +90,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
         fileCreationTime: file.stat.ctime,
         type: 'alias',
         keyword: alias.toLowerCase(),
+        originalText: file.basename,
         replaceText: `[[${file.basename}|${alias}]]`,
       });
     });
@@ -89,6 +100,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
         fileCreationTime: file.stat.ctime,
         type: 'tag',
         keyword: tag.replace(/#/, '').toLowerCase(),
+        originalText: tag,
         replaceText: tag,
       });
     });

diff --git a/src/indexing/utils.spec.ts b/src/indexing/utils.spec.ts
diff --git a/src/indexing/utils.ts b/src/indexing/utils.ts
diff --git a/src/search/index.ts b/src/search/index.ts
@@ -1,17 +1,22 @@
 import _ from 'lodash';
-import { Trie, Emit } from '@tanishiking/aho-corasick';
+import { Trie } from '@tanishiking/aho-corasick';
 
-import { redactText } from './search.utils';
 import type { Indexer } from '../indexing/indexer';
+import { redactText } from './redactText';
+import { mapStemToOriginalText } from './mapStemToOriginalText';
+import { WordPunctStemTokenizer } from '../tokenizers';
 
-type SearchResult = {
+const tokenizer = new WordPunctStemTokenizer();
+
+export type SearchResult = {
   start: number;
   end: number;
-  keyword: string;
+  indexKeyword: string;
+  originalKeyword: string;
 };
 
-const isEqual = (a: Emit, b: Emit) => {
-  return a.start === b.start && a.keyword === b.keyword;
+const isEqual = (a: SearchResult, b: SearchResult) => {
+  return a.start === b.start && a.indexKeyword === b.indexKeyword;
 };
 
 export default class Search {
@@ -36,20 +41,20 @@ export default class Search {
   public find(text: string): SearchResult[] {
     const redactedText = redactText(text); // Redact text that we don't want to be searched
 
-    const results = this.trie.parseText(redactedText);
+    // Stem the text
+    const tokens = tokenizer.tokenize(redactedText);
+    const stemmedText = tokens.map((t) => t.stem).join('');
 
-    return this.mapToSearchResults(results);
-  }
+    // Search stemmed text
+    const emits = this.trie.parseText(stemmedText);
 
-  private mapToSearchResults(results: Emit[]): SearchResult[] {
-    return _.uniqWith(results, isEqual)
-      .filter((result) => this.keywordExistsInIndex(result.keyword))
-      .map((result) => ({
-        start: result.start,
-        end: result.end + 1,
-        keyword: result.keyword,
-      }))
-      .sort((a, b) => a.start - b.start); // Must sort by start position to prepare for highlighting
+    // Map stemmed results to original text
+    return _.chain(emits)
+      .map((emit) => mapStemToOriginalText(emit, tokens))
+      .uniqWith(isEqual)
+      .filter((result) => this.keywordExistsInIndex(result.indexKeyword))
+      .sort((a, b) => a.start - b.start) // Must sort by start position to prepare for highlighting
+      .value();
   }
 
   private keywordExistsInIndex(index: string): boolean {

diff --git a/src/search/mapStemToOriginalText.ts b/src/search/mapStemToOriginalText.ts
@@ -0,0 +1,28 @@
+import { Emit } from '@tanishiking/aho-corasick';
+
+import { SearchResult } from '../search/index';
+import { Token } from '../tokenizers';
+
+/**
+ * Takes a given search result (which has the start/end position and a "stemmed" keyword)
+ * that was matched, and maps them to a new start/end position for the original keyword
+ * which was stem was created from
+ * @param searchResult
+ * @param tokens
+ * @returns
+ */
+export const mapStemToOriginalText = (searchResult: Emit, tokens: Token[]): SearchResult => {
+  const matchingTokens = tokens.filter(
+    (token) => token.stemStart >= searchResult.start && token.stemEnd <= searchResult.end + 1
+  );
+
+  return {
+    start: matchingTokens[0].originalStart,
+    end: matchingTokens[matchingTokens.length - 1].originalEnd,
+    indexKeyword: matchingTokens
+      .map((token) => token.stem)
+      .join('')
+      .toLowerCase(),
+    originalKeyword: matchingTokens.map((token) => token.originalText).join(''),
+  };
+};
diff --git a/src/search/search.utils.spec.ts → src/search/redactText.spec.ts b/src/search/search.utils.spec.ts → src/search/redactText.spec.ts
@@ -1,4 +1,4 @@
-import { redactText } from './search.utils';
+import { redactText } from './redactText';
 
 describe('redactText', () => {
   it('Hashtags are redacted', () => {

diff --git a/src/search/search.utils.ts → src/search/redactText.ts b/src/search/search.utils.ts → src/search/redactText.ts
diff --git a/src/search/search.spec.ts b/src/search/search.spec.ts
@@ -0,0 +1,81 @@
+import { Indexer } from '../indexing/indexer';
+import Search from './index';
+
+const getKeywordsMockFn = jest.fn();
+
+jest.mock('../indexing/indexer', () => {
+  return {
+    Indexer: jest.fn().mockImplementation(() => {
+      return {
+        getKeywords: getKeywordsMockFn,
+        getDocumentsByKeyword: () => [{}],
+      };
+    }),
+  };
+});
+
+beforeEach(() => {
+  jest.clearAllMocks();
+});
+
+describe('Search class', () => {
+  it('Highlights single keywords that can be stemmed', () => {
+    getKeywordsMockFn.mockReturnValue(['search', 'note']);
+    const text = 'This is a note that I will be use for searching';
+
+    const indexer = new Indexer(null);
+    const search = new Search(indexer);
+    const results = search.find(text);
+
+    expect(results).toEqual([
+      {
+        start: 10,
+        end: 14,
+        indexKeyword: 'note',
+        originalKeyword: 'note',
+      },
+      {
+        start: 38,
+        end: 47,
+        indexKeyword: 'search',
+        originalKeyword: 'searching',
+      },
+    ]);
+  });
+
+  it('Longer keyword matches are always prioritised for highlight', () => {
+    getKeywordsMockFn.mockReturnValue(['github', 'github fork']);
+    const text = 'I use GitHub Forks as part of my development flow';
+
+    const indexer = new Indexer(null);
+    const search = new Search(indexer);
+    const results = search.find(text);
+
+    expect(results).toEqual([
+      {
+        start: 6,
+        end: 18,
+        indexKeyword: 'github fork',
+        originalKeyword: 'GitHub Forks',
+      },
+    ]);
+  });
+
+  it('Three word keyword is highlighted', () => {
+    getKeywordsMockFn.mockReturnValue(['shared', 'client', 'record', 'share client record']);
+    const text = 'Designing a shared client record is a great idea but challenging';
+
+    const indexer = new Indexer(null);
+    const search = new Search(indexer);
+    const results = search.find(text);
+
+    expect(results).toEqual([
+      {
+        start: 12,
+        end: 32,
+        indexKeyword: 'share client record',
+        originalKeyword: 'shared client record',
+      },
+    ]);
+  });
+});
diff --git a/src/stemmers/index.ts b/src/stemmers/index.ts
@@ -0,0 +1,21 @@
+import { PorterStemmer } from 'natural';
+
+import { WordPunctStemTokenizer } from '../tokenizers';
+
+/**
+ * Stem a given phrase. If the phrase is made up of multiple words,
+ * the last word in the phrase is the only one that will be stemmed
+ * @param text input text
+ * @returns stemmed text
+ */
+export const stemLastWord = (text: string): string => {
+  return PorterStemmer.stem(text);
+};
+
+export const stemPhrase = (text: string): string => {
+  const tokenizer = new WordPunctStemTokenizer();
+  return tokenizer
+    .tokenize(text)
+    .map((t) => t.stem)
+    .join('');
+};