diff --git a/manifest.json b/manifest.json index e30c5b6..e474c4e 100644 --- a/manifest.json +++ b/manifest.json @@ -2,7 +2,7 @@ "id": "obsidian-sidekick", "name": "Sidekick", "description": "A companion to identify hidden connections that match your tags and pages", - "version": "1.4.3", + "version": "1.5.0", "minAppVersion": "0.13.8", "author": "Hady Osman", "authorUrl": "https://hady.geek.nz", diff --git a/package.json b/package.json index caa8304..a1826e2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "obsidian-sidekick", - "version": "1.4.3", + "version": "1.5.0", "description": "A companion to identify hidden connections that match your tags and pages", "main": "src/index.ts", "repository": { diff --git a/src/cmExtension/suggestionsExtension.ts b/src/cmExtension/suggestionsExtension.ts index 718e8de..a43e0cb 100644 --- a/src/cmExtension/suggestionsExtension.ts +++ b/src/cmExtension/suggestionsExtension.ts @@ -16,11 +16,11 @@ import './suggestionsExtension.css'; const SuggestionCandidateClass = 'cm-suggestion-candidate'; -const underlineDecoration = (start: number, end: number, keyword: string) => +const underlineDecoration = (start: number, end: number, indexKeyword: string) => Decoration.mark({ class: SuggestionCandidateClass, attributes: { - 'data-keyword': keyword, + 'data-index-keyword': indexKeyword, 'data-position-start': `${start}`, 'data-position-end': `${end}`, }, @@ -68,7 +68,7 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin { view.dispatch({ changes: { diff --git a/src/indexing/indexer.ts b/src/indexing/indexer.ts index 787ae00..3fbc4c5 100644 --- a/src/indexing/indexer.ts +++ b/src/indexing/indexer.ts @@ -1,14 +1,17 @@ +import _ from 'lodash'; import lokijs from 'lokijs'; import { TypedEmitter } from 'tiny-typed-emitter'; import type { TFile } from 'obsidian'; -import { tokenize } from './utils'; +import { stemPhrase } from '../stemmers'; +import { WordPermutationsTokenizer } from '../tokenizers'; import type { PluginHelper } from '../plugin-helper'; type Document = { fileCreationTime: number; type: 'tag' | 'alias' | 'page' | 'page-token'; keyword: string; + originalText: string; replaceText: string; }; @@ -19,6 +22,7 @@ interface IndexerEvents { export class Indexer extends TypedEmitter { private documents: Collection; + private permutationTokenizer: WordPermutationsTokenizer; constructor(private pluginHelper: PluginHelper) { super(); @@ -28,15 +32,19 @@ export class Indexer extends TypedEmitter { this.documents = db.addCollection('documents', { indices: ['fileCreationTime', 'keyword'], }); + + this.permutationTokenizer = new WordPermutationsTokenizer(); } public getKeywords(): string[] { // Exclude any keywords associated with active file as we don't want recursive highlighting const exclusionFile = this.pluginHelper.activeFile; - return this.documents + const keywords = this.documents .where((doc) => doc.fileCreationTime !== exclusionFile.stat.ctime) .map((doc) => doc.keyword); + + return _.uniq(keywords); } public getDocumentsByKeyword(keyword: string): Document[] { @@ -62,15 +70,17 @@ export class Indexer extends TypedEmitter { this.documents.insert({ fileCreationTime: file.stat.ctime, type: 'page', - keyword: file.basename.toLowerCase(), + keyword: stemPhrase(file.basename), + originalText: file.basename, replaceText: `[[${file.basename}]]`, }); - tokenize(file.basename).forEach((token) => { + this.permutationTokenizer.tokenize(file.basename).forEach((token) => { this.documents.insert({ fileCreationTime: file.stat.ctime, type: 'page-token', keyword: token, + originalText: file.basename, replaceText: `[[${file.basename}]]`, }); }); @@ -80,6 +90,7 @@ export class Indexer extends TypedEmitter { fileCreationTime: file.stat.ctime, type: 'alias', keyword: alias.toLowerCase(), + originalText: file.basename, replaceText: `[[${file.basename}|${alias}]]`, }); }); @@ -89,6 +100,7 @@ export class Indexer extends TypedEmitter { fileCreationTime: file.stat.ctime, type: 'tag', keyword: tag.replace(/#/, '').toLowerCase(), + originalText: tag, replaceText: tag, }); }); diff --git a/src/indexing/utils.spec.ts b/src/indexing/utils.spec.ts deleted file mode 100644 index ab71c50..0000000 --- a/src/indexing/utils.spec.ts +++ /dev/null @@ -1,29 +0,0 @@ -import { tokenize } from './utils'; - -describe('tokenize', () => { - const dataSet = [ - { - sentence: 'The quick brown fox jumps over the lazy dog.', - expected: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog'], - }, - { - sentence: 'GitHub Forks', - expected: ['github', 'fork'], - }, - { - sentence: 'John Doe', - expected: ['john', 'doe'], - }, - { - sentence: 'Approximate Inference', - expected: ['approxim', 'infer'], - }, - ]; - - dataSet.forEach(({ sentence, expected }) => { - it(`Tokenizes and removes stop words ("${sentence}", [${expected}]`, () => { - const tokens = tokenize(sentence); - expect(tokens).toEqual(expected); - }); - }); -}); diff --git a/src/indexing/utils.ts b/src/indexing/utils.ts deleted file mode 100644 index 3318cd4..0000000 --- a/src/indexing/utils.ts +++ /dev/null @@ -1,5 +0,0 @@ -import natural from 'natural'; - -export const tokenize = (text: string): string[] => { - return natural.PorterStemmer.tokenizeAndStem(text); -}; diff --git a/src/search/index.ts b/src/search/index.ts index 2039ac7..403fee7 100644 --- a/src/search/index.ts +++ b/src/search/index.ts @@ -1,17 +1,22 @@ import _ from 'lodash'; -import { Trie, Emit } from '@tanishiking/aho-corasick'; +import { Trie } from '@tanishiking/aho-corasick'; -import { redactText } from './search.utils'; import type { Indexer } from '../indexing/indexer'; +import { redactText } from './redactText'; +import { mapStemToOriginalText } from './mapStemToOriginalText'; +import { WordPunctStemTokenizer } from '../tokenizers'; -type SearchResult = { +const tokenizer = new WordPunctStemTokenizer(); + +export type SearchResult = { start: number; end: number; - keyword: string; + indexKeyword: string; + originalKeyword: string; }; -const isEqual = (a: Emit, b: Emit) => { - return a.start === b.start && a.keyword === b.keyword; +const isEqual = (a: SearchResult, b: SearchResult) => { + return a.start === b.start && a.indexKeyword === b.indexKeyword; }; export default class Search { @@ -36,20 +41,20 @@ export default class Search { public find(text: string): SearchResult[] { const redactedText = redactText(text); // Redact text that we don't want to be searched - const results = this.trie.parseText(redactedText); + // Stem the text + const tokens = tokenizer.tokenize(redactedText); + const stemmedText = tokens.map((t) => t.stem).join(''); - return this.mapToSearchResults(results); - } + // Search stemmed text + const emits = this.trie.parseText(stemmedText); - private mapToSearchResults(results: Emit[]): SearchResult[] { - return _.uniqWith(results, isEqual) - .filter((result) => this.keywordExistsInIndex(result.keyword)) - .map((result) => ({ - start: result.start, - end: result.end + 1, - keyword: result.keyword, - })) - .sort((a, b) => a.start - b.start); // Must sort by start position to prepare for highlighting + // Map stemmed results to original text + return _.chain(emits) + .map((emit) => mapStemToOriginalText(emit, tokens)) + .uniqWith(isEqual) + .filter((result) => this.keywordExistsInIndex(result.indexKeyword)) + .sort((a, b) => a.start - b.start) // Must sort by start position to prepare for highlighting + .value(); } private keywordExistsInIndex(index: string): boolean { diff --git a/src/search/mapStemToOriginalText.ts b/src/search/mapStemToOriginalText.ts new file mode 100644 index 0000000..ab323b0 --- /dev/null +++ b/src/search/mapStemToOriginalText.ts @@ -0,0 +1,28 @@ +import { Emit } from '@tanishiking/aho-corasick'; + +import { SearchResult } from '../search/index'; +import { Token } from '../tokenizers'; + +/** + * Takes a given search result (which has the start/end position and a "stemmed" keyword) + * that was matched, and maps them to a new start/end position for the original keyword + * which was stem was created from + * @param searchResult + * @param tokens + * @returns + */ +export const mapStemToOriginalText = (searchResult: Emit, tokens: Token[]): SearchResult => { + const matchingTokens = tokens.filter( + (token) => token.stemStart >= searchResult.start && token.stemEnd <= searchResult.end + 1 + ); + + return { + start: matchingTokens[0].originalStart, + end: matchingTokens[matchingTokens.length - 1].originalEnd, + indexKeyword: matchingTokens + .map((token) => token.stem) + .join('') + .toLowerCase(), + originalKeyword: matchingTokens.map((token) => token.originalText).join(''), + }; +}; diff --git a/src/search/search.utils.spec.ts b/src/search/redactText.spec.ts similarity index 97% rename from src/search/search.utils.spec.ts rename to src/search/redactText.spec.ts index 3639ed3..884e458 100644 --- a/src/search/search.utils.spec.ts +++ b/src/search/redactText.spec.ts @@ -1,4 +1,4 @@ -import { redactText } from './search.utils'; +import { redactText } from './redactText'; describe('redactText', () => { it('Hashtags are redacted', () => { diff --git a/src/search/search.utils.ts b/src/search/redactText.ts similarity index 100% rename from src/search/search.utils.ts rename to src/search/redactText.ts diff --git a/src/search/search.spec.ts b/src/search/search.spec.ts new file mode 100644 index 0000000..d993249 --- /dev/null +++ b/src/search/search.spec.ts @@ -0,0 +1,81 @@ +import { Indexer } from '../indexing/indexer'; +import Search from './index'; + +const getKeywordsMockFn = jest.fn(); + +jest.mock('../indexing/indexer', () => { + return { + Indexer: jest.fn().mockImplementation(() => { + return { + getKeywords: getKeywordsMockFn, + getDocumentsByKeyword: () => [{}], + }; + }), + }; +}); + +beforeEach(() => { + jest.clearAllMocks(); +}); + +describe('Search class', () => { + it('Highlights single keywords that can be stemmed', () => { + getKeywordsMockFn.mockReturnValue(['search', 'note']); + const text = 'This is a note that I will be use for searching'; + + const indexer = new Indexer(null); + const search = new Search(indexer); + const results = search.find(text); + + expect(results).toEqual([ + { + start: 10, + end: 14, + indexKeyword: 'note', + originalKeyword: 'note', + }, + { + start: 38, + end: 47, + indexKeyword: 'search', + originalKeyword: 'searching', + }, + ]); + }); + + it('Longer keyword matches are always prioritised for highlight', () => { + getKeywordsMockFn.mockReturnValue(['github', 'github fork']); + const text = 'I use GitHub Forks as part of my development flow'; + + const indexer = new Indexer(null); + const search = new Search(indexer); + const results = search.find(text); + + expect(results).toEqual([ + { + start: 6, + end: 18, + indexKeyword: 'github fork', + originalKeyword: 'GitHub Forks', + }, + ]); + }); + + it('Three word keyword is highlighted', () => { + getKeywordsMockFn.mockReturnValue(['shared', 'client', 'record', 'share client record']); + const text = 'Designing a shared client record is a great idea but challenging'; + + const indexer = new Indexer(null); + const search = new Search(indexer); + const results = search.find(text); + + expect(results).toEqual([ + { + start: 12, + end: 32, + indexKeyword: 'share client record', + originalKeyword: 'shared client record', + }, + ]); + }); +}); diff --git a/src/stemmers/index.ts b/src/stemmers/index.ts new file mode 100644 index 0000000..8c3bc9a --- /dev/null +++ b/src/stemmers/index.ts @@ -0,0 +1,21 @@ +import { PorterStemmer } from 'natural'; + +import { WordPunctStemTokenizer } from '../tokenizers'; + +/** + * Stem a given phrase. If the phrase is made up of multiple words, + * the last word in the phrase is the only one that will be stemmed + * @param text input text + * @returns stemmed text + */ +export const stemLastWord = (text: string): string => { + return PorterStemmer.stem(text); +}; + +export const stemPhrase = (text: string): string => { + const tokenizer = new WordPunctStemTokenizer(); + return tokenizer + .tokenize(text) + .map((t) => t.stem) + .join(''); +}; diff --git a/src/tokenizers/index.ts b/src/tokenizers/index.ts new file mode 100644 index 0000000..8291720 --- /dev/null +++ b/src/tokenizers/index.ts @@ -0,0 +1,86 @@ +import _ from 'lodash'; +import { PorterStemmer, NGrams } from 'natural'; +import { Trie } from '@tanishiking/aho-corasick'; +import * as natural from 'natural'; + +import { stemLastWord } from '../stemmers'; + +export type Token = { + index: number; + originalText: string; + originalStart: number; + originalEnd: number; + stem: string; + stemStart: number; + stemEnd: number; +}; + +export class WordPermutationsTokenizer { + private trie: Trie; + + constructor() { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const stopWords: string[] = (natural as any).stopwords; + + this.trie = new Trie(stopWords, { + allowOverlaps: false, + onlyWholeWords: true, + caseInsensitive: true, + }); + } + + public tokenize(text: string): string[] { + const tokens = PorterStemmer.tokenizeAndStem(text); // Strip punctuation and stop words, stem remaining words + + if (tokens.length >= 5) { + return [...tokens, ...NGrams.bigrams(tokens).map((tokens) => tokens.join(' '))]; + } + + return this.combinations(tokens, 2, 2); + } + + private combinations(arr: string[], min: number, max: number) { + return [...Array(max).keys()] + .reduce((result) => { + return arr.concat( + result.flatMap((val) => + arr.filter((char) => char !== val).map((char) => `${val} ${char}`) + ) + ); + }, []) + .filter((val) => val.length >= min); + } +} + +export class WordPunctStemTokenizer { + private pattern = /([\s]+|[A-zÀ-ÿ-]+|[0-9._]+|.|!|\?|'|"|:|;|,|-)/i; + + public tokenize(text: string): Token[] { + const tokens = text.split(this.pattern); + return _.chain(tokens).without('').transform(this.stringToTokenAccumulator()).value(); + } + + private stringToTokenAccumulator() { + let originalCharIndex = 0; + let stemCharIndex = 0; + + return (acc: Token[], token: string, index: number) => { + const stemmedToken = stemLastWord(token); + + acc.push({ + index, + originalText: token, + originalStart: originalCharIndex, + originalEnd: originalCharIndex + token.length, + stem: stemmedToken, + stemStart: stemCharIndex, + stemEnd: stemCharIndex + stemmedToken.length, + }); + + originalCharIndex += token.length; + stemCharIndex += stemmedToken.length; + + return acc; + }; + } +} diff --git a/src/tokenizers/tokenizer.spec.ts b/src/tokenizers/tokenizer.spec.ts new file mode 100644 index 0000000..64d1fb4 --- /dev/null +++ b/src/tokenizers/tokenizer.spec.ts @@ -0,0 +1,93 @@ +import { WordPermutationsTokenizer, WordPunctStemTokenizer } from '.'; + +describe('WordPermutationsTokenizer', () => { + const dataSet = [ + { + description: 'Single word', + sentence: 'John', + expected: ['john'], + }, + { + description: 'Two words with no stop words', + sentence: 'John Doe', + expected: ['john', 'doe', 'john doe', 'doe john'], + }, + { + description: 'Two words (with one stop word at the start)', + sentence: 'The brothers Karamazov', + expected: ['brother', 'karamazov', 'brother karamazov', 'karamazov brother'], + }, + { + description: 'Two words (with stop words throughout the sentence)', + sentence: 'An Officer and a Spy', + expected: ['offic', 'spy', 'offic spy', 'spy offic'], + }, + { + description: 'Three words with no stop words', + sentence: 'GitHub Forking tutorial', + expected: [ + 'github', + 'fork', + 'tutori', + 'github fork', + 'github tutori', + 'fork github', + 'fork tutori', + 'tutori github', + 'tutori fork', + ], + }, + + { + description: 'Five words or more does not generate permutations', + sentence: 'Ten Arguments For Deleting Your Social Media Accounts Right Now', + expected: [ + 'ten', + 'argument', + 'delet', + 'social', + 'media', + 'account', + 'right', + 'ten argument', + 'argument delet', + 'delet social', + 'social media', + 'media account', + 'account right', + ], + }, + ]; + + dataSet.forEach(({ description, sentence, expected }) => { + it(`Tokenize phase permutations (${description})`, () => { + const tokenizer = new WordPermutationsTokenizer(); + const tokens = tokenizer.tokenize(sentence); + + expect(tokens).toEqual(expected); + }); + }); +}); + +describe('WordPunctStemTokenizer', () => { + it('Tokenize and stem a simple phrase', () => { + const sentence = 'The lazy dog jumped over the fence.'; + + const tokenizer = new WordPunctStemTokenizer(); + const tokens = tokenizer.tokenize(sentence); + + expect(tokens.length).toEqual(14); + + expect(tokens[2]).toEqual({ + index: 2, + originalText: 'lazy', + originalStart: 4, + originalEnd: 8, + stem: 'lazi', + stemStart: 4, + stemEnd: 8, + }); + + expect(tokens[6].stem).toEqual('jump'); + }); +});