Skip to content

Commit

Permalink
Re-engineer index to match both partial and multiple words (#30)
Browse files Browse the repository at this point in the history
* Initial work in progress

* Re-engineer index to match both partial and multiple words

* Remove @liquicode/lib-tokenize dependency
  • Loading branch information
hadynz authored Feb 26, 2022
1 parent c6184be commit ff6a882
Show file tree
Hide file tree
Showing 14 changed files with 356 additions and 64 deletions.
2 changes: 1 addition & 1 deletion manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"id": "obsidian-sidekick",
"name": "Sidekick",
"description": "A companion to identify hidden connections that match your tags and pages",
"version": "1.4.3",
"version": "1.5.0",
"minAppVersion": "0.13.8",
"author": "Hady Osman",
"authorUrl": "https://hady.geek.nz",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "obsidian-sidekick",
"version": "1.4.3",
"version": "1.5.0",
"description": "A companion to identify hidden connections that match your tags and pages",
"main": "src/index.ts",
"repository": {
Expand Down
10 changes: 5 additions & 5 deletions src/cmExtension/suggestionsExtension.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ import './suggestionsExtension.css';

const SuggestionCandidateClass = 'cm-suggestion-candidate';

const underlineDecoration = (start: number, end: number, keyword: string) =>
const underlineDecoration = (start: number, end: number, indexKeyword: string) =>
Decoration.mark({
class: SuggestionCandidateClass,
attributes: {
'data-keyword': keyword,
'data-index-keyword': indexKeyword,
'data-position-start': `${start}`,
'data-position-end': `${end}`,
},
Expand Down Expand Up @@ -68,7 +68,7 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin<Plugi
const end = from + result.end;

// Add the decoration
builder.add(start, end, underlineDecoration(start, end, result.keyword));
builder.add(start, end, underlineDecoration(start, end, result.indexKeyword));
}
}

Expand All @@ -89,13 +89,13 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin<Plugi
}

// Extract position and replacement text from target element data attributes state
const { positionStart, positionEnd, keyword } = target.dataset;
const { positionStart, positionEnd, indexKeyword } = target.dataset;

// Show suggestions modal
showSuggestionsModal({
app,
mouseEvent: e,
suggestions: search.getReplacementSuggestions(keyword),
suggestions: search.getReplacementSuggestions(indexKeyword),
onClick: (replaceText) => {
view.dispatch({
changes: {
Expand Down
20 changes: 16 additions & 4 deletions src/indexing/indexer.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import _ from 'lodash';
import lokijs from 'lokijs';
import { TypedEmitter } from 'tiny-typed-emitter';
import type { TFile } from 'obsidian';

import { tokenize } from './utils';
import { stemPhrase } from '../stemmers';
import { WordPermutationsTokenizer } from '../tokenizers';
import type { PluginHelper } from '../plugin-helper';

type Document = {
fileCreationTime: number;
type: 'tag' | 'alias' | 'page' | 'page-token';
keyword: string;
originalText: string;
replaceText: string;
};

Expand All @@ -19,6 +22,7 @@ interface IndexerEvents {

export class Indexer extends TypedEmitter<IndexerEvents> {
private documents: Collection<Document>;
private permutationTokenizer: WordPermutationsTokenizer;

constructor(private pluginHelper: PluginHelper) {
super();
Expand All @@ -28,15 +32,19 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
this.documents = db.addCollection<Document>('documents', {
indices: ['fileCreationTime', 'keyword'],
});

this.permutationTokenizer = new WordPermutationsTokenizer();
}

public getKeywords(): string[] {
// Exclude any keywords associated with active file as we don't want recursive highlighting
const exclusionFile = this.pluginHelper.activeFile;

return this.documents
const keywords = this.documents
.where((doc) => doc.fileCreationTime !== exclusionFile.stat.ctime)
.map((doc) => doc.keyword);

return _.uniq(keywords);
}

public getDocumentsByKeyword(keyword: string): Document[] {
Expand All @@ -62,15 +70,17 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
this.documents.insert({
fileCreationTime: file.stat.ctime,
type: 'page',
keyword: file.basename.toLowerCase(),
keyword: stemPhrase(file.basename),
originalText: file.basename,
replaceText: `[[${file.basename}]]`,
});

tokenize(file.basename).forEach((token) => {
this.permutationTokenizer.tokenize(file.basename).forEach((token) => {
this.documents.insert({
fileCreationTime: file.stat.ctime,
type: 'page-token',
keyword: token,
originalText: file.basename,
replaceText: `[[${file.basename}]]`,
});
});
Expand All @@ -80,6 +90,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
fileCreationTime: file.stat.ctime,
type: 'alias',
keyword: alias.toLowerCase(),
originalText: file.basename,
replaceText: `[[${file.basename}|${alias}]]`,
});
});
Expand All @@ -89,6 +100,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
fileCreationTime: file.stat.ctime,
type: 'tag',
keyword: tag.replace(/#/, '').toLowerCase(),
originalText: tag,
replaceText: tag,
});
});
Expand Down
29 changes: 0 additions & 29 deletions src/indexing/utils.spec.ts

This file was deleted.

5 changes: 0 additions & 5 deletions src/indexing/utils.ts

This file was deleted.

41 changes: 23 additions & 18 deletions src/search/index.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import _ from 'lodash';
import { Trie, Emit } from '@tanishiking/aho-corasick';
import { Trie } from '@tanishiking/aho-corasick';

import { redactText } from './search.utils';
import type { Indexer } from '../indexing/indexer';
import { redactText } from './redactText';
import { mapStemToOriginalText } from './mapStemToOriginalText';
import { WordPunctStemTokenizer } from '../tokenizers';

type SearchResult = {
const tokenizer = new WordPunctStemTokenizer();

export type SearchResult = {
start: number;
end: number;
keyword: string;
indexKeyword: string;
originalKeyword: string;
};

const isEqual = (a: Emit, b: Emit) => {
return a.start === b.start && a.keyword === b.keyword;
const isEqual = (a: SearchResult, b: SearchResult) => {
return a.start === b.start && a.indexKeyword === b.indexKeyword;
};

export default class Search {
Expand All @@ -36,20 +41,20 @@ export default class Search {
public find(text: string): SearchResult[] {
const redactedText = redactText(text); // Redact text that we don't want to be searched

const results = this.trie.parseText(redactedText);
// Stem the text
const tokens = tokenizer.tokenize(redactedText);
const stemmedText = tokens.map((t) => t.stem).join('');

return this.mapToSearchResults(results);
}
// Search stemmed text
const emits = this.trie.parseText(stemmedText);

private mapToSearchResults(results: Emit[]): SearchResult[] {
return _.uniqWith(results, isEqual)
.filter((result) => this.keywordExistsInIndex(result.keyword))
.map((result) => ({
start: result.start,
end: result.end + 1,
keyword: result.keyword,
}))
.sort((a, b) => a.start - b.start); // Must sort by start position to prepare for highlighting
// Map stemmed results to original text
return _.chain(emits)
.map((emit) => mapStemToOriginalText(emit, tokens))
.uniqWith(isEqual)
.filter((result) => this.keywordExistsInIndex(result.indexKeyword))
.sort((a, b) => a.start - b.start) // Must sort by start position to prepare for highlighting
.value();
}

private keywordExistsInIndex(index: string): boolean {
Expand Down
28 changes: 28 additions & 0 deletions src/search/mapStemToOriginalText.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { Emit } from '@tanishiking/aho-corasick';

import { SearchResult } from '../search/index';
import { Token } from '../tokenizers';

/**
* Takes a given search result (which has the start/end position and a "stemmed" keyword)
* that was matched, and maps them to a new start/end position for the original keyword
* which was stem was created from
* @param searchResult
* @param tokens
* @returns
*/
export const mapStemToOriginalText = (searchResult: Emit, tokens: Token[]): SearchResult => {
const matchingTokens = tokens.filter(
(token) => token.stemStart >= searchResult.start && token.stemEnd <= searchResult.end + 1
);

return {
start: matchingTokens[0].originalStart,
end: matchingTokens[matchingTokens.length - 1].originalEnd,
indexKeyword: matchingTokens
.map((token) => token.stem)
.join('')
.toLowerCase(),
originalKeyword: matchingTokens.map((token) => token.originalText).join(''),
};
};
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { redactText } from './search.utils';
import { redactText } from './redactText';

describe('redactText', () => {
it('Hashtags are redacted', () => {
Expand Down
File renamed without changes.
81 changes: 81 additions & 0 deletions src/search/search.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import { Indexer } from '../indexing/indexer';
import Search from './index';

const getKeywordsMockFn = jest.fn();

jest.mock('../indexing/indexer', () => {
return {
Indexer: jest.fn().mockImplementation(() => {
return {
getKeywords: getKeywordsMockFn,
getDocumentsByKeyword: () => [{}],
};
}),
};
});

beforeEach(() => {
jest.clearAllMocks();
});

describe('Search class', () => {
it('Highlights single keywords that can be stemmed', () => {
getKeywordsMockFn.mockReturnValue(['search', 'note']);
const text = 'This is a note that I will be use for searching';

const indexer = new Indexer(null);
const search = new Search(indexer);
const results = search.find(text);

expect(results).toEqual([
{
start: 10,
end: 14,
indexKeyword: 'note',
originalKeyword: 'note',
},
{
start: 38,
end: 47,
indexKeyword: 'search',
originalKeyword: 'searching',
},
]);
});

it('Longer keyword matches are always prioritised for highlight', () => {
getKeywordsMockFn.mockReturnValue(['github', 'github fork']);
const text = 'I use GitHub Forks as part of my development flow';

const indexer = new Indexer(null);
const search = new Search(indexer);
const results = search.find(text);

expect(results).toEqual([
{
start: 6,
end: 18,
indexKeyword: 'github fork',
originalKeyword: 'GitHub Forks',
},
]);
});

it('Three word keyword is highlighted', () => {
getKeywordsMockFn.mockReturnValue(['shared', 'client', 'record', 'share client record']);
const text = 'Designing a shared client record is a great idea but challenging';

const indexer = new Indexer(null);
const search = new Search(indexer);
const results = search.find(text);

expect(results).toEqual([
{
start: 12,
end: 32,
indexKeyword: 'share client record',
originalKeyword: 'shared client record',
},
]);
});
});
21 changes: 21 additions & 0 deletions src/stemmers/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { PorterStemmer } from 'natural';

import { WordPunctStemTokenizer } from '../tokenizers';

/**
* Stem a given phrase. If the phrase is made up of multiple words,
* the last word in the phrase is the only one that will be stemmed
* @param text input text
* @returns stemmed text
*/
export const stemLastWord = (text: string): string => {
return PorterStemmer.stem(text);
};

export const stemPhrase = (text: string): string => {
const tokenizer = new WordPunctStemTokenizer();
return tokenizer
.tokenize(text)
.map((t) => t.stem)
.join('');
};
Loading

0 comments on commit ff6a882

Please sign in to comment.