Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-engineer index to match both partial and multiple words #30

Merged
merged 4 commits into from
Feb 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"id": "obsidian-sidekick",
"name": "Sidekick",
"description": "A companion to identify hidden connections that match your tags and pages",
"version": "1.4.3",
"version": "1.5.0",
"minAppVersion": "0.13.8",
"author": "Hady Osman",
"authorUrl": "https://hady.geek.nz",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "obsidian-sidekick",
"version": "1.4.3",
"version": "1.5.0",
"description": "A companion to identify hidden connections that match your tags and pages",
"main": "src/index.ts",
"repository": {
Expand Down
10 changes: 5 additions & 5 deletions src/cmExtension/suggestionsExtension.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ import './suggestionsExtension.css';

const SuggestionCandidateClass = 'cm-suggestion-candidate';

const underlineDecoration = (start: number, end: number, keyword: string) =>
const underlineDecoration = (start: number, end: number, indexKeyword: string) =>
Decoration.mark({
class: SuggestionCandidateClass,
attributes: {
'data-keyword': keyword,
'data-index-keyword': indexKeyword,
'data-position-start': `${start}`,
'data-position-end': `${end}`,
},
Expand Down Expand Up @@ -68,7 +68,7 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin<Plugi
const end = from + result.end;

// Add the decoration
builder.add(start, end, underlineDecoration(start, end, result.keyword));
builder.add(start, end, underlineDecoration(start, end, result.indexKeyword));
}
}

Expand All @@ -89,13 +89,13 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin<Plugi
}

// Extract position and replacement text from target element data attributes state
const { positionStart, positionEnd, keyword } = target.dataset;
const { positionStart, positionEnd, indexKeyword } = target.dataset;

// Show suggestions modal
showSuggestionsModal({
app,
mouseEvent: e,
suggestions: search.getReplacementSuggestions(keyword),
suggestions: search.getReplacementSuggestions(indexKeyword),
onClick: (replaceText) => {
view.dispatch({
changes: {
Expand Down
20 changes: 16 additions & 4 deletions src/indexing/indexer.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import _ from 'lodash';
import lokijs from 'lokijs';
import { TypedEmitter } from 'tiny-typed-emitter';
import type { TFile } from 'obsidian';

import { tokenize } from './utils';
import { stemPhrase } from '../stemmers';
import { WordPermutationsTokenizer } from '../tokenizers';
import type { PluginHelper } from '../plugin-helper';

type Document = {
fileCreationTime: number;
type: 'tag' | 'alias' | 'page' | 'page-token';
keyword: string;
originalText: string;
replaceText: string;
};

Expand All @@ -19,6 +22,7 @@ interface IndexerEvents {

export class Indexer extends TypedEmitter<IndexerEvents> {
private documents: Collection<Document>;
private permutationTokenizer: WordPermutationsTokenizer;

constructor(private pluginHelper: PluginHelper) {
super();
Expand All @@ -28,15 +32,19 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
this.documents = db.addCollection<Document>('documents', {
indices: ['fileCreationTime', 'keyword'],
});

this.permutationTokenizer = new WordPermutationsTokenizer();
}

public getKeywords(): string[] {
// Exclude any keywords associated with active file as we don't want recursive highlighting
const exclusionFile = this.pluginHelper.activeFile;

return this.documents
const keywords = this.documents
.where((doc) => doc.fileCreationTime !== exclusionFile.stat.ctime)
.map((doc) => doc.keyword);

return _.uniq(keywords);
}

public getDocumentsByKeyword(keyword: string): Document[] {
Expand All @@ -62,15 +70,17 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
this.documents.insert({
fileCreationTime: file.stat.ctime,
type: 'page',
keyword: file.basename.toLowerCase(),
keyword: stemPhrase(file.basename),
originalText: file.basename,
replaceText: `[[${file.basename}]]`,
});

tokenize(file.basename).forEach((token) => {
this.permutationTokenizer.tokenize(file.basename).forEach((token) => {
this.documents.insert({
fileCreationTime: file.stat.ctime,
type: 'page-token',
keyword: token,
originalText: file.basename,
replaceText: `[[${file.basename}]]`,
});
});
Expand All @@ -80,6 +90,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
fileCreationTime: file.stat.ctime,
type: 'alias',
keyword: alias.toLowerCase(),
originalText: file.basename,
replaceText: `[[${file.basename}|${alias}]]`,
});
});
Expand All @@ -89,6 +100,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
fileCreationTime: file.stat.ctime,
type: 'tag',
keyword: tag.replace(/#/, '').toLowerCase(),
originalText: tag,
replaceText: tag,
});
});
Expand Down
29 changes: 0 additions & 29 deletions src/indexing/utils.spec.ts

This file was deleted.

5 changes: 0 additions & 5 deletions src/indexing/utils.ts

This file was deleted.

41 changes: 23 additions & 18 deletions src/search/index.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import _ from 'lodash';
import { Trie, Emit } from '@tanishiking/aho-corasick';
import { Trie } from '@tanishiking/aho-corasick';

import { redactText } from './search.utils';
import type { Indexer } from '../indexing/indexer';
import { redactText } from './redactText';
import { mapStemToOriginalText } from './mapStemToOriginalText';
import { WordPunctStemTokenizer } from '../tokenizers';

type SearchResult = {
const tokenizer = new WordPunctStemTokenizer();

export type SearchResult = {
start: number;
end: number;
keyword: string;
indexKeyword: string;
originalKeyword: string;
};

const isEqual = (a: Emit, b: Emit) => {
return a.start === b.start && a.keyword === b.keyword;
const isEqual = (a: SearchResult, b: SearchResult) => {
return a.start === b.start && a.indexKeyword === b.indexKeyword;
};

export default class Search {
Expand All @@ -36,20 +41,20 @@ export default class Search {
public find(text: string): SearchResult[] {
const redactedText = redactText(text); // Redact text that we don't want to be searched

const results = this.trie.parseText(redactedText);
// Stem the text
const tokens = tokenizer.tokenize(redactedText);
const stemmedText = tokens.map((t) => t.stem).join('');

return this.mapToSearchResults(results);
}
// Search stemmed text
const emits = this.trie.parseText(stemmedText);

private mapToSearchResults(results: Emit[]): SearchResult[] {
return _.uniqWith(results, isEqual)
.filter((result) => this.keywordExistsInIndex(result.keyword))
.map((result) => ({
start: result.start,
end: result.end + 1,
keyword: result.keyword,
}))
.sort((a, b) => a.start - b.start); // Must sort by start position to prepare for highlighting
// Map stemmed results to original text
return _.chain(emits)
.map((emit) => mapStemToOriginalText(emit, tokens))
.uniqWith(isEqual)
.filter((result) => this.keywordExistsInIndex(result.indexKeyword))
.sort((a, b) => a.start - b.start) // Must sort by start position to prepare for highlighting
.value();
}

private keywordExistsInIndex(index: string): boolean {
Expand Down
28 changes: 28 additions & 0 deletions src/search/mapStemToOriginalText.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { Emit } from '@tanishiking/aho-corasick';

import { SearchResult } from '../search/index';
import { Token } from '../tokenizers';

/**
* Takes a given search result (which has the start/end position and a "stemmed" keyword)
* that was matched, and maps them to a new start/end position for the original keyword
* which was stem was created from
* @param searchResult
* @param tokens
* @returns
*/
export const mapStemToOriginalText = (searchResult: Emit, tokens: Token[]): SearchResult => {
const matchingTokens = tokens.filter(
(token) => token.stemStart >= searchResult.start && token.stemEnd <= searchResult.end + 1
);

return {
start: matchingTokens[0].originalStart,
end: matchingTokens[matchingTokens.length - 1].originalEnd,
indexKeyword: matchingTokens
.map((token) => token.stem)
.join('')
.toLowerCase(),
originalKeyword: matchingTokens.map((token) => token.originalText).join(''),
};
};
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { redactText } from './search.utils';
import { redactText } from './redactText';

describe('redactText', () => {
it('Hashtags are redacted', () => {
Expand Down
File renamed without changes.
81 changes: 81 additions & 0 deletions src/search/search.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import { Indexer } from '../indexing/indexer';
import Search from './index';

const getKeywordsMockFn = jest.fn();

jest.mock('../indexing/indexer', () => {
return {
Indexer: jest.fn().mockImplementation(() => {
return {
getKeywords: getKeywordsMockFn,
getDocumentsByKeyword: () => [{}],
};
}),
};
});

beforeEach(() => {
jest.clearAllMocks();
});

describe('Search class', () => {
it('Highlights single keywords that can be stemmed', () => {
getKeywordsMockFn.mockReturnValue(['search', 'note']);
const text = 'This is a note that I will be use for searching';

const indexer = new Indexer(null);
const search = new Search(indexer);
const results = search.find(text);

expect(results).toEqual([
{
start: 10,
end: 14,
indexKeyword: 'note',
originalKeyword: 'note',
},
{
start: 38,
end: 47,
indexKeyword: 'search',
originalKeyword: 'searching',
},
]);
});

it('Longer keyword matches are always prioritised for highlight', () => {
getKeywordsMockFn.mockReturnValue(['github', 'github fork']);
const text = 'I use GitHub Forks as part of my development flow';

const indexer = new Indexer(null);
const search = new Search(indexer);
const results = search.find(text);

expect(results).toEqual([
{
start: 6,
end: 18,
indexKeyword: 'github fork',
originalKeyword: 'GitHub Forks',
},
]);
});

it('Three word keyword is highlighted', () => {
getKeywordsMockFn.mockReturnValue(['shared', 'client', 'record', 'share client record']);
const text = 'Designing a shared client record is a great idea but challenging';

const indexer = new Indexer(null);
const search = new Search(indexer);
const results = search.find(text);

expect(results).toEqual([
{
start: 12,
end: 32,
indexKeyword: 'share client record',
originalKeyword: 'shared client record',
},
]);
});
});
21 changes: 21 additions & 0 deletions src/stemmers/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { PorterStemmer } from 'natural';

import { WordPunctStemTokenizer } from '../tokenizers';

/**
* Stem a given phrase. If the phrase is made up of multiple words,
* the last word in the phrase is the only one that will be stemmed
* @param text input text
* @returns stemmed text
*/
export const stemLastWord = (text: string): string => {
return PorterStemmer.stem(text);
};

export const stemPhrase = (text: string): string => {
const tokenizer = new WordPunctStemTokenizer();
return tokenizer
.tokenize(text)
.map((t) => t.stem)
.join('');
};
Loading