-
Notifications
You must be signed in to change notification settings - Fork 355
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Utils and helpers for sequence salience, most notably token grouping …
…code. PiperOrigin-RevId: 606346156
- Loading branch information
Showing
8 changed files
with
243 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/** | ||
* @fileoverview Utils for working with tokenized text. | ||
*/ | ||
|
||
/** | ||
* Evil underscore used by sentencepiece to replace spaces. | ||
*/ | ||
export const SPM_SPACE_SENTINEL = '▁'; | ||
|
||
/** | ||
* Clean SPM text to make it more human-readable. | ||
*/ | ||
export function cleanSpmText(text: string): string { | ||
return text.replaceAll(SPM_SPACE_SENTINEL, ' '); | ||
} | ||
|
||
/** | ||
* Use a regex to match segment prefixes. The prefix and anything | ||
* following it (until the next match) are treated as one segment. | ||
*/ | ||
export function groupTokensByRegexPrefix( | ||
tokens: string[], | ||
matcher: RegExp, | ||
): string[][] { | ||
const text = tokens.join(''); | ||
const matches = [...text.matchAll(matcher)]; | ||
|
||
let textCharOffset = 0; // chars into text | ||
let matchIdx = 0; // indices into matches | ||
const groups: string[][] = []; | ||
let acc: string[] = []; | ||
for (let i = 0; i < tokens.length; i++) { | ||
const token = tokens[i]; | ||
const nextMatch = matches[matchIdx]; | ||
|
||
// Look ahead to see if this token intrudes on a match. | ||
// If so, start a new segment before pushing the token. | ||
if (nextMatch !== undefined && | ||
textCharOffset + token.length > nextMatch.index!) { | ||
// Don't push an empty group if the first token is part of a match. | ||
if (acc.length > 0 || groups.length > 0) groups.push(acc); | ||
acc = []; | ||
matchIdx += 1; | ||
} | ||
|
||
// Push the token. | ||
acc.push(token); | ||
textCharOffset += token.length; | ||
} | ||
// Finally, push any open group. | ||
if (acc.length > 0) groups.push(acc); | ||
return groups; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/** | ||
* Testing for token_utils.ts | ||
*/ | ||
|
||
import 'jasmine'; | ||
|
||
import * as tokenUtils from './token_utils'; | ||
|
||
describe('cleanSpmText test', () => { | ||
it('cleans magic underscores from SPM output', () => { | ||
const text = 'Summarize▁this▁sentence:\n\nOnce▁upon▁a▁time'; | ||
expect(tokenUtils.cleanSpmText(text)) | ||
.toEqual('Summarize this sentence:\n\nOnce upon a time'); | ||
}); | ||
}); | ||
|
||
describe('groupTokensByRegexPrefix test', () => { | ||
[{ | ||
testcaseName: 'groups tokens by word', | ||
tokens: ['Sum', 'mar', 'ize', '▁this', '▁sent', 'ence', ':'], | ||
regex: /[▁\s]+/g, | ||
expectedGroups: [['Sum', 'mar', 'ize'], ['▁this'], ['▁sent', 'ence', ':']], | ||
}, | ||
{ | ||
testcaseName: 'groups tokens by word, handling newlines', | ||
tokens: [ | ||
'Sum', 'mar', 'ize', '▁this', '▁sent', 'ence', ':', '\n', '\n', 'Once', | ||
'▁upon', '▁a', '▁time' | ||
], | ||
// Consecutive newlines should be their own segment. | ||
// Start a new word on the first non-\n afterwards. | ||
regex: /([▁\s]+)|(?<=\n)[^\n]/g, | ||
expectedGroups: [ | ||
['Sum', 'mar', 'ize'], ['▁this'], ['▁sent', 'ence', ':'], ['\n', '\n'], | ||
['Once'], ['▁upon'], ['▁a'], ['▁time'] | ||
], | ||
}, | ||
{ | ||
testcaseName: 'groups tokens by sentence, simple version', | ||
tokens: [ | ||
'Sent', 'ence', '▁one', '.', '▁Sent', 'ence', '▁two', '!', '▁Sent', | ||
'ence', '▁three', '?' | ||
], | ||
regex: /(?<=[.?!])[▁\s]+/g, | ||
expectedGroups: [ | ||
['Sent', 'ence', '▁one', '.'], | ||
['▁Sent', 'ence', '▁two', '!'], | ||
['▁Sent', 'ence', '▁three', '?'], | ||
], | ||
}, | ||
{ | ||
testcaseName: 'groups tokens by sentence, handling newlines', | ||
tokens: [ | ||
'Sum', 'mar', 'ize', '▁this', '▁sent', 'ence', ':', '\n', '\n', 'Once', | ||
'▁upon', '▁a', '▁time' | ||
], | ||
// Sentence start is one of: | ||
// - a run of consecutive \n as its own segment | ||
// - any non-\n following \n | ||
// - whitespace or magic underscore following punctuation [.?!] | ||
regex: /(\n+)|((?<=\n)[^\n])|((?<=[.?!])([▁\s]+))/g, | ||
expectedGroups: [ | ||
['Sum', 'mar', 'ize', '▁this', '▁sent', 'ence', ':'], ['\n', '\n'], | ||
['Once', '▁upon', '▁a', '▁time'] | ||
], | ||
}, | ||
{ | ||
testcaseName: 'groups tokens by line', | ||
tokens: [ | ||
'Sum', 'mar', 'ize', '▁this', '▁sent', 'ence', ':', '\n', '\n', 'Once', | ||
'▁upon', '▁a', '▁time' | ||
], | ||
// Line start is either: | ||
// - a run of consecutive \n as its own segment | ||
// - any non-\n following \n | ||
regex: /(\n+)|([^\n]+)/g, | ||
expectedGroups: [ | ||
['Sum', 'mar', 'ize', '▁this', '▁sent', 'ence', ':'], ['\n', '\n'], | ||
['Once', '▁upon', '▁a', '▁time'] | ||
], | ||
}, | ||
].forEach(({testcaseName, tokens, regex, expectedGroups}) => { | ||
it(testcaseName, () => { | ||
const groups = tokenUtils.groupTokensByRegexPrefix(tokens, regex); | ||
expect(groups).toEqual(expectedGroups); | ||
}); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters