Skip to content

Commit

Permalink
Optimize regexes used in tiktoken (#7020)
Browse files Browse the repository at this point in the history
* Optimize regexes used in tiktoken

* Add comment and consolidate duplicate regex from Roberta
  • Loading branch information
stephentoub authored Feb 23, 2024
1 parent a139371 commit 4b89d98
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 16 deletions.
12 changes: 1 addition & 11 deletions src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace Microsoft.ML.Tokenizers
{
Expand All @@ -18,15 +17,6 @@ public sealed partial class RobertaPreTokenizer : PreTokenizer
/// </summary>
public static RobertaPreTokenizer Instance { get; } = new RobertaPreTokenizer();

private const string PretokenizePattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
#if NET7_0_OR_GREATER
[GeneratedRegex(PretokenizePattern)]
private static partial Regex PretokenizeRegex();
#else
private static readonly Regex _regex = new Regex(PretokenizePattern, RegexOptions.Compiled);
private static Regex PretokenizeRegex() => _regex;
#endif

/// <summary>
/// Splits the given string in multiple substrings at the word boundary, keeping track of the offsets of said substrings from the original string.
/// </summary>
Expand All @@ -40,7 +30,7 @@ public override IEnumerable<Split> PreTokenize(string sentence, bool skipSpecial
return Array.Empty<Split>();
}

return SplitSentence(sentence, PretokenizeRegex());
return SplitSentence(sentence, Tokenizer.P50kBaseRegex());
}
}
}
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public sealed partial class WhiteSpace : PreTokenizer
/// </summary>
public static WhiteSpace Instance { get; } = new WhiteSpace();

private const string PretokenizePattern = @"\w+|[^\w\s]+";
private const string PretokenizePattern = /*lang=regex*/ @"\w+|[^\w\s]+";
#if NET7_0_OR_GREATER
[GeneratedRegex(PretokenizePattern)]
private static partial Regex PretokenizeRegex();
Expand Down
10 changes: 6 additions & 4 deletions src/Microsoft.ML.Tokenizers/Tokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,10 @@ public static Task<Tokenizer> CreateByModelNameAsync(
}
}

private const string Cl100kBaseRegexPattern = @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
private const string P50kBaseRegexPattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
// Regex patterns based on https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

private const string Cl100kBaseRegexPattern = /*lang=regex*/ @"'(?i:[sdmt]|re|ve|ll)|(?>[^\r\n\p{L}\p{N}]?)\p{L}+|\p{N}{1,3}| ?(?>[^\s\p{L}\p{N}]+)[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+";
private const string P50kBaseRegexPattern = /*lang=regex*/ @"'(?:[sdmt]|re|ve|ll)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";

private const string Cl100kBaseVocabUrl = @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken";
private const string P50RanksUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken";
Expand All @@ -397,13 +399,13 @@ public static Task<Tokenizer> CreateByModelNameAsync(
private static partial Regex Cl100kBaseRegex();

[GeneratedRegex(P50kBaseRegexPattern)]
private static partial Regex P50kBaseRegex();
internal static partial Regex P50kBaseRegex();
#else
private static Regex? _cl100kBaseRegex;
private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled);

private static Regex? _p50kBaseRegex;
private static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
#endif

/// <summary>
Expand Down

0 comments on commit 4b89d98

Please sign in to comment.