Skip to content

Commit d0aa2c2

Browse files
authored
Address the feedback on the tokenizer's library (dotnet#7024)
* Fix cache when calling EncodeToIds * Make EnglishRoberta _mergeRanks thread safe * Delete Trainer * Remove the setters on the Bpe properties * Remove Roberta and Tiktoken special casing in the Tokenizer and support the cases in the Model abstraction * Support text-embedding-3-small/large embedding * Remove redundant TokenToId abstraction and keep the one with the extra parameters * Enable creating Tiktoken asynchronously or directly using the tokenizer data * Add cancellationToken support in CreateAsync APIs * Rename sequence to text and Tokenize to Encode * Rename skipSpecialTokens to considerSpecialTokens * Rename TokenizerResult to EncodingResult * Make Token publicly immutable * Change offset tuples from (Index, End) to (Index, Length) * Rename NormalizedString method's parameters * Rename Model's methods to start with verb * Convert Model.GetVocab() method to a Vocab property * Some method's parameters and variable renaming * Remove Vocab and VocabSize from the abstraction * Cleanup normalization support * Minor Bpe cleanup * Resolve rebase change * Address the feedback
1 parent 4b89d98 commit d0aa2c2

31 files changed

+838
-6033
lines changed

src/Microsoft.ML.Tokenizers/TokenizerResult.cs renamed to src/Microsoft.ML.Tokenizers/EncodingResult.cs

+7-7
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,16 @@ namespace Microsoft.ML.Tokenizers
1111
/// <summary>
1212
/// The Encoding represents the output of a Tokenizer.
1313
/// </summary>
14-
public sealed class TokenizerResult
14+
public sealed class EncodingResult
1515
{
1616
/// <summary>
17-
/// Create a new object of the TokenizerResult object.
17+
/// Create a new object of the EncodingResult object.
1818
/// </summary>
1919
/// <param name="originalString">The list of tokens to merge.</param>
2020
/// <param name="normalizedString">The list of tokens to merge.</param>
2121
/// <param name="splits">The list of tokens to merge.</param>
2222
/// <param name="offsetsMappedToOriginalString">Indicate whether the offsets is mapped to the original string or the normalized string.</param>
23-
public TokenizerResult(string originalString, string normalizedString, IEnumerable<Split> splits, bool offsetsMappedToOriginalString)
23+
public EncodingResult(string originalString, string normalizedString, IEnumerable<Split> splits, bool offsetsMappedToOriginalString)
2424
{
2525
OriginalString = originalString;
2626
NormalizedString = normalizedString;
@@ -47,7 +47,7 @@ public TokenizerResult(string originalString, string normalizedString, IEnumerab
4747
private List<Token>? _tokens;
4848
private List<string>? _tokensWords;
4949
private List<int>? _ids;
50-
private List<(int Index, int End)>? _offsets;
50+
private List<(int Index, int Length)>? _offsets;
5151

5252
internal void AddTokens(IReadOnlyList<Token> addedTokens)
5353
{
@@ -121,10 +121,10 @@ public IReadOnlyList<string> Tokens
121121
}
122122

123123
/// <summary>
124-
/// Gets The list of offsets. These offsets lets you slice the input string, and thus retrieve
124+
/// Gets The list of offsets. These offsets let's you slice the input string, and thus retrieve
125125
/// the original part that led to producing the corresponding token.
126126
/// </summary>
127-
public IReadOnlyList<(int Index, int End)> Offsets
127+
public IReadOnlyList<(int Index, int Length)> Offsets
128128
{
129129
get
130130
{
@@ -138,7 +138,7 @@ public IReadOnlyList<string> Tokens
138138
return Array.Empty<(int, int)>();
139139
}
140140

141-
_offsets = new List<(int Index, int End)>(_tokens.Count);
141+
_offsets = new List<(int Index, int Length)>(_tokens.Count);
142142

143143
foreach (var token in _tokens)
144144
{

src/Microsoft.ML.Tokenizers/Model/BPE.cs

+137-143
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)