Skip to content

Commit 2e43416

Browse files
committed
[ConvertSlow] make sure the order is preserved for addedtokens (#31902)
* preserve the order * oups * oups * nit * trick * fix issues
1 parent c43fd9d commit 2e43416

File tree

1 file changed

+32
-9
lines changed

1 file changed

+32
-9
lines changed

src/transformers/convert_slow_tokenizer.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -622,17 +622,40 @@ def decoder(self, replacement, add_prefix_space):
622622
def converted(self) -> Tokenizer:
623623
tokenizer = self.tokenizer(self.proto)
624624

625+
# control tokens are special
626+
# user defined symbols are not
627+
# both user and control tokens are AddedTokens
625628
# Add user defined symbols (type == 4) from sentnecepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33)
626-
user_defined_symbols = [
627-
AddedToken(token, normalized=False, special=False)
628-
for token in [p.piece for p in self.proto.pieces if p.type == 4]
629-
]
630-
control_symbols = [
631-
AddedToken(token, normalized=False, special=True) for token in self.proto.trainer_spec.control_symbols
632-
]
633-
634-
tokenizer.add_tokens(user_defined_symbols + control_symbols)
635629

630+
tokens_to_add = {
631+
id: AddedToken(token, normalized=False, special=special)
632+
for id, token, special in [
633+
(id, p.piece, p.type == 3) for id, p in enumerate(self.proto.pieces) if p.type in [3, 4]
634+
]
635+
}
636+
tokens_to_add = [k for _, k in sorted(tokens_to_add.items(), key=lambda x: x[0])]
637+
if len(tokens_to_add) > 0:
638+
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
639+
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
640+
# individual tokens would repeatedly rebuild a trie, which can be slow.
641+
is_last_special = None
642+
tokens = []
643+
for token in tokens_to_add:
644+
is_special = token.special
645+
if is_last_special is None or is_last_special == is_special:
646+
tokens.append(token)
647+
else:
648+
if is_last_special:
649+
tokenizer.add_special_tokens(tokens)
650+
else:
651+
tokenizer.add_tokens(tokens)
652+
tokens = [token]
653+
is_last_special = is_special
654+
if tokens:
655+
if is_last_special:
656+
tokenizer.add_special_tokens(tokens)
657+
else:
658+
tokenizer.add_tokens(tokens)
636659
# Tokenizer assemble
637660
normalizer = self.normalizer(self.proto)
638661
if normalizer is not None:

0 commit comments

Comments
 (0)