@@ -622,17 +622,40 @@ def decoder(self, replacement, add_prefix_space):
622
622
def converted (self ) -> Tokenizer :
623
623
tokenizer = self .tokenizer (self .proto )
624
624
625
+ # control tokens are special
626
+ # user defined symbols are not
627
+ # both user and control tokens are AddedTokens
625
628
# Add user defined symbols (type == 4) from sentnecepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33)
626
- user_defined_symbols = [
627
- AddedToken (token , normalized = False , special = False )
628
- for token in [p .piece for p in self .proto .pieces if p .type == 4 ]
629
- ]
630
- control_symbols = [
631
- AddedToken (token , normalized = False , special = True ) for token in self .proto .trainer_spec .control_symbols
632
- ]
633
-
634
- tokenizer .add_tokens (user_defined_symbols + control_symbols )
635
629
630
+ tokens_to_add = {
631
+ id : AddedToken (token , normalized = False , special = special )
632
+ for id , token , special in [
633
+ (id , p .piece , p .type == 3 ) for id , p in enumerate (self .proto .pieces ) if p .type in [3 , 4 ]
634
+ ]
635
+ }
636
+ tokens_to_add = [k for _ , k in sorted (tokens_to_add .items (), key = lambda x : x [0 ])]
637
+ if len (tokens_to_add ) > 0 :
638
+ # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
639
+ # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
640
+ # individual tokens would repeatedly rebuild a trie, which can be slow.
641
+ is_last_special = None
642
+ tokens = []
643
+ for token in tokens_to_add :
644
+ is_special = token .special
645
+ if is_last_special is None or is_last_special == is_special :
646
+ tokens .append (token )
647
+ else :
648
+ if is_last_special :
649
+ tokenizer .add_special_tokens (tokens )
650
+ else :
651
+ tokenizer .add_tokens (tokens )
652
+ tokens = [token ]
653
+ is_last_special = is_special
654
+ if tokens :
655
+ if is_last_special :
656
+ tokenizer .add_special_tokens (tokens )
657
+ else :
658
+ tokenizer .add_tokens (tokens )
636
659
# Tokenizer assemble
637
660
normalizer = self .normalizer (self .proto )
638
661
if normalizer is not None :
0 commit comments