Improve tokenizer pretty-pretty logic + __call__ method (pytorch#1417)

craymichael · facebook-github-bot · commit 9fe0a98fee97 · 2024-10-25T18:49:21.000-07:00
Summary:

Use the __call__ method of tokenizers that returns a BatchEncoding with offsets. This allows us to grab text from the fully decoded string and not make assumptions about how many tokens correspond to a single string.

Differential Revision: D64998804
diff --git a/captum/_utils/typing.py b/captum/_utils/typing.py
@@ -53,12 +53,23 @@ class TokenizerLike(Protocol):
     LLM attribution methods."""
 
     @overload
-    def encode(self, text: str, return_tensors: None = None) -> List[int]: ...
+    def encode(
+        self, text: str, add_special_tokens: bool = ..., return_tensors: None = ...
+    ) -> List[int]: ...
+
     @overload
-    def encode(self, text: str, return_tensors: Literal["pt"]) -> Tensor: ...
+    def encode(
+        self,
+        text: str,
+        add_special_tokens: bool = ...,
+        return_tensors: Literal["pt"] = ...,
+    ) -> Tensor: ...
 
     def encode(
-        self, text: str, return_tensors: Optional[str] = None
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        return_tensors: Optional[str] = None,
     ) -> Union[List[int], Tensor]: ...
 
     def decode(self, token_ids: Tensor) -> str: ...
@@ -84,5 +95,6 @@ def convert_tokens_to_ids(
     def __call__(
         self,
         text: Optional[Union[str, List[str], List[List[str]]]] = None,
+        add_special_tokens: bool = True,
         return_offsets_mapping: bool = False,
     ) -> BatchEncodingType: ...
diff --git a/captum/attr/_core/llm_attr.py b/captum/attr/_core/llm_attr.py
@@ -1,4 +1,7 @@
 # pyre-strict
+
+import warnings
+
 from copy import copy
 
 from textwrap import shorten
@@ -216,6 +219,11 @@ def plot_seq_attr(
             return fig, ax
 
 
+def _clean_up_pretty_token(token: str) -> str:
+    """Remove newlines and leading/trailing whitespace from token."""
+    return token.replace("\n", "\\n").strip()
+
+
 def _convert_ids_to_pretty_tokens(ids: Tensor, tokenizer: TokenizerLike) -> List[str]:
     """
     Convert ids to tokens without ugly unicode characters (e.g., Ġ). See:
@@ -230,10 +238,63 @@ def _convert_ids_to_pretty_tokens(ids: Tensor, tokenizer: TokenizerLike) -> List
     > BPE splitting mostly to avoid digesting spaces since the standard BPE algorithm
     > used spaces in its process
     """
+    txt = tokenizer.decode(ids)
+    # Don't add special tokens (they're either already there, or we don't want them)
+    enc = tokenizer(txt, return_offsets_mapping=True, add_special_tokens=False)
+    input_ids = cast(List[int], enc["input_ids"])
+    offset_mapping = cast(List[Tuple[int, int]], enc["offset_mapping"])
+
+    pretty_tokens = []
+    end_prev = -1
+    idx = 0
+    for i, (input_id, offset) in enumerate(zip(input_ids, offset_mapping)):
+        start, end = offset
+        if start == end:
+            # For the case where offsets are not set properly (the end and start are
+            # equal for all tokens - fall back on the start of the next span in the
+            # offset mapping)
+            if (i + 1) < len(input_ids):
+                end = offset_mapping[i + 1][0]
+            else:
+                end = len(txt)
+        if input_id != ids[idx]:
+            # When the re-encoded string doesn't match the original encoding we skip
+            # this token and hope for the best, falling back on a naive method. This
+            # can happen when a tokenizer might add a token that corresponds to
+            # a space only when add_special_tokens=False.
+            warnings.warn(
+                f"(i={i}) input_id {input_id} != ids[idx] {ids[idx]} (corresponding "
+                f"to text: {repr(txt[start:end])}). Skipping this token.",
+                stacklevel=2,
+            )
+            continue
+        pretty_tokens.append(
+            _clean_up_pretty_token(txt[start:end])
+            + (" [OVERLAP]" if end_prev > start else "")
+        )
+        end_prev = end
+        idx += 1
+    if len(pretty_tokens) != len(ids):
+        warnings.warn(
+            f"Pretty tokens length {len(pretty_tokens)} != ids length {len(ids)}! "
+            "Falling back to naive decoding logic.",
+            stacklevel=2,
+        )
+        return _convert_ids_to_pretty_tokens_fallback(ids, tokenizer)
+    return pretty_tokens
+
+
+def _convert_ids_to_pretty_tokens_fallback(
+    ids: Tensor, tokenizer: TokenizerLike
+) -> List[str]:
+    """
+    Fallback function that naively handles logic when multiple ids map to one string.
+    """
     pretty_tokens = []
     idx = 0
     while idx < len(ids):
         decoded = tokenizer.decode(ids[idx])
+        decoded_pretty = _clean_up_pretty_token(decoded)
         # Handle case where single token (e.g. unicode) is split into multiple IDs
         # NOTE: This logic will fail if a tokenizer splits a token into 3+ IDs
         if decoded.strip() == "�" and tokenizer.encode(decoded) != [ids[idx]]:
@@ -244,17 +305,17 @@ def _convert_ids_to_pretty_tokens(ids: Tensor, tokenizer: TokenizerLike) -> List
             ]:
                 # Both tokens are from a split, combine them
                 decoded = tokenizer.decode(ids[idx : idx + 2])
-                pretty_tokens.append(decoded + "[1/2]")
-                pretty_tokens.append(decoded + "[2/2]")
+                pretty_tokens.append(decoded_pretty)
+                pretty_tokens.append(decoded_pretty + " [OVERLAP]")
             else:
                 # Treat tokens as separate
-                pretty_tokens.append(decoded)
-                pretty_tokens.append(decoded_next)
+                pretty_tokens.append(decoded_pretty)
+                pretty_tokens.append(_clean_up_pretty_token(decoded_next))
             idx += 2
         else:
             # Just a normal token
             idx += 1
-            pretty_tokens.append(decoded)
+            pretty_tokens.append(decoded_pretty)
     return pretty_tokens
 
 
diff --git a/tests/attr/test_interpretable_input.py b/tests/attr/test_interpretable_input.py
@@ -20,12 +20,23 @@ def __init__(self, vocab_list) -> None:
         self.unk_idx = len(vocab_list) + 1
 
     @overload
-    def encode(self, text: str, return_tensors: None = None) -> List[int]: ...
+    def encode(
+        self, text: str, add_special_tokens: bool = ..., return_tensors: None = ...
+    ) -> List[int]: ...
+
     @overload
-    def encode(self, text: str, return_tensors: Literal["pt"]) -> Tensor: ...
+    def encode(
+        self,
+        text: str,
+        add_special_tokens: bool = ...,
+        return_tensors: Literal["pt"] = ...,
+    ) -> Tensor: ...
 
     def encode(
-        self, text: str, return_tensors: Optional[str] = "pt"
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        return_tensors: Optional[str] = "pt",
     ) -> Union[List[int], Tensor]:
         assert return_tensors == "pt"
         return torch.tensor([self.convert_tokens_to_ids(text.split(" "))])
@@ -72,6 +83,7 @@ def decode(self, token_ids: Tensor) -> str:
     def __call__(
         self,
         text: Optional[Union[str, List[str], List[List[str]]]] = None,
+        add_special_tokens: bool = True,
         return_offsets_mapping: bool = False,
     ) -> BatchEncodingType:
         raise NotImplementedError
diff --git a/tests/attr/test_llm_attr.py b/tests/attr/test_llm_attr.py
@@ -3,6 +3,8 @@
 # pyre-strict
 
 import copy
+
+from collections import UserDict
 from typing import (
     Any,
     cast,
@@ -40,24 +42,38 @@ class DummyTokenizer:
     vocab_size: int = 256
     sos: int = 0
     unk: int = 1
-    special_tokens: Dict[int, str] = {sos: "<sos>", unk: "<unk>"}
+    sos_str: str = "<sos>"
+    special_tokens: Dict[int, str] = {sos: sos_str, unk: "<unk>"}
 
     @overload
-    def encode(self, text: str, return_tensors: None = None) -> List[int]: ...
+    def encode(
+        self, text: str, add_special_tokens: bool = ..., return_tensors: None = ...
+    ) -> List[int]: ...
+
     @overload
-    def encode(self, text: str, return_tensors: Literal["pt"]) -> Tensor: ...
+    def encode(
+        self,
+        text: str,
+        add_special_tokens: bool = ...,
+        return_tensors: Literal["pt"] = ...,
+    ) -> Tensor: ...
 
     def encode(
-        self, text: str, return_tensors: Optional[str] = None
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        return_tensors: Optional[str] = None,
     ) -> Union[List[int], Tensor]:
         tokens = text.split(" ")
 
         tokens_ids: Union[List[int], Tensor] = [
-            ord(s[0]) if len(s) == 1 else self.unk for s in tokens
+            ord(s[0]) if len(s) == 1 else (self.sos if s == self.sos_str else self.unk)
+            for s in tokens
         ]
 
         # start with sos
-        tokens_ids = [self.sos, *tokens_ids]
+        if add_special_tokens:
+            tokens_ids = [self.sos, *tokens_ids]
 
         if return_tensors:
             return torch.tensor([tokens_ids])
@@ -100,9 +116,24 @@ def decode(self, token_ids: Tensor) -> str:
     def __call__(
         self,
         text: Optional[Union[str, List[str], List[List[str]]]] = None,
+        add_special_tokens: bool = True,
         return_offsets_mapping: bool = False,
     ) -> BatchEncodingType:
-        raise NotImplementedError
+        assert isinstance(text, str)
+        input_ids = self.encode(text, add_special_tokens=add_special_tokens)
+
+        result: BatchEncodingType = UserDict()
+        result["input_ids"] = input_ids
+
+        if return_offsets_mapping:
+            offset_mapping = []
+            idx = 0
+            for token in text.split(" "):
+                offset_mapping.append((idx - (0 if idx == 0 else 1), idx + len(token)))
+                idx += len(token) + 1  # +1 for space
+            result["offset_mapping"] = offset_mapping
+
+        return result
 
 
 class Result(NamedTuple):
diff --git a/tests/attr/test_llm_attr_hf_compatibility.py b/tests/attr/test_llm_attr_hf_compatibility.py