Skip to content

Bump transformers version #1631

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
ExampleType = Literal['prompt_response', 'chat']
TokenizedExample = dict[str, list[dict[str, list[int]]]]

_DEFAULT_CHAT_TEMPLATE = (
'{% for message in messages %}'
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
'{% endfor %}'
'{% if add_generation_prompt %}'
"{{ '<|im_start|>assistant\n' }}"
'{% endif %}'
)


def _get_example_type(example: Example) -> ExampleType:
"""Determines the type of the input example.
Expand Down Expand Up @@ -243,17 +252,21 @@ def slice_out_last_turn(
messages_through_current_turn: list[dict[str, str]],
conversation_through_previous_turn: str,
) -> tuple[str, str]:
chat_template = None if tokenizer.chat_template is not None else _DEFAULT_CHAT_TEMPLATE

try:
full_conversation = tokenizer.apply_chat_template(
messages_through_current_turn,
tokenize=False,
date_string=get_date_string(),
chat_template=chat_template,
)
prompt_with_history = tokenizer.apply_chat_template(
messages_through_current_turn[:-1],
tokenize=False,
add_generation_prompt=True,
date_string=get_date_string(),
chat_template=chat_template,
)
except Exception as e:
raise ChatTemplateError(
Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/models/mpt/modeling_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,17 @@ class InvalidConfigAccessError(KeyError):
# Not set but llama modeling code tries to read this attribute
'partial_rotary_factor',

# This key is accessed with a default of hidden_size / num_attention_heads
'head_dim',

# Benign transformers attributes needed for __init__
'_get_generation_defaults',
'label2id',
'id2label',
'torch_dtype',
'problem_type',
'__class__',
'_get_global_generation_defaults',
}


Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def __init__(
bos_token: Optional[str] = '<|endoftext|>',
pad_token: Optional[str] = None,
errors: str = 'replace',
chat_template: Optional[str] = None,
**kwargs: Any,
):
"""Constructor creates a tiktoken tokenizer to use as the underlying.
Expand All @@ -90,6 +91,8 @@ def __init__(
errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
Defaults to `"replace"`.
chat_template (Optional[str], optional): The Hugging Face chat template. Default will use the ``default_chat_template``
set on this class.
kwargs (Any): Other relevant keyword arguments.
"""
try:
Expand Down Expand Up @@ -178,6 +181,7 @@ def pickle_Encoding(enc: Encoding):
bos_token=bos_token,
pad_token=pad_token,
errors=errors,
chat_template=chat_template or self.default_chat_template,
**kwargs,
)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.26.0,<0.27',
'mlflow>=2.14.1,<2.18',
'accelerate>=0.25,<0.34', # for HF inference `device_map`
'transformers>=4.43.2,<4.44',
'transformers>=4.43.2,<4.47',
'mosaicml-streaming>=0.9.0,<0.10',
'torch>=2.4.0,<2.4.1',
'datasets>=2.19,<2.20',
Expand Down
4 changes: 4 additions & 0 deletions tests/data/test_template_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import transformers

from llmfoundry.data.finetuning.tasks import (
_DEFAULT_CHAT_TEMPLATE,
_slice_chat_formatted_example,
dataset_constructor,
tokenize_formatted_example,
Expand Down Expand Up @@ -304,6 +305,9 @@ def test_multi_turn_chat_slicing(
if use_date_string:
tok.chat_template = "{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{{- \"Today Date: \" + date_string }}\n"

if not tok.chat_template:
tok.chat_template = _DEFAULT_CHAT_TEMPLATE

templated_prompt_response_turns = _slice_chat_formatted_example(
example,
tok,
Expand Down
4 changes: 4 additions & 0 deletions tests/models/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1668,6 +1668,10 @@ def check_hf_model_equivalence(
del expected_model_config_dict['_name_or_path']
del new_model_config_dict['_name_or_path']

# Transformers changes this key on load from disk
del expected_model_config_dict['_attn_implementation_autoset']
del new_model_config_dict['_attn_implementation_autoset']

assert expected_model_config_dict == new_model_config_dict
assert sum(p.numel() for p in model1.parameters()
) == sum(p.numel() for p in model2.parameters())
Expand Down
4 changes: 4 additions & 0 deletions tests/tokenizers/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from omegaconf import OmegaConf as om
from transformers import AutoTokenizer

from llmfoundry.data.finetuning.tasks import _DEFAULT_CHAT_TEMPLATE
from llmfoundry.tokenizers.utils import get_date_string


Expand Down Expand Up @@ -115,6 +116,9 @@ def test_tokenizer_date_string(tokenizer_name: str, use_date_string: bool):
if use_date_string:
tokenizer.chat_template = "{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{{- \"Today Date: \" + date_string }}\n"

if not tokenizer.chat_template:
tokenizer.chat_template = _DEFAULT_CHAT_TEMPLATE

token_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
Expand Down
Loading