Skip to content

Commit a0ae025

Browse files
KuuCiVincent Chendakinggg
authored
Bump Transformer v4.49.0 (#1735)
Co-authored-by: Vincent Chen <[email protected]> Co-authored-by: Daniel King <[email protected]>
1 parent c66bc22 commit a0ae025

File tree

16 files changed

+204
-120
lines changed

16 files changed

+204
-120
lines changed

llmfoundry/data/text_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def __init__(
197197

198198
# How to tokenize a text sample to a token sample
199199
def _tokenize(self, text_sample: Mapping) -> dict[str, list[int]]:
200-
if self.tokenizer._pad_token is None:
200+
if self.tokenizer.pad_token is None:
201201
# Some tokenizers (e.g. GPT2 tokenizer) have no padding token which causes bugs
202202
raise RuntimeError(
203203
'If tokenizing on-the-fly, tokenizer must have a pad_token_id',

llmfoundry/data/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def get_text_collator(
211211
collate_fn = transformers.DataCollatorForLanguageModeling(
212212
tokenizer=tokenizer,
213213
mlm=mlm_probability is not None,
214-
mlm_probability=mlm_probability,
214+
mlm_probability=mlm_probability if mlm_probability else 0,
215215
)
216216

217217
if (eos_token_id is not None) or (bos_token_id is not None):

llmfoundry/models/hf/hf_base.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -228,13 +228,13 @@ def build_inner_model(
228228
Returns:
229229
Union[PreTrainedModel, 'PeftModel']: The built inner model.
230230
"""
231-
if not trust_remote_code and pretrained_model_name_or_path.startswith(
232-
'mosaicml/mpt',
233-
):
231+
if pretrained_model_name_or_path.startswith('mosaicml/mpt',):
234232
raise ValueError(
235-
'trust_remote_code must be set to True for MPT models. Without this, the MPT model code will come from the transformers library, '
233+
'The MPT series of models on the Hugging Face Hub is no longer supported by LLM Foundry. '
234+
+
235+
'Please use an older version of LLM Foundry (<0.18) or use a different model. '
236236
+
237-
'which is significantly slower and not compatible with the LLM foundry training code, rather than the code release by MosaicML.',
237+
'Please open a GitHub issue if this is a problem for you and we can help you downgrade or work around the issue.',
238238
)
239239
# Resolve "mixed" init device to either "cpu" or "meta"
240240
resolved_init_device = hf_get_init_device(init_device)

mcli/mcli-hf-eval.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,20 @@ parameters:
2828

2929
models:
3030
-
31-
model_name: mosaicml/mpt-7b-instruct
31+
model_name: meta-llama/Meta-Llama-3-8B
3232
# Tokenizer
3333
tokenizer:
34-
name: EleutherAI/gpt-neox-20b
34+
name: meta-llama/Meta-Llama-3-8B
3535
kwargs:
3636
model_max_length: ${max_seq_len}
3737

3838
model:
3939
name: hf_causal_lm
40-
pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
40+
pretrained_model_name_or_path: meta-llama/Meta-Llama-3-8B
4141
init_device: mixed
4242
pretrained: true
43-
use_auth_token: false
43+
# Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
44+
use_auth_token: true
4445

4546
# FSDP config for model sharding
4647
fsdp_config:

mcli/mcli-llama2-finetune.yaml renamed to mcli/mcli-llama3-finetune.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ compute:
2323
# The below is injected as a YAML file: /mnt/config/parameters.yaml
2424
parameters:
2525
variables:
26-
tokenizer_name: meta-llama/Llama-2-7b-hf
26+
tokenizer_name: meta-llama/Meta-Llama-3-8B
2727
global_seed: 17
2828
max_seq_len: 4096
2929

@@ -38,9 +38,9 @@ parameters:
3838
model:
3939
name: hf_causal_lm
4040
init_device: mixed
41-
pretrained_model_name_or_path: meta-llama/Llama-2-7b-hf
41+
pretrained_model_name_or_path: meta-llama/Meta-Llama-3-8B
4242
pretrained: true
43-
# Note: you must have set the HF_TOKEN environment variable and have access to the llama2 models
43+
# Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
4444
use_auth_token: true
4545
use_flash_attention_2: true
4646

scripts/eval/yamls/hf_eval.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,17 @@ models:
2323
model_max_length: ${variables.max_seq_len}
2424
# # if you are evaluating more than one model, list them all as YAML blocks without variable interpolation
2525
# -
26-
# model_name: mosaicml/mpt-7b
26+
# model_name: meta-llama/Meta-Llama-3-8B
2727
# model:
2828
# name: hf_causal_lm
29-
# pretrained_model_name_or_path: mosaicml/mpt-7b
29+
# pretrained_model_name_or_path: meta-llama/Meta-Llama-3-8B
3030
# init_device: cpu
3131
# pretrained: true
3232
# config_overrides:
3333
# max_seq_len: ${variables.max_seq_len}
34+
# use_auth_token: true
3435
# tokenizer:
35-
# name: mosaicml/mpt-7b
36+
# name: meta-llama/Meta-Llama-3-8B
3637
# kwargs:
3738
# model_max_length: ${variables.max_seq_len}
3839

scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml renamed to scripts/train/finetune_example/llama-3-8b-arc-easy--gpu.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,20 @@ run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME
77
# Model
88
model:
99
name: hf_causal_lm
10-
pretrained_model_name_or_path: mosaicml/mpt-7b
10+
pretrained_model_name_or_path: meta-llama/Meta-Llama-3-8B
1111
pretrained: true # false: only use the architecture; true: initialize with pretrained weights
1212
config_overrides:
1313
max_seq_len: ${max_seq_len}
1414
attn_config:
1515
attn_impl: flash
1616
# Set this to `true` if using `train_loader.dataset.packing_ratio` below
1717
attn_uses_sequence_id: false
18+
# Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
19+
use_auth_token: true
1820

1921
# Tokenizer
2022
tokenizer:
21-
name: mosaicml/mpt-7b
23+
name: meta-llama/Meta-Llama-3-8B
2224
kwargs:
2325
model_max_length: ${max_seq_len}
2426

scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml renamed to scripts/train/yamls/finetune/llama-3-8b_domain_adapt.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,18 @@ run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME
1515
model:
1616
name: hf_causal_lm
1717
pretrained: true
18-
pretrained_model_name_or_path: mosaicml/mpt-7b
18+
pretrained_model_name_or_path: meta-llama/Meta-Llama-3-8B
1919
config_overrides:
2020
max_seq_len: ${variables.max_seq_len}
2121
attn_config:
2222
attn_impl: flash
2323
attn_uses_sequence_id: false
24+
# Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
25+
use_auth_token: true
2426

2527
# Tokenizer
2628
tokenizer:
27-
name: mosaicml/mpt-7b
29+
name: meta-llama/Meta-Llama-3-8B
2830
kwargs:
2931
model_max_length: ${variables.max_seq_len}
3032

scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ model:
1919
attn_impl: flash
2020
# Set this to `true` if using `train_loader.dataset.packing_ratio` below
2121
attn_uses_sequence_id: false
22+
use_auth_token: false
2223

2324
# Tokenizer
2425
tokenizer:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.29.0,<0.30',
5656
'mlflow>=2.14.1,<2.19',
5757
'accelerate>=0.25,<1.4', # for HF inference `device_map`
58-
'transformers>=4.43.2,<4.47',
58+
'transformers>=v4.49.0,<4.50',
5959
'mosaicml-streaming>=0.11.0,<0.12',
6060
'torch>=2.5.1,<2.5.2',
6161
'datasets>=3.3.2,<3.4',

tests/a_scripts/inference/test_convert_composer_to_hf.py

Lines changed: 109 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import contextlib
5+
import glob
56
import json
67
import math
78
import os
@@ -220,11 +221,43 @@ def check_hf_tokenizer_equivalence(
220221
if attr1 is None and attr2 is None:
221222
continue
222223

223-
attr_value1 = attr1 if isinstance(attr1, str) else attr1.content
224-
attr_value2 = attr2 if isinstance(attr2, str) else attr2.content
224+
# Handle the case when the attribute is an AddedToken object
225+
attr_value1 = attr1 if isinstance(
226+
attr1,
227+
str,
228+
) else attr1.content if hasattr(attr1, 'content') else str(attr1)
229+
attr_value2 = attr2 if isinstance(
230+
attr2,
231+
str,
232+
) else attr2.content if hasattr(attr2, 'content') else str(attr2)
225233
assert attr_value1 == attr_value2
226234

227-
assert tokenizer1.__dict__ == tokenizer2.__dict__
235+
# Ignore 'extra_special_tokens' as it was added by the transformers library during save/load
236+
if 'extra_special_tokens' in tokenizer2.init_kwargs and 'extra_special_tokens' not in tokenizer1.init_kwargs:
237+
tokenizer2.init_kwargs.pop('extra_special_tokens')
238+
239+
# Process special tokens map and added tokens decoder
240+
for dict_map_key in ['_special_tokens_map', '_added_tokens_decoder']:
241+
if dict_map_key in tokenizer1.__dict__ and dict_map_key in tokenizer2.__dict__:
242+
# Get the nested dictionaries
243+
token_map1 = tokenizer1.__dict__[dict_map_key]
244+
token_map2 = tokenizer2.__dict__[dict_map_key]
245+
246+
# Process values in the first tokenizer's map
247+
for key in list(token_map1.keys()):
248+
if hasattr(token_map1[key], 'content'):
249+
token_map1[key] = token_map1[key].content
250+
251+
# Process values in the second tokenizer's map
252+
for key in list(token_map2.keys()):
253+
if hasattr(token_map2[key], 'content'):
254+
token_map2[key] = token_map2[key].content
255+
256+
# Final comparison of dictionaries
257+
t1_dict = tokenizer1.__dict__
258+
t2_dict = tokenizer2.__dict__
259+
260+
assert t1_dict == t2_dict, 'Tokenizer dictionaries are not equal'
228261

229262

230263
def remove_moe_world_size(config: MPTConfig):
@@ -274,6 +307,52 @@ def check_hf_model_equivalence(
274307
assert torch.equal(p1.cpu(), p2.cpu())
275308

276309

310+
def check_safetensors_precision(
311+
model_path: str,
312+
model: torch.nn.Module,
313+
expected_precision: torch.dtype,
314+
tolerance: float = 0.2,
315+
):
316+
"""Verify that the safetensors files in model_path have the expected size.
317+
318+
Args:
319+
model_path: Path to the directory containing the safetensors files
320+
model: The original model to count parameters from
321+
expected_precision: The expected precision (torch.float32, torch.bfloat16, etc.)
322+
tolerance: Allowed deviation from expected file size (as a ratio)
323+
324+
Returns:
325+
bool: True if the safetensors files have the expected size, False otherwise
326+
"""
327+
total_params = sum(p.numel() for p in model.parameters())
328+
# Determine expected bytes per parameter based on precision
329+
bytes_per_param = {
330+
torch.float32: 4,
331+
torch.float16: 2,
332+
torch.bfloat16: 2,
333+
torch.int8: 1,
334+
}.get(expected_precision)
335+
assert bytes_per_param
336+
337+
expected_size = total_params * bytes_per_param
338+
339+
safetensors_files = glob.glob(os.path.join(model_path, '*.safetensors'))
340+
if not safetensors_files:
341+
# If no safetensors files found, check pytorch_model.bin
342+
safetensors_files = glob.glob(
343+
os.path.join(model_path, 'pytorch_model*.bin'),
344+
)
345+
346+
if not safetensors_files:
347+
return False
348+
349+
total_size = sum(os.path.getsize(f) for f in safetensors_files)
350+
size_ratio = total_size / expected_size
351+
352+
is_correct_size = (1.0 - tolerance) <= size_ratio <= (1.0 + tolerance)
353+
return is_correct_size
354+
355+
277356
# TODO(GRT-2435): Change to fixture
278357
def delete_transformers_cache():
279358
# Only delete the files on local rank 0, otherwise race conditions are created
@@ -578,15 +657,27 @@ def test_huggingface_conversion_callback_interval(
578657
assert len(normal_checkpoints) == expected_normal_checkpoints
579658
assert len(huggingface_checkpoints) == expected_hf_checkpoints
580659

660+
# Get path to the last checkpoint
661+
checkpoint_path = os.path.join(
662+
tmp_path,
663+
'checkpoints',
664+
'huggingface',
665+
f'ba{batches_per_epoch}',
666+
)
667+
668+
# Verify the safetensors file size matches the expected precision
669+
is_size_correct = check_safetensors_precision(
670+
model_path=checkpoint_path,
671+
model=trainer.state.model.model,
672+
expected_precision=precision,
673+
)
674+
assert is_size_correct, f"Safetensors file size doesn't match expected precision {precision_str}"
675+
581676
# Load the last huggingface checkpoint
582677
loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
583-
os.path.join(
584-
tmp_path,
585-
'checkpoints',
586-
'huggingface',
587-
f'ba{batches_per_epoch}',
588-
),
678+
checkpoint_path,
589679
trust_remote_code=True,
680+
torch_dtype=precision,
590681
)
591682

592683
# Check that the loaded model has the correct precision, and then set it back
@@ -603,15 +694,16 @@ def test_huggingface_conversion_callback_interval(
603694
loaded_model.config.init_device = original_model.model.config.init_device
604695

605696
loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(
606-
os.path.join(
607-
tmp_path,
608-
'checkpoints',
609-
'huggingface',
610-
f'ba{batches_per_epoch}',
611-
),
697+
checkpoint_path,
612698
trust_remote_code=True,
613699
)
614700

701+
# Also check that at least one parameter has the expected precision
702+
for param_name, param in loaded_model.named_parameters():
703+
assert param.dtype == precision, \
704+
f'Parameter {param_name} has dtype {param.dtype}, expected {precision}'
705+
break
706+
615707
check_hf_model_equivalence(
616708
trainer.state.model.model.to(precision),
617709
loaded_model,
@@ -873,6 +965,7 @@ def _assert_checkpoint_equivalence(
873965
loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
874966
checkpoint_path,
875967
trust_remote_code=True,
968+
torch_dtype=precision,
876969
)
877970

878971
# Check that the loaded model has the correct precision, and then set it back
@@ -1426,7 +1519,6 @@ def test_mptmoe_huggingface_conversion_callback(
14261519
device_batch_size = 1
14271520
dataset_size = 2
14281521
precision_str = 'float32'
1429-
precision = torch.float32
14301522
batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
14311523

14321524
checkpointer_callback = HuggingFaceCheckpointer(
@@ -1617,7 +1709,7 @@ def test_mptmoe_huggingface_conversion_callback(
16171709

16181710
# Check that the loaded model has the correct precision, and then set it back
16191711
# to the original for the equivalence check
1620-
assert loaded_model.config.torch_dtype == precision
1712+
assert loaded_model.config.torch_dtype == precision_str
16211713
loaded_model.config.torch_dtype = original_model.model.config.torch_dtype
16221714

16231715
loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(

0 commit comments

Comments
 (0)