diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index dcc1c5491a..6b4bd25936 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -119,6 +119,15 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 ExampleType = Literal['prompt_response', 'chat']
 TokenizedExample = dict[str, list[dict[str, list[int]]]]
 
+_DEFAULT_CHAT_TEMPLATE = (
+    '{% for message in messages %}'
+    "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
+    '{% endfor %}'
+    '{% if add_generation_prompt %}'
+    "{{ '<|im_start|>assistant\n' }}"
+    '{% endif %}'
+)
+
 
 def _get_example_type(example: Example) -> ExampleType:
     """Determines the type of the input example.
@@ -243,17 +252,21 @@ def slice_out_last_turn(
         messages_through_current_turn: list[dict[str, str]],
         conversation_through_previous_turn: str,
     ) -> tuple[str, str]:
+        chat_template = None if tokenizer.chat_template is not None else _DEFAULT_CHAT_TEMPLATE
+
         try:
             full_conversation = tokenizer.apply_chat_template(
                 messages_through_current_turn,
                 tokenize=False,
                 date_string=get_date_string(),
+                chat_template=chat_template,
             )
             prompt_with_history = tokenizer.apply_chat_template(
                 messages_through_current_turn[:-1],
                 tokenize=False,
                 add_generation_prompt=True,
                 date_string=get_date_string(),
+                chat_template=chat_template,
             )
         except Exception as e:
             raise ChatTemplateError(
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index da576b29e1..0afb493844 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -99,6 +99,9 @@ class InvalidConfigAccessError(KeyError):
     # Not set but llama modeling code tries to read this attribute
     'partial_rotary_factor',
 
+    # This key is accessed with a default of hidden_size / num_attention_heads
+    'head_dim',
+
     # Benign transformers attributes needed for __init__
     '_get_generation_defaults',
     'label2id',
@@ -106,6 +109,7 @@ class InvalidConfigAccessError(KeyError):
     'torch_dtype',
     'problem_type',
     '__class__',
+    '_get_global_generation_defaults',
 }
 
 
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index 6458ad3ba4..36944589d0 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -69,6 +69,7 @@ def __init__(
         bos_token: Optional[str] = '<|endoftext|>',
         pad_token: Optional[str] = None,
         errors: str = 'replace',
+        chat_template: Optional[str] = None,
         **kwargs: Any,
     ):
         """Constructor creates a tiktoken tokenizer to use as the underlying.
@@ -90,6 +91,8 @@ def __init__(
             errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
                 [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
                 Defaults to `"replace"`.
+            chat_template (Optional[str], optional): The Hugging Face chat template. Default will use the ``default_chat_template``
+                set on this class.
             kwargs (Any): Other relevant keyword arguments.
         """
         try:
@@ -178,6 +181,7 @@ def pickle_Encoding(enc: Encoding):
             bos_token=bos_token,
             pad_token=pad_token,
             errors=errors,
+            chat_template=chat_template or self.default_chat_template,
             **kwargs,
         )
 
diff --git a/setup.py b/setup.py
index 86d696ed5c..72543b24c8 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
     'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.26.0,<0.27',
     'mlflow>=2.14.1,<2.18',
     'accelerate>=0.25,<0.34',  # for HF inference `device_map`
-    'transformers>=4.43.2,<4.44',
+    'transformers>=4.43.2,<4.47',
     'mosaicml-streaming>=0.9.0,<0.10',
     'torch>=2.4.0,<2.4.1',
     'datasets>=2.19,<2.20',
diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py
index 0697894bb2..65f8669ce6 100644
--- a/tests/data/test_template_tokenization.py
+++ b/tests/data/test_template_tokenization.py
@@ -7,6 +7,7 @@
 import transformers
 
 from llmfoundry.data.finetuning.tasks import (
+    _DEFAULT_CHAT_TEMPLATE,
     _slice_chat_formatted_example,
     dataset_constructor,
     tokenize_formatted_example,
@@ -304,6 +305,9 @@ def test_multi_turn_chat_slicing(
     if use_date_string:
         tok.chat_template = "{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{{- \"Today Date: \" + date_string }}\n"
 
+    if not tok.chat_template:
+        tok.chat_template = _DEFAULT_CHAT_TEMPLATE
+
     templated_prompt_response_turns = _slice_chat_formatted_example(
         example,
         tok,
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index a7769c237d..8a6290d5c4 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -1668,6 +1668,10 @@ def check_hf_model_equivalence(
     del expected_model_config_dict['_name_or_path']
     del new_model_config_dict['_name_or_path']
 
+    # Transformers changes this key on load from disk
+    del expected_model_config_dict['_attn_implementation_autoset']
+    del new_model_config_dict['_attn_implementation_autoset']
+
     assert expected_model_config_dict == new_model_config_dict
     assert sum(p.numel() for p in model1.parameters()
               ) == sum(p.numel() for p in model2.parameters())
diff --git a/tests/tokenizers/test_tokenizer.py b/tests/tokenizers/test_tokenizer.py
index d42f810214..5c6c07d4cc 100644
--- a/tests/tokenizers/test_tokenizer.py
+++ b/tests/tokenizers/test_tokenizer.py
@@ -6,6 +6,7 @@
 from omegaconf import OmegaConf as om
 from transformers import AutoTokenizer
 
+from llmfoundry.data.finetuning.tasks import _DEFAULT_CHAT_TEMPLATE
 from llmfoundry.tokenizers.utils import get_date_string
 
 
@@ -115,6 +116,9 @@ def test_tokenizer_date_string(tokenizer_name: str, use_date_string: bool):
     if use_date_string:
         tokenizer.chat_template = "{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{{- \"Today Date: \" + date_string }}\n"
 
+    if not tokenizer.chat_template:
+        tokenizer.chat_template = _DEFAULT_CHAT_TEMPLATE
+
     token_ids = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,