Skip to content

Commit 3b1fc4a

Browse files
authored
Use allenai/c4 instead of c4 dataset (#1554)
Co-authored-by: Eitan Turok <[email protected]>
1 parent dc58bb7 commit 3b1fc4a

File tree

15 files changed

+37
-34
lines changed

15 files changed

+37
-34
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ cd scripts
223223

224224
# Convert C4 dataset to StreamingDataset format
225225
python data_prep/convert_dataset_hf.py \
226-
--dataset c4 --data_subset en \
226+
--dataset allenai/c4 --data_subset en \
227227
--out_root my-copy-c4 --splits train_small val_small \
228228
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
229229

TUTORIAL.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ Output the processed data to `./my-adaptation-data`. Note that we use smaller su
216216
<!--pytest.mark.skip-->
217217
```bash
218218
python scripts/data_prep/convert_dataset_hf.py \
219-
--dataset c4 --data_subset en \
219+
--dataset allenai/c4 --data_subset en \
220220
--out_root my-adaptation-data --splits train_small val_small \
221221
--concat_tokens 4096 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \
222222
--compression zstd
@@ -248,7 +248,7 @@ The first step to training from scratch is to get your pretraining data prepared
248248
<!--pytest.mark.skip-->
249249
```bash
250250
python scripts/data_prep/convert_dataset_hf.py \
251-
--dataset c4 --data_subset en \
251+
--dataset allenai/c4 --data_subset en \
252252
--out_root my-copy-c4 --splits train_small val_small \
253253
--concat_tokens 2048 --tokenizer gpt2 \
254254
--eos_text '<|endoftext|>' \

llmfoundry/command_utils/data_prep/convert_dataset_hf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def __init__(
158158
truncated_samples=100,
159159
)
160160

161-
CONSTS = {'c4': c4constants, 'the_pile': pileconstants}
161+
CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants}
162162

163163

164164
def build_hf_dataset(
@@ -335,7 +335,7 @@ def convert_dataset_hf(
335335
dataset_constants = CONSTS[dataset]
336336
except KeyError:
337337
raise ValueError(
338-
f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "c4" are supported.',
338+
f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "allenai/c4" are supported.',
339339
)
340340

341341
if concat_tokens is not None and tokenizer is not None:

llmfoundry/command_utils/data_prep/convert_dataset_json.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def build_hf_dataset(
4343
no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries
4444
tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use
4545
data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset.
46-
Typically "all" (The Pile) or "en" (c4).
46+
Typically "all" (The Pile) or "en" (allenai/c4).
4747
4848
Returns:
4949
An IterableDataset.

mcli/mcli-1b-max-seq-len-8k.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ integrations:
1313
command: |
1414
cd llm-foundry/scripts
1515
python data_prep/convert_dataset_hf.py \
16-
--dataset c4 --data_subset en \
16+
--dataset allenai/c4 --data_subset en \
1717
--out_root ./my-copy-c4 --splits train_small val_small \
1818
--concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
1919
composer train/train.py /mnt/config/parameters.yaml

mcli/mcli-1b.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ integrations:
1313
command: |
1414
cd llm-foundry/scripts
1515
python data_prep/convert_dataset_hf.py \
16-
--dataset c4 --data_subset en \
16+
--dataset allenai/c4 --data_subset en \
1717
--out_root ./my-copy-c4 --splits train_small val_small \
1818
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
1919
composer train/train.py train/yamls/pretrain/mpt-1b.yaml \

mcli/mcli-pretokenize-oci-upload.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ command: |
2424
2525
# Run the dataset conversion
2626
python convert_dataset_hf.py \
27-
--dataset c4 --data_subset en \
27+
--dataset allenai/c4 --data_subset en \
2828
--out_root ./my-copy-c4 \
2929
--splits val_small val train_small train \
3030
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'

scripts/data_prep/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Currently supports `c4` and `The Pile`.
1414
```bash
1515
# Convert C4 dataset to StreamingDataset format
1616
python convert_dataset_hf.py \
17-
--dataset c4 --data_subset en \
17+
--dataset allenai/c4 --data_subset en \
1818
--out_root my-copy-c4 --splits train_small val_small \
1919
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \
2020
--compression zstd

scripts/train/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ If you haven't already, make sure to [install the requirements](../../README.md#
2727

2828
To run pretraining, you'll need to make yourself a copy of a pretraining dataset and format it for efficient streaming. Check out the [`llm-foundry/data_prep`](../data_prep) folder for detailed instructions on how to convert your dataset to the MosaicML [StreamingDataset](https://github.com/mosaicml/streaming) format.
2929

30-
As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/c4) dataset here.
30+
As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/allenai/c4) dataset here.
3131

3232
We first convert the dataset from its native format (a collection of zipped JSONs)
3333
to MosaicML's StreamingDataset format, which is a collection of binary `.mds` files.
@@ -44,13 +44,13 @@ This will take 20-60 seconds depending on your internet bandwidth.
4444
You should see two folders once completed: `./my-copy-c4/train_small` and `./my-copy-c4/val_small` that are ~1.0GB total. Note that we are using the `--concat_tokens` option to pre tokenize our samples to be of the max sequence length without padding
4545
<!--pytest.mark.skip-->
4646
```bash
47-
python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
47+
python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
4848
```
4949

5050
Alternatively, you can download the full `train` and `val` splits if you really want to train the model (i.e. not just profile the model). This will take 1-to-many hours depending on bandwidth, number of CPUs, etc. The final folder `./my-copy-c4/train` will be ~800GB so make sure you have space!
5151
<!--pytest.mark.skip-->
5252
```bash
53-
python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
53+
python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
5454
```
5555

5656
For any of the above commands, you can also choose to compress the `.mds` files.

scripts/train/benchmarking/submit_benchmarks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ def run_config(
479479
if args.data_remote is None:
480480
command += f"""
481481
cd llm-foundry/scripts
482-
python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
482+
python data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
483483
composer train/train.py /mnt/config/parameters.yaml
484484
"""
485485
else:

tests/a_scripts/data_prep/test_convert_dataset_hf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path):
1111
# test calling it directly
1212
path = os.path.join(tmp_path, 'my-copy-c4-1')
1313
convert_dataset_hf(
14-
dataset='c4',
14+
dataset='allenai/c4',
1515
data_subset='en',
1616
splits=['val_xsmall'],
1717
out_root=path,

tests/a_scripts/eval/test_eval.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def test_loader_eval(
121121

122122
# Set up multiple eval dataloaders
123123
first_eval_loader = test_cfg.eval_loader
124-
first_eval_loader.label = 'c4'
124+
first_eval_loader.label = 'allenai/c4'
125125
# Create second eval dataloader using the arxiv dataset.
126126
second_eval_loader = copy.deepcopy(first_eval_loader)
127127
second_eval_loader.label = 'arxiv'
@@ -157,16 +157,17 @@ def test_loader_eval(
157157
print(inmemorylogger.data.keys())
158158

159159
# Checks for first eval dataloader
160-
assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
160+
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
161+
)
161162
assert isinstance(
162-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
163+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
163164
list,
164165
)
165166
assert len(
166-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
167+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
167168
) > 0
168169
assert isinstance(
169-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
170+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
170171
tuple,
171172
)
172173

tests/a_scripts/train/test_train.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
134134
test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
135135
# Set up multiple eval dataloaders
136136
first_eval_loader = test_cfg.eval_loader
137-
first_eval_loader.label = 'c4'
137+
first_eval_loader.label = 'allenai/c4'
138138
# Create second eval dataloader using the arxiv dataset.
139139
second_eval_loader = copy.deepcopy(first_eval_loader)
140140
second_eval_loader.label = 'arxiv'
@@ -154,16 +154,17 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
154154
assert isinstance(inmemorylogger, InMemoryLogger)
155155

156156
# Checks for first eval dataloader
157-
assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
157+
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
158+
)
158159
assert isinstance(
159-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
160+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
160161
list,
161162
)
162163
assert len(
163-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
164+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
164165
) > 0
165166
assert isinstance(
166-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
167+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
167168
tuple,
168169
)
169170

@@ -212,7 +213,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
212213
c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)
213214
test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
214215
first_eval_loader = test_cfg.eval_loader
215-
first_eval_loader.label = 'c4'
216+
first_eval_loader.label = 'allenai/c4'
216217
test_cfg.eval_loader = om.create([first_eval_loader])
217218
test_cfg.eval_subset_num_batches = 1 # -1 to evaluate on all batches
218219
test_cfg.max_duration = '1ba'
@@ -226,15 +227,16 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
226227
0] # pyright: ignore [reportGeneralTypeIssues]
227228
assert isinstance(inmemorylogger, InMemoryLogger)
228229

229-
assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
230+
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
231+
)
230232
assert isinstance(
231-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
233+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
232234
list,
233235
)
234236
assert len(
235-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
237+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
236238
) > 0
237239
assert isinstance(
238-
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
240+
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
239241
tuple,
240242
)

tests/data/test_dataloader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def test_correct_padding(
204204
shutil.rmtree(path, ignore_errors=True)
205205
if pretokenize:
206206
convert_dataset_hf(
207-
dataset='c4',
207+
dataset='allenai/c4',
208208
data_subset='en',
209209
splits=[split],
210210
out_root=path,
@@ -219,7 +219,7 @@ def test_correct_padding(
219219
)
220220
else:
221221
convert_dataset_hf(
222-
dataset='c4',
222+
dataset='allenai/c4',
223223
data_subset='en',
224224
splits=[split],
225225
out_root=path,
@@ -233,7 +233,7 @@ def test_correct_padding(
233233
num_workers=None,
234234
)
235235
if not os.path.isdir(path):
236-
raise RuntimeError(f'c4 dataset at {path} not set up as expected')
236+
raise RuntimeError(f'allenai/c4 dataset at {path} not set up as expected')
237237

238238
test_cfg = get_config(
239239
conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml',

tests/data_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str:
231231

232232
# Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188
233233
convert_dataset_hf(
234-
dataset='c4',
234+
dataset='allenai/c4',
235235
data_subset='en',
236236
splits=[downloaded_split],
237237
out_root=c4_dir,

0 commit comments

Comments
 (0)