diff --git a/chapters/en/chapter7/6.mdx b/chapters/en/chapter7/6.mdx index 927c2be9f..27dc8cbbd 100644 --- a/chapters/en/chapter7/6.mdx +++ b/chapters/en/chapter7/6.mdx @@ -67,6 +67,11 @@ False True We can use this to create a function that will stream the dataset and filter the elements we want: ```py +from collections import defaultdict +from tqdm import tqdm +from datasets import Dataset + + def filter_streaming_dataset(dataset, filters): filtered_dict = defaultdict(list) total = 0 @@ -105,7 +110,7 @@ Filtering the full dataset can take 2-3h depending on your machine and bandwidth from datasets import load_dataset, DatasetDict ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train") -ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="train") +ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation") raw_datasets = DatasetDict( { @@ -347,7 +352,7 @@ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_ten Let's have a look at an example: ```py -out = data_collator([tokenized_dataset["train"][i] for i in range(5)]) +out = data_collator([tokenized_datasets["train"][i] for i in range(5)]) for key in out: print(f"{key} shape: {out[key].shape}") ``` @@ -799,6 +804,8 @@ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( Now that we have sent our `train_dataloader` to `accelerator.prepare()`, we can use its length to compute the number of training steps. Remember that we should always do this after preparing the dataloader, as that method will change its length. We use a classic linear schedule from the learning rate to 0: ```py +from transformers import get_scheduler + num_train_epochs = 1 num_update_steps_per_epoch = len(train_dataloader) num_training_steps = num_train_epochs * num_update_steps_per_epoch @@ -856,7 +863,7 @@ model.train() completed_steps = 0 for epoch in range(num_train_epochs): for step, batch in tqdm( - enumerate(train_dataloader, start=1), total=len(train_dataloader) + enumerate(train_dataloader, start=1), total=num_training_steps ): logits = model(batch["input_ids"]).logits loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids) diff --git a/chapters/en/chapter7/7.mdx b/chapters/en/chapter7/7.mdx index 756500fa7..d8e1942e4 100644 --- a/chapters/en/chapter7/7.mdx +++ b/chapters/en/chapter7/7.mdx @@ -955,7 +955,7 @@ Note that while the training happens, each time the model is saved (here, every Once the training is complete, we can finally evaluate our model (and pray we didn't spend all that compute time on nothing). The `predict()` method of the `Trainer` will return a tuple where the first elements will be the predictions of the model (here a pair with the start and end logits). We send this to our `compute_metrics()` function: ```python -predictions, _ = trainer.predict(validation_dataset) +predictions, _, _ = trainer.predict(validation_dataset) start_logits, end_logits = predictions compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"]) ```