From 0a61dfc7edc3f9096b408447bec1e4ed28e2aa1b Mon Sep 17 00:00:00 2001 From: Loki Date: Thu, 8 Sep 2022 20:21:54 +0000 Subject: [PATCH] Removing stale files --- .../scripts/launch_pt_dt_sm_native.py | 34 - .../scripts/launch_sm_training_compiler.py | 9 - .../language-modeling/scripts/run_mlm.py | 600 ------------------ .../scripts/requirements.txt | 1 - .../vision_transformer/scripts/run_mae.py | 390 ------------ .../vision_transformer/scripts/run_mim.py | 472 -------------- .../vision-transformer-p4-fp32.ipynb | 542 ---------------- 7 files changed, 2048 deletions(-) delete mode 100644 sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/launch_pt_dt_sm_native.py delete mode 100644 sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/launch_sm_training_compiler.py delete mode 100644 sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/run_mlm.py delete mode 100644 sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/requirements.txt delete mode 100644 sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/run_mae.py delete mode 100644 sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/run_mim.py delete mode 100644 sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/vision-transformer-p4-fp32.ipynb diff --git a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/launch_pt_dt_sm_native.py b/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/launch_pt_dt_sm_native.py deleted file mode 100644 index 28727d0074..0000000000 --- a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/launch_pt_dt_sm_native.py +++ /dev/null @@ -1,34 +0,0 @@ -import argparse -import os, subprocess -from pdb import run - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # hyperparameters sent by the client are passed as command-line arguments to the script. - parser.add_argument("--training_script", type=str, default="run_mlm.py") - parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) - parser.add_argument("--output_dir", type=str, default=os.environ["SM_OUTPUT_DIR"]) - - args, rem_args = parser.parse_known_args() - print("Parsed Arguments: ", vars(args), rem_args) - os.environ["GPU_NUM_DEVICES"] = str(args.n_gpus) - - # native torch distributed as benchmark - training_command = "python -m torch.distributed.launch " - training_command += f"--nproc_per_node={args.n_gpus} " - training_command += "--nnodes=1 --node_rank=0 --master_addr=127.0.0.1 --master_port=1234 " - - training_command += args.training_script + " " - - # output directory - training_command += f"--output_dir {args.output_dir} " - for i in range(0, len(rem_args), 2): - arg, value = rem_args[i], rem_args[i + 1] - if value == "True": - training_command += f"{arg} " - elif value != "False": - training_command += f"{arg} {value} " - - print("Training Command: ", training_command) - subprocess.check_call(training_command, shell=True) diff --git a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/launch_sm_training_compiler.py b/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/launch_sm_training_compiler.py deleted file mode 100644 index 655af389a2..0000000000 --- a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/launch_sm_training_compiler.py +++ /dev/null @@ -1,9 +0,0 @@ -import subprocess -import sys - -if __name__ == "__main__": - arguments_command = " ".join([arg for arg in sys.argv[1:]]) - """ - The following line will take care of setting up inter node communication as well as managing intra node workers for each GPU. - """ - subprocess.check_call("python -m torch_xla.distributed.sm_dist " + arguments_command, shell=True) diff --git a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/run_mlm.py b/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/run_mlm.py deleted file mode 100644 index 3a1375bf86..0000000000 --- a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/language-modeling/scripts/run_mlm.py +++ /dev/null @@ -1,600 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Team All rights reserved. -# Modifications Copyright 2021 Amazon.com, Inc. or its affiliates. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset. - -Here is the full list of checkpoints on the hub that can be fine-tuned by this script: -https://huggingface.co/models?filter=masked-lm -""" -# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments. - -import logging -import math -import os -import sys -from dataclasses import dataclass, field -from typing import Optional - -import datasets -from datasets import load_dataset - -import transformers -from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_MASKED_LM_MAPPING, - AutoConfig, - AutoModelForMaskedLM, - AutoTokenizer, - DataCollatorForLanguageModeling, - HfArgumentParser, - Trainer, - TrainingArguments, - set_seed, -) -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.10.0") - -require_version( - "datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt" -) - -logger = logging.getLogger(__name__) -MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": "The model checkpoint for weights initialization." - "Don't set if you want to train a model from scratch." - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={ - "help": "If training from scratch, pass a model type from the list: " - + ", ".join(MODEL_TYPES) - }, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - }, - ) - config_name: Optional[str] = field( - default=None, - metadata={"help": "Pretrained config name or path if not the same as model_name"}, - ) - tokenizer_name: Optional[str] = field( - default=None, - metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}, - ) - cache_dir: Optional[str] = field( - default=None, - metadata={ - "help": "Where do you want to store the pretrained models downloaded from huggingface.co" - }, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={ - "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not." - }, - ) - model_revision: str = field( - default="main", - metadata={ - "help": "The specific model version to use (can be a branch name, tag name or commit id)." - }, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and ( - self.config_name is not None or self.model_name_or_path is not None - ): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, - metadata={"help": "The name of the dataset to use (via the datasets library)."}, - ) - dataset_config_name: Optional[str] = field( - default=None, - metadata={ - "help": "The configuration name of the dataset to use (via the datasets library)." - }, - ) - train_file: Optional[str] = field( - default=None, metadata={"help": "The input training data file (a text file)."} - ) - validation_file: Optional[str] = field( - default=None, - metadata={ - "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - max_seq_length: Optional[int] = field( - default=None, - metadata={ - "help": "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated." - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - mlm_probability: float = field( - default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"} - ) - line_by_line: bool = field( - default=False, - metadata={ - "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences." - }, - ) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": "Whether to pad all samples to `max_seq_length`. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch." - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in [ - "csv", - "json", - "txt", - ], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in [ - "csv", - "json", - "txt", - ], "`validation_file` should be a csv, a json or a txt file." - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - # Set the verbosity to info of the Transformers logger (on main process only): - logger.info(f"Training/evaluation parameters {training_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if ( - os.path.isdir(training_args.output_dir) - and training_args.do_train - and not training_args.overwrite_output_dir - ): - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub - # - # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this - # behavior (see below) - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir - ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - ) - raw_datasets["train"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - ) - else: - data_files = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - extension = data_args.train_file.split(".")[-1] - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = data_args.validation_file.split(".")[-1] - if extension == "txt": - extension = "text" - raw_datasets = load_dataset( - extension, data_files=data_files, cache_dir=model_args.cache_dir - ) - - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - ) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - - if model_args.model_name_or_path: - model = AutoModelForMaskedLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - logger.info("Training new model from scratch") - model = AutoModelForMaskedLM.from_config(config) - - model.resize_token_embeddings(len(tokenizer)) - - # Preprocessing the datasets. - # First we tokenize all the texts. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - else: - column_names = raw_datasets["validation"].column_names - text_column_name = "text" if "text" in column_names else column_names[0] - - if data_args.max_seq_length is None: - max_seq_length = tokenizer.model_max_length - if max_seq_length > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." - ) - max_seq_length = 1024 - else: - if data_args.max_seq_length > tokenizer.model_max_length: - logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" - f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." - ) - max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) - - if data_args.line_by_line: - # When using line_by_line, we just tokenize each nonempty line. - padding = "max_length" if data_args.pad_to_max_length else False - - def tokenize_function(examples): - # Remove empty lines - examples[text_column_name] = [ - line for line in examples[text_column_name] if len(line) > 0 and not line.isspace() - ] - return tokenizer( - examples[text_column_name], - padding=padding, - truncation=True, - max_length=max_seq_length, - # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it - # receives the `special_tokens_mask`. - return_special_tokens_mask=True, - ) - - with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset line_by_line", - ) - else: - # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. - # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more - # efficient when it receives the `special_tokens_mask`. - def tokenize_function(examples): - return tokenizer(examples[text_column_name], return_special_tokens_mask=True) - - with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on every text in dataset", - ) - - # Main data processing function that will concatenate all texts from our dataset and generate chunks of - # max_seq_length. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= max_seq_length: - total_length = (total_length // max_seq_length) * max_seq_length - # Split by chunks of max_len. - result = { - k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] - for k, t in concatenated_examples.items() - } - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a - # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value - # might be slower to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - with training_args.main_process_first(desc="grouping texts together"): - tokenized_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {max_seq_length}", - ) - - if training_args.do_train: - if "train" not in tokenized_datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = tokenized_datasets["train"] - if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - - if training_args.do_eval: - if "validation" not in tokenized_datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_dataset = tokenized_datasets["validation"] - if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - - # Data collator - # This one will take care of randomly masking the tokens. - pad_to_multiple_of_8 = ( - data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length - ) - data_collator = DataCollatorForLanguageModeling( - tokenizer=tokenizer, - mlm_probability=data_args.mlm_probability, - pad_to_multiple_of=8 if pad_to_multiple_of_8 else None, - ) - - # Initialize our Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - data_collator=data_collator, - ) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - metrics = train_result.metrics - - max_train_samples = ( - data_args.max_train_samples - if data_args.max_train_samples is not None - else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - - metrics = trainer.evaluate() - - max_eval_samples = ( - data_args.max_eval_samples - if data_args.max_eval_samples is not None - else len(eval_dataset) - ) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - if training_args.push_to_hub: - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - - trainer.push_to_hub(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() diff --git a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/requirements.txt b/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/requirements.txt deleted file mode 100644 index 2e6ab725a9..0000000000 --- a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -accelerate \ No newline at end of file diff --git a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/run_mae.py b/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/run_mae.py deleted file mode 100644 index 2ef182d6a2..0000000000 --- a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/run_mae.py +++ /dev/null @@ -1,390 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# Modifications Copyright 2022 Amazon.com, Inc. or its affiliates. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and - -import logging -import os -import sys -from dataclasses import dataclass, field -from typing import Optional - -import torch -from datasets import load_dataset -from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalFlip, RandomResizedCrop, ToTensor -from torchvision.transforms.functional import InterpolationMode - -import transformers -from transformers import ( - HfArgumentParser, - Trainer, - TrainingArguments, - ViTFeatureExtractor, - ViTMAEConfig, - ViTMAEForPreTraining, -) -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry -from transformers.utils.versions import require_version - - -""" Pre-training a 🤗 ViT model as an MAE (masked autoencoder), as proposed in https://arxiv.org/abs/2111.06377.""" - -logger = logging.getLogger(__name__) - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.21.0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - Using `HfArgumentParser` we can turn this class - into argparse arguments to be able to specify them on - the command line. - """ - - dataset_name: Optional[str] = field( - default="cifar10", metadata={"help": "Name of a dataset from the datasets package"} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - image_column_name: Optional[str] = field( - default=None, metadata={"help": "The column name of the images in the files."} - ) - train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."}) - validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."}) - train_val_split: Optional[float] = field( - default=0.15, metadata={"help": "Percent to split off of train for validation."} - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - ) - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - ) - }, - ) - - def __post_init__(self): - data_files = dict() - if self.train_dir is not None: - data_files["train"] = self.train_dir - if self.validation_dir is not None: - data_files["val"] = self.validation_dir - self.data_files = data_files if data_files else None - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/feature extractor we are going to pre-train. - """ - - model_name_or_path: str = field( - default=None, - metadata={ - "help": ( - "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." - ) - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name_or_path"} - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": ( - "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - ) - }, - ) - cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) - use_auth_token: bool = field( - default=False, - metadata={ - "help": ( - "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - ) - }, - ) - mask_ratio: float = field( - default=0.75, metadata={"help": "The ratio of the number of masked tokens in the input sequence."} - ) - norm_pix_loss: bool = field( - default=True, metadata={"help": "Whether or not to train with normalized pixel values as target."} - ) - - -@dataclass -class CustomTrainingArguments(TrainingArguments): - base_learning_rate: float = field( - default=1e-3, metadata={"help": "Base learning rate: absolute_lr = base_lr * total_batch_size / 256."} - ) - - -def collate_fn(examples): - pixel_values = torch.stack([example["pixel_values"] for example in examples]) - return {"pixel_values": pixel_values} - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CustomTrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_mae", model_args, data_args) - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"Training/evaluation parameters {training_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Initialize our dataset. - ds = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - data_files=data_args.data_files, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - - # If we don't have a validation split, split off a percentage of train as validation. - data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split - if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: - split = ds["train"].train_test_split(data_args.train_val_split) - ds["train"] = split["train"] - ds["validation"] = split["test"] - - # Load pretrained model and feature extractor - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = ViTMAEConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = ViTMAEConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = ViTMAEConfig() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - logger.info(f"New config: {config}") - - # adapt config - config.update( - { - "mask_ratio": model_args.mask_ratio, - "norm_pix_loss": model_args.norm_pix_loss, - } - ) - - # create feature extractor - if model_args.feature_extractor_name: - feature_extractor = ViTFeatureExtractor.from_pretrained(model_args.feature_extractor_name, **config_kwargs) - elif model_args.model_name_or_path: - feature_extractor = ViTFeatureExtractor.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - feature_extractor = ViTFeatureExtractor() - - # create model - if model_args.model_name_or_path: - model = ViTMAEForPreTraining.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - logger.info("Training new model from scratch") - model = ViTMAEForPreTraining(config) - - if training_args.do_train: - column_names = ds["train"].column_names - else: - column_names = ds["validation"].column_names - - if data_args.image_column_name is not None: - image_column_name = data_args.image_column_name - elif "image" in column_names: - image_column_name = "image" - elif "img" in column_names: - image_column_name = "img" - else: - image_column_name = column_names[0] - - # transformations as done in original MAE paper - # source: https://github.com/facebookresearch/mae/blob/main/main_pretrain.py - transforms = Compose( - [ - Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), - RandomResizedCrop(feature_extractor.size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC), - RandomHorizontalFlip(), - ToTensor(), - Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), - ] - ) - - def preprocess_images(examples): - """Preprocess a batch of images by applying transforms.""" - - examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]] - return examples - - if training_args.do_train: - if "train" not in ds: - raise ValueError("--do_train requires a train dataset") - if data_args.max_train_samples is not None: - ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples)) - # Set the training transforms - ds["train"].set_transform(preprocess_images) - - if training_args.do_eval: - if "validation" not in ds: - raise ValueError("--do_eval requires a validation dataset") - if data_args.max_eval_samples is not None: - ds["validation"] = ( - ds["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples)) - ) - # Set the validation transforms - ds["validation"].set_transform(preprocess_images) - - # Compute absolute learning rate - total_train_batch_size = ( - training_args.train_batch_size * training_args.gradient_accumulation_steps * training_args.world_size - ) - if training_args.base_learning_rate is not None: - training_args.learning_rate = training_args.base_learning_rate * total_train_batch_size / 256 - - # Initialize our trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=ds["train"] if training_args.do_train else None, - eval_dataset=ds["validation"] if training_args.do_eval else None, - tokenizer=feature_extractor, - data_collator=collate_fn, - ) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - metrics = trainer.evaluate() - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - # Write model card and (optionally) push to hub - kwargs = { - "tasks": "masked-auto-encoding", - "dataset": data_args.dataset_name, - "tags": ["masked-auto-encoding"], - } - if training_args.push_to_hub: - trainer.push_to_hub(**kwargs) - else: - trainer.create_model_card(**kwargs) - - -if __name__ == "__main__": - main() - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() \ No newline at end of file diff --git a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/run_mim.py b/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/run_mim.py deleted file mode 100644 index e4f8b84af2..0000000000 --- a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/scripts/run_mim.py +++ /dev/null @@ -1,472 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# Modifications Copyright 2022 Amazon.com, Inc. or its affiliates. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and - -import logging -import os -import sys -from dataclasses import dataclass, field -from typing import Optional - -import numpy as np -import torch -from datasets import load_dataset -from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalFlip, RandomResizedCrop, ToTensor - -import transformers -from transformers import ( - CONFIG_MAPPING, - FEATURE_EXTRACTOR_MAPPING, - MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, - AutoConfig, - AutoFeatureExtractor, - AutoModelForMaskedImageModeling, - HfArgumentParser, - Trainer, - TrainingArguments, -) -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry -from transformers.utils.versions import require_version - - -""" Pre-training a 🤗 Transformers model for simple masked image modeling (SimMIM). -Any model supported by the AutoModelForMaskedImageModeling API can be used. -""" - -logger = logging.getLogger(__name__) - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.21.0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") - -MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - Using `HfArgumentParser` we can turn this class into argparse arguments to be able to - specify them on the command line. - """ - - dataset_name: Optional[str] = field( - default="cifar10", metadata={"help": "Name of a dataset from the datasets package"} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - image_column_name: Optional[str] = field( - default=None, - metadata={"help": "The column name of the images in the files. If not set, will try to use 'image' or 'img'."}, - ) - train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."}) - validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."}) - train_val_split: Optional[float] = field( - default=0.15, metadata={"help": "Percent to split off of train for validation."} - ) - mask_patch_size: int = field(default=32, metadata={"help": "The size of the square patches to use for masking."}) - mask_ratio: float = field( - default=0.6, - metadata={"help": "Percentage of patches to mask."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - ) - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - ) - }, - ) - - def __post_init__(self): - data_files = dict() - if self.train_dir is not None: - data_files["train"] = self.train_dir - if self.validation_dir is not None: - data_files["val"] = self.validation_dir - self.data_files = data_files if data_files else None - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/feature extractor we are going to pre-train. - """ - - model_name_or_path: str = field( - default=None, - metadata={ - "help": ( - "The model checkpoint for weights initialization. Can be a local path to a pytorch_model.bin or a " - "checkpoint identifier on the hub. " - "Don't set if you want to train a model from scratch." - ) - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_name_or_path: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": ( - "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - ) - }, - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store (cache) the pretrained models/datasets downloaded from the hub"}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) - use_auth_token: bool = field( - default=False, - metadata={ - "help": ( - "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - ) - }, - ) - image_size: Optional[int] = field( - default=None, - metadata={ - "help": ( - "The size (resolution) of each image. If not specified, will use `image_size` of the configuration." - ) - }, - ) - patch_size: Optional[int] = field( - default=None, - metadata={ - "help": ( - "The size (resolution) of each patch. If not specified, will use `patch_size` of the configuration." - ) - }, - ) - encoder_stride: Optional[int] = field( - default=None, - metadata={"help": "Stride to use for the encoder."}, - ) - - -class MaskGenerator: - """ - A class to generate boolean masks for the pretraining task. - A mask is a 1D tensor of shape (model_patch_size**2,) where the value is either 0 or 1, - where 1 indicates "masked". - """ - - def __init__(self, input_size=192, mask_patch_size=32, model_patch_size=4, mask_ratio=0.6): - self.input_size = input_size - self.mask_patch_size = mask_patch_size - self.model_patch_size = model_patch_size - self.mask_ratio = mask_ratio - - if self.input_size % self.mask_patch_size != 0: - raise ValueError("Input size must be divisible by mask patch size") - if self.mask_patch_size % self.model_patch_size != 0: - raise ValueError("Mask patch size must be divisible by model patch size") - - self.rand_size = self.input_size // self.mask_patch_size - self.scale = self.mask_patch_size // self.model_patch_size - - self.token_count = self.rand_size**2 - self.mask_count = int(np.ceil(self.token_count * self.mask_ratio)) - - def __call__(self): - mask_idx = np.random.permutation(self.token_count)[: self.mask_count] - mask = np.zeros(self.token_count, dtype=int) - mask[mask_idx] = 1 - - mask = mask.reshape((self.rand_size, self.rand_size)) - mask = mask.repeat(self.scale, axis=0).repeat(self.scale, axis=1) - - return torch.tensor(mask.flatten()) - - -def collate_fn(examples): - pixel_values = torch.stack([example["pixel_values"] for example in examples]) - mask = torch.stack([example["mask"] for example in examples]) - return {"pixel_values": pixel_values, "bool_masked_pos": mask} - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_mim", model_args, data_args) - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"Training/evaluation parameters {training_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Initialize our dataset. - ds = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - data_files=data_args.data_files, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - - # If we don't have a validation split, split off a percentage of train as validation. - data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split - if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: - split = ds["train"].train_test_split(data_args.train_val_split) - ds["train"] = split["train"] - ds["validation"] = split["test"] - - # Create config - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name_or_path: - config = AutoConfig.from_pretrained(model_args.config_name_or_path, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - logger.info(f"New config: {config}") - - # make sure the decoder_type is "simmim" (only relevant for BEiT) - if hasattr(config, "decoder_type"): - config.decoder_type = "simmim" - - # adapt config - model_args.image_size = model_args.image_size if model_args.image_size is not None else config.image_size - model_args.patch_size = model_args.patch_size if model_args.patch_size is not None else config.patch_size - model_args.encoder_stride = ( - model_args.encoder_stride if model_args.encoder_stride is not None else config.encoder_stride - ) - - config.update( - { - "image_size": model_args.image_size, - "patch_size": model_args.patch_size, - "encoder_stride": model_args.encoder_stride, - } - ) - - # create feature extractor - if model_args.feature_extractor_name: - feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.feature_extractor_name, **config_kwargs) - elif model_args.model_name_or_path: - feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - FEATURE_EXTRACTOR_TYPES = { - conf.model_type: feature_extractor_class - for conf, feature_extractor_class in FEATURE_EXTRACTOR_MAPPING.items() - } - feature_extractor = FEATURE_EXTRACTOR_TYPES[model_args.model_type]() - - # create model - if model_args.model_name_or_path: - model = AutoModelForMaskedImageModeling.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - logger.info("Training new model from scratch") - model = AutoModelForMaskedImageModeling.from_config(config) - - if training_args.do_train: - column_names = ds["train"].column_names - else: - column_names = ds["validation"].column_names - - if data_args.image_column_name is not None: - image_column_name = data_args.image_column_name - elif "image" in column_names: - image_column_name = "image" - elif "img" in column_names: - image_column_name = "img" - else: - image_column_name = column_names[0] - - # transformations as done in original SimMIM paper - # source: https://github.com/microsoft/SimMIM/blob/main/data/data_simmim.py - transforms = Compose( - [ - Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), - RandomResizedCrop(model_args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)), - RandomHorizontalFlip(), - ToTensor(), - Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), - ] - ) - - # create mask generator - mask_generator = MaskGenerator( - input_size=model_args.image_size, - mask_patch_size=data_args.mask_patch_size, - model_patch_size=model_args.patch_size, - mask_ratio=data_args.mask_ratio, - ) - - def preprocess_images(examples): - """Preprocess a batch of images by applying transforms + creating a corresponding mask, indicating - which patches to mask.""" - - examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]] - examples["mask"] = [mask_generator() for i in range(len(examples[image_column_name]))] - - return examples - - if training_args.do_train: - if "train" not in ds: - raise ValueError("--do_train requires a train dataset") - if data_args.max_train_samples is not None: - ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples)) - # Set the training transforms - ds["train"].set_transform(preprocess_images) - - if training_args.do_eval: - if "validation" not in ds: - raise ValueError("--do_eval requires a validation dataset") - if data_args.max_eval_samples is not None: - ds["validation"] = ( - ds["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples)) - ) - # Set the validation transforms - ds["validation"].set_transform(preprocess_images) - - # Initialize our trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=ds["train"] if training_args.do_train else None, - eval_dataset=ds["validation"] if training_args.do_eval else None, - tokenizer=feature_extractor, - data_collator=collate_fn, - ) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - metrics = trainer.evaluate() - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - # Write model card and (optionally) push to hub - kwargs = { - "finetuned_from": model_args.model_name_or_path, - "tasks": "masked-image-modeling", - "dataset": data_args.dataset_name, - "tags": ["masked-image-modeling"], - } - if training_args.push_to_hub: - trainer.push_to_hub(**kwargs) - else: - trainer.create_model_card(**kwargs) - - -if __name__ == "__main__": - main() - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() \ No newline at end of file diff --git a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/vision-transformer-p4-fp32.ipynb b/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/vision-transformer-p4-fp32.ipynb deleted file mode 100644 index 98d748ea49..0000000000 --- a/sagemaker-training-compiler/huggingface/pytorch_multiple_gpu_single_node/vision_transformer/vision-transformer-p4-fp32.ipynb +++ /dev/null @@ -1,542 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compile and Train a Vision Transformer Model on the MNIST Dataset using Multi Node Distributed Training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. [Introduction](#Introduction) \n", - "2. [Development Environment and Permissions](#Development-Environment-and-Permissions)\n", - " 1. [Installation](#Installation) \n", - " 2. [SageMaker environment](#SageMaker-environment)\n", - "3. [Processing](#Preprocessing) \n", - " 1. [Tokenization](#Tokenization) \n", - " 2. [Uploading data to sagemaker_session_bucket](#Uploading-data-to-sagemaker_session_bucket) \n", - "4. [SageMaker Training Job](#SageMaker-Training-Job) \n", - " 1. [Training with Native PyTorch](#Training-with-Native-PyTorch) \n", - " 2. [Training with Optimized PyTorch](#Training-with-Optimized-PyTorch) \n", - " 3. [Analysis](#Analysis) \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SageMaker Training Compiler Overview\n", - "\n", - "SageMaker Training Compiler is a capability of SageMaker that makes these hard-to-implement optimizations to reduce training time on GPU instances. The compiler optimizes DL models to accelerate training by more efficiently using SageMaker machine learning (ML) GPU instances. SageMaker Training Compiler is available at no additional charge within SageMaker and can help reduce total billable time as it accelerates training. \n", - "\n", - "SageMaker Training Compiler is integrated into the AWS Deep Learning Containers (DLCs). Using the SageMaker Training Compiler enabled AWS DLCs, you can compile and optimize training jobs on GPU instances with minimal changes to your code. Bring your deep learning models to SageMaker and enable SageMaker Training Compiler to accelerate the speed of your training job on SageMaker ML instances for accelerated computing. \n", - "\n", - "For more information, see [SageMaker Training Compiler](https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html) in the *Amazon SageMaker Developer Guide*.\n", - "\n", - "## Introduction\n", - "\n", - "In this demo, you'll use Hugging Face's `transformers` and `datasets` libraries with Amazon SageMaker Training Compiler to train the `RoBERTa` model on the `Stanford Sentiment Treebank v2 (SST2)` dataset. To get started, we need to set up the environment with a few prerequisite steps, for permissions, configurations, and so on. \n", - "\n", - "**NOTE:** You can run this demo in SageMaker Studio, SageMaker notebook instances, or your local machine with AWS CLI set up. If using SageMaker Studio or SageMaker notebook instances, make sure you choose one of the PyTorch-based kernels, `Python 3 (PyTorch x.y Python 3.x CPU Optimized)` or `conda_pytorch_p36` respectively.\n", - "\n", - "**NOTE:** This notebook uses two `ml.p3.2xlarge` instances that have single GPU. If you don't have enough quota, see [Request a service quota increase for SageMaker resources](https://docs.aws.amazon.com/sagemaker/latest/dg/regions-quotas.html#service-limit-increase-request-procedure). " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Development Environment " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Installation\n", - "\n", - "This example notebook requires the **SageMaker Python SDK v2.70.0** and **transformers v4.11.0**." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com\n", - "Requirement already satisfied: sagemaker in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (2.103.0)\n", - "Requirement already satisfied: botocore in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (1.24.19)\n", - "Collecting botocore\n", - " Downloading botocore-1.27.52-py3-none-any.whl (9.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.0/9.0 MB\u001b[0m \u001b[31m84.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: boto3 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (1.24.48)\n", - "Collecting boto3\n", - " Downloading boto3-1.24.52-py3-none-any.whl (132 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.5/132.5 KB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: awscli in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (1.25.48)\n", - "Collecting awscli\n", - " Downloading awscli-1.25.52-py3-none-any.whl (3.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: attrs<22,>=20.3.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (21.2.0)\n", - "Requirement already satisfied: numpy<2.0,>=1.9.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (1.21.2)\n", - "Requirement already satisfied: smdebug-rulesconfig==1.0.1 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (1.0.1)\n", - "Requirement already satisfied: pathos in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (0.2.8)\n", - "Requirement already satisfied: google-pasta in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (0.2.0)\n", - "Requirement already satisfied: pandas in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (1.3.4)\n", - "Requirement already satisfied: protobuf3-to-dict<1.0,>=0.1.5 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (0.1.5)\n", - "Requirement already satisfied: importlib-metadata<5.0,>=1.4.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (4.8.2)\n", - "Requirement already satisfied: protobuf<4.0,>=3.1 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (3.19.4)\n", - "Requirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from sagemaker) (21.3)\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from botocore) (2.8.2)\n", - "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from botocore) (1.26.8)\n", - "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from botocore) (0.10.0)\n", - "Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from boto3) (0.6.0)\n", - "Requirement already satisfied: colorama<0.4.5,>=0.2.5 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from awscli) (0.4.3)\n", - "Requirement already satisfied: docutils<0.17,>=0.10 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from awscli) (0.15.2)\n", - "Requirement already satisfied: PyYAML<5.5,>=3.10 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from awscli) (5.4.1)\n", - "Requirement already satisfied: rsa<4.8,>=3.1.2 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from awscli) (4.7.2)\n", - "Requirement already satisfied: zipp>=0.5 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from importlib-metadata<5.0,>=1.4.0->sagemaker) (3.6.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from packaging>=20.0->sagemaker) (3.0.6)\n", - "Requirement already satisfied: six in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from protobuf3-to-dict<1.0,>=0.1.5->sagemaker) (1.16.0)\n", - "Requirement already satisfied: pyasn1>=0.1.3 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from rsa<4.8,>=3.1.2->awscli) (0.4.8)\n", - "Requirement already satisfied: pytz>=2017.3 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from pandas->sagemaker) (2021.3)\n", - "Requirement already satisfied: dill>=0.3.4 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from pathos->sagemaker) (0.3.4)\n", - "Requirement already satisfied: multiprocess>=0.70.12 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from pathos->sagemaker) (0.70.12.2)\n", - "Requirement already satisfied: pox>=0.3.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from pathos->sagemaker) (0.3.0)\n", - "Requirement already satisfied: ppft>=1.6.6.4 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from pathos->sagemaker) (1.6.6.4)\n", - "Installing collected packages: botocore, boto3, awscli\n", - " Attempting uninstall: botocore\n", - " Found existing installation: botocore 1.24.19\n", - " Uninstalling botocore-1.24.19:\n", - " Successfully uninstalled botocore-1.24.19\n", - " Attempting uninstall: boto3\n", - " Found existing installation: boto3 1.24.48\n", - " Uninstalling boto3-1.24.48:\n", - " Successfully uninstalled boto3-1.24.48\n", - " Attempting uninstall: awscli\n", - " Found existing installation: awscli 1.25.48\n", - " Uninstalling awscli-1.25.48:\n", - " Successfully uninstalled awscli-1.25.48\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "aiobotocore 2.0.1 requires botocore<1.22.9,>=1.22.8, but you have botocore 1.27.52 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed awscli-1.25.52 boto3-1.24.52 botocore-1.27.52\n", - "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n", - "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install sagemaker botocore boto3 awscli --upgrade" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com\n", - "Collecting transformers\n", - " Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.7/4.7 MB\u001b[0m \u001b[31m52.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting datasets\n", - " Downloading datasets-2.4.0-py3-none-any.whl (365 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m365.7/365.7 KB\u001b[0m \u001b[31m68.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from transformers) (21.3)\n", - "Requirement already satisfied: requests in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from transformers) (2.26.0)\n", - "Requirement already satisfied: numpy>=1.17 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from transformers) (1.21.2)\n", - "Requirement already satisfied: regex!=2019.12.17 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from transformers) (2021.11.10)\n", - "Collecting tokenizers!=0.11.3,<0.13,>=0.11.1\n", - " Downloading tokenizers-0.12.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m49.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from transformers) (4.62.3)\n", - "Requirement already satisfied: pyyaml>=5.1 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from transformers) (5.4.1)\n", - "Collecting huggingface-hub<1.0,>=0.1.0\n", - " Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.5/101.5 KB\u001b[0m \u001b[31m23.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: filelock in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from transformers) (3.4.0)\n", - "Requirement already satisfied: multiprocess in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from datasets) (0.70.12.2)\n", - "Requirement already satisfied: aiohttp in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from datasets) (3.8.1)\n", - "Collecting responses<0.19\n", - " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", - "Requirement already satisfied: pandas in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from datasets) (1.3.4)\n", - "Requirement already satisfied: fsspec[http]>=2021.11.1 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from datasets) (2021.11.1)\n", - "Requirement already satisfied: pyarrow>=6.0.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from datasets) (7.0.0)\n", - "Requirement already satisfied: dill<0.3.6 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from datasets) (0.3.4)\n", - "Collecting xxhash\n", - " Downloading xxhash-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.1/212.1 KB\u001b[0m \u001b[31m52.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from huggingface-hub<1.0,>=0.1.0->transformers) (4.0.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from packaging>=20.0->transformers) (3.0.6)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from requests->transformers) (2021.10.8)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from requests->transformers) (2.0.7)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from requests->transformers) (1.26.8)\n", - "Requirement already satisfied: idna<4,>=2.5 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from requests->transformers) (3.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from aiohttp->datasets) (1.2.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from aiohttp->datasets) (21.2.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from aiohttp->datasets) (5.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from aiohttp->datasets) (1.2.0)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from aiohttp->datasets) (1.7.2)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\n", - "Requirement already satisfied: pytz>=2017.3 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from pandas->datasets) (2021.3)\n", - "Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.16.0)\n", - "Installing collected packages: tokenizers, xxhash, responses, huggingface-hub, transformers, datasets\n", - "Successfully installed datasets-2.4.0 huggingface-hub-0.8.1 responses-0.18.0 tokenizers-0.12.1 transformers-4.21.1 xxhash-3.0.0\n", - "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n", - "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -U transformers datasets --upgrade" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sagemaker: 2.103.0\n", - "transformers: 4.21.1\n" - ] - } - ], - "source": [ - "import botocore\n", - "import boto3\n", - "import sagemaker\n", - "import transformers\n", - "import pandas as pd\n", - "\n", - "print(f\"sagemaker: {sagemaker.__version__}\")\n", - "print(f\"transformers: {transformers.__version__}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copy and run the following code if you need to upgrade ipywidgets for `datasets` library and restart kernel. This is only needed when prerpocessing is done in the notebook.\n", - "\n", - "```python\n", - "%%capture\n", - "import IPython\n", - "!conda install -c conda-forge ipywidgets -y\n", - "# has to restart kernel for the updates to be applied\n", - "IPython.Application.instance().kernel.do_shutdown(True) \n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SageMaker environment " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sagemaker role arn: arn:aws:iam::875423407011:role/SageMakerRole\n", - "sagemaker bucket: sagemaker-us-west-2-875423407011\n", - "sagemaker session region: us-west-2\n" - ] - } - ], - "source": [ - "import sagemaker\n", - "\n", - "sess = sagemaker.Session()\n", - "\n", - "# SageMaker session bucket -> used for uploading data, models and logs\n", - "# SageMaker will automatically create this bucket if it does not exist\n", - "sagemaker_session_bucket = None\n", - "if sagemaker_session_bucket is None and sess is not None:\n", - " # set to default bucket if a bucket name is not given\n", - " sagemaker_session_bucket = sess.default_bucket()\n", - "\n", - "role = sagemaker.get_execution_role()\n", - "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n", - "\n", - "print(f\"sagemaker role arn: {role}\")\n", - "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n", - "print(f\"sagemaker session region: {sess.boto_region_name}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SageMaker Training Job\n", - "\n", - "To create a SageMaker training job, we use a `HuggingFace` estimator. Using the estimator, you can define which fine-tuning script should SageMaker use through `entry_point`, which `instance_type` to use for training, which `hyperparameters` to pass, and so on.\n", - "\n", - "When a SageMaker training job starts, SageMaker takes care of starting and managing all the required machine learning instances, picks up the `HuggingFace` Deep Learning Container, uploads your training script, and downloads the data from `sagemaker_session_bucket` into the container at `/opt/ml/input/data`.\n", - "\n", - "In the following section, you learn how to set up two versions of the SageMaker `HuggingFace` estimator, a native one without the compiler and an optimized one with the compiler." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set up an option for fine-tuning or full training. Set `FINE_TUNING = 1` for fine-tuning and using `fine_tune_with_huggingface.py`. Set `FINE_TUNING = 0` for full training and using `full_train_roberta_with_huggingface.py`." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "EPOCHS = 1\n", - "\n", - "# SageMaker Training Compiler currently only supports training on GPU\n", - "# Select Instance type for training\n", - "INSTANCE_TYPE = \"ml.p4d.24xlarge\"\n", - "NUM_GPUS = 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training with Native PyTorch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `train_batch_size` in the following code cell is the maximum batch that can fit into the memory of an `ml.g4dn.2xlarge` instance. If you change the model, instance type, and other parameters, you need to do some experiments to find the largest batch size that will fit into GPU memory." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.huggingface import HuggingFace\n", - "\n", - "kwargs = dict(\n", - " source_dir=\"scripts\",\n", - " instance_type=INSTANCE_TYPE,\n", - " role=role,\n", - " py_version=\"py38\",\n", - " disable_profiler=True,\n", - " debugger_hook_config=False,\n", - " volume_size=60,\n", - ")\n", - "\n", - "PER_DEVICE_BATCH_SIZE=248\n", - "cluster_size=1\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "200 pr-huggingface-pytorch-training-2022-08-23-21-10-36-273\n", - "208 pr-huggingface-pytorch-training-2022-08-23-21-10-36-995\n", - "216 pr-huggingface-pytorch-training-2022-08-23-21-10-40-499\n", - "224 pr-huggingface-pytorch-training-2022-08-23-21-10-41-041\n", - "232 pr-huggingface-pytorch-training-2022-08-23-21-10-44-361\n", - "240 pr-huggingface-pytorch-training-2022-08-23-21-10-45-961\n", - "248 pr-huggingface-pytorch-training-2022-08-23-21-10-47-342\n", - "256 pr-huggingface-pytorch-training-2022-08-23-21-10-49-527\n", - "264 pr-huggingface-pytorch-training-2022-08-23-21-10-53-981\n", - "272 pr-huggingface-pytorch-training-2022-08-23-21-10-54-513\n" - ] - } - ], - "source": [ - "from sagemaker.huggingface import HuggingFace\n", - "\n", - "\n", - "# The original LR was set for a batch of 8. Here we are scaling learning rate with batch size.\n", - "GLOBAL_BATCH_SIZE = PER_DEVICE_BATCH_SIZE * NUM_GPUS * cluster_size\n", - "LEARNING_RATE = float(\"2e-5\") / 8 * GLOBAL_BATCH_SIZE\n", - "\n", - "# configure the training job\n", - "huggingface_estimator = HuggingFace(\n", - " image_uri=\"669063966089.dkr.ecr.us-west-2.amazonaws.com/pr-huggingface-pytorch-training:1.11.0-transformers4.21.1-gpu-py38-cu113-ubuntu20.04-pr-1824-2022-08-08-10-57-02\",\n", - " instance_count=cluster_size,\n", - " entry_point='run_mim.py',\n", - " hyperparameters={\n", - " 'model_type': 'vit',\n", - " 'dataset_name': 'mnist',\n", - " 'output_dir': '/opt/ml/model',\n", - " 'overwrite_output_dir': True,\n", - " 'remove_unused_columns': 'False',\n", - " 'label_names' : 'bool_masked_pos',\n", - " 'do_train': True,\n", - " 'do_eval': False,\n", - " 'learning_rate': LEARNING_RATE,\n", - " 'weight_decay': 0.05,\n", - " 'num_train_epochs': EPOCHS,\n", - " 'per_device_train_batch_size': PER_DEVICE_BATCH_SIZE,\n", - " 'per_device_eval_batch_size': PER_DEVICE_BATCH_SIZE,\n", - " 'logging_strategy': 'epoch',\n", - " 'evaluation_strategy': 'no',\n", - " 'save_strategy': 'no',\n", - " 'save_total_limit': 3,\n", - " },\n", - " distribution={'smdistributed': {'dataparallel': {'enabled': True}}},\n", - " **kwargs,\n", - ")\n", - "\n", - "# start training with our uploaded datasets as input\n", - "huggingface_estimator.fit(wait=False)\n", - "\n", - "# The name of the training job.\n", - "print(huggingface_estimator.latest_training_job.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training with Optimized PyTorch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compilation through Training Compiler changes the memory footprint of the model. Most commonly, this manifests as a reduction in memory utilization and a consequent increase in the largest batch size that can fit on the GPU. Note that if you want to change the batch size, you must adjust the learning rate appropriately.\n", - "\n", - "**Note:** We recommend you to turn the SageMaker Debugger's profiling and debugging tools off when you use compilation to avoid additional overheads." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "248 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-40-712\n", - "256 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-41-485\n", - "264 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-44-498\n", - "272 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-46-143\n", - "280 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-46-682\n", - "288 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-51-186\n", - "296 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-52-597\n", - "304 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-53-330\n", - "312 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-56-047\n", - "320 pr-huggingface-pytorch-trcomp-training-2022-08-23-21-45-56-676\n" - ] - } - ], - "source": [ - "from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig\n", - "TrainingCompilerConfig.validate = lambda *args, **kwargs:None\n", - "\n", - "NEW_PER_DEVICE_BATCH_SIZE=248\n", - "cluster_size=1\n", - "\n", - "# The original LR was set for a batch of 8. Here we are scaling learning rate with batch size.\n", - "GLOBAL_BATCH_SIZE = NEW_PER_DEVICE_BATCH_SIZE * NUM_GPUS * cluster_size\n", - "LEARNING_RATE = float(\"2e-5\") / 8 * GLOBAL_BATCH_SIZE\n", - "\n", - "# configure the training job\n", - "optimized_estimator = HuggingFace(\n", - " image_uri=\"669063966089.dkr.ecr.us-west-2.amazonaws.com/pr-huggingface-pytorch-trcomp-training:1.11.0-transformers4.21.1-gpu-py38-cu113-ubuntu20.04-pr-2032-2022-08-19-18-27-39\",\n", - " compiler_config=TrainingCompilerConfig(),\n", - " instance_count=cluster_size,\n", - " entry_point='run_mim.py',\n", - " hyperparameters={\n", - " 'model_type': 'vit',\n", - " 'dataset_name': 'mnist',\n", - " 'output_dir': '/opt/ml/model',\n", - " 'overwrite_output_dir': True,\n", - " 'remove_unused_columns': 'False',\n", - " 'label_names' : 'bool_masked_pos',\n", - " 'do_train': True,\n", - " 'do_eval': False,\n", - " 'learning_rate': LEARNING_RATE,\n", - " 'weight_decay': 0.05,\n", - " 'num_train_epochs': EPOCHS,\n", - " 'per_device_train_batch_size': NEW_PER_DEVICE_BATCH_SIZE,\n", - " 'per_device_eval_batch_size': PER_DEVICE_BATCH_SIZE,\n", - " 'logging_strategy': 'epoch',\n", - " 'evaluation_strategy': 'no',\n", - " 'save_strategy': 'no',\n", - " 'save_total_limit': 3,\n", - " 'sagemaker_pytorch_xla_multi_worker_enabled': True,\n", - " },\n", - " **kwargs,\n", - ")\n", - "\n", - "# start training with our uploaded datasets as input\n", - "optimized_estimator.fit(wait=False)\n", - "\n", - "# The name of the training job.\n", - "print(optimized_estimator.latest_training_job.name)" - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "interpreter": { - "hash": "c281c456f1b8161c8906f4af2c08ed2c40c50136979eaae69688b01f70e9f4a9" - }, - "kernelspec": { - "display_name": "conda_pytorch_p38", - "language": "python", - "name": "conda_pytorch_p38" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}