From f861504466bf19c607e3c8407be6194a565afc00 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 10 May 2022 08:22:42 -0700 Subject: [PATCH] [Deepspeed] add many more models to the model zoo test (#12695) * model zoo take 2 * add deberta * new param for zero2 * doc update * doc update * add layoutlm * bump deepspeed * add deberta-v2, funnel, longformer * new models * style * add t5_v1 * update TAPAS status * reorg problematic models * move doc to another PR * style * fix checkpoint check test * making progress on more models running * cleanup * new version * cleanup --- setup.py | 4 +- src/transformers/dependency_versions_table.py | 2 +- tests/deepspeed/test_deepspeed.py | 7 +- tests/deepspeed/test_model_zoo.py | 160 ++++++++++++++---- 4 files changed, 131 insertions(+), 42 deletions(-) diff --git a/setup.py b/setup.py index 386d1c25c51dcb..7ff6a2cf1c78d4 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ 1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the documentation. - + If releasing on a special branch, copy the updated README.md on the main branch for your the commit you will make for the post-release and run `make fix-copies` on the main branch as well. @@ -102,7 +102,7 @@ "cookiecutter==1.7.3", "dataclasses", "datasets", - "deepspeed>=0.6.0", + "deepspeed>=0.6.4", "fairscale>0.3", "faiss-cpu", "fastapi", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index c6ae12ff84e8f8..0de926adaf8f7a 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -9,7 +9,7 @@ "cookiecutter": "cookiecutter==1.7.3", "dataclasses": "dataclasses", "datasets": "datasets", - "deepspeed": "deepspeed>=0.6.0", + "deepspeed": "deepspeed>=0.6.4", "fairscale": "fairscale>0.3", "faiss-cpu": "faiss-cpu", "fastapi": "fastapi", diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 9fba62815b0134..03896fca8fef19 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -522,7 +522,7 @@ def test_gradient_accumulation(self, stage, dtype): # see the note above how to get identical loss on a small bs self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2) - def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage): + def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype): # adapted from TrainerIntegrationCommon.check_saved_checkpoints file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"] @@ -534,7 +534,8 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage): else: raise ValueError(f"unknown stage {stage}") - ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt") + if dtype == "bf16": + ds_file_list.append("bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt") for step in range(freq, total, freq): checkpoint = os.path.join(output_dir, f"checkpoint-{step}") @@ -578,7 +579,7 @@ def test_save_checkpoints(self, stage, dtype): trainer.train() total = int(self.n_epochs * 64 / self.batch_size) - self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage) + self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage, dtype) @parameterized.expand(params, name_func=parameterized_custom_name_func) def test_can_resume_training_errors(self, stage, dtype): diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py index 94559604935a7e..189452c609d361 100644 --- a/tests/deepspeed/test_model_zoo.py +++ b/tests/deepspeed/test_model_zoo.py @@ -42,51 +42,99 @@ set_seed(42) +FIXTURE_DIRECTORY = get_tests_dir("fixtures") +ROOT_DIRECTORY = os.path.join(dirname(get_tests_dir())) +DS_TESTS_DIRECTORY = dirname(os.path.abspath(__file__)) + # default torch.distributed port DEFAULT_MASTER_PORT = "10999" -# translation -FSMT_TINY = "stas/tiny-wmt19-en-de" -BART_TINY = "sshleifer/bart-tiny-random" T5_SMALL = "t5-small" -T5_TINY = "patrickvonplaten/t5-tiny-random" -MBART_TINY = "sshleifer/tiny-mbart" -MARIAN_TINY = "sshleifer/tiny-marian-en-de" - -# summarization -PEGASUS_TINY = "stas/pegasus-cnn_dailymail-tiny-random" -# causal lm +# *** Working Models *** +ALBERT_TINY = "hf-internal-testing/tiny-albert" +BART_TINY = "sshleifer/bart-tiny-random" +BERT_TINY = "hf-internal-testing/tiny-bert" +BIGBIRD_PEGASUS_TINY = "hf-internal-testing/tiny-random-bigbird_pegasus" +BIG_BIRD_TINY = "hf-internal-testing/tiny-random-big_bird" +BLENDERBOT_TINY = "hf-internal-testing/tiny-random-blenderbot" +DEBERTA_TINY = "hf-internal-testing/tiny-random-deberta" +DEBERTA_V2_TINY = "hf-internal-testing/tiny-random-deberta-v2" +DISTILBERT_TINY = "sshleifer/tiny-distilbert-base-cased" +ELECTRA_TINY = "hf-internal-testing/tiny-electra" +FLAUBERT_TINY = "hf-internal-testing/tiny-random-flaubert" +FSMT_TINY = "stas/tiny-wmt19-en-de" +FUNNEL_TINY = "hf-internal-testing/tiny-random-funnel" GPT2_TINY = "sshleifer/tiny-gpt2" +GPTJ_TINY = "hf-internal-testing/tiny-random-gptj" +GPT_NEO_TINY = "hf-internal-testing/tiny-random-gpt_neo" +LAYOUTLM_TINY = "hf-internal-testing/tiny-layoutlm" +LED_TINY = "hf-internal-testing/tiny-random-led" +LONGFORMER_TINY = "hf-internal-testing/tiny-random-longformer" +M2M_100_TINY = "stas/tiny-m2m_100" # hf tiny model is unsuitable +MARIAN_TINY = "sshleifer/tiny-marian-en-de" +MBART_TINY = "sshleifer/tiny-mbart" +MOBILEBERT_TINY = "hf-internal-testing/tiny-random-mobilebert" +MPNET_TINY = "hf-internal-testing/tiny-random-mpnet" +PEGASUS_TINY = "stas/pegasus-cnn_dailymail-tiny-random" +PROPHETNET_TINY = "hf-internal-testing/tiny-random-prophetnet" +ROBERTA_TINY = "sshleifer/tiny-distilroberta-base" +SQUEEZEBERT_TINY = "hf-internal-testing/tiny-random-squeezebert" +T5_TINY = "patrickvonplaten/t5-tiny-random" +T5_V1_TINY = "hf-internal-testing/tiny-random-t5-v1.1" +VIT_TINY = "hf-internal-testing/tiny-random-vit" XLM_ROBERTA_TINY = "hf-internal-testing/tiny-xlm-roberta" +XLNET_TINY = "sshleifer/tiny-xlnet-base-cased" -# question-answering -ROBERTA_TINY = "sshleifer/tiny-distilroberta-base" -# masked lm -DISTILBERT_TINY = "sshleifer/tiny-distilbert-base-cased" -ELECTRA_TINY = "hf-internal-testing/tiny-electra" +# *** To Fix *** -# classification -XLNET_TINY = "sshleifer/tiny-xlnet-base-cased" -BERT_TINY = "hf-internal-testing/tiny-bert" -FIXTURE_DIRECTORY = get_tests_dir("fixtures") -ROOT_DIRECTORY = os.path.join(dirname(get_tests_dir())) +# *** tiny model issues *** +# missing model files: +MT5_TINY = "hf-internal-testing/tiny-random-mt5" +CAMEMBERT_TINY = "hf-internal-testing/tiny-random-camembert" +OPENAI_GPT_TINY = "hf-internal-testing/tiny-random-openai-gpt" + +# missing tokenizer files +CONVBERT_TINY = "hf-internal-testing/tiny-random-convbert" +LAYOUTLMV2_TINY = "hf-internal-testing/tiny-random-layoutlmv2" +HUBERT_TINY = "hf-internal-testing/tiny-random-hubert" + +# issues with tokenizer +CTRL_TINY = "hf-internal-testing/tiny-random-ctrl" +TRANSFO_XL_TINY = "hf-internal-testing/tiny-random-transfo-xl" # same as ctrl -# TODO: to add: -# albert -# deberta -# funnel -# longformer -# dpr -# gpt_neo -# camembert -# deberta-v2 -# m2m_100 -# tapas -# vit -# big_bird +# other issues with tiny models +IBERT_TINY = "hf-internal-testing/tiny-random-ibert" # multiple issues with either mlm/qa/clas +REFORMER_TINY = "hf-internal-testing/tiny-random-reformer" # multiple issues with either mlm/qa/clas + +# *** Lacking official examples to test with *** +# or not working with examples +DPR_TINY = "hf-internal-testing/tiny-random-dpr" +# - "dpr" examples/research_projects/rag-end2end-retriever/ +RAG_TINY = "hf-internal-testing/tiny-random-rag" +# - "rag" research_projects +LUKE_TINY = "" +# - "luke" Entities classes - no plan to make such example +LXMERT_TINY = "hf-internal-testing/tiny-random-lxmert" +# - "lxmert" doesn't work with run_qa.py +CLIP_TINY = "hf-internal-testing/tiny-random-clip" +# - "clip" nothing under pytorch examples - XXX: Suraj is working on adding some - check by end of Sep +SPEECH_TO_TEXT_TINY = "hf-internal-testing/tiny-random-speech_to_text" +# - "speech_to_text", nothing under pytorch examples + + +# *** Reactive mode *** +# models with low usage, unstable API, things about to change - do nothing about the following until someone runs into a problem +TAPAS_TINY = "hf-internal-testing/tiny-random-tapas" +# additional notes on tapas +# 1. requires torch_scatter - skip if it's not installed? +# 2. "Table must be of type pd.DataFrame" failure + + +# TODO: new models to add: +# def get_launcher(distributed=False): @@ -113,35 +161,68 @@ def make_task_cmds(): --overwrite_output_dir """.split() - # XXX: try to cover as many models as possible once (it's enough to run on one task per model) + # try to cover as many models as possible once (it's enough to run on one task per model) # but need a tiny model for each # - # should have T5_TINY, etc. global var defined + # should have "{model_type.upper()}_TINY" corresponding vars defined, e.g., T5_TINY, etc. tasks2models = dict( trans=[ "bart", "fsmt", + "m2m_100", "marian", "mbart", "t5", + "t5_v1", + # "mt5", missing model files ], sum=[ "pegasus", ], clm=[ + "big_bird", + "bigbird_pegasus", + "blenderbot", "gpt2", + "gpt_neo", + "gptj", "xlm-roberta", + "prophetnet", + # "camembert", missing model files ], mlm=[ - "electra", + "albert", + "deberta", + "deberta-v2", "distilbert", + "electra", + "flaubert", + "funnel", + "layoutlm", + # "reformer", # multiple issues with either mlm/qa/clas ], qa=[ + "led", + "longformer", + "mobilebert", + "mpnet", "roberta", + "squeezebert", + # "convbert", # missing tokenizer files + # "layoutlmv2", missing model files ], clas=[ "bert", "xlnet", + # "hubert", # missing tokenizer files + # "ibert", # multiple issues with either mlm/qa/clas + # "transfo-xl", # tokenizer issues as ctrl + # "ctrl", # tokenizer issues + # "openai-gpt", missing model files + # "tapas", multiple issues + ], + img_clas=[ + "vit", ], ) @@ -180,6 +261,13 @@ def make_task_cmds(): --max_seq_length 12 --task_name MRPC """, + img_clas=f""" + {scripts_dir}/image-classification/run_image_classification.py + --dataset_name hf-internal-testing/cats_vs_dogs_sample + --remove_unused_columns False + --max_steps 10 + --feature_extractor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json + """, ) launcher = get_launcher(distributed=True)