Skip to content

Commit

Permalink
[Deepspeed] add many more models to the model zoo test (huggingface#1…
Browse files Browse the repository at this point in the history
…2695)

* model zoo take 2

* add deberta

* new param for zero2

* doc update

* doc update

* add layoutlm

* bump deepspeed

* add deberta-v2, funnel, longformer

* new models

* style

* add t5_v1

* update TAPAS status

* reorg problematic models

* move doc to another PR

* style

* fix checkpoint check test

* making progress on more models running

* cleanup

* new version

* cleanup
  • Loading branch information
stas00 authored May 10, 2022
1 parent 9aeacfe commit f861504
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 42 deletions.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
documentation.
If releasing on a special branch, copy the updated README.md on the main branch for your the commit you will make
for the post-release and run `make fix-copies` on the main branch as well.
Expand Down Expand Up @@ -102,7 +102,7 @@
"cookiecutter==1.7.3",
"dataclasses",
"datasets",
"deepspeed>=0.6.0",
"deepspeed>=0.6.4",
"fairscale>0.3",
"faiss-cpu",
"fastapi",
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/dependency_versions_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"cookiecutter": "cookiecutter==1.7.3",
"dataclasses": "dataclasses",
"datasets": "datasets",
"deepspeed": "deepspeed>=0.6.0",
"deepspeed": "deepspeed>=0.6.4",
"fairscale": "fairscale>0.3",
"faiss-cpu": "faiss-cpu",
"fastapi": "fastapi",
Expand Down
7 changes: 4 additions & 3 deletions tests/deepspeed/test_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def test_gradient_accumulation(self, stage, dtype):
# see the note above how to get identical loss on a small bs
self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)

def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
# adapted from TrainerIntegrationCommon.check_saved_checkpoints

file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
Expand All @@ -534,7 +534,8 @@ def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
else:
raise ValueError(f"unknown stage {stage}")

ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt")
if dtype == "bf16":
ds_file_list.append("bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt")

for step in range(freq, total, freq):
checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
Expand Down Expand Up @@ -578,7 +579,7 @@ def test_save_checkpoints(self, stage, dtype):
trainer.train()

total = int(self.n_epochs * 64 / self.batch_size)
self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage, dtype)

@parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_can_resume_training_errors(self, stage, dtype):
Expand Down
160 changes: 124 additions & 36 deletions tests/deepspeed/test_model_zoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,51 +42,99 @@

set_seed(42)

FIXTURE_DIRECTORY = get_tests_dir("fixtures")
ROOT_DIRECTORY = os.path.join(dirname(get_tests_dir()))
DS_TESTS_DIRECTORY = dirname(os.path.abspath(__file__))

# default torch.distributed port
DEFAULT_MASTER_PORT = "10999"

# translation
FSMT_TINY = "stas/tiny-wmt19-en-de"
BART_TINY = "sshleifer/bart-tiny-random"
T5_SMALL = "t5-small"
T5_TINY = "patrickvonplaten/t5-tiny-random"
MBART_TINY = "sshleifer/tiny-mbart"
MARIAN_TINY = "sshleifer/tiny-marian-en-de"

# summarization
PEGASUS_TINY = "stas/pegasus-cnn_dailymail-tiny-random"

# causal lm
# *** Working Models ***
ALBERT_TINY = "hf-internal-testing/tiny-albert"
BART_TINY = "sshleifer/bart-tiny-random"
BERT_TINY = "hf-internal-testing/tiny-bert"
BIGBIRD_PEGASUS_TINY = "hf-internal-testing/tiny-random-bigbird_pegasus"
BIG_BIRD_TINY = "hf-internal-testing/tiny-random-big_bird"
BLENDERBOT_TINY = "hf-internal-testing/tiny-random-blenderbot"
DEBERTA_TINY = "hf-internal-testing/tiny-random-deberta"
DEBERTA_V2_TINY = "hf-internal-testing/tiny-random-deberta-v2"
DISTILBERT_TINY = "sshleifer/tiny-distilbert-base-cased"
ELECTRA_TINY = "hf-internal-testing/tiny-electra"
FLAUBERT_TINY = "hf-internal-testing/tiny-random-flaubert"
FSMT_TINY = "stas/tiny-wmt19-en-de"
FUNNEL_TINY = "hf-internal-testing/tiny-random-funnel"
GPT2_TINY = "sshleifer/tiny-gpt2"
GPTJ_TINY = "hf-internal-testing/tiny-random-gptj"
GPT_NEO_TINY = "hf-internal-testing/tiny-random-gpt_neo"
LAYOUTLM_TINY = "hf-internal-testing/tiny-layoutlm"
LED_TINY = "hf-internal-testing/tiny-random-led"
LONGFORMER_TINY = "hf-internal-testing/tiny-random-longformer"
M2M_100_TINY = "stas/tiny-m2m_100" # hf tiny model is unsuitable
MARIAN_TINY = "sshleifer/tiny-marian-en-de"
MBART_TINY = "sshleifer/tiny-mbart"
MOBILEBERT_TINY = "hf-internal-testing/tiny-random-mobilebert"
MPNET_TINY = "hf-internal-testing/tiny-random-mpnet"
PEGASUS_TINY = "stas/pegasus-cnn_dailymail-tiny-random"
PROPHETNET_TINY = "hf-internal-testing/tiny-random-prophetnet"
ROBERTA_TINY = "sshleifer/tiny-distilroberta-base"
SQUEEZEBERT_TINY = "hf-internal-testing/tiny-random-squeezebert"
T5_TINY = "patrickvonplaten/t5-tiny-random"
T5_V1_TINY = "hf-internal-testing/tiny-random-t5-v1.1"
VIT_TINY = "hf-internal-testing/tiny-random-vit"
XLM_ROBERTA_TINY = "hf-internal-testing/tiny-xlm-roberta"
XLNET_TINY = "sshleifer/tiny-xlnet-base-cased"

# question-answering
ROBERTA_TINY = "sshleifer/tiny-distilroberta-base"

# masked lm
DISTILBERT_TINY = "sshleifer/tiny-distilbert-base-cased"
ELECTRA_TINY = "hf-internal-testing/tiny-electra"
# *** To Fix ***

# classification
XLNET_TINY = "sshleifer/tiny-xlnet-base-cased"
BERT_TINY = "hf-internal-testing/tiny-bert"

FIXTURE_DIRECTORY = get_tests_dir("fixtures")
ROOT_DIRECTORY = os.path.join(dirname(get_tests_dir()))
# *** tiny model issues ***
# missing model files:
MT5_TINY = "hf-internal-testing/tiny-random-mt5"
CAMEMBERT_TINY = "hf-internal-testing/tiny-random-camembert"
OPENAI_GPT_TINY = "hf-internal-testing/tiny-random-openai-gpt"

# missing tokenizer files
CONVBERT_TINY = "hf-internal-testing/tiny-random-convbert"
LAYOUTLMV2_TINY = "hf-internal-testing/tiny-random-layoutlmv2"
HUBERT_TINY = "hf-internal-testing/tiny-random-hubert"

# issues with tokenizer
CTRL_TINY = "hf-internal-testing/tiny-random-ctrl"
TRANSFO_XL_TINY = "hf-internal-testing/tiny-random-transfo-xl" # same as ctrl

# TODO: to add:
# albert
# deberta
# funnel
# longformer
# dpr
# gpt_neo
# camembert
# deberta-v2
# m2m_100
# tapas
# vit
# big_bird
# other issues with tiny models
IBERT_TINY = "hf-internal-testing/tiny-random-ibert" # multiple issues with either mlm/qa/clas
REFORMER_TINY = "hf-internal-testing/tiny-random-reformer" # multiple issues with either mlm/qa/clas

# *** Lacking official examples to test with ***
# or not working with examples
DPR_TINY = "hf-internal-testing/tiny-random-dpr"
# - "dpr" examples/research_projects/rag-end2end-retriever/
RAG_TINY = "hf-internal-testing/tiny-random-rag"
# - "rag" research_projects
LUKE_TINY = ""
# - "luke" Entities classes - no plan to make such example
LXMERT_TINY = "hf-internal-testing/tiny-random-lxmert"
# - "lxmert" doesn't work with run_qa.py
CLIP_TINY = "hf-internal-testing/tiny-random-clip"
# - "clip" nothing under pytorch examples - XXX: Suraj is working on adding some - check by end of Sep
SPEECH_TO_TEXT_TINY = "hf-internal-testing/tiny-random-speech_to_text"
# - "speech_to_text", nothing under pytorch examples


# *** Reactive mode ***
# models with low usage, unstable API, things about to change - do nothing about the following until someone runs into a problem
TAPAS_TINY = "hf-internal-testing/tiny-random-tapas"
# additional notes on tapas
# 1. requires torch_scatter - skip if it's not installed?
# 2. "Table must be of type pd.DataFrame" failure


# TODO: new models to add:
#


def get_launcher(distributed=False):
Expand All @@ -113,35 +161,68 @@ def make_task_cmds():
--overwrite_output_dir
""".split()

# XXX: try to cover as many models as possible once (it's enough to run on one task per model)
# try to cover as many models as possible once (it's enough to run on one task per model)
# but need a tiny model for each
#
# should have T5_TINY, etc. global var defined
# should have "{model_type.upper()}_TINY" corresponding vars defined, e.g., T5_TINY, etc.
tasks2models = dict(
trans=[
"bart",
"fsmt",
"m2m_100",
"marian",
"mbart",
"t5",
"t5_v1",
# "mt5", missing model files
],
sum=[
"pegasus",
],
clm=[
"big_bird",
"bigbird_pegasus",
"blenderbot",
"gpt2",
"gpt_neo",
"gptj",
"xlm-roberta",
"prophetnet",
# "camembert", missing model files
],
mlm=[
"electra",
"albert",
"deberta",
"deberta-v2",
"distilbert",
"electra",
"flaubert",
"funnel",
"layoutlm",
# "reformer", # multiple issues with either mlm/qa/clas
],
qa=[
"led",
"longformer",
"mobilebert",
"mpnet",
"roberta",
"squeezebert",
# "convbert", # missing tokenizer files
# "layoutlmv2", missing model files
],
clas=[
"bert",
"xlnet",
# "hubert", # missing tokenizer files
# "ibert", # multiple issues with either mlm/qa/clas
# "transfo-xl", # tokenizer issues as ctrl
# "ctrl", # tokenizer issues
# "openai-gpt", missing model files
# "tapas", multiple issues
],
img_clas=[
"vit",
],
)

Expand Down Expand Up @@ -180,6 +261,13 @@ def make_task_cmds():
--max_seq_length 12
--task_name MRPC
""",
img_clas=f"""
{scripts_dir}/image-classification/run_image_classification.py
--dataset_name hf-internal-testing/cats_vs_dogs_sample
--remove_unused_columns False
--max_steps 10
--feature_extractor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
""",
)

launcher = get_launcher(distributed=True)
Expand Down

0 comments on commit f861504

Please sign in to comment.