From d31f742a51a86a8ae1f23dc9f07624e6df2090ad Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre@huggingface.co>
Date: Tue, 13 Feb 2024 16:43:23 +0100
Subject: [PATCH] Script & Manual edition

---
 README.md                                     |  16 +-
 README_de.md                                  |  18 +--
 README_es.md                                  |  18 +--
 README_fr.md                                  |  18 +--
 README_hd.md                                  |  18 +--
 README_ja.md                                  |  18 +--
 README_ko.md                                  |  18 +--
 README_pt-br.md                               |  18 +--
 README_ru.md                                  |  18 +--
 README_te.md                                  |  18 +--
 README_zh-hans.md                             |  18 +--
 README_zh-hant.md                             |  18 +--
 docs/source/de/add_tensorflow_model.md        |   2 +-
 docs/source/de/autoclass_tutorial.md          |  12 +-
 docs/source/de/installation.md                |   4 +-
 docs/source/de/model_sharing.md               |   2 +-
 docs/source/de/pipeline_tutorial.md           |   4 +-
 docs/source/de/preprocessing.md               |   2 +-
 docs/source/de/quicktour.md                   |   4 +-
 docs/source/de/run_scripts.md                 |  26 +--
 docs/source/de/training.md                    |  10 +-
 docs/source/en/add_tensorflow_model.md        |   2 +-
 docs/source/en/autoclass_tutorial.md          |  12 +-
 docs/source/en/benchmarks.md                  |  38 ++---
 docs/source/en/big_models.md                  |   2 +-
 docs/source/en/community.md                   |   4 +-
 docs/source/en/create_a_model.md              |  22 +--
 docs/source/en/custom_tools.md                |   2 +-
 docs/source/en/deepspeed.md                   |  12 +-
 docs/source/en/generation_strategies.md       |  20 +--
 docs/source/en/glossary.md                    |   8 +-
 docs/source/en/installation.md                |   2 +-
 docs/source/en/internal/generation_utils.md   |   4 +-
 docs/source/en/main_classes/output.md         |   4 +-
 docs/source/en/main_classes/pipelines.md      |   2 +-
 docs/source/en/model_doc/auto.md              |   2 +-
 docs/source/en/model_doc/bert-generation.md   |   6 +-
 docs/source/en/model_doc/distilbert.md        |   6 +-
 docs/source/en/model_doc/encoder-decoder.md   |   8 +-
 docs/source/en/model_doc/gpt_bigcode.md       |   2 +-
 docs/source/en/model_doc/qdqbert.md           |   2 +-
 .../en/model_doc/speech-encoder-decoder.md    |   4 +-
 docs/source/en/model_doc/t5.md                |  34 ++--
 docs/source/en/model_doc/transfo-xl.md        |   4 +-
 .../en/model_doc/vision-encoder-decoder.md    |   6 +-
 docs/source/en/model_doc/visual_bert.md       |   2 +-
 docs/source/en/model_memory_anatomy.md        |   4 +-
 docs/source/en/model_sharing.md               |   2 +-
 docs/source/en/multilingual.md                |  34 ++--
 docs/source/en/perf_hardware.md               |   6 +-
 docs/source/en/perf_infer_gpu_one.md          |   4 +-
 docs/source/en/perf_train_cpu.md              |   2 +-
 docs/source/en/perf_train_cpu_many.md         |   6 +-
 docs/source/en/perf_train_gpu_many.md         |   6 +-
 docs/source/en/perf_train_gpu_one.md          |   2 +-
 docs/source/en/perf_train_special.md          |   2 +-
 docs/source/en/perplexity.md                  |   2 +-
 docs/source/en/pipeline_tutorial.md           |   2 +-
 docs/source/en/pipeline_webserver.md          |   2 +-
 docs/source/en/preprocessing.md               |   2 +-
 docs/source/en/quicktour.md                   |  12 +-
 docs/source/en/run_scripts.md                 |  26 +--
 docs/source/en/serialization.md               |   8 +-
 docs/source/en/task_summary.md                |   2 +-
 docs/source/en/tasks/language_modeling.md     |   8 +-
 .../en/tasks/masked_language_modeling.md      |   8 +-
 docs/source/en/tasks/multiple_choice.md       |   8 +-
 docs/source/en/tasks/prompting.md             |   2 +-
 docs/source/en/tasks/question_answering.md    |   8 +-
 .../en/tasks/sequence_classification.md       |   8 +-
 docs/source/en/tasks/summarization.md         |   4 +-
 docs/source/en/tasks/token_classification.md  |   8 +-
 docs/source/en/tasks/translation.md           |   4 +-
 docs/source/en/tf_xla.md                      |  12 +-
 docs/source/en/tflite.md                      |   4 +-
 docs/source/en/tokenizer_summary.md           |   4 +-
 docs/source/en/torchscript.md                 |   4 +-
 docs/source/en/trainer.md                     |   4 +-
 docs/source/en/training.md                    |  10 +-
 docs/source/en/troubleshooting.md             |   6 +-
 docs/source/es/autoclass_tutorial.md          |  12 +-
 docs/source/es/community.md                   |   4 +-
 .../source/es/converting_tensorflow_models.md |   4 +-
 docs/source/es/create_a_model.md              |  22 +--
 docs/source/es/glossary.md                    |   8 +-
 docs/source/es/installation.md                |   4 +-
 docs/source/es/model_sharing.md               |   2 +-
 docs/source/es/multilingual.md                |  34 ++--
 docs/source/es/perplexity.md                  |   2 +-
 docs/source/es/pipeline_tutorial.md           |   4 +-
 docs/source/es/preprocessing.md               |   2 +-
 docs/source/es/run_scripts.md                 |  26 +--
 docs/source/es/serialization.md               |  28 ++--
 docs/source/es/tasks/language_modeling.md     |  20 +--
 docs/source/es/tasks/multiple_choice.md       |   8 +-
 docs/source/es/tasks/question_answering.md    |   8 +-
 docs/source/es/tasks/summarization.md         |   8 +-
 docs/source/es/training.md                    |   8 +-
 docs/source/fr/autoclass_tutorial.md          |  12 +-
 docs/source/fr/installation.md                |   2 +-
 docs/source/fr/quicktour.md                   |  12 +-
 docs/source/hi/pipeline_tutorial.md           |   2 +-
 docs/source/it/autoclass_tutorial.md          |  12 +-
 docs/source/it/big_models.md                  |   2 +-
 docs/source/it/community.md                   |   4 +-
 .../source/it/converting_tensorflow_models.md |   4 +-
 docs/source/it/create_a_model.md              |  22 +--
 docs/source/it/installation.md                |   4 +-
 docs/source/it/migration.md                   |  18 +--
 docs/source/it/model_sharing.md               |   2 +-
 docs/source/it/multilingual.md                |  34 ++--
 docs/source/it/perf_hardware.md               |   6 +-
 docs/source/it/perf_train_cpu.md              |   2 +-
 docs/source/it/perf_train_cpu_many.md         |   4 +-
 docs/source/it/pipeline_tutorial.md           |   4 +-
 docs/source/it/preprocessing.md               |   2 +-
 docs/source/it/run_scripts.md                 |  26 +--
 docs/source/it/serialization.md               |  28 ++--
 docs/source/it/training.md                    |   8 +-
 docs/source/ja/add_tensorflow_model.md        |   2 +-
 docs/source/ja/autoclass_tutorial.md          |  12 +-
 docs/source/ja/benchmarks.md                  |  38 ++---
 docs/source/ja/big_models.md                  |   2 +-
 docs/source/ja/community.md                   |   4 +-
 docs/source/ja/create_a_model.md              |  22 +--
 docs/source/ja/custom_tools.md                |   2 +-
 docs/source/ja/generation_strategies.md       |  20 +--
 docs/source/ja/glossary.md                    |   8 +-
 docs/source/ja/installation.md                |   4 +-
 docs/source/ja/internal/generation_utils.md   |   4 +-
 docs/source/ja/main_classes/deepspeed.md      |  14 +-
 docs/source/ja/main_classes/output.md         |   4 +-
 docs/source/ja/main_classes/pipelines.md      |   2 +-
 docs/source/ja/main_classes/trainer.md        |   6 +-
 docs/source/ja/model_doc/auto.md              |   2 +-
 docs/source/ja/model_doc/bert-generation.md   |   6 +-
 docs/source/ja/model_doc/cpm.md               |   2 +-
 docs/source/ja/model_doc/ctrl.md              |   6 +-
 docs/source/ja/model_doc/dialogpt.md          |   2 +-
 docs/source/ja/model_memory_anatomy.md        |   4 +-
 docs/source/ja/model_sharing.md               |   2 +-
 docs/source/ja/multilingual.md                |  34 ++--
 docs/source/ja/perf_hardware.md               |   4 +-
 docs/source/ja/perf_train_cpu.md              |   2 +-
 docs/source/ja/perf_train_cpu_many.md         |   4 +-
 docs/source/ja/perf_train_gpu_many.md         |   6 +-
 docs/source/ja/perf_train_gpu_one.md          |   2 +-
 docs/source/ja/perplexity.md                  |   2 +-
 docs/source/ja/pipeline_tutorial.md           |   2 +-
 docs/source/ja/pipeline_webserver.md          |   2 +-
 docs/source/ja/preprocessing.md               |   2 +-
 docs/source/ja/quicktour.md                   |  12 +-
 docs/source/ja/run_scripts.md                 |  26 +--
 docs/source/ja/serialization.md               |   8 +-
 docs/source/ja/task_summary.md                |   2 +-
 docs/source/ja/tasks/language_modeling.md     |   8 +-
 .../ja/tasks/masked_language_modeling.md      |   8 +-
 docs/source/ja/tasks/multiple_choice.md       |   8 +-
 docs/source/ja/tasks/prompting.md             |   2 +-
 docs/source/ja/tasks/question_answering.md    |   8 +-
 docs/source/ja/tasks/summarization.md         |   4 +-
 docs/source/ja/tasks/token_classification.md  |   8 +-
 docs/source/ja/tasks/translation.md           |   4 +-
 docs/source/ja/tf_xla.md                      |  12 +-
 docs/source/ja/tflite.md                      |   4 +-
 docs/source/ja/tokenizer_summary.md           |   4 +-
 docs/source/ja/torchscript.md                 |   4 +-
 docs/source/ja/training.md                    |  10 +-
 docs/source/ja/troubleshooting.md             |   6 +-
 docs/source/ko/add_tensorflow_model.md        |   2 +-
 docs/source/ko/autoclass_tutorial.md          |  12 +-
 docs/source/ko/big_models.md                  |   2 +-
 docs/source/ko/community.md                   |   4 +-
 docs/source/ko/create_a_model.md              |  22 +--
 docs/source/ko/custom_tools.md                |   2 +-
 docs/source/ko/installation.md                |   4 +-
 docs/source/ko/model_memory_anatomy.md        |   4 +-
 docs/source/ko/model_sharing.md               |   2 +-
 docs/source/ko/multilingual.md                |  34 ++--
 docs/source/ko/perf_hardware.md               |   6 +-
 docs/source/ko/perf_train_cpu.md              |   2 +-
 docs/source/ko/perf_train_cpu_many.md         |   4 +-
 docs/source/ko/perf_train_gpu_many.md         |   6 +-
 docs/source/ko/perplexity.md                  |   2 +-
 docs/source/ko/pipeline_tutorial.md           |   2 +-
 docs/source/ko/pipeline_webserver.md          |   2 +-
 docs/source/ko/preprocessing.md               |   2 +-
 docs/source/ko/quicktour.md                   |  12 +-
 docs/source/ko/run_scripts.md                 |  26 +--
 docs/source/ko/serialization.md               |   8 +-
 docs/source/ko/task_summary.md                |   2 +-
 docs/source/ko/tasks/language_modeling.md     |   8 +-
 .../ko/tasks/masked_language_modeling.md      |   8 +-
 docs/source/ko/tasks/multiple_choice.md       |   8 +-
 docs/source/ko/tasks/question_answering.md    |   8 +-
 .../ko/tasks/sequence_classification.md       |   8 +-
 docs/source/ko/tasks/summarization.md         |   4 +-
 docs/source/ko/tasks/token_classification.md  |   8 +-
 docs/source/ko/tasks/translation.md           |   4 +-
 docs/source/ko/tf_xla.md                      |  12 +-
 docs/source/ko/tflite.md                      |   4 +-
 docs/source/ko/tokenizer_summary.md           |   4 +-
 docs/source/ko/torchscript.md                 |   4 +-
 docs/source/ko/training.md                    |  10 +-
 docs/source/ko/troubleshooting.md             |   6 +-
 .../source/pt/converting_tensorflow_models.md |   4 +-
 docs/source/pt/create_a_model.md              |  22 +--
 docs/source/pt/installation.md                |   4 +-
 docs/source/pt/multilingual.md                |  34 ++--
 docs/source/pt/pipeline_tutorial.md           |   4 +-
 docs/source/pt/quicktour.md                   |   2 +-
 docs/source/pt/run_scripts.md                 |  26 +--
 docs/source/pt/serialization.md               |  24 +--
 .../pt/tasks/sequence_classification.md       |   8 +-
 docs/source/pt/tasks/token_classification.md  |   8 +-
 docs/source/pt/training.md                    |   8 +-
 docs/source/te/quicktour.md                   |  12 +-
 docs/source/zh/autoclass_tutorial.md          |  12 +-
 docs/source/zh/big_models.md                  |   2 +-
 docs/source/zh/create_a_model.md              |  22 +--
 docs/source/zh/installation.md                |   4 +-
 docs/source/zh/internal/generation_utils.md   |   4 +-
 docs/source/zh/main_classes/deepspeed.md      |  14 +-
 docs/source/zh/main_classes/output.md         |   4 +-
 docs/source/zh/main_classes/pipelines.md      |   2 +-
 docs/source/zh/main_classes/trainer.md        |   6 +-
 docs/source/zh/model_sharing.md               |   2 +-
 docs/source/zh/multilingual.md                |  34 ++--
 docs/source/zh/perf_hardware.md               |   4 +-
 docs/source/zh/pipeline_tutorial.md           |   2 +-
 docs/source/zh/preprocessing.md               |   2 +-
 docs/source/zh/quicktour.md                   |  12 +-
 docs/source/zh/run_scripts.md                 |  26 +--
 docs/source/zh/serialization.md               |   8 +-
 docs/source/zh/task_summary.md                |   2 +-
 docs/source/zh/tf_xla.md                      |  12 +-
 docs/source/zh/tflite.md                      |   4 +-
 docs/source/zh/tokenizer_summary.md           |   4 +-
 docs/source/zh/training.md                    |  10 +-
 examples/README.md                            |   4 +-
 examples/flax/image-captioning/README.md      |   2 +-
 examples/flax/language-modeling/README.md     |  16 +-
 examples/flax/question-answering/README.md    |   4 +-
 examples/flax/test_flax_examples.py           |  14 +-
 examples/flax/text-classification/README.md   |   2 +-
 examples/flax/token-classification/README.md  |   2 +-
 examples/legacy/benchmarking/README.md        |   4 +-
 examples/legacy/question-answering/README.md  |  10 +-
 examples/legacy/run_camembert.py              |   4 +-
 examples/legacy/run_openai_gpt.py             |   4 +-
 examples/legacy/run_transfo_xl.py             |   2 +-
 examples/legacy/seq2seq/README.md             |   2 +-
 examples/legacy/seq2seq/old_test_datasets.py  |   2 +-
 examples/legacy/seq2seq/pack_dataset.py       |   2 +-
 .../legacy/seq2seq/run_distributed_eval.py    |   2 +-
 examples/legacy/seq2seq/run_eval.py           |   2 +-
 .../legacy/token-classification/README.md     |   8 +-
 .../legacy/token-classification/utils_ner.py  |   2 +-
 examples/pytorch/README.md                    |   4 +-
 .../pytorch/contrastive-image-text/README.md  |   4 +-
 examples/pytorch/language-modeling/README.md  |  18 +--
 examples/pytorch/multiple-choice/README.md    |   6 +-
 examples/pytorch/old_test_xla_examples.py     |   2 +-
 examples/pytorch/question-answering/README.md |  12 +-
 examples/pytorch/summarization/README.md      |  12 +-
 .../summarization/run_summarization.py        |  10 +-
 .../run_summarization_no_trainer.py           |  10 +-
 examples/pytorch/test_accelerate_examples.py  |  14 +-
 examples/pytorch/test_pytorch_examples.py     |  22 +--
 .../pytorch/text-classification/README.md     |  14 +-
 examples/pytorch/text-generation/README.md    |   4 +-
 .../run_generation_contrastive_search.py      |   2 +-
 .../pytorch/token-classification/README.md    |   8 +-
 examples/pytorch/translation/README.md        |   8 +-
 .../pytorch/translation/run_translation.py    |  10 +-
 .../bert-loses-patience/README.md             |   2 +-
 .../pabee/modeling_pabee_albert.py            |   4 +-
 .../pabee/modeling_pabee_bert.py              |   4 +-
 .../test_run_glue_with_pabee.py               |   2 +-
 ...ert_bertabs_original_pytorch_checkpoint.py |   2 +-
 .../bertabs/modeling_bertabs.py               |   2 +-
 .../bertabs/run_summarization.py              |   2 +-
 .../research_projects/codeparrot/README.md    |   6 +-
 .../codeparrot/scripts/arguments.py           |   4 +-
 .../deebert/test_glue_deebert.py              |  12 +-
 .../information-gain-filtration/README.md     |   2 +-
 .../information-gain-filtration/igf/igf.py    |   4 +-
 .../run_clm_igf.py                            |  25 +--
 .../research_projects/jax-projects/README.md  |  22 +--
 .../jax-projects/dataset-streaming/README.md  |   6 +-
 .../jax-projects/hybrid_clip/README.md        |  10 +-
 .../hybrid_clip/modeling_hybrid_clip.py       |   6 +-
 .../jax-projects/model_parallel/README.md     |   2 +-
 .../research_projects/longform-qa/eli5_app.py |   2 +-
 examples/research_projects/mlm_wwm/README.md  |   4 +-
 examples/research_projects/mm-imdb/README.md  |   2 +-
 .../movement-pruning/README.md                |   8 +-
 .../research_projects/performer/README.md     |   4 +-
 examples/research_projects/pplm/run_pplm.py   |   8 +-
 .../pplm/run_pplm_discrim_train.py            |   9 +-
 .../quantization-qdqbert/README.md            |  30 ++--
 examples/tensorflow/benchmarking/README.md    |   4 +-
 .../contrastive-image-text/README.md          |   2 +-
 .../language-modeling-tpu/run_mlm.py          |   2 +-
 .../tensorflow/language-modeling/README.md    |   8 +-
 examples/tensorflow/multiple-choice/README.md |   2 +-
 .../tensorflow/question-answering/README.md   |   2 +-
 .../summarization/run_summarization.py        |  10 +-
 .../tensorflow/test_tensorflow_examples.py    |  14 +-
 .../tensorflow/text-classification/README.md  |   4 +-
 .../tensorflow/token-classification/README.md |   4 +-
 examples/tensorflow/translation/README.md     |   4 +-
 hubconf.py                                    |  28 ++--
 scripts/benchmark/trainer-benchmark.py        |   2 +-
 .../benchmark/benchmark_args_utils.py         |   2 +-
 .../commands/add_new_model_like.py            |   2 +-
 src/transformers/commands/train.py            |   2 +-
 src/transformers/configuration_utils.py       |   9 +-
 src/transformers/convert_graph_to_onnx.py     |   4 +-
 .../convert_pytorch_checkpoint_to_tf2.py      |  14 +-
 ...nvert_tf_hub_seq_to_seq_bert_to_pytorch.py |   2 +-
 src/transformers/dynamic_module_utils.py      |   8 +-
 src/transformers/feature_extraction_utils.py  |   3 +-
 .../generation/configuration_utils.py         |   7 +-
 src/transformers/generation/logits_process.py |  60 +++----
 src/transformers/generation/streamers.py      |   8 +-
 src/transformers/generation/tf_utils.py       |  16 +-
 src/transformers/generation/utils.py          |  43 +++--
 src/transformers/image_processing_utils.py    |   3 +-
 src/transformers/integrations/bitsandbytes.py |   2 +-
 src/transformers/modelcard.py                 |   6 +-
 src/transformers/modeling_flax_utils.py       |  14 +-
 src/transformers/modeling_tf_utils.py         |   8 +-
 src/transformers/modeling_utils.py            |  10 +-
 .../models/albert/configuration_albert.py     |  18 +--
 .../models/albert/modeling_albert.py          |  26 +--
 .../models/albert/modeling_flax_albert.py     |   6 +-
 .../models/albert/modeling_tf_albert.py       |  26 +--
 .../models/albert/tokenization_albert.py      |  32 ++--
 .../models/albert/tokenization_albert_fast.py |  48 +++---
 .../models/align/convert_align_tf_to_hf.py    |   2 +-
 src/transformers/models/auto/auto_factory.py  |   8 +-
 .../models/auto/configuration_auto.py         |   9 +-
 .../models/auto/feature_extraction_auto.py    |  12 +-
 .../models/auto/image_processing_auto.py      |  10 +-
 src/transformers/models/auto/modeling_auto.py |   2 +-
 .../models/auto/modeling_flax_auto.py         |   4 +-
 .../models/auto/modeling_tf_auto.py           |   4 +-
 .../models/auto/processing_auto.py            |   3 +-
 .../models/auto/tokenization_auto.py          |  20 +--
 .../models/bark/processing_bark.py            |   3 +-
 .../models/bert/configuration_bert.py         |  44 ++---
 ..._bert_pytorch_checkpoint_to_original_tf.py |   2 +-
 src/transformers/models/bert/modeling_bert.py |  40 ++---
 .../models/bert/modeling_flax_bert.py         |  10 +-
 .../models/bert/modeling_tf_bert.py           |  36 ++---
 .../models/bert/tokenization_bert.py          | 104 ++++++------
 .../models/bert/tokenization_bert_fast.py     | 152 +++++++++---------
 .../models/bert/tokenization_bert_tf.py       |   4 +-
 .../convert_blip_original_pytorch_to_hf.py    |   2 +-
 .../camembert/configuration_camembert.py      |   8 +-
 .../models/camembert/modeling_camembert.py    |  12 +-
 .../models/camembert/modeling_tf_camembert.py |   2 +-
 .../camembert/tokenization_camembert.py       |   4 +-
 .../camembert/tokenization_camembert_fast.py  |   6 +-
 .../models/ctrl/modeling_tf_ctrl.py           |   2 +-
 .../models/ctrl/tokenization_ctrl.py          |   6 +-
 ...original_gluonnlp_checkpoint_to_pytorch.py |   2 +-
 .../models/deprecated/mmbt/modeling_mmbt.py   |   4 +-
 .../transfo_xl/configuration_transfo_xl.py    |   4 +-
 .../transfo_xl/modeling_tf_transfo_xl.py      |   4 +-
 .../transfo_xl/modeling_transfo_xl.py         |   4 +-
 .../transfo_xl/tokenization_transfo_xl.py     |   8 +-
 ...vert_dpr_original_checkpoint_to_pytorch.py |   6 +-
 .../configuration_encoder_decoder.py          |   4 +-
 .../modeling_encoder_decoder.py               |  10 +-
 .../modeling_flax_encoder_decoder.py          |  18 +--
 .../modeling_tf_encoder_decoder.py            |  10 +-
 .../models/flaubert/modeling_flaubert.py      |   4 +-
 .../models/git/convert_git_to_pytorch.py      |   4 +-
 .../models/gpt2/configuration_gpt2.py         |  12 +-
 .../models/gpt2/modeling_flax_gpt2.py         |   2 +-
 src/transformers/models/gpt2/modeling_gpt2.py |  30 ++--
 .../models/gpt2/modeling_tf_gpt2.py           |  18 +--
 .../models/gpt2/tokenization_gpt2.py          |  32 ++--
 .../models/gpt2/tokenization_gpt2_fast.py     |  42 ++---
 .../models/gpt2/tokenization_gpt2_tf.py       |   4 +-
 .../gpt_neox/tokenization_gpt_neox_fast.py    |   2 +-
 ...onvert_instructblip_original_to_pytorch.py |   2 +-
 .../models/llama/tokenization_llama.py        |   4 +-
 .../longformer/tokenization_longformer.py     |   2 +-
 .../tokenization_longformer_fast.py           |   2 +-
 .../models/longt5/modeling_flax_longt5.py     |  10 +-
 .../configuration_megatron_bert.py            |   4 +-
 ...eckpoint_reshaping_and_interoperability.py |   2 +-
 .../convert_megatron_gpt2_checkpoint.py       |   4 +-
 .../models/mgp_str/processing_mgp_str.py      |   4 +-
 src/transformers/models/mt5/modeling_mt5.py   |  12 +-
 .../musicgen/convert_musicgen_transformers.py |   4 +-
 .../models/musicgen/modeling_musicgen.py      |   8 +-
 .../models/openai/configuration_openai.py     |   8 +-
 .../models/openai/modeling_openai.py          |  10 +-
 .../models/openai/modeling_tf_openai.py       |  10 +-
 .../models/openai/tokenization_openai.py      |  10 +-
 .../models/openai/tokenization_openai_fast.py |  14 +-
 .../models/prophetnet/modeling_prophetnet.py  |   4 +-
 .../models/qdqbert/configuration_qdqbert.py   |   8 +-
 .../models/qdqbert/modeling_qdqbert.py        |  14 +-
 src/transformers/models/rag/modeling_rag.py   |   6 +-
 .../models/rag/modeling_tf_rag.py             |   6 +-
 .../models/roberta/configuration_roberta.py   |  14 +-
 .../models/roberta/modeling_flax_roberta.py   |   2 +-
 .../models/roberta/modeling_roberta.py        |  20 +--
 .../models/roberta/modeling_tf_roberta.py     |  10 +-
 .../models/roberta/tokenization_roberta.py    |  42 ++---
 .../roberta/tokenization_roberta_fast.py      |  58 +++----
 .../configuration_roberta_prelayernorm.py     |   2 +-
 .../modeling_roberta_prelayernorm.py          |   2 +-
 .../configuration_speech_encoder_decoder.py   |   2 +-
 .../modeling_flax_speech_encoder_decoder.py   |   4 -
 .../modeling_speech_encoder_decoder.py        |   6 +-
 .../switch_transformers/convert_big_switch.py |   2 +-
 .../models/t5/configuration_t5.py             |  12 +-
 .../models/t5/modeling_flax_t5.py             |  22 +--
 src/transformers/models/t5/modeling_t5.py     |  42 ++---
 src/transformers/models/t5/modeling_tf_t5.py  |  22 +--
 src/transformers/models/t5/tokenization_t5.py |  24 +--
 .../models/t5/tokenization_t5_fast.py         |  30 ++--
 .../trocr/convert_trocr_unilm_to_pytorch.py   |   2 +-
 src/transformers/models/umt5/modeling_umt5.py |   2 +-
 .../vilt/convert_vilt_original_to_pytorch.py  |   2 +-
 .../configuration_vision_encoder_decoder.py   |   2 +-
 .../modeling_flax_vision_encoder_decoder.py   |  12 +-
 .../modeling_tf_vision_encoder_decoder.py     |   8 +-
 .../modeling_vision_encoder_decoder.py        |   4 +-
 .../modeling_flax_vision_text_dual_encoder.py |  10 +-
 .../modeling_tf_vision_text_dual_encoder.py   |  10 +-
 .../modeling_vision_text_dual_encoder.py      |  10 +-
 .../visual_bert/modeling_visual_bert.py       |  12 +-
 .../processing_wav2vec2_with_lm.py            |   3 +-
 .../models/xlm/configuration_xlm.py           |  22 +--
 .../models/xlm/modeling_tf_xlm.py             |  22 +--
 src/transformers/models/xlm/modeling_xlm.py   |  26 +--
 .../models/xlm/tokenization_xlm.py            |  84 +++++-----
 .../xlm_prophetnet/modeling_xlm_prophetnet.py |   4 +-
 .../xlm_roberta/configuration_xlm_roberta.py  |  26 +--
 .../xlm_roberta/modeling_flax_xlm_roberta.py  |   6 +-
 .../xlm_roberta/modeling_tf_xlm_roberta.py    |   6 +-
 .../xlm_roberta/modeling_xlm_roberta.py       |  20 +--
 .../xlm_roberta/tokenization_xlm_roberta.py   |  32 ++--
 .../tokenization_xlm_roberta_fast.py          |  52 +++---
 .../configuration_xlm_roberta_xl.py           |   4 +-
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py |   6 +-
 .../models/xlnet/configuration_xlnet.py       |   6 +-
 .../models/xlnet/modeling_tf_xlnet.py         |  10 +-
 .../models/xlnet/modeling_xlnet.py            |  14 +-
 .../models/xlnet/tokenization_xlnet.py        |   8 +-
 .../models/xlnet/tokenization_xlnet_fast.py   |  12 +-
 src/transformers/models/xmod/modeling_xmod.py |   2 +-
 src/transformers/pipelines/__init__.py        |   4 +-
 .../pipelines/feature_extraction.py           |   2 +-
 src/transformers/pipelines/fill_mask.py       |   4 +-
 .../pipelines/text2text_generation.py         |   4 +-
 .../pipelines/text_classification.py          |   2 +-
 src/transformers/pipelines/text_generation.py |   4 +-
 src/transformers/processing_utils.py          |   3 +-
 .../quantizers/quantizer_bnb_4bit.py          |   2 +-
 .../quantizers/quantizer_bnb_8bit.py          |   2 +-
 src/transformers/testing_utils.py             |   4 +-
 src/transformers/tokenization_utils.py        |   4 +-
 src/transformers/tokenization_utils_base.py   |  14 +-
 src/transformers/training_args_seq2seq.py     |   3 +-
 src/transformers/utils/hub.py                 |   8 +-
 src/transformers/utils/quantization_config.py |   2 -
 tests/deepspeed/test_deepspeed.py             |   2 +-
 tests/deepspeed/test_model_zoo.py             |  10 +-
 tests/fsdp/test_fsdp.py                       |   2 +-
 tests/generation/test_configuration_utils.py  |   2 +-
 tests/generation/test_framework_agnostic.py   |  10 +-
 tests/generation/test_streamers.py            |   4 +-
 tests/generation/test_utils.py                |  32 ++--
 tests/models/albert/test_modeling_albert.py   |   2 +-
 .../albert/test_modeling_flax_albert.py       |   4 +-
 .../models/albert/test_modeling_tf_albert.py  |   2 +-
 .../models/albert/test_tokenization_albert.py |   2 +-
 tests/models/auto/test_configuration_auto.py  |   2 +-
 tests/models/auto/test_modeling_flax_auto.py  |   8 +-
 tests/models/auto/test_modeling_tf_auto.py    |   8 +-
 tests/models/auto/test_modeling_tf_pytorch.py |   8 +-
 tests/models/auto/test_tokenization_auto.py   |  14 +-
 tests/models/bert/test_modeling_bert.py       |   2 +-
 tests/models/bert/test_modeling_flax_bert.py  |   2 +-
 tests/models/bert/test_tokenization_bert.py   |   2 +-
 .../models/bert/test_tokenization_bert_tf.py  |   2 +-
 .../test_tokenization_bert_japanese.py        |   2 +-
 .../camembert/test_modeling_camembert.py      |   2 +-
 .../camembert/test_tokenization_camembert.py  |   2 +-
 tests/models/dpr/test_tokenization_dpr.py     |   4 +-
 .../test_modeling_encoder_decoder.py          |  34 ++--
 .../test_modeling_flax_encoder_decoder.py     |  20 ++-
 .../test_modeling_tf_encoder_decoder.py       |  30 ++--
 tests/models/gpt2/test_modeling_flax_gpt2.py  |   6 +-
 tests/models/gpt2/test_modeling_gpt2.py       |  24 +--
 tests/models/gpt2/test_modeling_tf_gpt2.py    |  36 ++---
 .../models/gpt2/test_tokenization_gpt2_tf.py  |   4 +-
 .../gpt_neo/test_modeling_flax_gpt_neo.py     |   4 +-
 tests/models/gptj/test_modeling_flax_gptj.py  |   4 +-
 .../test_tokenization_longformer.py           |   2 +-
 .../markuplm/test_tokenization_markuplm.py    |   2 +-
 .../test_tokenization_mobilebert.py           |   2 +-
 tests/models/mt5/test_modeling_mt5.py         |   4 +-
 tests/models/openai/test_modeling_openai.py   |   2 +-
 .../models/openai/test_modeling_tf_openai.py  |   2 +-
 .../pix2struct/test_processor_pix2struct.py   |   2 +-
 tests/models/qdqbert/test_modeling_qdqbert.py |   2 +-
 tests/models/realm/test_tokenization_realm.py |   2 +-
 .../roberta/test_modeling_flax_roberta.py     |   2 +-
 tests/models/roberta/test_modeling_roberta.py |   6 +-
 .../roberta/test_modeling_tf_roberta.py       |   6 +-
 .../roberta/test_tokenization_roberta.py      |   2 +-
 ...test_modeling_flax_roberta_prelayernorm.py |   2 +-
 ...st_modeling_flax_speech_encoder_decoder.py |   4 +-
 .../test_modeling_speech_encoder_decoder.py   |   4 +-
 .../test_modeling_switch_transformers.py      |   4 +-
 tests/models/t5/test_modeling_flax_t5.py      |  16 +-
 tests/models/t5/test_modeling_t5.py           |  34 ++--
 tests/models/t5/test_modeling_tf_t5.py        |  40 ++---
 tests/models/t5/test_tokenization_t5.py       |  16 +-
 tests/models/umt5/test_modeling_umt5.py       |   2 +-
 ...st_modeling_flax_vision_encoder_decoder.py |   4 +-
 ...test_modeling_tf_vision_encoder_decoder.py |  20 ++-
 tests/models/xlm/test_modeling_tf_xlm.py      |   2 +-
 tests/models/xlm/test_modeling_xlm.py         |   2 +-
 tests/models/xlm/test_tokenization_xlm.py     |   2 +-
 .../test_modeling_flax_xlm_roberta.py         |   4 +-
 .../xlm_roberta/test_modeling_xlm_roberta.py  |   4 +-
 .../test_tokenization_xlm_roberta.py          |   4 +-
 tests/models/xlnet/test_modeling_tf_xlnet.py  |   2 +-
 tests/models/xlnet/test_modeling_xlnet.py     |   2 +-
 tests/models/xlnet/test_tokenization_xlnet.py |   4 +-
 tests/models/xmod/test_modeling_xmod.py       |   2 +-
 tests/pipelines/test_pipelines_common.py      |   2 +-
 tests/pipelines/test_pipelines_fill_mask.py   |   4 +-
 .../test_pipelines_token_classification.py    |   2 +-
 tests/pipelines/test_pipelines_zero_shot.py   |   8 +-
 tests/quantization/bnb/test_4bit.py           |  20 +--
 tests/quantization/bnb/test_mixed_int8.py     |  26 +--
 .../test_multi_node_data_parallel.py          |   6 +-
 .../test_multi_node_model_parallel.py         |   4 +-
 tests/sagemaker/test_single_node_gpu.py       |   4 +-
 tests/test_configuration_utils.py             |   2 +-
 tests/test_modeling_utils.py                  |  12 +-
 tests/test_tokenization_common.py             |   2 +-
 tests/test_tokenization_utils.py              |  10 +-
 tests/tokenization/test_tokenization_fast.py  |   4 +-
 tests/tokenization/test_tokenization_utils.py |  24 +--
 tests/trainer/test_trainer.py                 |  10 +-
 tests/trainer/test_trainer_seq2seq.py         |   8 +-
 tests/utils/test_add_new_model_like.py        |  16 +-
 tests/utils/test_hub_utils.py                 |   6 +-
 utils/check_config_docstrings.py              |   4 +-
 561 files changed, 2685 insertions(+), 2690 deletions(-)

diff --git a/README.md b/README.md
index 1ca78f1e5a338b..b7077ce61032ba 100644
--- a/README.md
+++ b/README.md
@@ -89,13 +89,13 @@ You can test most of our models directly on their pages from the [model hub](htt
 Here are a few examples:
 
 In Natural Language Processing:
-- [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Masked word completion with BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Named Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
 - [Text generation with Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
-- [Natural Language Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Natural Language Inference with RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [Question answering with DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Translation with T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 In Computer Vision:
 - [Image classification with ViT](https://huggingface.co/google/vit-base-patch16-224)
@@ -201,8 +201,8 @@ In addition to `pipeline`, to download and use any of the pretrained models on y
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -212,8 +212,8 @@ And here is the equivalent code for TensorFlow:
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_de.md b/README_de.md
index 22fe8b13fe9b31..f21bebdc781120 100644
--- a/README_de.md
+++ b/README_de.md
@@ -90,13 +90,13 @@ Hier sind einige Beispiele:
 
 In der Computerlinguistik:
 
-- [Maskierte Wortvervollständigung mit BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Maskierte Wortvervollständigung mit BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Eigennamenerkennung mit Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [Textgenerierung mit GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [Natural Language Inference mit RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Textgenerierung mit GPT-2](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [Natural Language Inference mit RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [Automatische Textzusammenfassung mit BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [Question Answering mit DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [Maschinelle Übersetzung mit T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [Question Answering mit DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Maschinelle Übersetzung mit T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 In der Computer Vision:
 
@@ -197,8 +197,8 @@ Zusätzlich zur `pipeline` benötigt es nur drei Zeilen Code, um eines der vortr
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -209,8 +209,8 @@ Und hier ist der entsprechende Code für TensorFlow:
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_es.md b/README_es.md
index 8a814ff476ee21..9dfbf8931abada 100644
--- a/README_es.md
+++ b/README_es.md
@@ -84,13 +84,13 @@ Puedes probar la mayoría de nuestros modelos directamente en sus páginas desde
 Aquí hay algunos ejemplos:
 
 En procesamiento del lenguaje natural:
-- [Terminación de palabras enmascaradas con BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Terminación de palabras enmascaradas con BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Reconocimiento del nombre de la entidad con Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [Generación de texto con GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [Inferencia del lenguaje natural con RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Generación de texto con GPT-2](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [Inferencia del lenguaje natural con RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [Resumen con BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [Responder a preguntas con DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [Traducción con T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [Responder a preguntas con DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Traducción con T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 En visión de ordenador:
 - [Clasificación de imágenes con ViT](https://huggingface.co/google/vit-base-patch16-224)
@@ -174,8 +174,8 @@ Además de `pipeline`, para descargar y usar cualquiera de los modelos previamen
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -185,8 +185,8 @@ Y aquí está el código equivalente para TensorFlow:
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_fr.md b/README_fr.md
index d5672cca881bae..75ebdd315f651d 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -89,13 +89,13 @@ Vous pouvez tester la plupart de nos modèles directement sur leurs pages du [hu
 Voici quelques exemples :
 
 En traitement du langage naturel :
-- [Complétion de mots masqués avec BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Complétion de mots masqués avec BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Reconnaissance d'entités nommées avec Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [Génération de texte avec GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [Inférence de langage naturel avec RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Génération de texte avec GPT-2](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [Inférence de langage naturel avec RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [Résumé avec BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [Réponse aux questions avec DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [Traduction avec T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [Réponse aux questions avec DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Traduction avec T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 En vision par ordinateur :
 - [Classification d'images avec ViT](https://huggingface.co/google/vit-base-patch16-224)
@@ -194,8 +194,8 @@ En plus de `pipeline`, pour télécharger et utiliser n'importe lequel des modè
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 inputs = tokenizer("Bonjour le monde !", return_tensors="pt")
 outputs = model(**inputs)
@@ -206,8 +206,8 @@ Et voici le code équivalent pour TensorFlow :
 ```python
 from transformers import AutoTokenizer, TFAutoModel
 
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-model = TFAutoModel.from_pretrained("bert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 inputs = tokenizer("Bonjour le monde !", return_tensors="tf")
 outputs = model(**inputs)
diff --git a/README_hd.md b/README_hd.md
index e4ebddbea9de31..6402c3ee5eb7fc 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -99,13 +99,13 @@ checkpoint: जाँच बिंदु
 आप सबसे सीधे मॉडल पृष्ठ पर परीक्षण कर सकते हैं [model hub](https://huggingface.co/models) मॉडल पर। हम [निजी मॉडल होस्टिंग, मॉडल संस्करण, और अनुमान एपीआई](https://huggingface.co/pricing) भी प्रदान करते हैं।。
 
 यहाँ कुछ उदाहरण हैं：
-- [शब्द को भरने के लिए मास्क के रूप में BERT का प्रयोग करें](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [शब्द को भरने के लिए मास्क के रूप में BERT का प्रयोग करें](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [इलेक्ट्रा के साथ नामित इकाई पहचान](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [जीपीटी-2 के साथ टेक्स्ट जनरेशन](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [रॉबर्टा के साथ प्राकृतिक भाषा निष्कर्ष](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [जीपीटी-2 के साथ टेक्स्ट जनरेशन](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [रॉबर्टा के साथ प्राकृतिक भाषा निष्कर्ष](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [बार्ट के साथ पाठ सारांश](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [डिस्टिलबर्ट के साथ प्रश्नोत्तर](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [अनुवाद के लिए T5 का प्रयोग करें](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [डिस्टिलबर्ट के साथ प्रश्नोत्तर](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [अनुवाद के लिए T5 का प्रयोग करें](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 **[Write With Transformer](https://transformer.huggingface.co)**，हगिंग फेस टीम द्वारा बनाया गया, यह एक आधिकारिक पाठ पीढ़ी है demo。
 
@@ -151,8 +151,8 @@ checkpoint: जाँच बिंदु
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -161,8 +161,8 @@ checkpoint: जाँच बिंदु
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_ja.md b/README_ja.md
index 4cb4b4309d7a8d..bd8a058b7b1b96 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -119,13 +119,13 @@ user: ユーザ
 以下はその一例です:
 
  自然言語処理にて:
-- [BERTによるマスクドワード補完](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [BERTによるマスクドワード補完](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Electraによる名前実体認識](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [GPT-2によるテキスト生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [RoBERTaによる自然言語推論](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [GPT-2によるテキスト生成](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [RoBERTaによる自然言語推論](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [BARTによる要約](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [DistilBERTによる質問応答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [T5による翻訳](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [DistilBERTによる質問応答](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [T5による翻訳](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 コンピュータビジョンにて:
 - [ViTによる画像分類](https://huggingface.co/google/vit-base-patch16-224)
@@ -208,8 +208,8 @@ Hugging Faceチームによって作られた **[トランスフォーマーを
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -219,8 +219,8 @@ Hugging Faceチームによって作られた **[トランスフォーマーを
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_ko.md b/README_ko.md
index d00bd7c443256a..533ab4685bce09 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -74,13 +74,13 @@ limitations under the License.
 대부분의 모델을 [모델 허브](https://huggingface.co/models) 페이지에서 바로 테스트해볼 수 있습니다. 공개 및 비공개 모델을 위한 [비공개 모델 호스팅, 버전 관리, 추론 API](https://huggingface.co/pricing)도 제공합니다.
 
 예시:
-- [BERT로 마스킹된 단어 완성하기](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [BERT로 마스킹된 단어 완성하기](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Electra를 이용한 개체명 인식](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [GPT-2로 텍스트 생성하기](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [RoBERTa로 자연어 추론하기](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [GPT-2로 텍스트 생성하기](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [RoBERTa로 자연어 추론하기](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [BART를 이용한 요약](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [DistilBERT를 이용한 질문 답변](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [T5로 번역하기](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [DistilBERT를 이용한 질문 답변](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [T5로 번역하기](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 **[Transformer와 글쓰기](https://transformer.huggingface.co)** 는 이 저장소의 텍스트 생성 능력에 관한 Hugging Face 팀의 공식 데모입니다.
 
@@ -126,8 +126,8 @@ limitations under the License.
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -136,8 +136,8 @@ limitations under the License.
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_pt-br.md b/README_pt-br.md
index ab40f607c78314..40841bd82b9f8a 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -93,13 +93,13 @@ Aqui estão alguns exemplos:
 
 Em Processamento de Linguagem Natural:
 
-- [Completar palavra mascarada com BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Completar palavra mascarada com BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Reconhecimento de Entidades Nomeadas com Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [Geração de texto com GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C)
-- [Inferência de Linguagem Natural com RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Geração de texto com GPT-2](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C)
+- [Inferência de Linguagem Natural com RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [Sumarização com BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [Resposta a perguntas com DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [Tradução com T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [Resposta a perguntas com DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Tradução com T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 
 Em Visão Computacional:
@@ -204,8 +204,8 @@ Além do `pipeline`, para baixar e usar qualquer um dos modelos pré-treinados e
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -216,8 +216,8 @@ E aqui está o código equivalente para TensorFlow:
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_ru.md b/README_ru.md
index 718258d7f967d1..3e6f3d54f27e22 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -89,13 +89,13 @@ limitations under the License.
 Вот несколько примеров:
 
 В области NLP ( Обработка текстов на естественном языке ):
-- [Маскированное заполнение слов с помощью BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Маскированное заполнение слов с помощью BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Распознавание сущностей с помощью Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [Генерация текста с помощью GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [Выводы на естественном языке с помощью RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Генерация текста с помощью GPT-2](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [Выводы на естественном языке с помощью RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [Обобщение с помощью BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [Ответы на вопросы с помощью DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [Перевод с помощью T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [Ответы на вопросы с помощью DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Перевод с помощью T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 В области компьютерного зрения:
 - [Классификация изображений с помощью ViT](https://huggingface.co/google/vit-base-patch16-224)
@@ -196,8 +196,8 @@ Hugging Face Hub. Мы хотим, чтобы Transformers позволил ра
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Привет мир!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -207,8 +207,8 @@ Hugging Face Hub. Мы хотим, чтобы Transformers позволил ра
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Привет мир!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_te.md b/README_te.md
index 2706cfdc6ea07f..2c0b97dada67ed 100644
--- a/README_te.md
+++ b/README_te.md
@@ -91,13 +91,13 @@ limitations under the License.
 ఇక్కడ కొన్ని ఉదాహరణలు ఉన్నాయి:
 
 సహజ భాషా ప్రాసెసింగ్‌లో:
-- [BERT తో మాస్క్‌డ్ వర్డ్ కంప్లీషన్](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [BERT తో మాస్క్‌డ్ వర్డ్ కంప్లీషన్](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Electra తో పేరు ఎంటిటీ గుర్తింపు](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [GPT-2 తో టెక్స్ట్ జనరేషన్](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [RoBERTa తో సహజ భాషా అనుమితి](https://huggingface.co/roberta-large-mnli?text=The+dog+was+Lost.+Nobody+lost+any+animal)
+- [GPT-2 తో టెక్స్ట్ జనరేషన్](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [RoBERTa తో సహజ భాషా అనుమితి](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+Lost.+Nobody+lost+any+animal)
 - [BART తో సారాంశం](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [DistilBERT తో ప్రశ్న సమాధానం](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [T5 తో అనువాదం](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [DistilBERT తో ప్రశ్న సమాధానం](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [T5 తో అనువాదం](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 కంప్యూటర్ దృష్టిలో:
 - [VIT తో చిత్ర వర్గీకరణ](https://huggingface.co/google/vit-base-patch16-224)
@@ -198,8 +198,8 @@ limitations under the License.
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -209,8 +209,8 @@ limitations under the License.
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_zh-hans.md b/README_zh-hans.md
index b98e94791d8164..f2b9b38273bfba 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -99,13 +99,13 @@ checkpoint: 检查点
 你可以直接在模型页面上测试大多数 [model hub](https://huggingface.co/models) 上的模型。 我们也提供了 [私有模型托管、模型版本管理以及推理API](https://huggingface.co/pricing)。
 
 这里是一些例子：
-- [用 BERT 做掩码填词](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [用 BERT 做掩码填词](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [用 Electra 做命名实体识别](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [用 GPT-2 做文本生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [用 RoBERTa 做自然语言推理](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [用 GPT-2 做文本生成](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [用 RoBERTa 做自然语言推理](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [用 DistilBERT 做问答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [用 T5 做翻译](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [用 DistilBERT 做问答](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [用 T5 做翻译](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 **[Write With Transformer](https://transformer.huggingface.co)**，由抱抱脸团队打造，是一个文本生成的官方 demo。
 
@@ -151,8 +151,8 @@ checkpoint: 检查点
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -161,8 +161,8 @@ checkpoint: 检查点
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/README_zh-hant.md b/README_zh-hant.md
index b5c74ee1999eeb..1d5155529aa0a3 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -111,13 +111,13 @@ user: 使用者
 你可以直接在 [model hub](https://huggingface.co/models) 上測試大多數的模型。我們也提供了 [私有模型託管、模型版本管理以及推論API](https://huggingface.co/pricing)。
 
 這裡是一些範例：
-- [用 BERT 做遮蓋填詞](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [用 BERT 做遮蓋填詞](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [用 Electra 做專有名詞辨識](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [用 GPT-2 做文本生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [用 RoBERTa 做自然語言推論](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [用 GPT-2 做文本生成](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
+- [用 RoBERTa 做自然語言推論](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [用 DistilBERT 做問答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [用 T5 做翻譯](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+- [用 DistilBERT 做問答](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [用 T5 做翻譯](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
 **[Write With Transformer](https://transformer.huggingface.co)**，由 Hugging Face 團隊所打造，是一個文本生成的官方 demo。
 
@@ -163,8 +163,8 @@ user: 使用者
 ```python
 >>> from transformers import AutoTokenizer, AutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = AutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
@@ -173,8 +173,8 @@ user: 使用者
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("Hello world!", return_tensors="tf")
 >>> outputs = model(**inputs)
diff --git a/docs/source/de/add_tensorflow_model.md b/docs/source/de/add_tensorflow_model.md
index 23702f2d301d74..8488acbe709b64 100644
--- a/docs/source/de/add_tensorflow_model.md
+++ b/docs/source/de/add_tensorflow_model.md
@@ -42,7 +42,7 @@ Sind Sie unsicher, ob das Modell, das Sie verwenden möchten, bereits eine entsp
 &nbsp;
 
 Überprüfen Sie das Feld `model_type` in der `config.json` des Modells Ihrer Wahl
-([Beispiel](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). Wenn der entsprechende Modellordner in
+([Beispiel](https://huggingface.co/google-bert/bert-base-uncased/blob/main/config.json#L14)). Wenn der entsprechende Modellordner in
 🤗 Transformers eine Datei hat, deren Name mit "modeling_tf" beginnt, bedeutet dies, dass es eine entsprechende TensorFlow
 Architektur hat ([Beispiel](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)).
 
diff --git a/docs/source/de/autoclass_tutorial.md b/docs/source/de/autoclass_tutorial.md
index 7707f7b39b4910..5dea87ca552c1a 100644
--- a/docs/source/de/autoclass_tutorial.md
+++ b/docs/source/de/autoclass_tutorial.md
@@ -20,7 +20,7 @@ Bei so vielen verschiedenen Transformator-Architekturen kann es eine Herausforde
 
 <Tip>
 
-Denken Sie daran, dass sich die Architektur auf das Skelett des Modells bezieht und die Checkpoints die Gewichte für eine bestimmte Architektur sind. Zum Beispiel ist [BERT](https://huggingface.co/bert-base-uncased) eine Architektur, während `bert-base-uncased` ein Checkpoint ist. Modell ist ein allgemeiner Begriff, der entweder Architektur oder Prüfpunkt bedeuten kann.
+Denken Sie daran, dass sich die Architektur auf das Skelett des Modells bezieht und die Checkpoints die Gewichte für eine bestimmte Architektur sind. Zum Beispiel ist [BERT](https://huggingface.co/google-bert/bert-base-uncased) eine Architektur, während `google-bert/bert-base-uncased` ein Checkpoint ist. Modell ist ein allgemeiner Begriff, der entweder Architektur oder Prüfpunkt bedeuten kann.
 
 </Tip>
 
@@ -40,7 +40,7 @@ Laden Sie einen Tokenizer mit [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 Dann tokenisieren Sie Ihre Eingabe wie unten gezeigt:
@@ -88,7 +88,7 @@ Mit den `AutoModelFor`-Klassen können Sie schließlich ein vortrainiertes Model
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden:
@@ -96,7 +96,7 @@ Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur
 ```py
 >>> from transformers import AutoModelForTokenClassification
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip warning={true}>
@@ -115,7 +115,7 @@ Mit den Klassen `TFAutoModelFor` schließlich können Sie ein vortrainiertes Mod
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden:
@@ -123,7 +123,7 @@ Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Im Allgemeinen empfehlen wir, die Klasse "AutoTokenizer" und die Klasse "TFAutoModelFor" zu verwenden, um vortrainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten.
diff --git a/docs/source/de/installation.md b/docs/source/de/installation.md
index acf41bcbe45c9d..55d0f2d8512d47 100644
--- a/docs/source/de/installation.md
+++ b/docs/source/de/installation.md
@@ -173,14 +173,14 @@ Fügen sie [🤗 Datasets](https://huggingface.co/docs/datasets/) zu Ihrem Offli
 So würden Sie beispielsweise ein Programm in einem normalen Netzwerk mit einer Firewall für externe Instanzen mit dem folgenden Befehl ausführen:
 
 ```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 Führen Sie das gleiche Programm in einer Offline-Instanz mit aus:
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 Das Skript sollte nun laufen, ohne sich aufzuhängen oder eine Zeitüberschreitung abzuwarten, da es weiß, dass es nur nach lokalen Dateien suchen soll.
diff --git a/docs/source/de/model_sharing.md b/docs/source/de/model_sharing.md
index 415277e00e5ee9..6bbb6e10cb4942 100644
--- a/docs/source/de/model_sharing.md
+++ b/docs/source/de/model_sharing.md
@@ -229,4 +229,4 @@ Um sicherzustellen, dass die Benutzer die Fähigkeiten, Grenzen, möglichen Verz
 * Manuelles Erstellen und Hochladen einer "README.md"-Datei.
 * Klicken Sie auf die Schaltfläche **Modellkarte bearbeiten** in Ihrem Modell-Repository.
 
-Werfen Sie einen Blick auf die DistilBert [model card](https://huggingface.co/distilbert-base-uncased) als gutes Beispiel für die Art von Informationen, die eine Modellkarte enthalten sollte. Weitere Details über andere Optionen, die Sie in der Datei "README.md" einstellen können, wie z.B. den Kohlenstoff-Fußabdruck eines Modells oder Beispiele für Widgets, finden Sie in der Dokumentation [hier](https://huggingface.co/docs/hub/models-cards).
\ No newline at end of file
+Werfen Sie einen Blick auf die DistilBert [model card](https://huggingface.co/distilbert/distilbert-base-uncased) als gutes Beispiel für die Art von Informationen, die eine Modellkarte enthalten sollte. Weitere Details über andere Optionen, die Sie in der Datei "README.md" einstellen können, wie z.B. den Kohlenstoff-Fußabdruck eines Modells oder Beispiele für Widgets, finden Sie in der Dokumentation [hier](https://huggingface.co/docs/hub/models-cards).
\ No newline at end of file
diff --git a/docs/source/de/pipeline_tutorial.md b/docs/source/de/pipeline_tutorial.md
index 96aa60e357f8d5..5106af9b2fafc7 100644
--- a/docs/source/de/pipeline_tutorial.md
+++ b/docs/source/de/pipeline_tutorial.md
@@ -76,8 +76,8 @@ Die [`pipeline`] akzeptiert jedes Modell aus dem [Hub](https://huggingface.co/mo
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 Erstellen Sie eine [`pipeline`] für Ihre Aufgabe, und geben Sie das Modell und den Tokenizer an, die Sie geladen haben:
diff --git a/docs/source/de/preprocessing.md b/docs/source/de/preprocessing.md
index cf7b37bc9de90b..b56a5c0ae4ca1c 100644
--- a/docs/source/de/preprocessing.md
+++ b/docs/source/de/preprocessing.md
@@ -45,7 +45,7 @@ Laden Sie einen vortrainierten Tokenizer mit [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 ```
 
 Dann übergeben Sie Ihren Satz an den Tokenizer:
diff --git a/docs/source/de/quicktour.md b/docs/source/de/quicktour.md
index 0046124a1c8284..01cd7200750c4d 100644
--- a/docs/source/de/quicktour.md
+++ b/docs/source/de/quicktour.md
@@ -89,7 +89,7 @@ Importieren sie die [`pipeline`] und spezifizieren sie die Aufgabe, welche sie l
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-Die Pipeline lädt ein standardmäßiges [vortrainiertes Modell](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) und einen Tokenizer für die Stimmungs-Analyse herunter und speichert sie. Jetzt können Sie den "Klassifikator" auf Ihren Zieltext anwenden:
+Die Pipeline lädt ein standardmäßiges [vortrainiertes Modell](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) und einen Tokenizer für die Stimmungs-Analyse herunter und speichert sie. Jetzt können Sie den "Klassifikator" auf Ihren Zieltext anwenden:
 
 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
@@ -407,7 +407,7 @@ Beginnen Sie mit dem Import von [`AutoConfig`] und laden Sie dann das trainierte
 ```py
 >>> from transformers import AutoConfig
 
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
 ```
 
 <frameworkcontent>
diff --git a/docs/source/de/run_scripts.md b/docs/source/de/run_scripts.md
index 52ff281a02babe..61a0754ea92628 100644
--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@@ -87,11 +87,11 @@ pip install -r requirements.txt
 
 <frameworkcontent>
 <pt>
-Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Dann nimmt das Skript eine Feinabstimmung eines Datensatzes mit dem [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) auf einer Architektur vor, die eine Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/t5-small) auf dem Datensatz [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
+Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Dann nimmt das Skript eine Feinabstimmung eines Datensatzes mit dem [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) auf einer Architektur vor, die eine Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/google-t5/t5-small) auf dem Datensatz [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -105,11 +105,11 @@ python examples/pytorch/summarization/run_summarization.py \
 ```
 </pt>
 <tf>
-Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Anschließend nimmt das Skript die Feinabstimmung eines Datensatzes mit Keras auf einer Architektur vor, die die Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/t5-small) auf dem [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) Datensatz durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
+Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Anschließend nimmt das Skript die Feinabstimmung eines Datensatzes mit Keras auf einer Architektur vor, die die Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/google-t5/t5-small) auf dem [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) Datensatz durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
 
 ```bash
 python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -133,7 +133,7 @@ Der [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) unt
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -157,7 +157,7 @@ Tensor Processing Units (TPUs) sind speziell für die Beschleunigung der Leistun
 ```bash
 python xla_spawn.py --num_cores 8 \
     summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -176,7 +176,7 @@ Tensor Processing Units (TPUs) sind speziell für die Beschleunigung der Leistun
 ```bash
 python run_summarization.py  \
     --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -214,7 +214,7 @@ Jetzt sind Sie bereit, das Training zu starten:
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -233,7 +233,7 @@ Ein Zusammenfassungsskript, das einen benutzerdefinierten Datensatz verwendet, w
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -258,7 +258,7 @@ Es ist oft eine gute Idee, Ihr Skript an einer kleineren Anzahl von Beispielen f
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --max_train_samples 50 \
     --max_eval_samples 50 \
     --max_predict_samples 50 \
@@ -288,7 +288,7 @@ Die erste Methode verwendet das Argument `output_dir previous_output_dir`, um da
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -305,7 +305,7 @@ Die zweite Methode verwendet das Argument `Resume_from_checkpoint path_to_specif
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -335,7 +335,7 @@ Das folgende Beispiel zeigt, wie Sie ein Modell mit einem bestimmten Repository-
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
diff --git a/docs/source/de/training.md b/docs/source/de/training.md
index e87aa458135bc7..7b1bd3e5d0c368 100644
--- a/docs/source/de/training.md
+++ b/docs/source/de/training.md
@@ -48,7 +48,7 @@ Wie Sie nun wissen, benötigen Sie einen Tokenizer, um den Text zu verarbeiten u
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 
 >>> def tokenize_function(examples):
@@ -86,7 +86,7 @@ Beginnen Sie mit dem Laden Ihres Modells und geben Sie die Anzahl der erwarteten
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 <Tip>
@@ -187,7 +187,7 @@ Wir können sie also ohne Tokenisierung direkt in ein NumPy-Array konvertieren!
 ```py
 from transformers import AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True)
 # Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
 tokenized_data = dict(tokenized_data)
@@ -202,7 +202,7 @@ from transformers import TFAutoModelForSequenceClassification
 from tensorflow.keras.optimizers import Adam
 
 # Load and compile our model
-model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
 # Lower learning rates are often better for fine-tuning transformers
 model.compile(optimizer=Adam(3e-5))
 
@@ -333,7 +333,7 @@ Laden Sie Ihr Modell mit der Anzahl der erwarteten Kennzeichnungen:
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 ### Optimierer und Lernratensteuerung
diff --git a/docs/source/en/add_tensorflow_model.md b/docs/source/en/add_tensorflow_model.md
index b2ff9bb8998642..52c7e3b1ada118 100644
--- a/docs/source/en/add_tensorflow_model.md
+++ b/docs/source/en/add_tensorflow_model.md
@@ -42,7 +42,7 @@ Are you unsure whether the model you wish to use already has a corresponding Ten
 &nbsp;
 
 Check the `model_type` field of the `config.json` of your model of choice
-([example](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). If the corresponding model folder in
+([example](https://huggingface.co/google-bert/bert-base-uncased/blob/main/config.json#L14)). If the corresponding model folder in
 🤗 Transformers has a file whose name starts with "modeling_tf", it means that it has a corresponding TensorFlow
 architecture ([example](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)).
 
diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md
index d52ba3fbc98ffa..eacfdb441c2099 100644
--- a/docs/source/en/autoclass_tutorial.md
+++ b/docs/source/en/autoclass_tutorial.md
@@ -20,7 +20,7 @@ With so many different Transformer architectures, it can be challenging to creat
 
 <Tip>
 
-Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/bert-base-uncased) is an architecture, while `bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint.
+Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/google-bert/bert-base-uncased) is an architecture, while `google-bert/bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint.
 
 </Tip>
 
@@ -42,7 +42,7 @@ Load a tokenizer with [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 Then tokenize your input as shown below:
@@ -143,7 +143,7 @@ The `AutoModelFor` classes let you load a pretrained model for a given task (see
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Easily reuse the same checkpoint to load an architecture for a different task:
@@ -151,7 +151,7 @@ Easily reuse the same checkpoint to load an architecture for a different task:
 ```py
 >>> from transformers import AutoModelForTokenClassification
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip warning={true}>
@@ -170,7 +170,7 @@ Finally, the `TFAutoModelFor` classes let you load a pretrained model for a give
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Easily reuse the same checkpoint to load an architecture for a different task:
@@ -178,7 +178,7 @@ Easily reuse the same checkpoint to load an architecture for a different task:
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
diff --git a/docs/source/en/benchmarks.md b/docs/source/en/benchmarks.md
index 5023d248697904..1fd61cc8de4029 100644
--- a/docs/source/en/benchmarks.md
+++ b/docs/source/en/benchmarks.md
@@ -48,7 +48,7 @@ The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an
 ```py
 >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
 
->>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> args = PyTorchBenchmarkArguments(models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
 >>> benchmark = PyTorchBenchmark(args)
 ```
 </pt>
@@ -57,7 +57,7 @@ The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an
 >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
 
 >>> args = TensorFlowBenchmarkArguments(
-...     models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
+...     models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
 ... )
 >>> benchmark = TensorFlowBenchmark(args)
 ```
@@ -89,20 +89,20 @@ An instantiated benchmark object can then simply be run by calling `benchmark.ru
 --------------------------------------------------------------------------------
 Model Name             Batch Size     Seq Length     Time in s                  
 --------------------------------------------------------------------------------
-bert-base-uncased          8               8             0.006     
-bert-base-uncased          8               32            0.006     
-bert-base-uncased          8              128            0.018     
-bert-base-uncased          8              512            0.088     
+google-bert/bert-base-uncased          8               8             0.006     
+google-bert/bert-base-uncased          8               32            0.006     
+google-bert/bert-base-uncased          8              128            0.018     
+google-bert/bert-base-uncased          8              512            0.088     
 --------------------------------------------------------------------------------
 
 ====================      INFERENCE - MEMORY - RESULT       ====================
 --------------------------------------------------------------------------------
 Model Name             Batch Size     Seq Length    Memory in MB 
 --------------------------------------------------------------------------------
-bert-base-uncased          8               8             1227
-bert-base-uncased          8               32            1281
-bert-base-uncased          8              128            1307
-bert-base-uncased          8              512            1539
+google-bert/bert-base-uncased          8               8             1227
+google-bert/bert-base-uncased          8               32            1281
+google-bert/bert-base-uncased          8              128            1307
+google-bert/bert-base-uncased          8              512            1539
 --------------------------------------------------------------------------------
 
 ====================        ENVIRONMENT INFORMATION         ====================
@@ -146,20 +146,20 @@ An instantiated benchmark object can then simply be run by calling `benchmark.ru
 --------------------------------------------------------------------------------
 Model Name             Batch Size     Seq Length     Time in s                  
 --------------------------------------------------------------------------------
-bert-base-uncased          8               8             0.005
-bert-base-uncased          8               32            0.008
-bert-base-uncased          8              128            0.022
-bert-base-uncased          8              512            0.105
+google-bert/bert-base-uncased          8               8             0.005
+google-bert/bert-base-uncased          8               32            0.008
+google-bert/bert-base-uncased          8              128            0.022
+google-bert/bert-base-uncased          8              512            0.105
 --------------------------------------------------------------------------------
 
 ====================      INFERENCE - MEMORY - RESULT       ====================
 --------------------------------------------------------------------------------
 Model Name             Batch Size     Seq Length    Memory in MB 
 --------------------------------------------------------------------------------
-bert-base-uncased          8               8             1330
-bert-base-uncased          8               32            1330
-bert-base-uncased          8              128            1330
-bert-base-uncased          8              512            1770
+google-bert/bert-base-uncased          8               8             1330
+google-bert/bert-base-uncased          8               32            1330
+google-bert/bert-base-uncased          8              128            1330
+google-bert/bert-base-uncased          8              512            1770
 --------------------------------------------------------------------------------
 
 ====================        ENVIRONMENT INFORMATION         ====================
@@ -197,7 +197,7 @@ when adding the argument `save_to_csv=True` to [`PyTorchBenchmarkArguments`] and
 [`TensorFlowBenchmarkArguments`] respectively. In this case, every section is saved in a separate
 _.csv_ file. The path to each _.csv_ file can optionally be defined via the argument data classes.
 
-Instead of benchmarking pre-trained models via their model identifier, _e.g._ `bert-base-uncased`, the user can
+Instead of benchmarking pre-trained models via their model identifier, _e.g._ `google-bert/bert-base-uncased`, the user can
 alternatively benchmark an arbitrary configuration of any available model class. In this case, a `list` of
 configurations must be inserted with the benchmark args as follows.
 
diff --git a/docs/source/en/big_models.md b/docs/source/en/big_models.md
index 9b57e433176094..729d32ca202951 100644
--- a/docs/source/en/big_models.md
+++ b/docs/source/en/big_models.md
@@ -42,7 +42,7 @@ You can control the maximum size before sharding with the `max_shard_size` param
 ```py
 from transformers import AutoModel
 
-model = AutoModel.from_pretrained("bert-base-cased")
+model = AutoModel.from_pretrained("google-bert/bert-base-cased")
 ```
 
 If you save it using [`~PreTrainedModel.save_pretrained`], you will get a new folder with two files: the config of the model and its weights:
diff --git a/docs/source/en/community.md b/docs/source/en/community.md
index 1666a9e3e20c49..7890cb22ca5882 100644
--- a/docs/source/en/community.md
+++ b/docs/source/en/community.md
@@ -43,8 +43,8 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
 |[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
 |[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
-|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
-|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *google-bert/bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *FacebookAI/roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
 |[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
 |[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
 |[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
diff --git a/docs/source/en/create_a_model.md b/docs/source/en/create_a_model.md
index 7f810e8107e4dd..29f26c59984aa3 100644
--- a/docs/source/en/create_a_model.md
+++ b/docs/source/en/create_a_model.md
@@ -87,7 +87,7 @@ DistilBertConfig {
 Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function:
 
 ```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
 ```
 
 Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory:
@@ -128,13 +128,13 @@ This creates a model with random values instead of pretrained weights. You won't
 Create a pretrained model with [`~PreTrainedModel.from_pretrained`]:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </pt>
 <tf>
@@ -152,13 +152,13 @@ This creates a model with random values instead of pretrained weights. You won't
 Create a pretrained model with [`~TFPreTrainedModel.from_pretrained`]:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </tf>
 </frameworkcontent>
@@ -174,7 +174,7 @@ For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model
 ```py
 >>> from transformers import DistilBertForSequenceClassification
 
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
@@ -182,7 +182,7 @@ Easily reuse this checkpoint for another task by switching to a different model
 ```py
 >>> from transformers import DistilBertForQuestionAnswering
 
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </pt>
 <tf>
@@ -191,7 +191,7 @@ For example, [`TFDistilBertForSequenceClassification`] is a base DistilBERT mode
 ```py
 >>> from transformers import TFDistilBertForSequenceClassification
 
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`TFDistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
@@ -199,7 +199,7 @@ Easily reuse this checkpoint for another task by switching to a different model
 ```py
 >>> from transformers import TFDistilBertForQuestionAnswering
 
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </tf>
 </frameworkcontent>
@@ -232,7 +232,7 @@ It is important to remember the vocabulary from a custom tokenizer will be diffe
 ```py
 >>> from transformers import DistilBertTokenizer
 
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Create a fast tokenizer with the [`DistilBertTokenizerFast`] class:
@@ -240,7 +240,7 @@ Create a fast tokenizer with the [`DistilBertTokenizerFast`] class:
 ```py
 >>> from transformers import DistilBertTokenizerFast
 
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip>
diff --git a/docs/source/en/custom_tools.md b/docs/source/en/custom_tools.md
index 4221679c79d958..9b7d1dcab67e6c 100644
--- a/docs/source/en/custom_tools.md
+++ b/docs/source/en/custom_tools.md
@@ -586,7 +586,7 @@ model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
 print(model.id)
 ```
 
-For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'t5-base`.
+For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'google-t5/t5-base`.
 
 How do we convert this to a tool that the agent can leverage? All tools depend on the superclass `Tool` that holds the
 main attributes necessary. We'll create a class that inherits from it:
diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md
index 90eaa8386238a9..eacd6e1c1071c8 100644
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@@ -266,7 +266,7 @@ from transformers import T5ForConditionalGeneration, T5Config
 import deepspeed
 
 with deepspeed.zero.Init():
-    config = T5Config.from_pretrained("t5-small")
+    config = T5Config.from_pretrained("google-t5/t5-small")
     model = T5ForConditionalGeneration(config)
 ```
 
@@ -276,7 +276,7 @@ For pretrained models, the DeepSped config file needs to have `is_deepspeed_zero
 from transformers import AutoModel, Trainer, TrainingArguments
 
 training_args = TrainingArguments(..., deepspeed=ds_config)
-model = AutoModel.from_pretrained("t5-small")
+model = AutoModel.from_pretrained("google-t5/t5-small")
 trainer = Trainer(model=model, args=training_args, ...)
 ```
 
@@ -601,7 +601,7 @@ To deploy DeepSpeed on multiple GPUs, add the `--num_gpus` parameter. If you wan
 ```bash
 deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
 --deepspeed tests/deepspeed/ds_config_zero3.json \
---model_name_or_path t5-small --per_device_train_batch_size 1 \
+--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
 --output_dir output_dir --overwrite_output_dir --fp16 \
 --do_train --max_train_samples 500 --num_train_epochs 1 \
 --dataset_name wmt16 --dataset_config "ro-en" \
@@ -616,7 +616,7 @@ To deploy DeepSpeed on a single GPU, add the `--num_gpus` parameter. It isn't ne
 ```bash
 deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
 --deepspeed tests/deepspeed/ds_config_zero2.json \
---model_name_or_path t5-small --per_device_train_batch_size 1 \
+--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
 --output_dir output_dir --overwrite_output_dir --fp16 \
 --do_train --max_train_samples 500 --num_train_epochs 1 \
 --dataset_name wmt16 --dataset_config "ro-en" \
@@ -949,7 +949,7 @@ import deepspeed
 ds_config = {...}  # deepspeed config object or path to the file
 # must run before instantiating the model to detect zero 3
 dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-model = AutoModel.from_pretrained("gpt2")
+model = AutoModel.from_pretrained("openai-community/gpt2")
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 ```
 
@@ -966,7 +966,7 @@ import deepspeed
 ds_config = {...}  # deepspeed config object or path to the file
 # must run before instantiating the model to detect zero 3
 dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-config = AutoConfig.from_pretrained("gpt2")
+config = AutoConfig.from_pretrained("openai-community/gpt2")
 model = AutoModel.from_config(config)
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 ```
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index df91c36c610b71..c4378551e6146c 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -54,7 +54,7 @@ When you load a model explicitly, you can inspect the generation configuration t
 ```python
 >>> from transformers import AutoModelForCausalLM
 
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 >>> model.generation_config
 GenerationConfig {
     "bos_token_id": 50256,
@@ -121,8 +121,8 @@ one for summarization with beam search). You must have the right Hub permissions
 ```python
 >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
 
->>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
 
 >>> translation_generation_config = GenerationConfig(
 ...     num_beams=4,
@@ -162,8 +162,8 @@ your screen, one word at a time:
 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 
->>> tok = AutoTokenizer.from_pretrained("gpt2")
->>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+>>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
 >>> streamer = TextStreamer(tok)
 
@@ -187,7 +187,7 @@ Here, we'll show some of the parameters that control the decoding strategies and
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 
 >>> prompt = "I look forward to"
->>> checkpoint = "distilgpt2"
+>>> checkpoint = "distilbert/distilgpt2"
 
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -208,7 +208,7 @@ The two main parameters that enable and control the behavior of contrastive sear
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> checkpoint = "gpt2-large"
+>>> checkpoint = "openai-community/gpt2-large"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 
@@ -235,7 +235,7 @@ To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 >>> set_seed(0)  # For reproducibility
 
->>> checkpoint = "gpt2-large"
+>>> checkpoint = "openai-community/gpt2-large"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 
@@ -260,7 +260,7 @@ To enable this decoding strategy, specify the `num_beams` (aka number of hypothe
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 
 >>> prompt = "It is astonishing how one can"
->>> checkpoint = "gpt2-medium"
+>>> checkpoint = "openai-community/gpt2-medium"
 
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -283,7 +283,7 @@ the `num_beams` greater than 1, and set `do_sample=True` to use this decoding st
 >>> set_seed(0)  # For reproducibility
 
 >>> prompt = "translate English to German: The house is wonderful."
->>> checkpoint = "t5-small"
+>>> checkpoint = "google-t5/t5-small"
 
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> inputs = tokenizer(prompt, return_tensors="pt")
diff --git a/docs/source/en/glossary.md b/docs/source/en/glossary.md
index 96f5cbd0e66884..f3c2c50d705ab6 100644
--- a/docs/source/en/glossary.md
+++ b/docs/source/en/glossary.md
@@ -34,7 +34,7 @@ For example, consider these two sequences:
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 >>> sequence_a = "This is a short sequence."
 >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
@@ -159,7 +159,7 @@ The process of selecting and transforming raw data into a set of features that a
 
 In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
 The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
-`bert-base-uncased`).
+`google-bert/bert-base-uncased`).
 
 For an input of size `[batch_size, sequence_length]`, the memory required to store the intermediate feed forward
 embeddings `[batch_size, sequence_length, config.intermediate_size]` can account for a large fraction of the memory
@@ -212,7 +212,7 @@ tokenizer, which is a [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) tokenize
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 >>> sequence = "A Titan RTX has 24GB of VRAM"
 ```
@@ -467,7 +467,7 @@ arguments (and not a list, like before) like this:
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 >>> sequence_a = "HuggingFace is based in NYC"
 >>> sequence_b = "Where is HuggingFace based?"
 
diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md
index a7b916fe484152..7ece8eae44cabd 100644
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@@ -179,7 +179,7 @@ Add [🤗 Datasets](https://huggingface.co/docs/datasets/) to your offline train
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 This script should run without hanging or waiting to timeout because it won't attempt to download the model from the Hub.
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index 452921d88c0e87..0fa15ddbcf1943 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -38,8 +38,8 @@ Here's an example:
 ```python
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained("gpt2")
+tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
 
 inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
 generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
diff --git a/docs/source/en/main_classes/output.md b/docs/source/en/main_classes/output.md
index 64101fd824454a..3567cf62c44e2d 100644
--- a/docs/source/en/main_classes/output.md
+++ b/docs/source/en/main_classes/output.md
@@ -26,8 +26,8 @@ Let's see how this looks in an example:
 from transformers import BertTokenizer, BertForSequenceClassification
 import torch
 
-tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
 
 inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
 labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md
index 61bdf3729a7e0a..1e8f93f3ba8e5e 100644
--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@@ -43,7 +43,7 @@ If you want to use a specific model from the [hub](https://huggingface.co) you c
 the hub already defines it:
 
 ```python
->>> pipe = pipeline(model="roberta-large-mnli")
+>>> pipe = pipeline(model="FacebookAI/roberta-large-mnli")
 >>> pipe("This restaurant is awesome")
 [{'label': 'NEUTRAL', 'score': 0.7313136458396912}]
 ```
diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
index 9dbaaf3acbbbb6..036b8b81ca6b48 100644
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@@ -25,7 +25,7 @@ Instantiating one of [`AutoConfig`], [`AutoModel`], and
 
 
 ```python
-model = AutoModel.from_pretrained("bert-base-cased")
+model = AutoModel.from_pretrained("google-bert/bert-base-cased")
 ```
 
 will create a model that is an instance of [`BertModel`].
diff --git a/docs/source/en/model_doc/bert-generation.md b/docs/source/en/model_doc/bert-generation.md
index 7edbf38694ed39..40c2fbaa212e6b 100644
--- a/docs/source/en/model_doc/bert-generation.md
+++ b/docs/source/en/model_doc/bert-generation.md
@@ -44,15 +44,15 @@ subsequent fine-tuning:
 ```python
 >>> # leverage checkpoints for Bert2Bert model...
 >>> # use BERT's cls token as BOS token and sep token as EOS token
->>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+>>> encoder = BertGenerationEncoder.from_pretrained("google-bert/bert-large-uncased", bos_token_id=101, eos_token_id=102)
 >>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
 >>> decoder = BertGenerationDecoder.from_pretrained(
-...     "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
+...     "google-bert/bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
 ... )
 >>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
 
 >>> # create tokenizer...
->>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
 
 >>> input_ids = tokenizer(
 ...     "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md
index bd39260d3ca492..844927e71984a9 100644
--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@@ -34,7 +34,7 @@ The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, li
 distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
 distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
 small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
-*bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
+*google-bert/bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
 understanding benchmark.
 
 The abstract from the paper is the following:
@@ -152,8 +152,8 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 
 >>> device = "cuda" # the device to load the model onto
 
->>> tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
->>> model = AutoModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
+>>> tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
+>>> model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
 
 >>> text = "Replace me by any text you'd like."
 
diff --git a/docs/source/en/model_doc/encoder-decoder.md b/docs/source/en/model_doc/encoder-decoder.md
index 54c9f750647606..4bd0e6f188fe15 100644
--- a/docs/source/en/model_doc/encoder-decoder.md
+++ b/docs/source/en/model_doc/encoder-decoder.md
@@ -55,8 +55,8 @@ To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_
 ```python
 >>> from transformers import EncoderDecoderModel, BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
->>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
 ```
 
 ## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
@@ -119,8 +119,8 @@ target sequence).
 ```python
 >>> from transformers import BertTokenizer, EncoderDecoderModel
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
->>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
 
 >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
 >>> model.config.pad_token_id = tokenizer.pad_token_id
diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md
index b3cb078e2a140c..1635a9f50dd08e 100644
--- a/docs/source/en/model_doc/gpt_bigcode.md
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@@ -38,7 +38,7 @@ The main differences compared to GPT2.
 - Use jit to fuse the attention fp32 casting, masking, softmax, and scaling.
 - Combine the attention and causal masks into a single one, pre-computed for the whole model instead of every layer.
 - Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?)
-- Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original gpt2 model).
+- Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original openai-community/gpt2 model).
 
 You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)
 
diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md
index 9ee42ff3b49d37..19b829d0bc5d19 100644
--- a/docs/source/en/model_doc/qdqbert.md
+++ b/docs/source/en/model_doc/qdqbert.md
@@ -39,7 +39,7 @@ This model was contributed by [shangz](https://huggingface.co/shangz).
 - QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
   inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
 - QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
-- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
+- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *google-bert/bert-base-uncased*), and
   perform Quantization Aware Training/Post Training Quantization.
 - A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
   SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/).
diff --git a/docs/source/en/model_doc/speech-encoder-decoder.md b/docs/source/en/model_doc/speech-encoder-decoder.md
index b036f27e1865d8..7e2bcef98abce8 100644
--- a/docs/source/en/model_doc/speech-encoder-decoder.md
+++ b/docs/source/en/model_doc/speech-encoder-decoder.md
@@ -52,7 +52,7 @@ To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecode
 >>> from transformers import SpeechEncoderDecoderModel
 
 >>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "facebook/hubert-large-ll60k", "bert-base-uncased"
+...     "facebook/hubert-large-ll60k", "google-bert/bert-base-uncased"
 ... )
 ```
 
@@ -93,7 +93,7 @@ speech inputs) and `labels` (which are the `input_ids` of the encoded target seq
 >>> from datasets import load_dataset
 
 >>> encoder_id = "facebook/wav2vec2-base-960h"  # acoustic model encoder
->>> decoder_id = "bert-base-uncased"  # text decoder
+>>> decoder_id = "google-bert/bert-base-uncased"  # text decoder
 
 >>> feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id)
 >>> tokenizer = AutoTokenizer.from_pretrained(decoder_id)
diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
index b8a062cbbe59ed..70e80c459f082b 100644
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@@ -64,15 +64,15 @@ for summarization: *summarize: ...*.
 
 T5 comes in different sizes:
 
-- [t5-small](https://huggingface.co/t5-small)
+- [google-t5/t5-small](https://huggingface.co/google-t5/t5-small)
 
-- [t5-base](https://huggingface.co/t5-base)
+- [google-t5/t5-base](https://huggingface.co/google-t5/t5-base)
 
-- [t5-large](https://huggingface.co/t5-large)
+- [google-t5/t5-large](https://huggingface.co/google-t5/t5-large)
 
-- [t5-3b](https://huggingface.co/t5-3b)
+- [google-t5/t5-3b](https://huggingface.co/google-t5/t5-3b)
 
-- [t5-11b](https://huggingface.co/t5-11b).
+- [google-t5/t5-11b](https://huggingface.co/google-t5/t5-11b).
 
 Based on the original T5 model, Google has released some follow-up works:
 
@@ -121,8 +121,8 @@ processed as follows:
 ```python
 >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
 >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
 >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
@@ -146,8 +146,8 @@ the model as follows:
 ```python
 >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
 >>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
 >>> labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
@@ -183,8 +183,8 @@ ignored. The code example below illustrates all of this.
 >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 >>> import torch
 
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
 >>> # the following 2 hyperparameters are task-specific
 >>> max_source_length = 512
@@ -258,8 +258,8 @@ generation works in general in encoder-decoder models.
 ```python
 >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
 >>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
 >>> outputs = model.generate(input_ids)
@@ -275,8 +275,8 @@ The example above only shows a single example. You can also do batched inference
 ```python
 >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
 >>> task_prefix = "translate English to German: "
 >>> # use different length sentences to test batching
@@ -301,8 +301,8 @@ The predicted tokens will then be placed between the sentinel tokens.
 ```python
 >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
->>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+>>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
+>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
 >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
 
diff --git a/docs/source/en/model_doc/transfo-xl.md b/docs/source/en/model_doc/transfo-xl.md
index dae7e532be66f3..c80d9352b5aef6 100644
--- a/docs/source/en/model_doc/transfo-xl.md
+++ b/docs/source/en/model_doc/transfo-xl.md
@@ -22,7 +22,7 @@ This model is in maintenance mode only, so we won't accept any new PRs changing
 
 We recommend switching to more recent models for improved security.
 
-In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
+In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
 
 You will need to set the environment variable `TRUST_REMOTE_CODE` to `True` in order to allow the
 usage of `pickle.load()`:
@@ -33,7 +33,7 @@ from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
 
 os.environ["TRUST_REMOTE_CODE"] = "True"
 
-checkpoint = 'transfo-xl-wt103'
+checkpoint = 'transfo-xl/transfo-xl-wt103'
 revision = '40a186da79458c9f9de846edfaea79c412137f97'
 
 tokenizer = TransfoXLTokenizer.from_pretrained(checkpoint, revision=revision)
diff --git a/docs/source/en/model_doc/vision-encoder-decoder.md b/docs/source/en/model_doc/vision-encoder-decoder.md
index 89d89896a2e247..41159b7fc5f9a8 100644
--- a/docs/source/en/model_doc/vision-encoder-decoder.md
+++ b/docs/source/en/model_doc/vision-encoder-decoder.md
@@ -58,7 +58,7 @@ To do so, the `VisionEncoderDecoderModel` class provides a [`VisionEncoderDecode
 >>> from transformers import VisionEncoderDecoderModel
 
 >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased"
+...     "microsoft/swin-base-patch4-window7-224-in22k", "google-bert/bert-base-uncased"
 ... )
 ```
 
@@ -123,9 +123,9 @@ images) and `labels` (which are the `input_ids` of the encoded target sequence).
 >>> from datasets import load_dataset
 
 >>> image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "google/vit-base-patch16-224-in21k", "bert-base-uncased"
+...     "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
 ... )
 
 >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
diff --git a/docs/source/en/model_doc/visual_bert.md b/docs/source/en/model_doc/visual_bert.md
index 1db218f1a53147..95e5ae4e84a28d 100644
--- a/docs/source/en/model_doc/visual_bert.md
+++ b/docs/source/en/model_doc/visual_bert.md
@@ -73,7 +73,7 @@ The following example shows how to get the last hidden state using [`VisualBertM
 >>> from transformers import BertTokenizer, VisualBertModel
 
 >>> model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
 >>> inputs = tokenizer("What is the man eating?", return_tensors="pt")
 >>> # this is a custom function that returns the visual embeddings given the image path
diff --git a/docs/source/en/model_memory_anatomy.md b/docs/source/en/model_memory_anatomy.md
index 0a0d5bb5b8bf42..c820681a7af0fc 100644
--- a/docs/source/en/model_memory_anatomy.md
+++ b/docs/source/en/model_memory_anatomy.md
@@ -92,7 +92,7 @@ We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how muc
 
 ## Load Model
 
-First, we load the `bert-large-uncased` model. We load the model weights directly to the GPU so that we can check 
+First, we load the `google-bert/bert-large-uncased` model. We load the model weights directly to the GPU so that we can check 
 how much space just the weights use.
 
 
@@ -100,7 +100,7 @@ how much space just the weights use.
 >>> from transformers import AutoModelForSequenceClassification
 
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda")
 >>> print_gpu_utilization()
 GPU memory occupied: 2631 MB.
 ```
diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md
index 84d287570da192..6ec4d9fa2a9280 100644
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@@ -229,4 +229,4 @@ To make sure users understand your model's capabilities, limitations, potential
 * Manually creating and uploading a `README.md` file.
 * Clicking on the **Edit model card** button in your model repository.
 
-Take a look at the DistilBert [model card](https://huggingface.co/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards).
+Take a look at the DistilBert [model card](https://huggingface.co/distilbert/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/en/multilingual.md b/docs/source/en/multilingual.md
index 9bf904a3b3738e..30a63eea28c8c7 100644
--- a/docs/source/en/multilingual.md
+++ b/docs/source/en/multilingual.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. Not *all* multilingual model usage is different though. Some models, like [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), can be used just like a monolingual model. This guide will show you how to use multilingual models whose usage differs for inference.
+There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. Not *all* multilingual model usage is different though. Some models, like [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased), can be used just like a monolingual model. This guide will show you how to use multilingual models whose usage differs for inference.
 
 ## XLM
 
@@ -28,24 +28,24 @@ XLM has ten different checkpoints, only one of which is monolingual. The nine re
 
 The following XLM models use language embeddings to specify the language used at inference:
 
-- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
-- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
-- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
-- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
-- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
-- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
-- `xlm-clm-ende-1024` (Causal language modeling, English-German)
+- `FacebookAI/xlm-mlm-ende-1024` (Masked language modeling, English-German)
+- `FacebookAI/xlm-mlm-enfr-1024` (Masked language modeling, English-French)
+- `FacebookAI/xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
+- `FacebookAI/xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
+- `FacebookAI/xlm-clm-enfr-1024` (Causal language modeling, English-French)
+- `FacebookAI/xlm-clm-ende-1024` (Causal language modeling, English-German)
 
 Language embeddings are represented as a tensor of the same shape as the `input_ids` passed to the model. The values in these tensors depend on the language used and are identified by the tokenizer's `lang2id` and `id2lang` attributes.
 
-In this example, load the `xlm-clm-enfr-1024` checkpoint (Causal language modeling, English-French):
+In this example, load the `FacebookAI/xlm-clm-enfr-1024` checkpoint (Causal language modeling, English-French):
 
 ```py
 >>> import torch
 >>> from transformers import XLMTokenizer, XLMWithLMHeadModel
 
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
 ```
 
 The `lang2id` attribute of the tokenizer displays this model's languages and their ids:
@@ -83,8 +83,8 @@ The [run_generation.py](https://github.com/huggingface/transformers/tree/main/ex
 
 The following XLM models do not require language embeddings during inference:
 
-- `xlm-mlm-17-1280` (Masked language modeling, 17 languages)
-- `xlm-mlm-100-1280` (Masked language modeling, 100 languages)
+- `FacebookAI/xlm-mlm-17-1280` (Masked language modeling, 17 languages)
+- `FacebookAI/xlm-mlm-100-1280` (Masked language modeling, 100 languages)
 
 These models are used for generic sentence representations, unlike the previous XLM checkpoints.
 
@@ -92,8 +92,8 @@ These models are used for generic sentence representations, unlike the previous
 
 The following BERT models can be used for multilingual tasks:
 
-- `bert-base-multilingual-uncased` (Masked language modeling + Next sentence prediction, 102 languages)
-- `bert-base-multilingual-cased` (Masked language modeling + Next sentence prediction, 104 languages)
+- `google-bert/bert-base-multilingual-uncased` (Masked language modeling + Next sentence prediction, 102 languages)
+- `google-bert/bert-base-multilingual-cased` (Masked language modeling + Next sentence prediction, 104 languages)
 
 These models do not require language embeddings during inference. They should identify the language from the
 context and infer accordingly.
@@ -102,8 +102,8 @@ context and infer accordingly.
 
 The following XLM-RoBERTa models can be used for multilingual tasks:
 
-- `xlm-roberta-base` (Masked language modeling, 100 languages)
-- `xlm-roberta-large` (Masked language modeling, 100 languages)
+- `FacebookAI/xlm-roberta-base` (Masked language modeling, 100 languages)
+- `FacebookAI/xlm-roberta-large` (Masked language modeling, 100 languages)
 
 XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering.
 
diff --git a/docs/source/en/perf_hardware.md b/docs/source/en/perf_hardware.md
index 187bdd27b57b42..c42b58483bebd2 100644
--- a/docs/source/en/perf_hardware.md
+++ b/docs/source/en/perf_hardware.md
@@ -116,7 +116,7 @@ Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvid
 
 So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture.
 
-Let's compare the execution of a gpt2 language model training over a small sample of wikitext.
+Let's compare the execution of a openai-community/gpt2 language model training over a small sample of wikitext.
 
 The results are:
 
@@ -135,7 +135,7 @@ Here is the full benchmark code and outputs:
 # DDP w/ NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
@@ -144,7 +144,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 # DDP w/o NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index d3dd2ae00f9573..745a0f98a595fd 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -348,7 +348,7 @@ ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers. You'll
 from optimum.onnxruntime import ORTModelForSequenceClassification
 
 ort_model = ORTModelForSequenceClassification.from_pretrained(
-  "distilbert-base-uncased-finetuned-sst-2-english",
+  "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
   export=True,
   provider="CUDAExecutionProvider",
 )
@@ -360,7 +360,7 @@ Now you're free to use the model for inference:
 from optimum.pipelines import pipeline
 from transformers import AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
+tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
 
 pipeline = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0")
 result = pipeline("Both the music and visual were astounding, not to mention the actors performance.")
diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md
index 19b76c169d3f9c..14a52792d1f7d8 100644
--- a/docs/source/en/perf_train_cpu.md
+++ b/docs/source/en/perf_train_cpu.md
@@ -52,7 +52,7 @@ Take an example of the use cases on [Transformers question-answering](https://gi
 
 - Training with IPEX using BF16 auto mixed precision on CPU:
 <pre> python run_qa.py \
---model_name_or_path bert-base-uncased \
+--model_name_or_path google-bert/bert-base-uncased \
 --dataset_name squad \
 --do_train \
 --do_eval \
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index 9312d4b9116358..53f7f7f9295dea 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -90,7 +90,7 @@ The following command enables training with 2 processes on one Xeon node, with o
  export MASTER_ADDR=127.0.0.1
  mpirun -n 2 -genv OMP_NUM_THREADS=23 \
  python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
+ --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
@@ -119,7 +119,7 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
  mpirun -f hostfile -n 4 -ppn 2 \
  -genv OMP_NUM_THREADS=23 \
  python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
+ --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
@@ -210,7 +210,7 @@ spec:
                 - torchrun
                 - /workspace/transformers/examples/pytorch/question-answering/run_qa.py
                 - --model_name_or_path
-                - "bert-large-uncased"
+                - "google-bert/bert-large-uncased"
                 - --dataset_name
                 - "squad"
                 - --do_train
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
index 30c7aedfa38928..db1c3c3ef4ed8a 100644
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -143,7 +143,7 @@ Here is the benchmarking code and outputs:
 ```bash
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
 python examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
@@ -154,7 +154,7 @@ python examples/pytorch/language-modeling/run_clm.py \
 ```bash
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
 torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
@@ -165,7 +165,7 @@ torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 ```bash
 rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
index 9a81a622cc12a5..1d885ba03646c7 100644
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -248,7 +248,7 @@ Let's take a closer look at two alternatives to AdamW optimizer:
 1. `adafactor` which is available in [`Trainer`]
 2. `adamw_bnb_8bit` is also available in Trainer, but a third-party integration is provided below for demonstration.
 
-For comparison, for a 3B-parameter model, like “t5-3b”: 
+For comparison, for a 3B-parameter model, like “google-t5/t5-3b”: 
 * A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (8*3 => 24GB)
 * Adafactor optimizer will need more than 12GB. It uses slightly more than 4 bytes for each parameter, so 4*3 and then some extra.
 * 8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized.
diff --git a/docs/source/en/perf_train_special.md b/docs/source/en/perf_train_special.md
index b9bbe32897dbd6..d98d3e0e32e5a0 100644
--- a/docs/source/en/perf_train_special.md
+++ b/docs/source/en/perf_train_special.md
@@ -45,7 +45,7 @@ pip install torch torchvision torchaudio
 export TASK_NAME=mrpc
 
 python examples/pytorch/text-classification/run_glue.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --task_name $TASK_NAME \
 - --use_mps_device \
   --do_train \
diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md
index 18abc0305b0ef5..7555619fe488d2 100644
--- a/docs/source/en/perplexity.md
+++ b/docs/source/en/perplexity.md
@@ -75,7 +75,7 @@ Let's demonstrate this process with GPT-2.
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 
 device = "cuda"
-model_id = "gpt2-large"
+model_id = "openai-community/gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 ```
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 460fc17274a800..e3e4e2e5cb6b7e 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -185,7 +185,7 @@ def data():
         yield f"My example {i}"
 
 
-pipe = pipeline(model="gpt2", device=0)
+pipe = pipeline(model="openai-community/gpt2", device=0)
 generated_characters = 0
 for out in pipe(data()):
     generated_characters += len(out[0]["generated_text"])
diff --git a/docs/source/en/pipeline_webserver.md b/docs/source/en/pipeline_webserver.md
index 38ef28d498c615..17b5fbd958dd30 100644
--- a/docs/source/en/pipeline_webserver.md
+++ b/docs/source/en/pipeline_webserver.md
@@ -48,7 +48,7 @@ async def homepage(request):
 
 
 async def server_loop(q):
-    pipe = pipeline(model="bert-base-uncased")
+    pipe = pipeline(model="google-bert/bert-base-uncased")
     while True:
         (string, response_q) = await q.get()
         out = pipe(string)
diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md
index 04e9688c905e75..82381057d3742b 100644
--- a/docs/source/en/preprocessing.md
+++ b/docs/source/en/preprocessing.md
@@ -54,7 +54,7 @@ Get started by loading a pretrained tokenizer with the [`AutoTokenizer.from_pret
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 ```
 
 Then pass your text to the tokenizer:
diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
index d49943da17a14c..904e0bbc745340 100644
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -77,7 +77,7 @@ Start by creating an instance of [`pipeline`] and specifying a task you want to
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-The [`pipeline`] downloads and caches a default [pretrained model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) and tokenizer for sentiment analysis. Now you can use the `classifier` on your target text:
+The [`pipeline`] downloads and caches a default [pretrained model](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) and tokenizer for sentiment analysis. Now you can use the `classifier` on your target text:
 
 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
@@ -384,7 +384,7 @@ Start by importing [`AutoConfig`], and then load the pretrained model you want t
 ```py
 >>> from transformers import AutoConfig
 
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
 ```
 
 <frameworkcontent>
@@ -421,7 +421,7 @@ Depending on your task, you'll typically pass the following parameters to [`Trai
    ```py
    >>> from transformers import AutoModelForSequenceClassification
 
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments:
@@ -443,7 +443,7 @@ Depending on your task, you'll typically pass the following parameters to [`Trai
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 4. Load a dataset:
@@ -515,7 +515,7 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs
    ```py
    >>> from transformers import TFAutoModelForSequenceClassification
 
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor:
@@ -523,7 +523,7 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 3. Create a function to tokenize the dataset:
diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md
index 0652bb1da5e4a7..845befc5638133 100644
--- a/docs/source/en/run_scripts.md
+++ b/docs/source/en/run_scripts.md
@@ -87,11 +87,11 @@ pip install -r requirements.txt
 
 <frameworkcontent>
 <pt>
-The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset with the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
+The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset with the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -105,11 +105,11 @@ python examples/pytorch/summarization/run_summarization.py \
 ```
 </pt>
 <tf>
-The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset using Keras on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
+The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset using Keras on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
 
 ```bash
 python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -133,7 +133,7 @@ The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) sup
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -157,7 +157,7 @@ Tensor Processing Units (TPUs) are specifically designed to accelerate performan
 ```bash
 python xla_spawn.py --num_cores 8 \
     summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -176,7 +176,7 @@ Tensor Processing Units (TPUs) are specifically designed to accelerate performan
 ```bash
 python run_summarization.py  \
     --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -214,7 +214,7 @@ Now you are ready to launch the training:
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -233,7 +233,7 @@ A summarization script using a custom dataset would look like this:
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -258,7 +258,7 @@ It is often a good idea to run your script on a smaller number of dataset exampl
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --max_train_samples 50 \
     --max_eval_samples 50 \
     --max_predict_samples 50 \
@@ -288,7 +288,7 @@ The first method uses the `output_dir previous_output_dir` argument to resume tr
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -305,7 +305,7 @@ The second method uses the `resume_from_checkpoint path_to_specific_checkpoint`
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -335,7 +335,7 @@ The following example shows how to upload a model with a specific repository nam
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
diff --git a/docs/source/en/serialization.md b/docs/source/en/serialization.md
index 9fec884a8be451..5995d9042de6fb 100644
--- a/docs/source/en/serialization.md
+++ b/docs/source/en/serialization.md
@@ -70,10 +70,10 @@ or view help in command line:
 optimum-cli export onnx --help
 ```
 
-To export a model's checkpoint from the 🤗 Hub, for example, `distilbert-base-uncased-distilled-squad`, run the following command: 
+To export a model's checkpoint from the 🤗 Hub, for example, `distilbert/distilbert-base-uncased-distilled-squad`, run the following command: 
 
 ```bash
-optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
 ```
 
 You should see the logs indicating progress and showing where the resulting `model.onnx` is saved, like this:
@@ -166,7 +166,7 @@ pip install transformers[onnx]
 Use `transformers.onnx` package as a Python module to export a checkpoint using a ready-made configuration:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
 ```
 
 This exports an ONNX graph of the checkpoint defined by the `--model` argument. Pass any checkpoint on the 🤗 Hub or one that's stored locally.
@@ -177,7 +177,7 @@ load and run the model with ONNX Runtime as follows:
 >>> from transformers import AutoTokenizer
 >>> from onnxruntime import InferenceSession
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 >>> session = InferenceSession("onnx/model.onnx")
 >>> # ONNX Runtime expects NumPy arrays as input
 >>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
diff --git a/docs/source/en/task_summary.md b/docs/source/en/task_summary.md
index 4a79e79e05452e..8f7eb041f1f2d7 100644
--- a/docs/source/en/task_summary.md
+++ b/docs/source/en/task_summary.md
@@ -268,7 +268,7 @@ In the early days, translation models were mostly monolingual, but recently, the
 >>> from transformers import pipeline
 
 >>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
->>> translator = pipeline(task="translation", model="t5-small")
+>>> translator = pipeline(task="translation", model="google-t5/t5-small")
 >>> translator(text)
 [{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
 ```
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index 1236e23410ecdd..4022867a027af7 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -29,7 +29,7 @@ the left. This means the model cannot see future tokens. GPT-2 is an example of
 
 This guide will show you how to:
 
-1. Finetune [DistilGPT2](https://huggingface.co/distilgpt2) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.
+1. Finetune [DistilGPT2](https://huggingface.co/distilbert/distilgpt2) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.
 2. Use your finetuned model for inference.
 
 <Tip>
@@ -110,7 +110,7 @@ The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 ```
 
 You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to
@@ -236,7 +236,7 @@ You're ready to start training your model now! Load DistilGPT2 with [`AutoModelF
 ```py
 >>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
 
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 At this point, only three steps remain:
@@ -300,7 +300,7 @@ Then you can load DistilGPT2 with [`TFAutoModelForCausalLM`]:
 ```py
 >>> from transformers import TFAutoModelForCausalLM
 
->>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md
index 27a8f2f4911bb0..de91cd587a6a0c 100644
--- a/docs/source/en/tasks/masked_language_modeling.md
+++ b/docs/source/en/tasks/masked_language_modeling.md
@@ -26,7 +26,7 @@ require a good contextual understanding of an entire sequence. BERT is an exampl
 
 This guide will show you how to:
 
-1. Finetune [DistilRoBERTa](https://huggingface.co/distilroberta-base) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.
+1. Finetune [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.
 2. Use your finetuned model for inference.
 
 <Tip>
@@ -105,7 +105,7 @@ For masked language modeling, the next step is to load a DistilRoBERTa tokenizer
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
 ```
 
 You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process#flatten) method:
@@ -226,7 +226,7 @@ You're ready to start training your model now! Load DistilRoBERTa with [`AutoMod
 ```py
 >>> from transformers import AutoModelForMaskedLM
 
->>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
+>>> model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
 ```
 
 At this point, only three steps remain:
@@ -291,7 +291,7 @@ Then you can load DistilRoBERTa with [`TFAutoModelForMaskedLM`]:
 ```py
 >>> from transformers import TFAutoModelForMaskedLM
 
->>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")
+>>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
 ```
 
 Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
index 938d3ba461bb87..5cf17448f0a66a 100644
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -22,7 +22,7 @@ A multiple choice task is similar to question answering, except several candidat
 
 This guide will show you how to:
 
-1. Finetune [BERT](https://huggingface.co/bert-base-uncased) on the `regular` configuration of the [SWAG](https://huggingface.co/datasets/swag) dataset to select the best answer given multiple options and some context.
+1. Finetune [BERT](https://huggingface.co/google-bert/bert-base-uncased) on the `regular` configuration of the [SWAG](https://huggingface.co/datasets/swag) dataset to select the best answer given multiple options and some context.
 2. Use your finetuned model for inference.
 
 <Tip>
@@ -90,7 +90,7 @@ The next step is to load a BERT tokenizer to process the sentence starts and the
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 The preprocessing function you want to create needs to:
@@ -253,7 +253,7 @@ You're ready to start training your model now! Load BERT with [`AutoModelForMult
 ```py
 >>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
 
->>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 At this point, only three steps remain:
@@ -317,7 +317,7 @@ Then you can load BERT with [`TFAutoModelForMultipleChoice`]:
 ```py
 >>> from transformers import TFAutoModelForMultipleChoice
 
->>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md
index 8933b47dbfb751..1746e36fb9675f 100644
--- a/docs/source/en/tasks/prompting.md
+++ b/docs/source/en/tasks/prompting.md
@@ -76,7 +76,7 @@ Run inference with decoder-only models with the `text-generation` pipeline:
 
 >>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
 
->>> generator = pipeline('text-generation', model = 'gpt2')
+>>> generator = pipeline('text-generation', model = 'openai-community/gpt2')
 >>> prompt = "Hello, I'm a language model"
 
 >>> generator(prompt, max_length = 30)
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index 7c228061ff8e71..5d65dc8201cfda 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -27,7 +27,7 @@ Question answering tasks return an answer given a question. If you've ever asked
 
 This guide will show you how to:
 
-1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [SQuAD](https://huggingface.co/datasets/squad) dataset for extractive question answering.
+1. Finetune [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) on the [SQuAD](https://huggingface.co/datasets/squad) dataset for extractive question answering.
 2. Use your finetuned model for inference.
 
 <Tip>
@@ -100,7 +100,7 @@ The next step is to load a DistilBERT tokenizer to process the `question` and `c
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 There are a few preprocessing steps particular to question answering tasks you should be aware of:
@@ -206,7 +206,7 @@ You're ready to start training your model now! Load DistilBERT with [`AutoModelF
 ```py
 >>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
 
->>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 At this point, only three steps remain:
@@ -271,7 +271,7 @@ Then you can load DistilBERT with [`TFAutoModelForQuestionAnswering`]:
 ```py
 >>> from transformers import TFAutoModelForQuestionAnswering
 
->>> model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Convert your datasets to the `tf.data.Dataset` format with [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index f597dede7e9164..8459ae4c08babe 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -24,7 +24,7 @@ Text classification is a common NLP task that assigns a label or class to text.
 
 This guide will show you how to:
 
-1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [IMDb](https://huggingface.co/datasets/imdb) dataset to determine whether a movie review is positive or negative.
+1. Finetune [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) on the [IMDb](https://huggingface.co/datasets/imdb) dataset to determine whether a movie review is positive or negative.
 2. Use your finetuned model for inference.
 
 <Tip>
@@ -87,7 +87,7 @@ The next step is to load a DistilBERT tokenizer to preprocess the `text` field:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length:
@@ -169,7 +169,7 @@ You're ready to start training your model now! Load DistilBERT with [`AutoModelF
 >>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
 
 >>> model = AutoModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
 ... )
 ```
 
@@ -243,7 +243,7 @@ Then you can load DistilBERT with [`TFAutoModelForSequenceClassification`] along
 >>> from transformers import TFAutoModelForSequenceClassification
 
 >>> model = TFAutoModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
 ... )
 ```
 
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index 535d20ff492b49..28dd3f5a49ebe3 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -27,7 +27,7 @@ Summarization creates a shorter version of a document or an article that capture
 
 This guide will show you how to:
 
-1. Finetune [T5](https://huggingface.co/t5-small) on the California state bill subset of the [BillSum](https://huggingface.co/datasets/billsum) dataset for abstractive summarization.
+1. Finetune [T5](https://huggingface.co/google-t5/t5-small) on the California state bill subset of the [BillSum](https://huggingface.co/datasets/billsum) dataset for abstractive summarization.
 2. Use your finetuned model for inference.
 
 <Tip>
@@ -92,7 +92,7 @@ The next step is to load a T5 tokenizer to process `text` and `summary`:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> checkpoint = "t5-small"
+>>> checkpoint = "google-t5/t5-small"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 ```
 
diff --git a/docs/source/en/tasks/token_classification.md b/docs/source/en/tasks/token_classification.md
index 9bcb7750c2bf62..791737b677c871 100644
--- a/docs/source/en/tasks/token_classification.md
+++ b/docs/source/en/tasks/token_classification.md
@@ -24,7 +24,7 @@ Token classification assigns a label to individual tokens in a sentence. One of
 
 This guide will show you how to:
 
-1. Finetune [DistilBERT](https://huggingface.co/distilbert-base-uncased) on the [WNUT 17](https://huggingface.co/datasets/wnut_17) dataset to detect new entities.
+1. Finetune [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) on the [WNUT 17](https://huggingface.co/datasets/wnut_17) dataset to detect new entities.
 2. Use your finetuned model for inference.
 
 <Tip>
@@ -110,7 +110,7 @@ The next step is to load a DistilBERT tokenizer to preprocess the `tokens` field
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 As you saw in the example `tokens` field above, it looks like the input has already been tokenized. But the input actually hasn't been tokenized yet and you'll need to set `is_split_into_words=True` to tokenize the words into subwords. For example:
@@ -272,7 +272,7 @@ You're ready to start training your model now! Load DistilBERT with [`AutoModelF
 >>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
 
 >>> model = AutoModelForTokenClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
 ... )
 ```
 
@@ -343,7 +343,7 @@ Then you can load DistilBERT with [`TFAutoModelForTokenClassification`] along wi
 >>> from transformers import TFAutoModelForTokenClassification
 
 >>> model = TFAutoModelForTokenClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
 ... )
 ```
 
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
index 9c73e97bff366f..f0433a0dad797d 100644
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@@ -24,7 +24,7 @@ Translation converts a sequence of text from one language to another. It is one
 
 This guide will show you how to:
 
-1. Finetune [T5](https://huggingface.co/t5-small) on the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset to translate English text to French.
+1. Finetune [T5](https://huggingface.co/google-t5/t5-small) on the English-French subset of the [OPUS Books](https://huggingface.co/datasets/opus_books) dataset to translate English text to French.
 2. Use your finetuned model for inference.
 
 <Tip>
@@ -88,7 +88,7 @@ The next step is to load a T5 tokenizer to process the English-French language p
 ```py
 >>> from transformers import AutoTokenizer
 
->>> checkpoint = "t5-small"
+>>> checkpoint = "google-t5/t5-small"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 ```
 
diff --git a/docs/source/en/tf_xla.md b/docs/source/en/tf_xla.md
index 5f6a360dd8d5e2..86ed1035fccc9e 100644
--- a/docs/source/en/tf_xla.md
+++ b/docs/source/en/tf_xla.md
@@ -85,8 +85,8 @@ from transformers.utils import check_min_version
 check_min_version("4.21.0")
 
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
 # One line to create an XLA generation function
@@ -114,8 +114,8 @@ To ensure `xla_generate()` always operates with the same input shapes, you can s
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
 xla_generate = tf.function(model.generate, jit_compile=True)
@@ -135,8 +135,8 @@ import time
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 
 xla_generate = tf.function(model.generate, jit_compile=True)
 
diff --git a/docs/source/en/tflite.md b/docs/source/en/tflite.md
index 7b7735c992eac9..09434a81508d35 100644
--- a/docs/source/en/tflite.md
+++ b/docs/source/en/tflite.md
@@ -38,10 +38,10 @@ or view help in command line:
 optimum-cli export tflite --help
 ```
 
-To export a model's checkpoint from the 🤗 Hub, for example, `bert-base-uncased`, run the following command:
+To export a model's checkpoint from the 🤗 Hub, for example, `google-bert/bert-base-uncased`, run the following command:
 
 ```bash
-optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
 ```
 
 You should see the logs indicating progress and showing where the resulting `model.tflite` is saved, like this:
diff --git a/docs/source/en/tokenizer_summary.md b/docs/source/en/tokenizer_summary.md
index 99c52244bb04b7..fbe8f6f7a17743 100644
--- a/docs/source/en/tokenizer_summary.md
+++ b/docs/source/en/tokenizer_summary.md
@@ -109,7 +109,7 @@ seen before, by decomposing them into known subwords. For instance, the [`~trans
 ```py
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 >>> tokenizer.tokenize("I have a new GPU!")
 ["i", "have", "a", "new", "gp", "##u", "!"]
 ```
@@ -123,7 +123,7 @@ As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously ex
 ```py
 >>> from transformers import XLNetTokenizer
 
->>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
 >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
 ["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
 ```
diff --git a/docs/source/en/torchscript.md b/docs/source/en/torchscript.md
index adf34b2ea699d3..171e337ca7f846 100644
--- a/docs/source/en/torchscript.md
+++ b/docs/source/en/torchscript.md
@@ -97,7 +97,7 @@ class and then save it to disk under the filename `traced_bert.pt`:
 from transformers import BertModel, BertTokenizer, BertConfig
 import torch
 
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
+enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
 # Tokenizing input text
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -132,7 +132,7 @@ model = BertModel(config)
 model.eval()
 
 # If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
 
 # Creating the trace
 traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 2c8ca7d3459e1a..22ef9a0c160e9c 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -376,7 +376,7 @@ For example, to run the [run_glue.py](https://github.com/huggingface/transformer
 ```bash
 accelerate launch \
     ./examples/pytorch/text-classification/run_glue.py \
-    --model_name_or_path bert-base-cased \
+    --model_name_or_path google-bert/bert-base-cased \
     --task_name $TASK_NAME \
     --do_train \
     --do_eval \
@@ -399,7 +399,7 @@ accelerate launch --num_processes=2 \
     --fsdp_sharding_strategy=1 \
     --fsdp_state_dict_type=FULL_STATE_DICT \
     ./examples/pytorch/text-classification/run_glue.py
-    --model_name_or_path bert-base-cased \
+    --model_name_or_path google-bert/bert-base-cased \
     --task_name $TASK_NAME \
     --do_train \
     --do_eval \
diff --git a/docs/source/en/training.md b/docs/source/en/training.md
index 8e81048bf54e0e..4bd72aa9f6384d 100644
--- a/docs/source/en/training.md
+++ b/docs/source/en/training.md
@@ -48,7 +48,7 @@ As you now know, you need a tokenizer to process the text and include a padding
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 
 >>> def tokenize_function(examples):
@@ -86,7 +86,7 @@ Start by loading your model and specify the number of expected labels. From the
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 <Tip>
@@ -187,7 +187,7 @@ so we can just convert that directly to a NumPy array without tokenization!
 ```py
 from transformers import AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
 # Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
 tokenized_data = dict(tokenized_data)
@@ -202,7 +202,7 @@ from transformers import TFAutoModelForSequenceClassification
 from tensorflow.keras.optimizers import Adam
 
 # Load and compile our model
-model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
 # Lower learning rates are often better for fine-tuning transformers
 model.compile(optimizer=Adam(3e-5))  # No loss argument!
 
@@ -334,7 +334,7 @@ Load your model with the number of expected labels:
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 ### Optimizer and learning rate scheduler
diff --git a/docs/source/en/troubleshooting.md b/docs/source/en/troubleshooting.md
index 29b032dd2799f7..c1bf338c13bebb 100644
--- a/docs/source/en/troubleshooting.md
+++ b/docs/source/en/troubleshooting.md
@@ -134,7 +134,7 @@ In some cases, the output `hidden_state` may be incorrect if the `input_ids` inc
 >>> from transformers import AutoModelForSequenceClassification
 >>> import torch
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
 >>> model.config.pad_token_id
 0
 ```
@@ -191,8 +191,8 @@ For instance, you'll see this error in the following example because there is no
 ```py
 >>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
 
->>> processor = AutoProcessor.from_pretrained("gpt2-medium")
->>> model = AutoModelForQuestionAnswering.from_pretrained("gpt2-medium")
+>>> processor = AutoProcessor.from_pretrained("openai-community/gpt2-medium")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("openai-community/gpt2-medium")
 ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
 Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ...
 ```
diff --git a/docs/source/es/autoclass_tutorial.md b/docs/source/es/autoclass_tutorial.md
index 8b3ddd230b6bc9..cea44c3c1ea6cf 100644
--- a/docs/source/es/autoclass_tutorial.md
+++ b/docs/source/es/autoclass_tutorial.md
@@ -20,7 +20,7 @@ Con tantas arquitecturas diferentes de Transformer puede ser retador crear una p
 
 <Tip>
 
-Recuerda, la arquitectura se refiere al esqueleto del modelo y los checkpoints son los pesos para una arquitectura dada. Por ejemplo, [BERT](https://huggingface.co/bert-base-uncased) es una arquitectura, mientras que `bert-base-uncased` es un checkpoint. Modelo es un término general que puede significar una arquitectura o un checkpoint.
+Recuerda, la arquitectura se refiere al esqueleto del modelo y los checkpoints son los pesos para una arquitectura dada. Por ejemplo, [BERT](https://huggingface.co/google-bert/bert-base-uncased) es una arquitectura, mientras que `google-bert/bert-base-uncased` es un checkpoint. Modelo es un término general que puede significar una arquitectura o un checkpoint.
 
 </Tip>
 
@@ -40,7 +40,7 @@ Carga un tokenizador con [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 Luego tokeniza tu input como lo mostrado a continuación:
@@ -88,7 +88,7 @@ Finalmente, las clases `AutoModelFor` te permiten cargar un modelo preentrenado
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente:
@@ -96,7 +96,7 @@ Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para algun
 ```py
 >>> from transformers import AutoModelForTokenClassification
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Generalmente recomendamos utilizar las clases `AutoTokenizer` y `AutoModelFor` para cargar instancias pre-entrenadas de modelos. Ésto asegurará que cargues la arquitectura correcta en cada ocasión. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning.
@@ -107,7 +107,7 @@ Finalmente, la clase `TFAutoModelFor` te permite cargar tu modelo pre-entrenado
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente:
@@ -115,7 +115,7 @@ Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para algun
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Generalmente recomendamos utilizar las clases `AutoTokenizer` y `TFAutoModelFor` para cargar instancias de modelos pre-entrenados. Ésto asegurará que cargues la arquitectura correcta cada vez. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning.
diff --git a/docs/source/es/community.md b/docs/source/es/community.md
index c230618a214ae6..71153fbc8336f6 100644
--- a/docs/source/es/community.md
+++ b/docs/source/es/community.md
@@ -43,8 +43,8 @@ Esta página agrupa los recursos de 🤗 Transformers desarrollados por la comun
 |[Ajustar a Roberta para el análisis de sentimientos](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | Cómo ajustar un modelo de Roberta para el análisis de sentimientos | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
 |[Evaluación de modelos de generación de preguntas](https://github.com/flexudy-pipe/qugeev) | ¿Qué tan precisas son las respuestas a las preguntas generadas por tu modelo de transformador seq2seq? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
 |[Clasificar texto con DistilBERT y Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | Cómo ajustar DistilBERT para la clasificación de texto en TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
-|[Aprovechar BERT para el resumen de codificador y decodificador en CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | Cómo iniciar en caliente un *EncoderDecoderModel* con un punto de control *bert-base-uncased* para resumir en CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
-|[Aprovechar RoBERTa para el resumen de codificador-decodificador en BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | Cómo iniciar en caliente un *EncoderDecoderModel* compartido con un punto de control *roberta-base* para resumir en BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Aprovechar BERT para el resumen de codificador y decodificador en CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | Cómo iniciar en caliente un *EncoderDecoderModel* con un punto de control *google-bert/bert-base-uncased* para resumir en CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Aprovechar RoBERTa para el resumen de codificador-decodificador en BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | Cómo iniciar en caliente un *EncoderDecoderModel* compartido con un punto de control *FacebookAI/roberta-base* para resumir en BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
 |[Ajustar TAPAS en Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | Cómo ajustar *TapasForQuestionAnswering* con un punto de control *tapas-base* en el conjunto de datos del Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
 |[Evaluar TAPAS en Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | Cómo evaluar un *TapasForSequenceClassification* ajustado con un punto de control *tapas-base-finetuned-tabfact* usando una combinación de 🤗 conjuntos de datos y 🤗 bibliotecas de transformadores | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
 |[Ajustar de mBART para traducción](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | Cómo ajustar mBART utilizando Seq2SeqTrainer para la traducción del hindi al inglés | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
diff --git a/docs/source/es/converting_tensorflow_models.md b/docs/source/es/converting_tensorflow_models.md
index 8e5b1ad1e288f2..f56eb02d87006a 100644
--- a/docs/source/es/converting_tensorflow_models.md
+++ b/docs/source/es/converting_tensorflow_models.md
@@ -87,9 +87,9 @@ transformers-cli convert --model_type gpt \
 Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entrenado (más información [aquí](https://github.com/openai/gpt-2)):
 
 ```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
 
-transformers-cli convert --model_type gpt2 \
+transformers-cli convert --model_type openai-community/gpt2 \
   --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
   --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
   [--config OPENAI_GPT2_CONFIG] \
diff --git a/docs/source/es/create_a_model.md b/docs/source/es/create_a_model.md
index 5d6349370539c6..560fbd74e3851c 100644
--- a/docs/source/es/create_a_model.md
+++ b/docs/source/es/create_a_model.md
@@ -86,7 +86,7 @@ DistilBertConfig {
 Los atributos de los modelos preentrenados pueden ser modificados con la función [`~PretrainedConfig.from_pretrained`]:
 
 ```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
 ```
 
 Cuando estés satisfecho con la configuración de tu modelo, puedes guardarlo con la función [`~PretrainedConfig.save_pretrained`]. Tu configuración se guardará en un archivo JSON dentro del directorio que le especifiques como parámetro.
@@ -128,13 +128,13 @@ Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos de
 Puedes crear un modelo preentrenado con [`~PreTrainedModel.from_pretrained`]:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </pt>
 <tf>
@@ -153,13 +153,13 @@ Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos de
 Puedes crear un modelo preentrenado con [`~TFPreTrainedModel.from_pretrained`]:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Cuando cargues tus pesos del preentrenamiento, el modelo por defecto se carga automáticamente si este nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </tf>
 </frameworkcontent>
@@ -177,7 +177,7 @@ Por ejemplo,  [`DistilBertForSequenceClassification`] es un modelo DistilBERT ba
 ```py
 >>> from transformers import DistilBertForSequenceClassification
 
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`DistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. 
@@ -186,7 +186,7 @@ Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilme
 ```py
 >>> from transformers import DistilBertForQuestionAnswering
 
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </pt>
 <tf>
@@ -196,7 +196,7 @@ Por ejemplo,  [`TFDistilBertForSequenceClassification`] es un modelo DistilBERT
 ```py
 >>> from transformers import TFDistilBertForSequenceClassification
 
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`TFDistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*. 
@@ -205,7 +205,7 @@ Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilme
 ```py
 >>> from transformers import TFDistilBertForQuestionAnswering
 
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </tf>
 </frameworkcontent>
@@ -239,7 +239,7 @@ Es importante recordar que los vocabularios que provienen de un *tokenizer* pers
 ```py
 >>> from transformers import DistilBertTokenizer
 
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Crea un *tokenizer* rápido con la clase [`DistilBertTokenizerFast`]:
@@ -248,7 +248,7 @@ Crea un *tokenizer* rápido con la clase [`DistilBertTokenizerFast`]:
 ```py
 >>> from transformers import DistilBertTokenizerFast
 
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip>
diff --git a/docs/source/es/glossary.md b/docs/source/es/glossary.md
index f2729a41f98983..790fa1fecbe69a 100644
--- a/docs/source/es/glossary.md
+++ b/docs/source/es/glossary.md
@@ -33,7 +33,7 @@ Por ejemplo, considera estas dos secuencias:
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 >>> sequence_a = "This is a short sequence."
 >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
@@ -145,7 +145,7 @@ El proceso de seleccionar y transformar datos crudos en un conjunto de caracter
 
 ### feed forward chunking
 
-En cada bloque de atención residual en los transformadores, la capa de autoatención suele ir seguida de 2 capas de avance. El tamaño de embedding intermedio de las capas de avance suele ser mayor que el tamaño oculto del modelo (por ejemplo, para `bert-base-uncased`).
+En cada bloque de atención residual en los transformadores, la capa de autoatención suele ir seguida de 2 capas de avance. El tamaño de embedding intermedio de las capas de avance suele ser mayor que el tamaño oculto del modelo (por ejemplo, para `google-bert/bert-base-uncased`).
 
 Para una entrada de tamaño `[batch_size, sequence_length]`, la memoria requerida para almacenar los embeddings intermedios de avance `[batch_size, sequence_length, config.intermediate_size]` puede representar una gran fracción del uso de memoria. Los autores de [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) observaron que, dado que el cálculo es independiente de la dimensión `sequence_length`, es matemáticamente equivalente calcular los embeddings de salida de ambas capas de avance  `[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n` individualmente y concatenarlos después a `[batch_size, sequence_length, config.hidden_size]` con `n = sequence_length`, lo que intercambia el aumento del tiempo de cálculo por una reducción en el uso de memoria, pero produce un resultado matemáticamente **equivalente**.
 
@@ -188,7 +188,7 @@ Cada tokenizador funciona de manera diferente, pero el mecanismo subyacente sigu
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 >>> sequence = "A Titan RTX has 24GB of VRAM"
 ```
@@ -415,7 +415,7 @@ Podemos utilizar nuestro tokenizador para generar automáticamente una oración
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 >>> sequence_a = "HuggingFace is based in NYC"
 >>> sequence_b = "Where is HuggingFace based?"
 
diff --git a/docs/source/es/installation.md b/docs/source/es/installation.md
index 7ff9a92411d285..b79d0af4a46436 100644
--- a/docs/source/es/installation.md
+++ b/docs/source/es/installation.md
@@ -165,14 +165,14 @@ Puedes añadir [🤗 Datasets](https://huggingface.co/docs/datasets/) al flujo d
 Por ejemplo, normalmente ejecutarías un programa en una red normal con firewall para instancias externas con el siguiente comando:
 
 ```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 Ejecuta este mismo programa en una instancia offline con el siguiente comando:
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 El script ahora debería ejecutarse sin bloquearse ni esperar a que se agote el tiempo de espera porque sabe que solo debe buscar archivos locales.
diff --git a/docs/source/es/model_sharing.md b/docs/source/es/model_sharing.md
index 7e99e8066bf89a..43cf0b8eddb8f7 100644
--- a/docs/source/es/model_sharing.md
+++ b/docs/source/es/model_sharing.md
@@ -220,4 +220,4 @@ Para asegurarnos que los usuarios entiendan las capacidades de tu modelo, sus li
 * Elaborando y subiendo manualmente el archivo`README.md`.
 * Dando click en el botón **Edit model card** dentro del repositorio.
 
-Toma un momento para ver la [tarjeta de modelo](https://huggingface.co/distilbert-base-uncased) de DistilBert para que tengas un buen ejemplo del tipo de información que debería incluir. Consulta [la documentación](https://huggingface.co/docs/hub/models-cards) para más detalles acerca de otras opciones que puedes controlar dentro del archivo `README.md` como la huella de carbono del modelo o ejemplos de widgets. Consulta la documentación [aquí](https://huggingface.co/docs/hub/models-cards).
+Toma un momento para ver la [tarjeta de modelo](https://huggingface.co/distilbert/distilbert-base-uncased) de DistilBert para que tengas un buen ejemplo del tipo de información que debería incluir. Consulta [la documentación](https://huggingface.co/docs/hub/models-cards) para más detalles acerca de otras opciones que puedes controlar dentro del archivo `README.md` como la huella de carbono del modelo o ejemplos de widgets. Consulta la documentación [aquí](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/es/multilingual.md b/docs/source/es/multilingual.md
index fa60cac68c269a..d49d54f196d54e 100644
--- a/docs/source/es/multilingual.md
+++ b/docs/source/es/multilingual.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-Existen varios modelos multilingües en 🤗 Transformers y su uso para inferencia difiere de los modelos monolingües. Sin embargo, no *todos* los usos de los modelos multilingües son diferentes. Algunos modelos, como [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), pueden utilizarse igual que un modelo monolingüe. Esta guía te enseñará cómo utilizar modelos multilingües cuyo uso difiere en la inferencia.
+Existen varios modelos multilingües en 🤗 Transformers y su uso para inferencia difiere de los modelos monolingües. Sin embargo, no *todos* los usos de los modelos multilingües son diferentes. Algunos modelos, como [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased), pueden utilizarse igual que un modelo monolingüe. Esta guía te enseñará cómo utilizar modelos multilingües cuyo uso difiere en la inferencia.
 
 ## XLM
 
@@ -28,24 +28,24 @@ XLM tiene diez checkpoints diferentes de los cuales solo uno es monolingüe. Los
 
 Los siguientes modelos XLM usan language embeddings para especificar el lenguaje utilizado en la inferencia:
 
-- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
-- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
-- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
-- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
-- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
-- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
-- `xlm-clm-ende-1024` (Causal language modeling, English-German)
+- `FacebookAI/xlm-mlm-ende-1024` (Masked language modeling, English-German)
+- `FacebookAI/xlm-mlm-enfr-1024` (Masked language modeling, English-French)
+- `FacebookAI/xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
+- `FacebookAI/xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
+- `FacebookAI/xlm-clm-enfr-1024` (Causal language modeling, English-French)
+- `FacebookAI/xlm-clm-ende-1024` (Causal language modeling, English-German)
 
 Los language embeddings son representados como un tensor de la mismas dimensiones que los `input_ids` pasados al modelo. Los valores de estos tensores dependen del idioma utilizado y se identifican mediante los atributos `lang2id` y `id2lang` del tokenizador.
 
-En este ejemplo, carga el checkpoint `xlm-clm-enfr-1024` (Causal language modeling, English-French):
+En este ejemplo, carga el checkpoint `FacebookAI/xlm-clm-enfr-1024` (Causal language modeling, English-French):
 
 ```py
 >>> import torch
 >>> from transformers import XLMTokenizer, XLMWithLMHeadModel
 
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
 ```
 
 El atributo `lang2id` del tokenizador muestra los idiomas de este modelo y sus ids:
@@ -83,8 +83,8 @@ El script [run_generation.py](https://github.com/huggingface/transformers/tree/m
 
 Los siguientes modelos XLM no requieren language embeddings durante la inferencia:
 
-- `xlm-mlm-17-1280` (modelado de lenguaje enmascarado, 17 idiomas)
-- `xlm-mlm-100-1280` (modelado de lenguaje enmascarado, 100 idiomas)
+- `FacebookAI/xlm-mlm-17-1280` (modelado de lenguaje enmascarado, 17 idiomas)
+- `FacebookAI/xlm-mlm-100-1280` (modelado de lenguaje enmascarado, 100 idiomas)
 
 Estos modelos se utilizan para representaciones genéricas de frases a diferencia de los anteriores checkpoints XLM.
 
@@ -92,8 +92,8 @@ Estos modelos se utilizan para representaciones genéricas de frases a diferenci
 
 Los siguientes modelos de BERT pueden utilizarse para tareas multilingües:
 
-- `bert-base-multilingual-uncased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 102 idiomas)
-- `bert-base-multilingual-cased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 104 idiomas)
+- `google-bert/bert-base-multilingual-uncased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 102 idiomas)
+- `google-bert/bert-base-multilingual-cased` (modelado de lenguaje enmascarado + predicción de la siguiente oración, 104 idiomas)
 
 Estos modelos no requieren language embeddings durante la inferencia. Deben identificar la lengua a partir del
 contexto e inferir en consecuencia.
@@ -102,8 +102,8 @@ contexto e inferir en consecuencia.
 
 Los siguientes modelos de XLM-RoBERTa pueden utilizarse para tareas multilingües:
 
-- `xlm-roberta-base` (modelado de lenguaje enmascarado, 100 idiomas)
-- `xlm-roberta-large` (Modelado de lenguaje enmascarado, 100 idiomas)
+- `FacebookAI/xlm-roberta-base` (modelado de lenguaje enmascarado, 100 idiomas)
+- `FacebookAI/xlm-roberta-large` (Modelado de lenguaje enmascarado, 100 idiomas)
 
 XLM-RoBERTa se entrenó con 2,5 TB de datos CommonCrawl recién creados y depurados en 100 idiomas. Proporciona fuertes ventajas sobre los modelos multilingües publicados anteriormente como mBERT o XLM en tareas posteriores como la clasificación, el etiquetado de secuencias y la respuesta a preguntas.
 
diff --git a/docs/source/es/perplexity.md b/docs/source/es/perplexity.md
index 3e96e9865586f7..f07dc663f5524e 100644
--- a/docs/source/es/perplexity.md
+++ b/docs/source/es/perplexity.md
@@ -57,7 +57,7 @@ Demostremos este proceso con GPT-2.
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 
 device = "cuda"
-model_id = "gpt2-large"
+model_id = "openai-community/gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 ```
diff --git a/docs/source/es/pipeline_tutorial.md b/docs/source/es/pipeline_tutorial.md
index 0f77c3c3db8395..279f3593ba95c5 100644
--- a/docs/source/es/pipeline_tutorial.md
+++ b/docs/source/es/pipeline_tutorial.md
@@ -74,8 +74,8 @@ El [`pipeline`] acepta cualquier modelo del [Model Hub](https://huggingface.co/m
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 Crea un [`pipeline`] para tu tarea y específica el modelo y el tokenizador que cargaste:
diff --git a/docs/source/es/preprocessing.md b/docs/source/es/preprocessing.md
index a0ac11ff05c637..8486d6a0687abc 100644
--- a/docs/source/es/preprocessing.md
+++ b/docs/source/es/preprocessing.md
@@ -45,7 +45,7 @@ Carga un tokenizador pre-entrenado con [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 ```
 
 A continuación, pasa tu frase al tokenizador:
diff --git a/docs/source/es/run_scripts.md b/docs/source/es/run_scripts.md
index 8b762fdddc28fc..ff1afa340c9a1d 100644
--- a/docs/source/es/run_scripts.md
+++ b/docs/source/es/run_scripts.md
@@ -87,11 +87,11 @@ pip install -r requirements.txt
 
 <frameworkcontent>
 <pt>
-El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos con [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) en una arquitectura que soporta la tarea de resumen. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
+El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos con [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) en una arquitectura que soporta la tarea de resumen. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/google-t5/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -105,11 +105,11 @@ python examples/pytorch/summarization/run_summarization.py \
 ```
 </pt>
 <tf>
-El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos utilizando Keras en una arquitectura que soporta la tarea de resumir. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
+El script de ejemplo descarga y preprocesa un conjunto de datos de la biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Luego, el script ajusta un conjunto de datos utilizando Keras en una arquitectura que soporta la tarea de resumir. El siguiente ejemplo muestra cómo ajustar un [T5-small](https://huggingface.co/google-t5/t5-small) en el conjunto de datos [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). El modelo T5 requiere un argumento adicional `source_prefix` debido a cómo fue entrenado. Este aviso le permite a T5 saber que se trata de una tarea de resumir.
 
 ```bash
 python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -133,7 +133,7 @@ python examples/tensorflow/summarization/run_summarization.py  \
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -157,7 +157,7 @@ Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicament
 ```bash
 python xla_spawn.py --num_cores 8 \
     summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -176,7 +176,7 @@ Las Unidades de Procesamiento de Tensor (TPUs) están diseñadas específicament
 ```bash
 python run_summarization.py  \
     --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -214,7 +214,7 @@ Todo listo para iniciar el entrenamiento:
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -233,7 +233,7 @@ Un script para resumir que utiliza un conjunto de datos personalizado se vera as
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -258,7 +258,7 @@ A veces, es una buena idea ejecutar tu secuencia de comandos en una cantidad men
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --max_train_samples 50 \
     --max_eval_samples 50 \
     --max_predict_samples 50 \
@@ -288,7 +288,7 @@ El primer método utiliza el argumento `output_dir previous_output_dir` para rea
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -305,7 +305,7 @@ El segundo método utiliza el argumento `resume_from_checkpoint path_to_specific
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -335,7 +335,7 @@ El siguiente ejemplo muestra cómo cargar un modelo con un nombre de repositorio
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
diff --git a/docs/source/es/serialization.md b/docs/source/es/serialization.md
index 9c24ba72f3d42f..3ad7d089853053 100644
--- a/docs/source/es/serialization.md
+++ b/docs/source/es/serialization.md
@@ -137,7 +137,7 @@ optional arguments:
 Exportar un checkpoint usando una configuración a la medida se puede hacer de la siguiente manera:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
 ```
 
 que debería mostrar los siguientes registros:
@@ -152,7 +152,7 @@ All good, model saved at: onnx/model.onnx
 ```
 
 Esto exporta un grafo ONNX del checkpoint definido por el argumento `--model`. 
-En este ejemplo, es un modelo `distilbert-base-uncased`, pero puede ser cualquier
+En este ejemplo, es un modelo `distilbert/distilbert-base-uncased`, pero puede ser cualquier
 checkpoint en Hugging Face Hub o que esté almacenado localmente.
 
 El archivo `model.onnx` resultante se puede ejecutar en uno de los 
@@ -164,7 +164,7 @@ modelo con [ONNX Runtime](https://onnxruntime.ai/) de la siguiente manera:
 >>> from transformers import AutoTokenizer
 >>> from onnxruntime import InferenceSession
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 >>> session = InferenceSession("onnx/model.onnx")
 >>> # ONNX Runtime expects NumPy arrays as input
 >>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
@@ -201,8 +201,8 @@ y guardar un checkpoint de la siguiente manera:
 >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 >>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 >>> # Save to disk
 >>> tokenizer.save_pretrained("local-pt-checkpoint")
 >>> pt_model.save_pretrained("local-pt-checkpoint")
@@ -220,8 +220,8 @@ python -m transformers.onnx --model=local-pt-checkpoint onnx/
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 
 >>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 >>> # Save to disk
 >>> tokenizer.save_pretrained("local-tf-checkpoint")
 >>> tf_model.save_pretrained("local-tf-checkpoint")
@@ -267,7 +267,7 @@ Le puedes pasar una de estas características al argumento `--feature` en el paq
 Por ejemplo, para exportar un modelo de clasificación de texto, podemos elegir un modelo ya ajustado del Hub y ejecutar:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased-finetuned-sst-2-english \
                             --feature=sequence-classification onnx/
 ```
 
@@ -283,7 +283,7 @@ All good, model saved at: onnx/model.onnx
 ```
 
 Ten en cuenta que, en este caso, los nombres de salida del modelo ajustado son `logits` en lugar de `last_hidden_state` 
-que vimos anteriormente con el checkpoint `distilbert-base-uncased`. Esto es de esperarse ya que el modelo ajustado 
+que vimos anteriormente con el checkpoint `distilbert/distilbert-base-uncased`. Esto es de esperarse ya que el modelo ajustado 
 tiene un cabezal de clasificación secuencial.
 
 <Tip>
@@ -362,7 +362,7 @@ instancia proporcionando la configuración del modelo base de la siguiente maner
 ```python
 >>> from transformers import AutoConfig
 
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased")
 >>> onnx_config = DistilBertOnnxConfig(config)
 ```
 
@@ -393,7 +393,7 @@ exportar DistilBERT con un cabezal de clasificación de secuencias, podríamos u
 ```python
 >>> from transformers import AutoConfig
 
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased")
 >>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
 >>> print(onnx_config_for_seq_clf.outputs)
 OrderedDict([('logits', {0: 'batch'})])
@@ -420,7 +420,7 @@ y la ruta para guardar el archivo exportado:
 >>> from transformers import AutoTokenizer, AutoModel
 
 >>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
+>>> model_ckpt = "distilbert/distilbert-base-uncased"
 >>> base_model = AutoModel.from_pretrained(model_ckpt)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
 
@@ -550,7 +550,7 @@ con la clase `BertConfig` y luego se guarda en el disco con el nombre de archivo
 from transformers import BertModel, BertTokenizer, BertConfig
 import torch
 
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
+enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
 # Tokenizing input text
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -585,7 +585,7 @@ model = BertModel(config)
 model.eval()
 
 # If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
 
 # Creating the trace
 traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
diff --git a/docs/source/es/tasks/language_modeling.md b/docs/source/es/tasks/language_modeling.md
index b3f22f0846335d..010d1bccae7bbf 100644
--- a/docs/source/es/tasks/language_modeling.md
+++ b/docs/source/es/tasks/language_modeling.md
@@ -26,11 +26,11 @@ El modelado de lenguaje causal predice el siguiente token en una secuencia de to
 
 El modelado de lenguaje por enmascaramiento predice un token enmascarado en una secuencia, y el modelo puede considerar los tokens bidireccionalmente.
 
-Esta guía te mostrará cómo realizar fine-tuning [DistilGPT2](https://huggingface.co/distilgpt2) para modelos de lenguaje causales y [DistilRoBERTa](https://huggingface.co/distilroberta-base) para modelos de lenguaje por enmascaramiento en el [r/askscience](https://www.reddit.com/r/askscience/) subdataset [ELI5](https://huggingface.co/datasets/eli5). 
+Esta guía te mostrará cómo realizar fine-tuning [DistilGPT2](https://huggingface.co/distilbert/distilgpt2) para modelos de lenguaje causales y [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) para modelos de lenguaje por enmascaramiento en el [r/askscience](https://www.reddit.com/r/askscience/) subdataset [ELI5](https://huggingface.co/datasets/eli5). 
 
 <Tip>
 
-Puedes realizar fine-tuning a otras arquitecturas para modelos de lenguaje como [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) y [BERT](https://huggingface.co/bert-base-uncased) siguiendo los mismos pasos presentados en esta guía!
+Puedes realizar fine-tuning a otras arquitecturas para modelos de lenguaje como [GPT-Neo](https://huggingface.co/EleutherAI/gpt-neo-125M), [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) y [BERT](https://huggingface.co/google-bert/bert-base-uncased) siguiendo los mismos pasos presentados en esta guía!
 
 Mira la [página de tarea](https://huggingface.co/tasks/text-generation) para generación de texto y la [página de tarea](https://huggingface.co/tasks/fill-mask) para modelos de lenguajes por enmascaramiento para obtener más información sobre los modelos, datasets, y métricas asociadas.
 
@@ -81,7 +81,7 @@ Para modelados de lenguaje causales carga el tokenizador DistilGPT2 para procesa
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 ```
 
 <Youtube id="8PmhEIXhBvI"/>
@@ -91,7 +91,7 @@ Para modelados de lenguaje por enmascaramiento carga el tokenizador DistilRoBERT
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
 ```
 
 Extrae el subcampo `text` desde su estructura anidado con el método [`flatten`](https://huggingface.co/docs/datasets/process#flatten):
@@ -203,7 +203,7 @@ Para modelados de lenguajes por enmascaramiento usa el mismo [`DataCollatorForLa
 
 ## Modelado de lenguaje causal
 
-El modelado de lenguaje causal es frecuentemente utilizado para generación de texto. Esta sección te muestra cómo realizar fine-tuning a [DistilGPT2](https://huggingface.co/distilgpt2) para generar nuevo texto.
+El modelado de lenguaje causal es frecuentemente utilizado para generación de texto. Esta sección te muestra cómo realizar fine-tuning a [DistilGPT2](https://huggingface.co/distilbert/distilgpt2) para generar nuevo texto.
 
 ### Entrenamiento
 
@@ -214,7 +214,7 @@ Carga DistilGPT2 con [`AutoModelForCausalLM`]:
 ```py
 >>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
 
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 <Tip>
@@ -288,7 +288,7 @@ Carga DistilGPT2 con [`TFAutoModelForCausalLM`]:
 ```py
 >>> from transformers import TFAutoModelForCausalLM
 
->>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
@@ -309,7 +309,7 @@ Llama a [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) par
 
 ## Modelado de lenguaje por enmascaramiento
 
-El modelado de lenguaje por enmascaramiento es también conocido como una tarea de rellenar la máscara, pues predice un token enmascarado dada una secuencia. Los modelos de lenguaje por enmascaramiento requieren una buena comprensión del contexto de una secuencia entera, en lugar de solo el contexto a la izquierda. Esta sección te enseña como realizar el fine-tuning de [DistilRoBERTa](https://huggingface.co/distilroberta-base) para predecir una palabra enmascarada.
+El modelado de lenguaje por enmascaramiento es también conocido como una tarea de rellenar la máscara, pues predice un token enmascarado dada una secuencia. Los modelos de lenguaje por enmascaramiento requieren una buena comprensión del contexto de una secuencia entera, en lugar de solo el contexto a la izquierda. Esta sección te enseña como realizar el fine-tuning de [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) para predecir una palabra enmascarada.
 
 ### Entrenamiento
 
@@ -320,7 +320,7 @@ Carga DistilRoBERTa con [`AutoModelForMaskedlM`]:
 ```py
 >>> from transformers import AutoModelForMaskedLM
 
->>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
+>>> model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
 ```
 
 <Tip>
@@ -395,7 +395,7 @@ Carga DistilRoBERTa con [`TFAutoModelForMaskedLM`]:
 ```py
 >>> from transformers import TFAutoModelForMaskedLM
 
->>> model = TFAutoModelForCausalLM.from_pretrained("distilroberta-base")
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilroberta-base")
 ```
 
 Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
diff --git a/docs/source/es/tasks/multiple_choice.md b/docs/source/es/tasks/multiple_choice.md
index 8391dcbdd5ebbd..ca2e3d15f63546 100644
--- a/docs/source/es/tasks/multiple_choice.md
+++ b/docs/source/es/tasks/multiple_choice.md
@@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 La tarea de selección múltiple es parecida a la de responder preguntas, con la excepción de que se dan varias opciones de respuesta junto con el contexto. El modelo se entrena para escoger la respuesta correcta
 entre varias opciones a partir del contexto dado.
 
-Esta guía te mostrará como hacerle fine-tuning a [BERT](https://huggingface.co/bert-base-uncased) en la configuración `regular` del dataset [SWAG](https://huggingface.co/datasets/swag), de forma
+Esta guía te mostrará como hacerle fine-tuning a [BERT](https://huggingface.co/google-bert/bert-base-uncased) en la configuración `regular` del dataset [SWAG](https://huggingface.co/datasets/swag), de forma
 que seleccione la mejor respuesta a partir de varias opciones y algún contexto.
 
 ## Cargar el dataset SWAG
@@ -58,7 +58,7 @@ Carga el tokenizer de BERT para procesar el comienzo de cada oración y los cuat
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 La función de preprocesmaiento debe hacer lo siguiente:
@@ -194,7 +194,7 @@ Carga el modelo BERT con [`AutoModelForMultipleChoice`]:
 ```py
 >>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
 
->>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 <Tip>
@@ -274,7 +274,7 @@ Carga el modelo BERT con [`TFAutoModelForMultipleChoice`]:
 ```py
 >>> from transformers import TFAutoModelForMultipleChoice
 
->>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
diff --git a/docs/source/es/tasks/question_answering.md b/docs/source/es/tasks/question_answering.md
index 2aa896142e2ead..5cd59f6b064f71 100644
--- a/docs/source/es/tasks/question_answering.md
+++ b/docs/source/es/tasks/question_answering.md
@@ -23,7 +23,7 @@ La respuesta a preguntas devuelve una respuesta a partir de una pregunta dada. E
 - Extractiva: extraer la respuesta a partir del contexto dado.
 - Abstractiva: generar una respuesta que responda correctamente la pregunta a partir del contexto dado.
 
-Esta guía te mostrará como hacer fine-tuning de [DistilBERT](https://huggingface.co/distilbert-base-uncased) en el dataset [SQuAD](https://huggingface.co/datasets/squad) para responder preguntas de forma extractiva.
+Esta guía te mostrará como hacer fine-tuning de [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) en el dataset [SQuAD](https://huggingface.co/datasets/squad) para responder preguntas de forma extractiva.
 
 <Tip>
 
@@ -64,7 +64,7 @@ Carga el tokenizer de DistilBERT para procesar los campos `question` (pregunta)
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Hay algunos pasos de preprocesamiento específicos para la tarea de respuesta a preguntas que debes tener en cuenta:
@@ -164,7 +164,7 @@ Carga el modelo DistilBERT con [`AutoModelForQuestionAnswering`]:
 ```py
 >>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
 
->>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip>
@@ -247,7 +247,7 @@ Carga el modelo DistilBERT con [`TFAutoModelForQuestionAnswering`]:
 ```py
 >>> from transformers import TFAutoModelForQuestionAnswering
 
->>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
+>>> model = TFAutoModelForQuestionAnswering("distilbert/distilbert-base-uncased")
 ```
 
 Configura el modelo para entrenarlo con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
diff --git a/docs/source/es/tasks/summarization.md b/docs/source/es/tasks/summarization.md
index b545e4216e5de1..19ceb90b22cbb2 100644
--- a/docs/source/es/tasks/summarization.md
+++ b/docs/source/es/tasks/summarization.md
@@ -23,7 +23,7 @@ La generación de resúmenes (summarization, en inglés) crea una versión más
 - Extractiva: Extrae la información más relevante de un documento.
 - Abstractiva: Genera un texto nuevo que captura la información más importante.
 
-Esta guía te mostrará cómo puedes hacer fine-tuning del modelo [T5](https://huggingface.co/t5-small) sobre el subset de proyectos de ley del estado de California, dentro del dataset [BillSum](https://huggingface.co/datasets/billsum) para hacer generación de resúmenes abstractiva.
+Esta guía te mostrará cómo puedes hacer fine-tuning del modelo [T5](https://huggingface.co/google-t5/t5-small) sobre el subset de proyectos de ley del estado de California, dentro del dataset [BillSum](https://huggingface.co/datasets/billsum) para hacer generación de resúmenes abstractiva.
 
 <Tip>
 
@@ -65,7 +65,7 @@ Carga el tokenizador T5 para procesar `text` y `summary`:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
 ```
 
 La función de preprocesamiento necesita:
@@ -122,7 +122,7 @@ Carga T5 con [`AutoModelForSeq2SeqLM`]:
 ```py
 >>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
 
->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
 ```
 
 <Tip>
@@ -200,7 +200,7 @@ Carga T5 con [`TFAutoModelForSeq2SeqLM`]:
 ```py
 >>> from transformers import TFAutoModelForSeq2SeqLM
 
->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
 ```
 
 Configura el modelo para entrenamiento con [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
diff --git a/docs/source/es/training.md b/docs/source/es/training.md
index 4f224b0797a3b9..fef44ed3f9ff72 100644
--- a/docs/source/es/training.md
+++ b/docs/source/es/training.md
@@ -48,7 +48,7 @@ Como ya sabes, necesitas un tokenizador para procesar el texto e incluir una est
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 
 >>> def tokenize_function(examples):
@@ -78,7 +78,7 @@ Comienza cargando tu modelo y especifica el número de labels previstas. A parti
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 <Tip>
@@ -200,7 +200,7 @@ Carguemos un modelo TensorFlow con el número esperado de labels:
 >>> import tensorflow as tf
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 A continuación, compila y aplica fine-tuning a tu modelo con [`fit`](https://keras.io/api/models/model_training_apis/) como lo harías con cualquier otro modelo de Keras:
@@ -275,7 +275,7 @@ Carga tu modelo con el número de labels previstas:
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 ### Optimiza y programa el learning rate
diff --git a/docs/source/fr/autoclass_tutorial.md b/docs/source/fr/autoclass_tutorial.md
index 392e2a6807e55d..f569966d0c6043 100644
--- a/docs/source/fr/autoclass_tutorial.md
+++ b/docs/source/fr/autoclass_tutorial.md
@@ -20,7 +20,7 @@ Avec autant d'architectures Transformer différentes, il peut être difficile d'
 
 <Tip>
 
-Rappel, l'architecture fait référence au squelette du modèle et l'ensemble de poids contient les poids pour une architecture donnée. Par exemple, [BERT](https://huggingface.co/bert-base-uncased) est une architecture, tandis que `bert-base-uncased` est un ensemble de poids. Le terme modèle est général et peut signifier soit architecture soit ensemble de poids.
+Rappel, l'architecture fait référence au squelette du modèle et l'ensemble de poids contient les poids pour une architecture donnée. Par exemple, [BERT](https://huggingface.co/google-bert/bert-base-uncased) est une architecture, tandis que `google-bert/bert-base-uncased` est un ensemble de poids. Le terme modèle est général et peut signifier soit architecture soit ensemble de poids.
 
 </Tip>
 
@@ -41,7 +41,7 @@ Chargez un tokenizer avec [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 Puis, transformez votre texte initial comme montré ci-dessous:
@@ -99,7 +99,7 @@ Enfin, les classes `AutoModelFor` vous permettent de charger un modèle pré-ent
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Réutilisez facilement le même ensemble de poids pour charger une architecture pour une tâche différente :
@@ -107,7 +107,7 @@ Réutilisez facilement le même ensemble de poids pour charger une architecture
 ```py
 >>> from transformers import AutoModelForTokenClassification
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip warning={true}>
@@ -126,7 +126,7 @@ Enfin, les classes `TFAutoModelFor` vous permettent de charger un modèle pré-e
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Réutilisez facilement le même ensemble de poids pour charger une architecture pour une tâche différente :
@@ -134,7 +134,7 @@ Réutilisez facilement le même ensemble de poids pour charger une architecture
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 En général, nous recommandons d'utiliser les classes `AutoTokenizer` et `TFAutoModelFor` pour charger des instances pré-entraînées de tokenizers et modèles respectivement. Cela vous permettra de charger la bonne architecture à chaque fois. Dans le prochain [tutoriel](preprocessing), vous apprenez à utiliser un tokenizer, processeur d'image, extracteur de caractéristiques et processeur pour pré-traiter un jeu de données pour le fine-tuning.
diff --git a/docs/source/fr/installation.md b/docs/source/fr/installation.md
index 793a1eec82ec27..cd68911bc3564d 100644
--- a/docs/source/fr/installation.md
+++ b/docs/source/fr/installation.md
@@ -181,7 +181,7 @@ Ajoutez [🤗 Datasets](https://huggingface.co/docs/datasets/) à votre processu
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 Le script devrait maintenant s'exécuter sans rester en attente ou attendre une expiration, car il n'essaiera pas de télécharger des modèle sur le Hub.
diff --git a/docs/source/fr/quicktour.md b/docs/source/fr/quicktour.md
index 666a931f825f1c..f76764f103387a 100644
--- a/docs/source/fr/quicktour.md
+++ b/docs/source/fr/quicktour.md
@@ -73,7 +73,7 @@ Commencez par créer une instance de [`pipeline`] et spécifiez la tâche pour l
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-Le [`pipeline`] télécharge et stocke en cache un [modèle pré-entraîné](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) et un tokenizer par défaut pour l'analyse des sentiments. Vous pouvez maintenant utiliser le `classifier` sur le texte de votre choix :
+Le [`pipeline`] télécharge et stocke en cache un [modèle pré-entraîné](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) et un tokenizer par défaut pour l'analyse des sentiments. Vous pouvez maintenant utiliser le `classifier` sur le texte de votre choix :
 
 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
@@ -378,7 +378,7 @@ Commencez par importer [`AutoConfig`], puis chargez le modèle pré-entraîné q
 ```py
 >>> from transformers import AutoConfig
 
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
 ```
 
 <frameworkcontent>
@@ -415,7 +415,7 @@ En fonction de votre tâche, vous passerez généralement les paramètres suivan
    ```py
    >>> from transformers import AutoModelForSequenceClassification
 
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. [`TrainingArguments`] contient les hyperparamètres du modèle que vous pouvez changer comme le taux d'apprentissage, la taille de l'échantillon, et le nombre d'époques pour s'entraîner. Les valeurs par défaut sont utilisées si vous ne spécifiez pas d'hyperparamètres d'apprentissage :
@@ -437,7 +437,7 @@ En fonction de votre tâche, vous passerez généralement les paramètres suivan
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 4. Chargez un jeu de données :
@@ -509,7 +509,7 @@ Tous les modèles sont des modèles standard [`tf.keras.Model`](https://www.tens
    ```py
    >>> from transformers import TFAutoModelForSequenceClassification
 
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. Une classe de prétraitement comme un tokenizer, un processeur d'images ou un extracteur de caractéristiques :
@@ -517,7 +517,7 @@ Tous les modèles sont des modèles standard [`tf.keras.Model`](https://www.tens
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 3. Créez une fonction qui transforme le texte du jeu de données en token :
diff --git a/docs/source/hi/pipeline_tutorial.md b/docs/source/hi/pipeline_tutorial.md
index eb18027095bfa2..5f3cd680480d63 100644
--- a/docs/source/hi/pipeline_tutorial.md
+++ b/docs/source/hi/pipeline_tutorial.md
@@ -185,7 +185,7 @@ def data():
         yield f"My example {i}"
 
 
-pipe = pipeline(model="gpt2", device=0)
+pipe = pipeline(model="openai-community/gpt2", device=0)
 generated_characters = 0
 for out in pipe(data()):
     generated_characters += len(out[0]["generated_text"])
diff --git a/docs/source/it/autoclass_tutorial.md b/docs/source/it/autoclass_tutorial.md
index 51621d098302bc..edb96528e705ea 100644
--- a/docs/source/it/autoclass_tutorial.md
+++ b/docs/source/it/autoclass_tutorial.md
@@ -20,7 +20,7 @@ Con così tante architetture Transformer differenti, può essere sfidante crearn
 
 <Tip>
 
-Ricorda, con architettura ci si riferisce allo scheletro del modello e con checkpoint ai pesi di una determinata architettura. Per esempio, [BERT](https://huggingface.co/bert-base-uncased) è un'architettura, mentre `bert-base-uncased` è un checkpoint. Modello è un termine generale che può significare sia architettura che checkpoint.
+Ricorda, con architettura ci si riferisce allo scheletro del modello e con checkpoint ai pesi di una determinata architettura. Per esempio, [BERT](https://huggingface.co/google-bert/bert-base-uncased) è un'architettura, mentre `google-bert/bert-base-uncased` è un checkpoint. Modello è un termine generale che può significare sia architettura che checkpoint.
 
 </Tip>
 
@@ -40,7 +40,7 @@ Carica un tokenizer con [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+>>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
 ```
 
 Poi tokenizza il tuo input come mostrato in seguito:
@@ -87,7 +87,7 @@ Infine, le classi `AutoModelFor` ti permettono di caricare un modello pre-allena
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente:
@@ -95,7 +95,7 @@ Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un
 ```py
 >>> from transformers import AutoModelForTokenClassification
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `AutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning.
@@ -107,7 +107,7 @@ Infine, le classi `TFAutoModelFor` ti permettono di caricare un modello pre-alle
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un task differente:
@@ -115,7 +115,7 @@ Semplicemente utilizza lo stesso checkpoint per caricare un'architettura per un
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Generalmente, raccomandiamo di utilizzare la classe `AutoTokenizer` e la classe `TFAutoModelFor` per caricare istanze pre-allenate dei modelli. Questo ti assicurerà di aver caricato la corretta architettura ogni volta. Nel prossimo [tutorial](preprocessing), imparerai come utilizzare il tokenizer, il feature extractor e il processore per elaborare un dataset per il fine-tuning.
diff --git a/docs/source/it/big_models.md b/docs/source/it/big_models.md
index cd0fd9017d9d3d..6a5c346dec890f 100644
--- a/docs/source/it/big_models.md
+++ b/docs/source/it/big_models.md
@@ -42,7 +42,7 @@ Puoi controllare la dimensione massima dopo la frammentazione con il parametro `
 ```py
 from transformers import AutoModel
 
-model = AutoModel.from_pretrained("bert-base-cased")
+model = AutoModel.from_pretrained("google-bert/bert-base-cased")
 ```
 
 Se tu salvi usando [`~PreTrainedModel.save_pretrained`], avrai una nuova cartella con due file: il config del modello e i suoi pesi:
diff --git a/docs/source/it/community.md b/docs/source/it/community.md
index f9f177189e3b76..92f6698a9a89bb 100644
--- a/docs/source/it/community.md
+++ b/docs/source/it/community.md
@@ -42,8 +42,8 @@ Questa pagina raggruppa le risorse sviluppate dalla comunità riguardo 🤗 Tran
 |[Fine-tuning di Roberta per l'analisi di sentimenti](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | Come effettuare il fine-tuning di un modello Roberta per l'analisi di sentimenti. | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
 |[Valutazione di modelli che generano domande](https://github.com/flexudy-pipe/qugeev) | Quanto sono accurante le risposte alle domande generate dal tuo modello transformer seq2seq? | [Pascal Zoleko](https://github.com/zolekode) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
 |[Classificazione di testo con DistilBERT e Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | Come effettuare il fine-tuning di DistilBERT per la classificazione di testo in TensorFlow. | [Peter Bayerle](https://github.com/peterbayerle) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
-|[Utilizzo di BERT per riassumere testi con un modello Encoder-Decoder su CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | Come avviare "a caldo" un *EncoderDecoderModel* attraverso l'utilizzo di un checkpoint *bert-base-uncased* per riassumere testi su CNN/Dailymail. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
-|[Utilizzo di RoBERTa per riassumere testi con un modello Encoder-Decoder su BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | Come avviare "a caldo" un *EncoderDecoderModel* (condiviso) attraverso l'utilizzo di un checkpoint *roberta-base* per riassumere testi su BBC/XSum. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Utilizzo di BERT per riassumere testi con un modello Encoder-Decoder su CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | Come avviare "a caldo" un *EncoderDecoderModel* attraverso l'utilizzo di un checkpoint *google-bert/bert-base-uncased* per riassumere testi su CNN/Dailymail. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Aprilo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Utilizzo di RoBERTa per riassumere testi con un modello Encoder-Decoder su BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | Come avviare "a caldo" un *EncoderDecoderModel* (condiviso) attraverso l'utilizzo di un checkpoint *FacebookAI/roberta-base* per riassumere testi su BBC/XSum. | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
 |[Fine-tuning di TAPAS su Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | Come effettuare il fine-tuning di un modello *TapasForQuestionAnswering* attraverso l'utilizzo di un checkpoint *tapas-base* sul dataset Sequential Question Answering (SQA). | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
 |[Valutazione di TAPAS su Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | Come valutare un modello *TapasForSequenceClassification* - fine-tuned con un checkpoint *tapas-base-finetuned-tabfact* - usando una combinazione delle librerie 🤗 datasets e 🤗 transformers. | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
 |[Fine-tuning di mBART per la traduzione](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | Come effettuare il fine-tuning di mBART usando Seq2SeqTrainer per la traduzione da hindi a inglese.| [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
diff --git a/docs/source/it/converting_tensorflow_models.md b/docs/source/it/converting_tensorflow_models.md
index f6326daa735fbe..b1de0113388254 100644
--- a/docs/source/it/converting_tensorflow_models.md
+++ b/docs/source/it/converting_tensorflow_models.md
@@ -96,8 +96,8 @@ transformers-cli convert --model_type gpt \
 Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allenato (vedi [qui](https://github.com/openai/gpt-2)):
 
 ```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-transformers-cli convert --model_type gpt2 \
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
+transformers-cli convert --model_type openai-community/gpt2 \
   --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
   --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
   [--config OPENAI_GPT2_CONFIG] \
diff --git a/docs/source/it/create_a_model.md b/docs/source/it/create_a_model.md
index 75055beb92718c..caacf4fadc5db6 100644
--- a/docs/source/it/create_a_model.md
+++ b/docs/source/it/create_a_model.md
@@ -86,7 +86,7 @@ DistilBertConfig {
 Nella funzione [`~PretrainedConfig.from_pretrained`] possono essere modificati gli attributi del modello pre-allenato:
 
 ```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
 ```
 
 Quando la configurazione del modello ti soddisfa, la puoi salvare con [`~PretrainedConfig.save_pretrained`]. Il file della tua configurazione è memorizzato come file JSON nella save directory specificata:
@@ -127,13 +127,13 @@ Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in
 Crea un modello pre-allenato con [`~PreTrainedModel.from_pretrained`]:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricata se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </pt>
 <tf>
@@ -152,13 +152,13 @@ Questo crea modelli con valori casuali invece di pesi pre-allenati. Non sarai in
 Crea un modello pre-allenoto con [`~TFPreTrainedModel.from_pretrained`]:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Quando carichi pesi pre-allenati, la configurazione del modello predefinito è automaticamente caricato se il modello è fornito da 🤗 Transformers. Tuttavia, puoi ancora sostituire gli attributi - alcuni o tutti - di configurazione del modello predefinito con i tuoi se lo desideri:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 
 </tf>
@@ -175,7 +175,7 @@ Per esempio, [`DistilBertForSequenceClassification`] è un modello DistilBERT ba
 ```py
 >>> from transformers import DistilBertForSequenceClassification
 
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Riutilizza facilmente questo checkpoint per un'altra attività passando ad un model head differente. Per un attività di risposta alle domande, utilizzerai il model head [`DistilBertForQuestionAnswering`]. La head per compiti di question answering è simile alla classificazione di sequenza head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese) 
@@ -183,7 +183,7 @@ Riutilizza facilmente questo checkpoint per un'altra attività passando ad un mo
 ```py
 >>> from transformers import DistilBertForQuestionAnswering
 
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </pt>
 <tf>
@@ -192,7 +192,7 @@ Per esempio, [`TFDistilBertForSequenceClassification`] è un modello DistilBERT
 ```py
 >>> from transformers import TFDistilBertForSequenceClassification
 
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Riutilizza facilmente questo checkpoint per un altra attività passando ad un modello head diverso. Per un attività di risposta alle domande, utilizzerai il model head [`TFDistilBertForQuestionAnswering`]. Il head di risposta alle domande è simile alla sequenza di classificazione head tranne per il fatto che è uno strato lineare sopra l'output degli stati nascosti (hidden states in inglese)
@@ -200,7 +200,7 @@ Riutilizza facilmente questo checkpoint per un altra attività passando ad un mo
 ```py
 >>> from transformers import TFDistilBertForQuestionAnswering
 
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </tf>
 </frameworkcontent>
@@ -233,7 +233,7 @@ Se hai addestrato il tuo tokenizer, puoi crearne uno dal tuo file *vocabolario*:
 ```py
 >>> from transformers import DistilBertTokenizer
 
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Crea un tokenizer veloce con la classe [`DistilBertTokenizerFast`]:
@@ -241,7 +241,7 @@ Crea un tokenizer veloce con la classe [`DistilBertTokenizerFast`]:
 ```py
 >>> from transformers import DistilBertTokenizerFast
 
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip>
diff --git a/docs/source/it/installation.md b/docs/source/it/installation.md
index ee63ad94d12bdd..2f45f4182d24c9 100644
--- a/docs/source/it/installation.md
+++ b/docs/source/it/installation.md
@@ -163,14 +163,14 @@ Aggiungi [🤗 Datasets](https://huggingface.co/docs/datasets/) al tuo flusso di
 Ad esempio, in genere si esegue un programma su una rete normale, protetta da firewall per le istanze esterne, con il seguente comando:
 
 ```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 Esegui lo stesso programma in un'istanza offline con:
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 Lo script viene ora eseguito senza bloccarsi o attendere il timeout, perché sa di dover cercare solo file locali.
diff --git a/docs/source/it/migration.md b/docs/source/it/migration.md
index bc22e5930b64bb..9a5f4d005505e8 100644
--- a/docs/source/it/migration.md
+++ b/docs/source/it/migration.md
@@ -42,13 +42,13 @@ Nella versione `v3.x`:
 ```py
 from transformers import AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 ```
 per ottenere lo stesso nella versione `v4.x`:
 ```py
 from transformers import AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased", use_fast=False)
 ```
 
 #### 2. SentencePiece è stato rimosso dalle dipendenze richieste
@@ -112,17 +112,17 @@ Per ottenere lo stesso comportamento della versione `v3.x`, specifica l'argoment
 
 Nella versione `v3.x`:
 ```bash
-model = BertModel.from_pretrained("bert-base-cased")
+model = BertModel.from_pretrained("google-bert/bert-base-cased")
 outputs = model(**inputs)
 ```
 per ottenere lo stesso nella versione `v4.x`:
 ```bash
-model = BertModel.from_pretrained("bert-base-cased")
+model = BertModel.from_pretrained("google-bert/bert-base-cased")
 outputs = model(**inputs, return_dict=False)
 ```
 o
 ```bash
-model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
+model = BertModel.from_pretrained("google-bert/bert-base-cased", return_dict=False)
 outputs = model(**inputs)
 ```
 
@@ -207,7 +207,7 @@ Ecco un esempio di conversione da `pytorch-pretrained-bert`
 
 ```python
 # Carichiamo il nostro modello
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
 
 # Se usavi questa riga in pytorch-pretrained-bert :
 loss = model(input_ids, labels=labels)
@@ -220,7 +220,7 @@ loss = outputs[0]
 loss, logits = outputs[:2]
 
 # Ed anche agli attention weight se configuri il modello per restituirli (e anche altri output, vedi le docstring e la documentazione)
-model = BertForSequenceClassification.from_pretrained(" bert-base-uncased", output_attentions=True)
+model = BertForSequenceClassification.from_pretrained(" google-bert/bert-base-uncased", output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
 ```
@@ -239,8 +239,8 @@ Ecco un esempio:
 
 ```python
 ### Carichiamo un modello e un tokenizer
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
-tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
 ### Facciamo fare alcune cose al nostro modello e tokenizer
 # Es: aggiungiamo nuovi token al vocabolario e agli embending del nostro modello
diff --git a/docs/source/it/model_sharing.md b/docs/source/it/model_sharing.md
index 351cf57bf96bb5..81257717ed9a70 100644
--- a/docs/source/it/model_sharing.md
+++ b/docs/source/it/model_sharing.md
@@ -235,4 +235,4 @@ Per assicurarti che chiunque possa comprendere le abilità, limitazioni, i poten
 * Creando manualmente e caricando un file `README.md`.
 * Premendo sul pulsante **Edit model card** nel repository del tuo modello.
 
-Dai un'occhiata alla [scheda del modello](https://huggingface.co/distilbert-base-uncased) di DistilBert per avere un buon esempio del tipo di informazioni che una scheda di un modello deve includere. Per maggiori dettagli legati ad altre opzioni che puoi controllare nel file `README.md`, come l'impatto ambientale o widget di esempio, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/models-cards).
+Dai un'occhiata alla [scheda del modello](https://huggingface.co/distilbert/distilbert-base-uncased) di DistilBert per avere un buon esempio del tipo di informazioni che una scheda di un modello deve includere. Per maggiori dettagli legati ad altre opzioni che puoi controllare nel file `README.md`, come l'impatto ambientale o widget di esempio, fai riferimento alla documentazione [qui](https://huggingface.co/docs/hub/models-cards).
diff --git a/docs/source/it/multilingual.md b/docs/source/it/multilingual.md
index 889c620ab29d9d..e9e85beec1d966 100644
--- a/docs/source/it/multilingual.md
+++ b/docs/source/it/multilingual.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-Ci sono diversi modelli multilingue in 🤗 Transformers, e il loro utilizzo per l'inferenza differisce da quello dei modelli monolingua. Non *tutti* gli utilizzi dei modelli multilingue sono però diversi. Alcuni modelli, come [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased), possono essere usati come un modello monolingua. Questa guida ti mostrerà come utilizzare modelli multilingue che utilizzano un modo diverso per fare l'inferenza.
+Ci sono diversi modelli multilingue in 🤗 Transformers, e il loro utilizzo per l'inferenza differisce da quello dei modelli monolingua. Non *tutti* gli utilizzi dei modelli multilingue sono però diversi. Alcuni modelli, come [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased), possono essere usati come un modello monolingua. Questa guida ti mostrerà come utilizzare modelli multilingue che utilizzano un modo diverso per fare l'inferenza.
 
 ## XLM
 
@@ -28,24 +28,24 @@ XLM ha dieci diversi checkpoint, di cui solo uno è monolingua. I nove checkpoin
 
 I seguenti modelli XLM utilizzano gli embeddings linguistici per specificare la lingua utilizzata per l'inferenza:
 
-- `xlm-mlm-ende-1024` (Modellazione mascherata del linguaggio (Masked language modeling, in inglese), Inglese-Tedesco)
-- `xlm-mlm-enfr-1024` (Modellazione mascherata del linguaggio, Inglese-Francese)
-- `xlm-mlm-enro-1024` (Modellazione mascherata del linguaggio, Inglese-Rumeno)
-- `xlm-mlm-xnli15-1024` (Modellazione mascherata del linguaggio, lingue XNLI)
-- `xlm-mlm-tlm-xnli15-1024` (Modellazione mascherata del linguaggio + traduzione, lingue XNLI)
-- `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese)
-- `xlm-clm-ende-1024` (Modellazione causale del linguaggio, Inglese-Tedesco)
+- `FacebookAI/xlm-mlm-ende-1024` (Modellazione mascherata del linguaggio (Masked language modeling, in inglese), Inglese-Tedesco)
+- `FacebookAI/xlm-mlm-enfr-1024` (Modellazione mascherata del linguaggio, Inglese-Francese)
+- `FacebookAI/xlm-mlm-enro-1024` (Modellazione mascherata del linguaggio, Inglese-Rumeno)
+- `FacebookAI/xlm-mlm-xnli15-1024` (Modellazione mascherata del linguaggio, lingue XNLI)
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (Modellazione mascherata del linguaggio + traduzione, lingue XNLI)
+- `FacebookAI/xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese)
+- `FacebookAI/xlm-clm-ende-1024` (Modellazione causale del linguaggio, Inglese-Tedesco)
 
 Gli embeddings linguistici sono rappresentati come un tensore delle stesse dimensioni dell' `input_ids` passato al modello. I valori in questi tensori dipendono dal linguaggio usato e sono identificati dagli attributi `lang2id` e `id2lang` del tokenizer.
 
-In questo esempio, carica il checkpoint `xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese):
+In questo esempio, carica il checkpoint `FacebookAI/xlm-clm-enfr-1024` (Modellazione causale del linguaggio, Inglese-Francese):
 
 ```py
 >>> import torch
 >>> from transformers import XLMTokenizer, XLMWithLMHeadModel
 
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
 ```
 
 L'attributo `lang2id` del tokenizer mostra il linguaggio del modello e il suo ids:
@@ -83,8 +83,8 @@ Lo script [run_generation.py](https://github.com/huggingface/transformers/tree/m
 
 I seguenti modelli XLM non richiedono l'utilizzo dei language embeddings per fare inferenza:
 
-- `xlm-mlm-17-1280` (Modellazione mascherata del linguaggio, 17 lingue)
-- `xlm-mlm-100-1280` (Modellazione mascherata del linguaggio, 100 lingue)
+- `FacebookAI/xlm-mlm-17-1280` (Modellazione mascherata del linguaggio, 17 lingue)
+- `FacebookAI/xlm-mlm-100-1280` (Modellazione mascherata del linguaggio, 100 lingue)
 
 Questi modelli sono utilizzati per rappresentazioni generiche di frasi, a differenza dei precedenti checkpoints XML.
 
@@ -92,8 +92,8 @@ Questi modelli sono utilizzati per rappresentazioni generiche di frasi, a differ
 
 Il seguente modello BERT può essere usato per compiti multilingue:
 
-- `bert-base-multilingual-uncased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 102 lingue)
-- `bert-base-multilingual-cased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 104 lingue)
+- `google-bert/bert-base-multilingual-uncased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 102 lingue)
+- `google-bert/bert-base-multilingual-cased` (Modellazione mascherata del linguaggio + Previsione della prossima frase, 104 lingue)
 
 Questi modelli non richiedono language embeddings per fare inferenza. Riescono ad identificare il linguaggio dal contesto e inferire di conseguenza.
 
@@ -101,8 +101,8 @@ Questi modelli non richiedono language embeddings per fare inferenza. Riescono a
 
 Il seguente modello XLM-RoBERTa può essere usato per compiti multilingue:
 
-- `xlm-roberta-base` (Modellazione mascherata del linguaggio, 100 lingue)
-- `xlm-roberta-large` (Modellazione mascherata del linguaggio, 100 lingue)
+- `FacebookAI/xlm-roberta-base` (Modellazione mascherata del linguaggio, 100 lingue)
+- `FacebookAI/xlm-roberta-large` (Modellazione mascherata del linguaggio, 100 lingue)
 
 XLM-RoBERTa è stato addestrato su 2.5TB di dati CommonCrawl appena creati e puliti in 100 lingue. Offre notevoli vantaggi rispetto ai modelli multilingue rilasciati in precedenza, come mBERT o XLM, in compiti come la classificazione, l'etichettatura delle sequenze e la risposta alle domande.
 
diff --git a/docs/source/it/perf_hardware.md b/docs/source/it/perf_hardware.md
index 79e41c0b7e7d14..946dcb3238d057 100644
--- a/docs/source/it/perf_hardware.md
+++ b/docs/source/it/perf_hardware.md
@@ -116,7 +116,7 @@ Ogni nuova generazione fornisce una larghezza di banda più veloce, ad es. ecco
 
 Quindi più `X` si ottiene nel rapporto di `NVX` nell'output di `nvidia-smi topo -m`, meglio è. La generazione dipenderà dall'architettura della tua GPU.
 
-Confrontiamo l'esecuzione di un training del modello di linguaggio gpt2 su un piccolo campione di wikitext
+Confrontiamo l'esecuzione di un training del modello di linguaggio openai-community/gpt2 su un piccolo campione di wikitext
 
 I risultati sono:
 
@@ -135,7 +135,7 @@ Ecco il codice benchmark completo e gli output:
 # DDP w/ NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
@@ -144,7 +144,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 # DDP w/o NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
diff --git a/docs/source/it/perf_train_cpu.md b/docs/source/it/perf_train_cpu.md
index c91baeec88005a..ff71d10d5c9d6c 100644
--- a/docs/source/it/perf_train_cpu.md
+++ b/docs/source/it/perf_train_cpu.md
@@ -51,7 +51,7 @@ Vedi un sempio di un caso d'uso [Transformers question-answering](https://github
 - Training with IPEX using BF16 auto mixed precision on CPU:
 
 <pre> python run_qa.py \
---model_name_or_path bert-base-uncased \
+--model_name_or_path google-bert/bert-base-uncased \
 --dataset_name squad \
 --do_train \
 --do_eval \
diff --git a/docs/source/it/perf_train_cpu_many.md b/docs/source/it/perf_train_cpu_many.md
index 2fb10ee4ba499d..c1f8833829ac3b 100644
--- a/docs/source/it/perf_train_cpu_many.md
+++ b/docs/source/it/perf_train_cpu_many.md
@@ -91,7 +91,7 @@ Il seguente comando abilita due processi sul nodo Xeon, con un processo in esecu
  export MASTER_ADDR=127.0.0.1
  mpirun -n 2 -genv OMP_NUM_THREADS=23 \
  python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
+ --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
@@ -124,7 +124,7 @@ A questo punto, esegui il seguente comando nel nodo0 e **4DDP** sarà abilitato
  mpirun -f hostfile -n 4 -ppn 2 \
  -genv OMP_NUM_THREADS=23 \
  python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
+ --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
diff --git a/docs/source/it/pipeline_tutorial.md b/docs/source/it/pipeline_tutorial.md
index 056282b164ed70..87f3166623b05a 100644
--- a/docs/source/it/pipeline_tutorial.md
+++ b/docs/source/it/pipeline_tutorial.md
@@ -76,8 +76,8 @@ La [`pipeline`] accetta qualsiasi modello dal [Model Hub](https://huggingface.co
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 Crea una [`pipeline`] per il tuo compito, specificando il modello e il tokenizer che hai caricato:
diff --git a/docs/source/it/preprocessing.md b/docs/source/it/preprocessing.md
index 626a44182eaaaa..6d7bc5b2e3df7e 100644
--- a/docs/source/it/preprocessing.md
+++ b/docs/source/it/preprocessing.md
@@ -45,7 +45,7 @@ Carica un tokenizer preaddestrato con [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 ```
 
 Poi inserisci le tue frasi nel tokenizer:
diff --git a/docs/source/it/run_scripts.md b/docs/source/it/run_scripts.md
index c376ff32c2a884..7fc3fb6c6ac67a 100644
--- a/docs/source/it/run_scripts.md
+++ b/docs/source/it/run_scripts.md
@@ -87,11 +87,11 @@ pip install -r requirements.txt
 <frameworkcontent>
 <pt>
 
-Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization.
+Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/google-t5/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization.
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -105,11 +105,11 @@ python examples/pytorch/summarization/run_summarization.py \
 ```
 </pt>
 <tf>
-Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando Keras su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization.
+Lo script di esempio scarica e pre-processa un dataset dalla libreria 🤗 [Datasets](https://huggingface.co/docs/datasets/). Successivamente, lo script esegue il fine-tuning su un dataset usando Keras su un'architettura che supporta la summarization. Il seguente esempio mostra come eseguire il fine-tuning di [T5-small](https://huggingface.co/google-t5/t5-small) sul dataset [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). Il modello T5 richiede un parametro addizionale `source_prefix` a causa del modo in cui è stato addestrato. Questo prefisso permette a T5 di sapere che si tratta di un task di summarization.
 
 ```bash
 python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -133,7 +133,7 @@ Il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supp
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -157,7 +157,7 @@ Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestaz
 ```bash
 python xla_spawn.py --num_cores 8 \
     summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -176,7 +176,7 @@ Le Tensor Processing Units (TPU) sono state progettate per migliorare le prestaz
 ```bash
 python run_summarization.py  \
     --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -214,7 +214,7 @@ Ora sei pronto per avviare l'addestramento:
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -233,7 +233,7 @@ Uno script di summarization usando un dataset personalizzato sarebbe simile a qu
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -258,7 +258,7 @@ python examples/pytorch/summarization/run_summarization.py \
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --max_train_samples 50 \
     --max_eval_samples 50 \
     --max_predict_samples 50 \
@@ -288,7 +288,7 @@ Il primo metodo usa l'argomento `output_dir previous_output_dir` per riavviare l
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -305,7 +305,7 @@ Il secondo metodo usa l'argomento `resume_from_checkpoint path_to_specific_check
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -335,7 +335,7 @@ Il seguente esempio mostra come caricare un modello specificando il nome del rep
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
diff --git a/docs/source/it/serialization.md b/docs/source/it/serialization.md
index 1212250749ccb7..974aee0d81cae0 100644
--- a/docs/source/it/serialization.md
+++ b/docs/source/it/serialization.md
@@ -122,7 +122,7 @@ optional arguments:
 L'esportazione di un checkpoint utilizzando una configurazione già pronta può essere eseguita come segue:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
 ```
 
 che dovrebbe mostrare i seguenti log:
@@ -137,7 +137,7 @@ All good, model saved at: onnx/model.onnx
 ```
 
 Questo esporta un grafico ONNX del checkpoint definito dall'argomento `--model`.
-In questo esempio è `distilbert-base-uncased`, ma può essere qualsiasi checkpoint
+In questo esempio è `distilbert/distilbert-base-uncased`, ma può essere qualsiasi checkpoint
 Hugging Face Hub o uno memorizzato localmente.
 
 Il file risultante `model.onnx` può quindi essere eseguito su uno dei [tanti
@@ -149,7 +149,7 @@ Runtime](https://onnxruntime.ai/) come segue:
 >>> from transformers import AutoTokenizer
 >>> from onnxruntime import InferenceSession
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 >>> session = InferenceSession("onnx/model.onnx")
 >>> # ONNX Runtime expects NumPy arrays as input
 >>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
@@ -187,8 +187,8 @@ checkpoint come segue:
 >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 >>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 >>> # Save to disk
 >>> tokenizer.save_pretrained("local-pt-checkpoint")
 >>> pt_model.save_pretrained("local-pt-checkpoint")
@@ -206,8 +206,8 @@ python -m transformers.onnx --model=local-pt-checkpoint onnx/
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 
 >>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 >>> # Save to disk
 >>> tokenizer.save_pretrained("local-tf-checkpoint")
 >>> tf_model.save_pretrained("local-tf-checkpoint")
@@ -254,7 +254,7 @@ pacchetto `transformers.onnx`. Ad esempio, per esportare un modello di classific
 possiamo scegliere un modello ottimizzato dall'Hub ed eseguire:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased-finetuned-sst-2-english \
                             --feature=sequence-classification onnx/
 ```
 
@@ -271,7 +271,7 @@ All good, model saved at: onnx/model.onnx
 
 Puoi notare che in questo caso, i nomi di output del modello ottimizzato sono
 `logits` invece di `last_hidden_state` che abbiamo visto con il
-checkpoint `distilbert-base-uncased` precedente. Questo è previsto dal
+checkpoint `distilbert/distilbert-base-uncased` precedente. Questo è previsto dal
 modello ottimizato visto che ha una testa di e.
 
 <Tip>
@@ -354,7 +354,7 @@ fornendo alla configurazione del modello base come segue:
 ```python
 >>> from transformers import AutoConfig
 
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased")
 >>> onnx_config = DistilBertOnnxConfig(config)
 ```
 
@@ -386,7 +386,7 @@ usare:
 ```python
 >>> from transformers import AutoConfig
 
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased")
 >>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
 >>> print(onnx_config_for_seq_clf.outputs)
 OrderedDict([('logits', {0: 'batch'})])
@@ -413,7 +413,7 @@ con il modello base e il tokenizer e il percorso per salvare il file esportato:
 >>> from transformers import AutoTokenizer, AutoModel
 
 >>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
+>>> model_ckpt = "distilbert/distilbert-base-uncased"
 >>> base_model = AutoModel.from_pretrained(model_ckpt)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
 
@@ -549,7 +549,7 @@ una classe `BertConfig` e quindi salvato su disco con il nome del file `traced_b
 from transformers import BertModel, BertTokenizer, BertConfig
 import torch
 
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
+enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
 # Tokenizing input text
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -584,7 +584,7 @@ model = BertModel(config)
 model.eval()
 
 # If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
 
 # Creating the trace
 traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
diff --git a/docs/source/it/training.md b/docs/source/it/training.md
index 503a43321799e1..2a64cfca375f69 100644
--- a/docs/source/it/training.md
+++ b/docs/source/it/training.md
@@ -48,7 +48,7 @@ Come già sai, hai bisogno di un tokenizer per processare il testo e includere u
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 
 >>> def tokenize_function(examples):
@@ -80,7 +80,7 @@ Inizia caricando il tuo modello e specificando il numero di etichette (labels) a
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 <Tip>
@@ -200,7 +200,7 @@ Carica un modello TensorFlow col numero atteso di etichette:
 >>> import tensorflow as tf
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 Poi compila e fai il fine-tuning del tuo modello usando [`fit`](https://keras.io/api/models/model_training_apis/) come faresti con qualsiasi altro modello di Keras:
@@ -279,7 +279,7 @@ Carica il tuo modello con il numero atteso di etichette:
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 ### Ottimizzatore e learning rate scheduler
diff --git a/docs/source/ja/add_tensorflow_model.md b/docs/source/ja/add_tensorflow_model.md
index 578a06997c4cd8..8bc7ed0d9ee740 100644
--- a/docs/source/ja/add_tensorflow_model.md
+++ b/docs/source/ja/add_tensorflow_model.md
@@ -41,7 +41,7 @@ PyTorchをTensorFlowモデルの重みに変換する手順、およびMLフレ
 &nbsp;
 
 選択したモデルの`config.json`の`model_type`フィールドをチェックしてみてください
-（[例](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)）。
+（[例](https://huggingface.co/google-bert/bert-base-uncased/blob/main/config.json#L14)）。
 🤗 Transformersの該当するモデルフォルダに、名前が"modeling_tf"で始まるファイルがある場合、それは対応するTensorFlow
 アーキテクチャを持っていることを意味します（[例](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)）。
 
diff --git a/docs/source/ja/autoclass_tutorial.md b/docs/source/ja/autoclass_tutorial.md
index dda7604c498585..f8fbeaa221f6aa 100644
--- a/docs/source/ja/autoclass_tutorial.md
+++ b/docs/source/ja/autoclass_tutorial.md
@@ -26,7 +26,7 @@ http://www.apache.org/licenses/LICENSE-2.0
 <Tip>
 
 アーキテクチャはモデルの骨格を指し、チェックポイントは特定のアーキテクチャの重みです。
-たとえば、[BERT](https://huggingface.co/bert-base-uncased)はアーキテクチャであり、`bert-base-uncased`はチェックポイントです。
+たとえば、[BERT](https://huggingface.co/google-bert/bert-base-uncased)はアーキテクチャであり、`google-bert/bert-base-uncased`はチェックポイントです。
 モデルはアーキテクチャまたはチェックポイントのどちらを指す一般的な用語です。
 
 </Tip>
@@ -48,7 +48,7 @@ http://www.apache.org/licenses/LICENSE-2.0
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 
@@ -110,7 +110,7 @@ http://www.apache.org/licenses/LICENSE-2.0
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 同じチェックポイントを再利用して異なるタスクのアーキテクチャをロードできます：
@@ -118,7 +118,7 @@ http://www.apache.org/licenses/LICENSE-2.0
 ```py
 >>> from transformers import AutoModelForTokenClassification
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip warning={true}>
@@ -143,7 +143,7 @@ TensorFlowおよびFlaxのチェックポイントには影響がなく、`from_
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 同じチェックポイントを再利用して異なるタスクのアーキテクチャをロードできます：
@@ -151,7 +151,7 @@ TensorFlowおよびFlaxのチェックポイントには影響がなく、`from_
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 一般的には、事前学習済みモデルのインスタンスをロードするために`AutoTokenizer`クラスと`TFAutoModelFor`クラスの使用をお勧めします。
diff --git a/docs/source/ja/benchmarks.md b/docs/source/ja/benchmarks.md
index ce4d1a38341471..7312aae8ce5b7c 100644
--- a/docs/source/ja/benchmarks.md
+++ b/docs/source/ja/benchmarks.md
@@ -49,7 +49,7 @@ Hugging Faceのベンチマークツールは非推奨であり、Transformerモ
 ```py
 >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
 
->>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> args = PyTorchBenchmarkArguments(models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
 >>> benchmark = PyTorchBenchmark(args)
 ```
 </pt>
@@ -58,7 +58,7 @@ Hugging Faceのベンチマークツールは非推奨であり、Transformerモ
 >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
 
 >>> args = TensorFlowBenchmarkArguments(
-...     models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
+...     models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
 ... )
 >>> benchmark = TensorFlowBenchmark(args)
 ```
@@ -92,20 +92,20 @@ python examples/pytorch/benchmarking/run_benchmark.py --help
 --------------------------------------------------------------------------------
 Model Name             Batch Size     Seq Length     Time in s                  
 --------------------------------------------------------------------------------
-bert-base-uncased          8               8             0.006     
-bert-base-uncased          8               32            0.006     
-bert-base-uncased          8              128            0.018     
-bert-base-uncased          8              512            0.088     
+google-bert/bert-base-uncased          8               8             0.006     
+google-bert/bert-base-uncased          8               32            0.006     
+google-bert/bert-base-uncased          8              128            0.018     
+google-bert/bert-base-uncased          8              512            0.088     
 --------------------------------------------------------------------------------
 
 ====================      INFERENCE - MEMORY - RESULT       ====================
 --------------------------------------------------------------------------------
 Model Name             Batch Size     Seq Length    Memory in MB 
 --------------------------------------------------------------------------------
-bert-base-uncased          8               8             1227
-bert-base-uncased          8               32            1281
-bert-base-uncased          8              128            1307
-bert-base-uncased          8              512            1539
+google-bert/bert-base-uncased          8               8             1227
+google-bert/bert-base-uncased          8               32            1281
+google-bert/bert-base-uncased          8              128            1307
+google-bert/bert-base-uncased          8              512            1539
 --------------------------------------------------------------------------------
 
 ====================        ENVIRONMENT INFORMATION         ====================
@@ -151,20 +151,20 @@ python examples/tensorflow/benchmarking/run_benchmark_tf.py --help
 --------------------------------------------------------------------------------
 Model Name             Batch Size     Seq Length     Time in s                  
 --------------------------------------------------------------------------------
-bert-base-uncased          8               8             0.005
-bert-base-uncased          8               32            0.008
-bert-base-uncased          8              128            0.022
-bert-base-uncased          8              512            0.105
+google-bert/bert-base-uncased          8               8             0.005
+google-bert/bert-base-uncased          8               32            0.008
+google-bert/bert-base-uncased          8              128            0.022
+google-bert/bert-base-uncased          8              512            0.105
 --------------------------------------------------------------------------------
 
 ====================      INFERENCE - MEMORY - RESULT       ====================
 --------------------------------------------------------------------------------
 Model Name             Batch Size     Seq Length    Memory in MB 
 --------------------------------------------------------------------------------
-bert-base-uncased          8               8             1330
-bert-base-uncased          8               32            1330
-bert-base-uncased          8              128            1330
-bert-base-uncased          8              512            1770
+google-bert/bert-base-uncased          8               8             1330
+google-bert/bert-base-uncased          8               32            1330
+google-bert/bert-base-uncased          8              128            1330
+google-bert/bert-base-uncased          8              512            1770
 --------------------------------------------------------------------------------
 
 ====================        ENVIRONMENT INFORMATION         ====================
@@ -202,7 +202,7 @@ bert-base-uncased          8              512            1770
 を追加することで、オプションで _.csv_ ファイルに保存することができます。この場合、各セクションは別々の _.csv_ ファイルに保存されます。_.csv_ 
 ファイルへのパスは、データクラスの引数を使用してオプションで定義できます。
 
-モデル識別子、例えば `bert-base-uncased` を使用して事前学習済みモデルをベンチマークする代わりに、利用可能な任意のモデルクラスの任意の設定をベンチマークすることもできます。この場合、ベンチマーク引数と共に設定の `list` を挿入する必要があります。
+モデル識別子、例えば `google-bert/bert-base-uncased` を使用して事前学習済みモデルをベンチマークする代わりに、利用可能な任意のモデルクラスの任意の設定をベンチマークすることもできます。この場合、ベンチマーク引数と共に設定の `list` を挿入する必要があります。
 
 
 <frameworkcontent>
diff --git a/docs/source/ja/big_models.md b/docs/source/ja/big_models.md
index 5f670646a28447..78852dc4374cce 100644
--- a/docs/source/ja/big_models.md
+++ b/docs/source/ja/big_models.md
@@ -42,7 +42,7 @@ rendered properly in your Markdown viewer.
 ```py
 from transformers import AutoModel
 
-model = AutoModel.from_pretrained("bert-base-cased")
+model = AutoModel.from_pretrained("google-bert/bert-base-cased")
 ```
 
 もし[`~PreTrainedModel.save_pretrained`]を使用して保存する場合、新しいフォルダが2つのファイルを含む形で作成されます: モデルの設定情報とその重み情報です。
diff --git a/docs/source/ja/community.md b/docs/source/ja/community.md
index 7fa893fa8d21eb..ffe28d042d237e 100644
--- a/docs/source/ja/community.md
+++ b/docs/source/ja/community.md
@@ -43,8 +43,8 @@ rendered properly in your Markdown viewer.
 |[RoBERTaを感情分析のためにファインチューニング](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) |RoBERTaモデルを感情分析のためにファインチューニングする方法|[Dhaval Taunk](https://github.com/DhavalTaunk08) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
 |[質問生成モデルの評価](https://github.com/flexudy-pipe/qugeev) | seq2seqトランスフォーマーモデルによって生成された質問の回答の正確さを評価する方法 | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
 |[DistilBERTとTensorflowを使用してテキストを分類](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | TensorFlowでテキスト分類のためにDistilBERTをファインチューニングする方法 | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
-|[CNN/Dailymailでのエンコーダーデコーダー要約にBERTを活用](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | *bert-base-uncased* チェックポイントを使用してCNN/Dailymailの要約のために *EncoderDecoderModel* をウォームスタートする方法 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
-|[BBC XSumでのエンコーダーデコーダー要約にRoBERTaを活用](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | *roberta-base* チェックポイントを使用してBBC/XSumの要約のための共有 *EncoderDecoderModel* をウォームスタートする方法 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[CNN/Dailymailでのエンコーダーデコーダー要約にBERTを活用](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | *google-bert/bert-base-uncased* チェックポイントを使用してCNN/Dailymailの要約のために *EncoderDecoderModel* をウォームスタートする方法 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[BBC XSumでのエンコーダーデコーダー要約にRoBERTaを活用](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | *FacebookAI/roberta-base* チェックポイントを使用してBBC/XSumの要約のための共有 *EncoderDecoderModel* をウォームスタートする方法 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
 |[TAPASをシーケンシャル質問応答（SQA）でファインチューニング](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | シーケンシャル質問応答（SQA）データセットで *tapas-base* チェックポイントを使用して *TapasForQuestionAnswering* をファインチューニングする方法 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
 |[TabFactでTAPASを評価](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | *tapas-base-finetuned-tabfact* チェックポイントを使用してファインチューニングされた *TapasForSequenceClassification* を評価する方法、🤗 datasets と 🤗 transformers ライブラリを組み合わせて使用 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
 |[翻訳のためのmBARTをファインチューニング](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | Seq2SeqTrainerを使用してHindiからEnglishへの翻訳のためにmBARTをファインチューニングする方法 | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
diff --git a/docs/source/ja/create_a_model.md b/docs/source/ja/create_a_model.md
index 08610873341912..fdb23f98e7b107 100644
--- a/docs/source/ja/create_a_model.md
+++ b/docs/source/ja/create_a_model.md
@@ -89,7 +89,7 @@ DistilBertConfig {
 事前学習済みモデルの属性は、[`~PretrainedConfig.from_pretrained`] 関数で変更できます：
 
 ```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
 ```
 
 Once you are satisfied with your model configuration, you can save it with [`PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory.
@@ -136,13 +136,13 @@ Once you are satisfied with your model configuration, you can save it with [`Pre
 
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 事前学習済みの重みをロードする際、モデルが🤗 Transformersによって提供されている場合、デフォルトのモデル設定が自動的にロードされます。ただし、必要に応じてデフォルトのモデル設定属性の一部またはすべてを独自のもので置き換えることができます。
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </pt>
 <tf>
@@ -163,13 +163,13 @@ Once you are satisfied with your model configuration, you can save it with [`Pre
 
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 事前学習済みの重みをロードする際、モデルが🤗 Transformersによって提供されている場合、デフォルトのモデル構成が自動的にロードされます。ただし、必要であればデフォルトのモデル構成属性の一部またはすべてを独自のもので置き換えることもできます：
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </tf>
 </frameworkcontent>
@@ -186,7 +186,7 @@ Once you are satisfied with your model configuration, you can save it with [`Pre
 ```py
 >>> from transformers import DistilBertForSequenceClassification
 
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 新しいタスクにこのチェックポイントを簡単に再利用するには、異なるモデルヘッドに切り替えます。
@@ -196,7 +196,7 @@ Once you are satisfied with your model configuration, you can save it with [`Pre
 ```py
 >>> from transformers import DistilBertForQuestionAnswering
 
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 </pt>
@@ -206,7 +206,7 @@ Once you are satisfied with your model configuration, you can save it with [`Pre
 ```py
 >>> from transformers import TFDistilBertForSequenceClassification
 
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 別のタスクにこのチェックポイントを簡単に再利用することができ、異なるモデルヘッドに切り替えるだけです。
@@ -217,7 +217,7 @@ Once you are satisfied with your model configuration, you can save it with [`Pre
 ```py
 >>> from transformers import TFDistilBertForQuestionAnswering
 
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </tf>
 </frameworkcontent>
@@ -257,7 +257,7 @@ Once you are satisfied with your model configuration, you can save it with [`Pre
 ```py
 >>> from transformers import DistilBertTokenizer
 
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 [`DistilBertTokenizerFast`]クラスを使用して高速なトークナイザを作成します：
@@ -265,7 +265,7 @@ Once you are satisfied with your model configuration, you can save it with [`Pre
 ```py
 >>> from transformers import DistilBertTokenizerFast
 
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip>
diff --git a/docs/source/ja/custom_tools.md b/docs/source/ja/custom_tools.md
index 6a9b1f58e5d5c2..8c51ebaeb9d1ca 100644
--- a/docs/source/ja/custom_tools.md
+++ b/docs/source/ja/custom_tools.md
@@ -566,7 +566,7 @@ model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
 print(model.id)
 ```
 
-タスク `text-classification` の場合、これは `'facebook/bart-large-mnli'` を返します。`translation` の場合、`'t5-base'` を返します。
+タスク `text-classification` の場合、これは `'facebook/bart-large-mnli'` を返します。`translation` の場合、`'google-t5/t5-base'` を返します。
 
 これをエージェントが利用できるツールに変換する方法は何でしょうか？すべてのツールは、主要な属性を保持するスーパークラス `Tool` に依存しています。私たちは、それを継承したクラスを作成します:
 
diff --git a/docs/source/ja/generation_strategies.md b/docs/source/ja/generation_strategies.md
index 83d6e42bb8cb3b..01a8cf203f7fb1 100644
--- a/docs/source/ja/generation_strategies.md
+++ b/docs/source/ja/generation_strategies.md
@@ -41,7 +41,7 @@ generateメソッドへの入力は、モデルのモダリティに依存しま
 ```python
 >>> from transformers import AutoModelForCausalLM
 
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 >>> model.generation_config
 GenerationConfig {
     "bos_token_id": 50256,
@@ -94,8 +94,8 @@ GenerationConfig {
 ```python
 >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
 
->>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
 
 >>> translation_generation_config = GenerationConfig(
 ...     num_beams=4,
@@ -132,8 +132,8 @@ GenerationConfig {
 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 
->>> tok = AutoTokenizer.from_pretrained("gpt2")
->>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+>>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
 >>> streamer = TextStreamer(tok)
 
@@ -157,7 +157,7 @@ An increasing sequence: one, two, three, four, five, six, seven, eight, nine, te
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 
 >>> prompt = "I look forward to"
->>> checkpoint = "distilgpt2"
+>>> checkpoint = "distilbert/distilgpt2"
 
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -177,7 +177,7 @@ An increasing sequence: one, two, three, four, five, six, seven, eight, nine, te
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> checkpoint = "gpt2-large"
+>>> checkpoint = "openai-community/gpt2-large"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 
@@ -201,7 +201,7 @@ products or services, feel free to contact us at any time. We look forward to he
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 >>> set_seed(0)  # For reproducibility
 
->>> checkpoint = "gpt2-large"
+>>> checkpoint = "openai-community/gpt2-large"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 
@@ -226,7 +226,7 @@ that\'s a terrible feeling."']
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 
 >>> prompt = "It is astonishing how one can"
->>> checkpoint = "gpt2-medium"
+>>> checkpoint = "openai-community/gpt2-medium"
 
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -248,7 +248,7 @@ time."\n\nHe added: "I am very proud of the work I have been able to do in the l
 >>> set_seed(0)  # For reproducibility
 
 >>> prompt = "translate English to German: The house is wonderful."
->>> checkpoint = "t5-small"
+>>> checkpoint = "google-t5/t5-small"
 
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 >>> inputs = tokenizer(prompt, return_tensors="pt")
diff --git a/docs/source/ja/glossary.md b/docs/source/ja/glossary.md
index 1c5b965104a62b..39148f5d0f48c1 100644
--- a/docs/source/ja/glossary.md
+++ b/docs/source/ja/glossary.md
@@ -33,7 +33,7 @@ rendered properly in your Markdown viewer.
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 >>> sequence_a = "This is a short sequence."
 >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
@@ -147,7 +147,7 @@ The encoded versions have different lengths:
 ### feed forward chunking
 
 トランスフォーマー内の各残差注意ブロックでは、通常、自己注意層の後に2つのフィードフォワード層が続きます。
-フィードフォワード層の中間埋め込みサイズは、モデルの隠れたサイズよりも大きいことがよくあります（たとえば、`bert-base-uncased`の場合）。
+フィードフォワード層の中間埋め込みサイズは、モデルの隠れたサイズよりも大きいことがよくあります（たとえば、`google-bert/bert-base-uncased`の場合）。
 
 入力サイズが `[batch_size、sequence_length]` の場合、中間フィードフォワード埋め込み `[batch_size、sequence_length、config.intermediate_size]` を保存するために必要なメモリは、メモリの大部分を占めることがあります。[Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451)の著者は、計算が `sequence_length` 次元に依存しないため、両方のフィードフォワード層の出力埋め込み `[batch_size、config.hidden_size]_0、...、[batch_size、config.hidden_size]_n` を個別に計算し、後で `[batch_size、sequence_length、config.hidden_size]` に連結することは数学的に等価であると気付きました。これにより、増加した計算時間とメモリ使用量のトレードオフが生じますが、数学的に等価な結果が得られます。
 
@@ -191,7 +191,7 @@ The encoded versions have different lengths:
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 >>> sequence = "A Titan RTX has 24GB of VRAM"
 ```
@@ -400,7 +400,7 @@ The encoded versions have different lengths:
 ```python
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 >>> sequence_a = "HuggingFace is based in NYC"
 >>> sequence_b = "Where is HuggingFace based?"
 
diff --git a/docs/source/ja/installation.md b/docs/source/ja/installation.md
index 8991030efe00ac..915984a91c860e 100644
--- a/docs/source/ja/installation.md
+++ b/docs/source/ja/installation.md
@@ -168,14 +168,14 @@ conda install conda-forge::transformers
 例えば、外部インスタンスに対してファイアウォールで保護された通常のネットワーク上でプログラムを実行する場合、通常以下のようなコマンドで実行することになります:
 
 ```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 オフラインインスタンスでこの同じプログラムを実行します:
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 このスクリプトは、ローカルファイルのみを検索することが分かっているので、ハングアップしたりタイムアウトを待ったりすることなく実行されるはずです。
diff --git a/docs/source/ja/internal/generation_utils.md b/docs/source/ja/internal/generation_utils.md
index 96624971104d2b..baeefd06abb01b 100644
--- a/docs/source/ja/internal/generation_utils.md
+++ b/docs/source/ja/internal/generation_utils.md
@@ -38,8 +38,8 @@ rendered properly in your Markdown viewer.
 ```python
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained("gpt2")
+tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
 
 inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
 generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
diff --git a/docs/source/ja/main_classes/deepspeed.md b/docs/source/ja/main_classes/deepspeed.md
index b2ba2bead912ea..4406ce4a34e21e 100644
--- a/docs/source/ja/main_classes/deepspeed.md
+++ b/docs/source/ja/main_classes/deepspeed.md
@@ -187,7 +187,7 @@ deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.js
 ```bash
 deepspeed examples/pytorch/translation/run_translation.py \
 --deepspeed tests/deepspeed/ds_config_zero3.json \
---model_name_or_path t5-small --per_device_train_batch_size 1 \
+--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
 --output_dir output_dir --overwrite_output_dir --fp16 \
 --do_train --max_train_samples 500 --num_train_epochs 1 \
 --dataset_name wmt16 --dataset_config "ro-en" \
@@ -210,7 +210,7 @@ DeepSpeed 関連の引数が 2 つありますが、簡単にするためであ
 ```bash
 deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
 --deepspeed tests/deepspeed/ds_config_zero2.json \
---model_name_or_path t5-small --per_device_train_batch_size 1 \
+--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
 --output_dir output_dir --overwrite_output_dir --fp16 \
 --do_train --max_train_samples 500 --num_train_epochs 1 \
 --dataset_name wmt16 --dataset_config "ro-en" \
@@ -1748,7 +1748,7 @@ from transformers import T5ForConditionalGeneration, T5Config
 import deepspeed
 
 with deepspeed.zero.Init():
-    config = T5Config.from_pretrained("t5-small")
+    config = T5Config.from_pretrained("google-t5/t5-small")
     model = T5ForConditionalGeneration(config)
 ```
 
@@ -1764,7 +1764,7 @@ with deepspeed.zero.Init():
 from transformers import AutoModel, Trainer, TrainingArguments
 
 training_args = TrainingArguments(..., deepspeed=ds_config)
-model = AutoModel.from_pretrained("t5-small")
+model = AutoModel.from_pretrained("google-t5/t5-small")
 trainer = Trainer(model=model, args=training_args, ...)
 ```
 
@@ -1822,7 +1822,7 @@ ZeRO-3 のみがパラメーターのシャーディングを実行するのに
 ```bash
 deepspeed examples/pytorch/translation/run_translation.py \
 --deepspeed tests/deepspeed/ds_config_zero3.json \
---model_name_or_path t5-small --output_dir output_dir \
+--model_name_or_path google-t5/t5-small --output_dir output_dir \
 --do_eval --max_eval_samples 50 --warmup_steps 50  \
 --max_source_length 128 --val_max_target_length 128 \
 --overwrite_output_dir --per_device_eval_batch_size 4 \
@@ -2022,7 +2022,7 @@ import deepspeed
 ds_config = {...}  # deepspeed config object or path to the file
 # must run before instantiating the model to detect zero 3
 dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-model = AutoModel.from_pretrained("gpt2")
+model = AutoModel.from_pretrained("openai-community/gpt2")
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 ```
 
@@ -2037,7 +2037,7 @@ import deepspeed
 ds_config = {...}  # deepspeed config object or path to the file
 # must run before instantiating the model to detect zero 3
 dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-config = AutoConfig.from_pretrained("gpt2")
+config = AutoConfig.from_pretrained("openai-community/gpt2")
 model = AutoModel.from_config(config)
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 ```
diff --git a/docs/source/ja/main_classes/output.md b/docs/source/ja/main_classes/output.md
index 7f906544a8f88c..beb9dcbb442355 100644
--- a/docs/source/ja/main_classes/output.md
+++ b/docs/source/ja/main_classes/output.md
@@ -26,8 +26,8 @@ rendered properly in your Markdown viewer.
 from transformers import BertTokenizer, BertForSequenceClassification
 import torch
 
-tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
 
 inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
 labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
diff --git a/docs/source/ja/main_classes/pipelines.md b/docs/source/ja/main_classes/pipelines.md
index 90eb17c0c44387..8e3f61130bdcaa 100644
--- a/docs/source/ja/main_classes/pipelines.md
+++ b/docs/source/ja/main_classes/pipelines.md
@@ -44,7 +44,7 @@ Recognition、Masked Language Modeling、Sentiment Analysis、Feature Extraction
 ハブはすでにそれを定義しています。
 
 ```python
->>> pipe = pipeline(model="roberta-large-mnli")
+>>> pipe = pipeline(model="FacebookAI/roberta-large-mnli")
 >>> pipe("This restaurant is awesome")
 [{'label': 'NEUTRAL', 'score': 0.7313136458396912}]
 ```
diff --git a/docs/source/ja/main_classes/trainer.md b/docs/source/ja/main_classes/trainer.md
index e8d0ca96f58731..61872996ab5938 100644
--- a/docs/source/ja/main_classes/trainer.md
+++ b/docs/source/ja/main_classes/trainer.md
@@ -525,7 +525,7 @@ MacOS マシンに PyTorch >= 1.13 (執筆時点ではナイトリー バージ
 export TASK_NAME=mrpc
 
 python examples/pytorch/text-classification/run_glue.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --task_name $TASK_NAME \
   --do_train \
   --do_eval \
@@ -660,7 +660,7 @@ cd transformers
 
 accelerate launch \
 ./examples/pytorch/text-classification/run_glue.py \
---model_name_or_path bert-base-cased \
+--model_name_or_path google-bert/bert-base-cased \
 --task_name $TASK_NAME \
 --do_train \
 --do_eval \
@@ -685,7 +685,7 @@ accelerate launch --num_processes=2 \
 --fsdp_sharding_strategy=1 \
 --fsdp_state_dict_type=FULL_STATE_DICT \
 ./examples/pytorch/text-classification/run_glue.py
---model_name_or_path bert-base-cased \
+--model_name_or_path google-bert/bert-base-cased \
 --task_name $TASK_NAME \
 --do_train \
 --do_eval \
diff --git a/docs/source/ja/model_doc/auto.md b/docs/source/ja/model_doc/auto.md
index c6775493baae39..d4baaf70e6fd48 100644
--- a/docs/source/ja/model_doc/auto.md
+++ b/docs/source/ja/model_doc/auto.md
@@ -21,7 +21,7 @@ rendered properly in your Markdown viewer.
 [`AutoConfig`]、[`AutoModel`]、[`AutoTokenizer`]のいずれかをインスタンス化すると、関連するアーキテクチャのクラスが直接作成されます。例えば、
 
 ```python
-model = AutoModel.from_pretrained("bert-base-cased")
+model = AutoModel.from_pretrained("google-bert/bert-base-cased")
 ```
 
 これは[`BertModel`]のインスタンスであるモデルを作成します。
diff --git a/docs/source/ja/model_doc/bert-generation.md b/docs/source/ja/model_doc/bert-generation.md
index 4a25ff5d9bc662..d2c93a4644d943 100644
--- a/docs/source/ja/model_doc/bert-generation.md
+++ b/docs/source/ja/model_doc/bert-generation.md
@@ -41,15 +41,15 @@ GPT-2 および RoBERTa チェックポイントを使用し、モデルの初
 ```python
 >>> # leverage checkpoints for Bert2Bert model...
 >>> # use BERT's cls token as BOS token and sep token as EOS token
->>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+>>> encoder = BertGenerationEncoder.from_pretrained("google-bert/bert-large-uncased", bos_token_id=101, eos_token_id=102)
 >>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
 >>> decoder = BertGenerationDecoder.from_pretrained(
-...     "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
+...     "google-bert/bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
 ... )
 >>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
 
 >>> # create tokenizer...
->>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
 
 >>> input_ids = tokenizer(
 ...     "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
diff --git a/docs/source/ja/model_doc/cpm.md b/docs/source/ja/model_doc/cpm.md
index 9776f676844e7d..afac35823e641a 100644
--- a/docs/source/ja/model_doc/cpm.md
+++ b/docs/source/ja/model_doc/cpm.md
@@ -40,7 +40,7 @@ GPT-3 の言語は主に英語であり、パラメーターは公開されて
 
 <Tip>
 
-CPM のアーキテクチャは、トークン化方法を除いて GPT-2 と同じです。詳細については、[GPT-2 ドキュメント](gpt2) を参照してください。
+CPM のアーキテクチャは、トークン化方法を除いて GPT-2 と同じです。詳細については、[GPT-2 ドキュメント](openai-community/gpt2) を参照してください。
 API リファレンス情報。
 
 </Tip>
diff --git a/docs/source/ja/model_doc/ctrl.md b/docs/source/ja/model_doc/ctrl.md
index 9d3a620ffe42d1..f93345d30e79bc 100644
--- a/docs/source/ja/model_doc/ctrl.md
+++ b/docs/source/ja/model_doc/ctrl.md
@@ -17,7 +17,7 @@ rendered properly in your Markdown viewer.
 # CTRL
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=ctrl">
+<a href="https://huggingface.co/models?filter=Salesforce/ctrl">
 <img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
 </a>
 <a href="https://huggingface.co/spaces/docs-demos/tiny-ctrl">
@@ -42,12 +42,12 @@ CTRL モデルは、Nitish Shirish Keskar*、Bryan McCann*、Lav R. Varshney、C
 モデルベースのソース帰属を介して。*
 
 このモデルは、[keskarnitishr](https://huggingface.co/keskarnitishr) によって提供されました。元のコードが見つかる
-[こちら](https://github.com/salesforce/ctrl)。
+[こちら](https://github.com/salesforce/Salesforce/ctrl)。
 
 ## Usage tips
 
 - CTRL は制御コードを利用してテキストを生成します。生成を特定の単語や文で開始する必要があります。
-  またはリンクして一貫したテキストを生成します。 [元の実装](https://github.com/salesforce/ctrl) を参照してください。
+  またはリンクして一貫したテキストを生成します。 [元の実装](https://github.com/salesforce/Salesforce/ctrl) を参照してください。
   詳しくは。
 - CTRL は絶対位置埋め込みを備えたモデルであるため、通常は入力を右側にパディングすることをお勧めします。
   左。
diff --git a/docs/source/ja/model_doc/dialogpt.md b/docs/source/ja/model_doc/dialogpt.md
index 82d6f8481afb47..22ce0c9a099f75 100644
--- a/docs/source/ja/model_doc/dialogpt.md
+++ b/docs/source/ja/model_doc/dialogpt.md
@@ -52,6 +52,6 @@ OpenAI GPT-2に従って、マルチターン対話セッションを長いテ
 
 <Tip>
 
-DialoGPT のアーキテクチャは GPT2 モデルに基づいています。API リファレンスと例については、[GPT2 のドキュメント ページ](gpt2) を参照してください。
+DialoGPT のアーキテクチャは GPT2 モデルに基づいています。API リファレンスと例については、[GPT2 のドキュメント ページ](openai-community/gpt2) を参照してください。
 
 </Tip>
diff --git a/docs/source/ja/model_memory_anatomy.md b/docs/source/ja/model_memory_anatomy.md
index 52374d58f983a5..5f09489b7f79aa 100644
--- a/docs/source/ja/model_memory_anatomy.md
+++ b/docs/source/ja/model_memory_anatomy.md
@@ -88,14 +88,14 @@ GPU memory occupied: 1343 MB.
 
 ## Load Model
 
-まず、`bert-large-uncased` モデルを読み込みます。モデルの重みを直接GPUに読み込むことで、重みだけがどれだけのスペースを使用しているかを確認できます。
+まず、`google-bert/bert-large-uncased` モデルを読み込みます。モデルの重みを直接GPUに読み込むことで、重みだけがどれだけのスペースを使用しているかを確認できます。
 
 
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda")
 >>> print_gpu_utilization()
 GPU memory occupied: 2631 MB.
 ```
diff --git a/docs/source/ja/model_sharing.md b/docs/source/ja/model_sharing.md
index 14e0c2e7a857c0..aa8f7a3d1e3327 100644
--- a/docs/source/ja/model_sharing.md
+++ b/docs/source/ja/model_sharing.md
@@ -254,7 +254,7 @@ Hugging Faceプロフィールに移動すると、新しく作成したモデ
 * 手動で`README.md`ファイルを作成およびアップロードする。
 * モデルリポジトリ内の**Edit model card**ボタンをクリックする。
 
-モデルカードに含めるべき情報の例については、DistilBert [モデルカード](https://huggingface.co/distilbert-base-uncased)をご覧ください。`README.md`ファイルで制御できる他のオプション、例えばモデルの炭素フットプリントやウィジェットの例などについての詳細は、[こちらのドキュメンテーション](https://huggingface.co/docs/hub/models-cards)を参照してください。
+モデルカードに含めるべき情報の例については、DistilBert [モデルカード](https://huggingface.co/distilbert/distilbert-base-uncased)をご覧ください。`README.md`ファイルで制御できる他のオプション、例えばモデルの炭素フットプリントやウィジェットの例などについての詳細は、[こちらのドキュメンテーション](https://huggingface.co/docs/hub/models-cards)を参照してください。
 
 
 
diff --git a/docs/source/ja/multilingual.md b/docs/source/ja/multilingual.md
index 86dabb94633c8b..39524195f88810 100644
--- a/docs/source/ja/multilingual.md
+++ b/docs/source/ja/multilingual.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-🤗 Transformers にはいくつかの多言語モデルがあり、それらの推論の使用方法は単一言語モデルとは異なります。ただし、多言語モデルの使用方法がすべて異なるわけではありません。 [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased) などの一部のモデルは、単一言語モデルと同様に使用できます。 このガイドでは、推論のために使用方法が異なる多言語モデルをどのように使うかを示します。
+🤗 Transformers にはいくつかの多言語モデルがあり、それらの推論の使用方法は単一言語モデルとは異なります。ただし、多言語モデルの使用方法がすべて異なるわけではありません。 [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased) などの一部のモデルは、単一言語モデルと同様に使用できます。 このガイドでは、推論のために使用方法が異なる多言語モデルをどのように使うかを示します。
 
 ## XLM
 
@@ -28,24 +28,24 @@ XLM には10の異なるチェックポイントがあり、そのうちの1つ
 
 次の XLM モデルは、言語の埋め込みを使用して、推論で使用される言語を指定します。
 
-- `xlm-mlm-ende-1024` (マスク化された言語モデリング、英語-ドイツ語)
-- `xlm-mlm-enfr-1024` (マスク化された言語モデリング、英語-フランス語)
-- `xlm-mlm-enro-1024` (マスク化された言語モデリング、英語-ルーマニア語)
-- `xlm-mlm-xnli15-1024` (マスク化された言語モデリング、XNLI 言語)
-- `xlm-mlm-tlm-xnli15-1024` (マスク化された言語モデリング + 翻訳 + XNLI 言語)
-- `xlm-clm-enfr-1024` (因果言語モデリング、英語-フランス語)
-- `xlm-clm-ende-1024` (因果言語モデリング、英語-ドイツ語)
+- `FacebookAI/xlm-mlm-ende-1024` (マスク化された言語モデリング、英語-ドイツ語)
+- `FacebookAI/xlm-mlm-enfr-1024` (マスク化された言語モデリング、英語-フランス語)
+- `FacebookAI/xlm-mlm-enro-1024` (マスク化された言語モデリング、英語-ルーマニア語)
+- `FacebookAI/xlm-mlm-xnli15-1024` (マスク化された言語モデリング、XNLI 言語)
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (マスク化された言語モデリング + 翻訳 + XNLI 言語)
+- `FacebookAI/xlm-clm-enfr-1024` (因果言語モデリング、英語-フランス語)
+- `FacebookAI/xlm-clm-ende-1024` (因果言語モデリング、英語-ドイツ語)
 
 言語の埋め込みは、モデルに渡される `input_ids` と同じ形状のテンソルとして表されます。 これらのテンソルの値は、使用される言語に依存し、トークナイザーの `lang2id` および `id2lang` 属性によって識別されます。
 
-この例では、`xlm-clm-enfr-1024` チェックポイントをロードします (因果言語モデリング、英語-フランス語)。
+この例では、`FacebookAI/xlm-clm-enfr-1024` チェックポイントをロードします (因果言語モデリング、英語-フランス語)。
 
 ```py
 >>> import torch
 >>> from transformers import XLMTokenizer, XLMWithLMHeadModel
 
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
 ```
 
 トークナイザーの `lang2id` 属性は、このモデルの言語とその ID を表示します。
@@ -83,8 +83,8 @@ XLM には10の異なるチェックポイントがあり、そのうちの1つ
 
 次の XLM モデルは、推論中に言語の埋め込みを必要としません。
 
-- `xlm-mlm-17-1280` (マスク化された言語モデリング、17の言語)
-- `xlm-mlm-100-1280` (マスク化された言語モデリング、100の言語)
+- `FacebookAI/xlm-mlm-17-1280` (マスク化された言語モデリング、17の言語)
+- `FacebookAI/xlm-mlm-100-1280` (マスク化された言語モデリング、100の言語)
 
 これらのモデルは、以前の XLM チェックポイントとは異なり、一般的な文の表現に使用されます。
 
@@ -92,8 +92,8 @@ XLM には10の異なるチェックポイントがあり、そのうちの1つ
 
 以下の BERT モデルは、多言語タスクに使用できます。
 
-- `bert-base-multilingual-uncased` (マスク化された言語モデリング + 次の文の予測、102の言語)
-- `bert-base-multilingual-cased` (マスク化された言語モデリング + 次の文の予測、104の言語)
+- `google-bert/bert-base-multilingual-uncased` (マスク化された言語モデリング + 次の文の予測、102の言語)
+- `google-bert/bert-base-multilingual-cased` (マスク化された言語モデリング + 次の文の予測、104の言語)
 
 これらのモデルは、推論中に言語の埋め込みを必要としません。 文脈から言語を識別し、それに応じて推測する必要があります。
 
@@ -101,8 +101,8 @@ XLM には10の異なるチェックポイントがあり、そのうちの1つ
 
 次の XLM-RoBERTa モデルは、多言語タスクに使用できます。
 
-- `xlm-roberta-base` (マスク化された言語モデリング、100の言語)
-- `xlm-roberta-large` (マスク化された言語モデリング、100の言語)
+- `FacebookAI/xlm-roberta-base` (マスク化された言語モデリング、100の言語)
+- `FacebookAI/xlm-roberta-large` (マスク化された言語モデリング、100の言語)
 
 XLM-RoBERTa は、100の言語で新しく作成およびクリーニングされた2.5 TB の CommonCrawl データでトレーニングされました。 これは、分類、シーケンスのラベル付け、質問応答などのダウンストリームタスクで、mBERT や XLM などの以前にリリースされた多言語モデルを大幅に改善します。
 
diff --git a/docs/source/ja/perf_hardware.md b/docs/source/ja/perf_hardware.md
index 2ebc0eef9b68c0..0d104ed3ddb0b3 100644
--- a/docs/source/ja/perf_hardware.md
+++ b/docs/source/ja/perf_hardware.md
@@ -140,7 +140,7 @@ NVLinkを使用すると、トレーニングが約23％速く完了すること
 # DDP w/ NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
@@ -149,7 +149,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 # DDP w/o NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
diff --git a/docs/source/ja/perf_train_cpu.md b/docs/source/ja/perf_train_cpu.md
index b22d7b96aa191c..bf623d131363b5 100644
--- a/docs/source/ja/perf_train_cpu.md
+++ b/docs/source/ja/perf_train_cpu.md
@@ -49,7 +49,7 @@ TrainerでIPEXの自動混合精度を有効にするには、ユーザーはト
 
 - CPU上でBF16自動混合精度を使用してIPEXでトレーニングを行う場合：
 <pre> python run_qa.py \
---model_name_or_path bert-base-uncased \
+--model_name_or_path google-bert/bert-base-uncased \
 --dataset_name squad \
 --do_train \
 --do_eval \
diff --git a/docs/source/ja/perf_train_cpu_many.md b/docs/source/ja/perf_train_cpu_many.md
index a15cb5d4900a61..26da32f577251f 100644
--- a/docs/source/ja/perf_train_cpu_many.md
+++ b/docs/source/ja/perf_train_cpu_many.md
@@ -100,7 +100,7 @@ IPEXは、Float32およびBFloat16の両方でCPUトレーニングのパフォ
  export MASTER_ADDR=127.0.0.1
  mpirun -n 2 -genv OMP_NUM_THREADS=23 \
  python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
+ --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
@@ -134,7 +134,7 @@ node0では、各ノードのIPアドレスを含む構成ファイルを作成
  mpirun -f hostfile -n 4 -ppn 2 \
  -genv OMP_NUM_THREADS=23 \
  python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
+ --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
diff --git a/docs/source/ja/perf_train_gpu_many.md b/docs/source/ja/perf_train_gpu_many.md
index 44186bba7963c3..d85165d0c547a0 100644
--- a/docs/source/ja/perf_train_gpu_many.md
+++ b/docs/source/ja/perf_train_gpu_many.md
@@ -136,7 +136,7 @@ DPとDDPの他にも違いがありますが、この議論には関係ありま
 # DP
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
 python examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
@@ -144,7 +144,7 @@ python examples/pytorch/language-modeling/run_clm.py \
 # DDP w/ NVlink
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
 torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
@@ -152,7 +152,7 @@ torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 # DDP w/o NVlink
 rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
diff --git a/docs/source/ja/perf_train_gpu_one.md b/docs/source/ja/perf_train_gpu_one.md
index 215c0914d1f309..2c2bc540e48384 100644
--- a/docs/source/ja/perf_train_gpu_one.md
+++ b/docs/source/ja/perf_train_gpu_one.md
@@ -193,7 +193,7 @@ AdamWオプティマイザの代替手段について詳しく見てみましょ
 1. [`Trainer`]で使用可能な`adafactor`
 2. Trainerで使用可能な`adamw_bnb_8bit`は、デモンストレーション用に以下でサードパーティの統合が提供されています。
 
-比較のため、3Bパラメータモデル（例：「t5-3b」）の場合：
+比較のため、3Bパラメータモデル（例：「google-t5/t5-3b」）の場合：
 * 標準のAdamWオプティマイザは、各パラメータに8バイトを使用するため、24GBのGPUメモリが必要です（8 * 3 => 24GB）。
 * Adafactorオプティマイザは12GB以上必要です。各パラメータにわずか4バイト以上を使用するため、4 * 3と少し余分になります。
 * 8ビットのBNB量子化オプティマイザは、すべてのオプティマイザの状態が量子化されている場合、わずか6GBしか使用しません。
diff --git a/docs/source/ja/perplexity.md b/docs/source/ja/perplexity.md
index aa88a7a212f1f2..368a301ec3ab4a 100644
--- a/docs/source/ja/perplexity.md
+++ b/docs/source/ja/perplexity.md
@@ -56,7 +56,7 @@ GPT-2を使用してこのプロセスをデモンストレーションしてみ
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 
 device = "cuda"
-model_id = "gpt2-large"
+model_id = "openai-community/gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 ```
diff --git a/docs/source/ja/pipeline_tutorial.md b/docs/source/ja/pipeline_tutorial.md
index 8892a7c4b87687..354e2a2be38022 100644
--- a/docs/source/ja/pipeline_tutorial.md
+++ b/docs/source/ja/pipeline_tutorial.md
@@ -165,7 +165,7 @@ def data():
         yield f"My example {i}"
 
 
-pipe = pipeline(model="gpt2", device=0)
+pipe = pipeline(model="openai-community/gpt2", device=0)
 generated_characters = 0
 for out in pipe(data()):
     generated_characters += len(out[0]["generated_text"])
diff --git a/docs/source/ja/pipeline_webserver.md b/docs/source/ja/pipeline_webserver.md
index c7dd3363748feb..3b35a01490d409 100644
--- a/docs/source/ja/pipeline_webserver.md
+++ b/docs/source/ja/pipeline_webserver.md
@@ -36,7 +36,7 @@ async def homepage(request):
 
 
 async def server_loop(q):
-    pipe = pipeline(model="bert-base-uncased")
+    pipe = pipeline(model="google-bert/bert-base-uncased")
     while True:
         (string, response_q) = await q.get()
         out = pipe(string)
diff --git a/docs/source/ja/preprocessing.md b/docs/source/ja/preprocessing.md
index b8fad2a0d21b36..ea0b98df028031 100644
--- a/docs/source/ja/preprocessing.md
+++ b/docs/source/ja/preprocessing.md
@@ -59,7 +59,7 @@ pip install datasets
 ```python
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 ```
 
 次に、テキストをトークナイザに渡します：
diff --git a/docs/source/ja/quicktour.md b/docs/source/ja/quicktour.md
index e16b2272c26f53..3bec2f827a47ee 100644
--- a/docs/source/ja/quicktour.md
+++ b/docs/source/ja/quicktour.md
@@ -83,7 +83,7 @@ pip install tensorflow
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-[`pipeline`]は、感情分析のためのデフォルトの[事前学習済みモデル](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)とトークナイザをダウンロードしてキャッシュし、使用できるようになります。
+[`pipeline`]は、感情分析のためのデフォルトの[事前学習済みモデル](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english)とトークナイザをダウンロードしてキャッシュし、使用できるようになります。
 これで、`classifier`を対象のテキストに使用できます：
 
 ```python
@@ -411,7 +411,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```python
 >>> from transformers import AutoConfig
 
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
 ```
 
 <frameworkcontent>
@@ -452,7 +452,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
     ```py
     >>> from transformers import AutoModelForSequenceClassification
 
-    >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+    >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
     ```
 
 2. [`TrainingArguments`]には、変更できるモデルのハイパーパラメータが含まれており、学習率、バッチサイズ、トレーニングエポック数などが変更できます。指定しない場合、デフォルト値が使用されます：
@@ -474,7 +474,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
     ```
 
 4. データセットをロードする:
@@ -547,7 +547,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import TFAutoModelForSequenceClassification
 
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. トークナイザ、画像プロセッサ、特徴量抽出器、またはプロセッサのような前処理クラスをロードします：
@@ -555,7 +555,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 3. データセットをトークナイズするための関数を作成します：
diff --git a/docs/source/ja/run_scripts.md b/docs/source/ja/run_scripts.md
index a7cc89d1348491..af99d1c6da9702 100644
--- a/docs/source/ja/run_scripts.md
+++ b/docs/source/ja/run_scripts.md
@@ -92,12 +92,12 @@ pip install -r requirements.txt
 
 <frameworkcontent>
 <pt>
-この例のスクリプトは、🤗 [Datasets](https://huggingface.co/docs/datasets/) ライブラリからデータセットをダウンロードし、前処理を行います。次に、[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) を使用して要約をサポートするアーキテクチャ上でデータセットをファインチューニングします。以下の例では、[CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) データセット上で [T5-small](https://huggingface.co/t5-small) をファインチューニングする方法が示されています。T5 モデルは、そのトレーニング方法に起因して追加の `source_prefix` 引数が必要です。このプロンプトにより、T5 はこれが要約タスクであることを知ることができます。
+この例のスクリプトは、🤗 [Datasets](https://huggingface.co/docs/datasets/) ライブラリからデータセットをダウンロードし、前処理を行います。次に、[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) を使用して要約をサポートするアーキテクチャ上でデータセットをファインチューニングします。以下の例では、[CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) データセット上で [T5-small](https://huggingface.co/google-t5/t5-small) をファインチューニングする方法が示されています。T5 モデルは、そのトレーニング方法に起因して追加の `source_prefix` 引数が必要です。このプロンプトにより、T5 はこれが要約タスクであることを知ることができます。
 
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -112,12 +112,12 @@ python examples/pytorch/summarization/run_summarization.py \
 
 </pt>
 <tf>
-この例のスクリプトは、🤗 [Datasets](https://huggingface.co/docs/datasets/) ライブラリからデータセットをダウンロードして前処理します。その後、スクリプトは要約をサポートするアーキテクチャ上で Keras を使用してデータセットをファインチューニングします。以下の例では、[T5-small](https://huggingface.co/t5-small) を [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) データセットでファインチューニングする方法を示しています。T5 モデルは、そのトレーニング方法に起因して追加の `source_prefix` 引数が必要です。このプロンプトは、T5 にこれが要約タスクであることを知らせます。
+この例のスクリプトは、🤗 [Datasets](https://huggingface.co/docs/datasets/) ライブラリからデータセットをダウンロードして前処理します。その後、スクリプトは要約をサポートするアーキテクチャ上で Keras を使用してデータセットをファインチューニングします。以下の例では、[T5-small](https://huggingface.co/google-t5/t5-small) を [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) データセットでファインチューニングする方法を示しています。T5 モデルは、そのトレーニング方法に起因して追加の `source_prefix` 引数が必要です。このプロンプトは、T5 にこれが要約タスクであることを知らせます。
 
 
 ```bash
 python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -143,7 +143,7 @@ python examples/tensorflow/summarization/run_summarization.py  \
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -167,7 +167,7 @@ Tensor Processing Units (TPUs)は、パフォーマンスを加速させるた
 ```bash
 python xla_spawn.py --num_cores 8 \
     summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -186,7 +186,7 @@ python xla_spawn.py --num_cores 8 \
 ```bash
 python run_summarization.py  \
     --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -226,7 +226,7 @@ Now you are ready to launch the training:
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -245,7 +245,7 @@ accelerate launch run_summarization_no_trainer.py \
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -270,7 +270,7 @@ python examples/pytorch/summarization/run_summarization.py \
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --max_train_samples 50 \
     --max_eval_samples 50 \
     --max_predict_samples 50 \
@@ -300,7 +300,7 @@ examples/pytorch/summarization/run_summarization.py -h
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -318,7 +318,7 @@ python examples/pytorch/summarization/run_summarization.py
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -350,7 +350,7 @@ huggingface-cli login
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
diff --git a/docs/source/ja/serialization.md b/docs/source/ja/serialization.md
index da23b63e6528e7..3e9d81180de046 100644
--- a/docs/source/ja/serialization.md
+++ b/docs/source/ja/serialization.md
@@ -57,10 +57,10 @@ pip install optimum[exporters]
 optimum-cli export onnx --help
 ```
 
-🤗 Hubからモデルのチェックポイントをエクスポートするには、例えば `distilbert-base-uncased-distilled-squad` を使いたい場合、以下のコマンドを実行してください：
+🤗 Hubからモデルのチェックポイントをエクスポートするには、例えば `distilbert/distilbert-base-uncased-distilled-squad` を使いたい場合、以下のコマンドを実行してください：
 
 ```bash
-optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
 ```
 
 進行状況を示し、結果の `model.onnx` が保存される場所を表示するログは、以下のように表示されるはずです：
@@ -147,7 +147,7 @@ pip install transformers[onnx]
 `transformers.onnx`パッケージをPythonモジュールとして使用して、事前に用意された設定を使用してチェックポイントをエクスポートする方法は以下の通りです：
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
 ```
 
 この方法は、`--model`引数で定義されたチェックポイントのONNXグラフをエクスポートします。🤗 Hubのいずれかのチェックポイントまたはローカルに保存されたチェックポイントを渡すことができます。エクスポートされた`model.onnx`ファイルは、ONNX標準をサポートする多くのアクセラレータで実行できます。例えば、ONNX Runtimeを使用してモデルを読み込んで実行する方法は以下の通りです：
@@ -157,7 +157,7 @@ python -m transformers.onnx --model=distilbert-base-uncased onnx/
 >>> from transformers import AutoTokenizer
 >>> from onnxruntime import InferenceSession
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 >>> session = InferenceSession("onnx/model.onnx")
 >>> # ONNX Runtime expects NumPy arrays as input
 >>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
diff --git a/docs/source/ja/task_summary.md b/docs/source/ja/task_summary.md
index 74c3f1436412d0..0069f6afaf3205 100644
--- a/docs/source/ja/task_summary.md
+++ b/docs/source/ja/task_summary.md
@@ -281,7 +281,7 @@ score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
 >>> from transformers import pipeline
 
 >>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
->>> translator = pipeline(task="translation", model="t5-small")
+>>> translator = pipeline(task="translation", model="google-t5/t5-small")
 >>> translator(text)
 [{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
 ```
diff --git a/docs/source/ja/tasks/language_modeling.md b/docs/source/ja/tasks/language_modeling.md
index b7ad65c6c4a210..835a0d54ea4ffd 100644
--- a/docs/source/ja/tasks/language_modeling.md
+++ b/docs/source/ja/tasks/language_modeling.md
@@ -32,7 +32,7 @@ rendered properly in your Markdown viewer.
 
 このガイドでは、次の方法を説明します。
 
-1. [ELI5](https:/) の [r/askscience](https://www.reddit.com/r/askscience/) サブセットで [DistilGPT2](https://huggingface.co/distilgpt2) を微調整します。 /huggingface.co/datasets/eli5) データセット。
+1. [ELI5](https:/) の [r/askscience](https://www.reddit.com/r/askscience/) サブセットで [DistilGPT2](https://huggingface.co/distilbert/distilgpt2) を微調整します。 /huggingface.co/datasets/eli5) データセット。
 2. 微調整したモデルを推論に使用します。
 
 <Tip>
@@ -112,7 +112,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 ```
 
 上の例からわかるように、`text`フィールドは実際には`answers`内にネストされています。つまり、次のことが必要になります。
@@ -234,7 +234,7 @@ Apply the `group_texts` function over the entire dataset:
 ```py
 >>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
 
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 この時点で残っている手順は次の 3 つだけです。
@@ -298,7 +298,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 ```py
 >>> from transformers import TFAutoModelForCausalLM
 
->>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
diff --git a/docs/source/ja/tasks/masked_language_modeling.md b/docs/source/ja/tasks/masked_language_modeling.md
index 3cf6db70f2e9d6..b0fff72f9b0e26 100644
--- a/docs/source/ja/tasks/masked_language_modeling.md
+++ b/docs/source/ja/tasks/masked_language_modeling.md
@@ -26,7 +26,7 @@ rendered properly in your Markdown viewer.
 
 このガイドでは、次の方法を説明します。
 
-1. [ELI5](https://huggingface.co/distilroberta-base) の [r/askscience](https://www.reddit.com/r/askscience/) サブセットで [DistilRoBERTa](https://huggingface.co/distilroberta-base) を微調整します。 ://huggingface.co/datasets/eli5) データセット。
+1. [ELI5](https://huggingface.co/distilbert/distilroberta-base) の [r/askscience](https://www.reddit.com/r/askscience/) サブセットで [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) を微調整します。 ://huggingface.co/datasets/eli5) データセット。
 2. 微調整したモデルを推論に使用します。
 
 <Tip>
@@ -101,7 +101,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
 ```
 
 上の例からわかるように、`text`フィールドは実際には`answers`内にネストされています。これは、次のことを行う必要があることを意味します
@@ -219,7 +219,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoModelForMaskedLM
 
->>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
+>>> model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
 ```
 
 この時点で残っている手順は次の 3 つだけです。
@@ -287,7 +287,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 ```py
 >>> from transformers import TFAutoModelForMaskedLM
 
->>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")
+>>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
 ```
 
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
diff --git a/docs/source/ja/tasks/multiple_choice.md b/docs/source/ja/tasks/multiple_choice.md
index 6b634710550be6..bfe5f388cb4ab6 100644
--- a/docs/source/ja/tasks/multiple_choice.md
+++ b/docs/source/ja/tasks/multiple_choice.md
@@ -22,7 +22,7 @@ rendered properly in your Markdown viewer.
 
 このガイドでは、次の方法を説明します。
 
-1. [SWAG](https://huggingface.co/datasets/swag) データセットの「通常」構成で [BERT](https://huggingface.co/bert-base-uncased) を微調整して、最適なデータセットを選択します複数の選択肢と何らかのコンテキストを考慮して回答します。
+1. [SWAG](https://huggingface.co/datasets/swag) データセットの「通常」構成で [BERT](https://huggingface.co/google-bert/bert-base-uncased) を微調整して、最適なデータセットを選択します複数の選択肢と何らかのコンテキストを考慮して回答します。
 2. 微調整したモデルを推論に使用します。
 
 <Tip>
@@ -90,7 +90,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 作成する前処理関数は次のことを行う必要があります。
@@ -254,7 +254,7 @@ tokenized_swag = swag.map(preprocess_function, batched=True)
 ```py
 >>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
 
->>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 この時点で残っている手順は次の 3 つだけです。
@@ -318,7 +318,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 ```py
 >>> from transformers import TFAutoModelForMultipleChoice
 
->>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
diff --git a/docs/source/ja/tasks/prompting.md b/docs/source/ja/tasks/prompting.md
index 1c85bd7a20a087..bd66e751ee61d6 100644
--- a/docs/source/ja/tasks/prompting.md
+++ b/docs/source/ja/tasks/prompting.md
@@ -76,7 +76,7 @@ Falcon、LLaMA などの大規模言語モデルは、事前にトレーニン
 
 >>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
 
->>> generator = pipeline('text-generation', model = 'gpt2')
+>>> generator = pipeline('text-generation', model = 'openai-community/gpt2')
 >>> prompt = "Hello, I'm a language model"
 
 >>> generator(prompt, max_length = 30)
diff --git a/docs/source/ja/tasks/question_answering.md b/docs/source/ja/tasks/question_answering.md
index 9c2ca869ffc5d6..54df687c2f047f 100644
--- a/docs/source/ja/tasks/question_answering.md
+++ b/docs/source/ja/tasks/question_answering.md
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
 
 このガイドでは、次の方法を説明します。
 
-1. 抽出的質問応答用に [SQuAD](https://huggingface.co/datasets/squad) データセット上の [DistilBERT](https://huggingface.co/distilbert-base-uncased) を微調整します。
+1. 抽出的質問応答用に [SQuAD](https://huggingface.co/datasets/squad) データセット上の [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) を微調整します。
 2. 微調整したモデルを推論に使用します。
 
 <Tip>
@@ -102,7 +102,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 質問応答タスクに特有の、注意すべき前処理手順がいくつかあります。
@@ -208,7 +208,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
 
->>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 この時点で残っている手順は次の 3 つだけです。
@@ -276,7 +276,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 ```py
 >>> from transformers import TFAutoModelForQuestionAnswering
 
->>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
+>>> model = TFAutoModelForQuestionAnswering("distilbert/distilbert-base-uncased")
 ```
 
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
diff --git a/docs/source/ja/tasks/summarization.md b/docs/source/ja/tasks/summarization.md
index 47b04888d4865f..a4b012d712f2e7 100644
--- a/docs/source/ja/tasks/summarization.md
+++ b/docs/source/ja/tasks/summarization.md
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
 
 このガイドでは、次の方法を説明します。
 
-1. 抽象的な要約のために、[BillSum](https://huggingface.co/datasets/billsum) データセットのカリフォルニア州請求書サブセットで [T5](https://huggingface.co/t5-small) を微調整します。
+1. 抽象的な要約のために、[BillSum](https://huggingface.co/datasets/billsum) データセットのカリフォルニア州請求書サブセットで [T5](https://huggingface.co/google-t5/t5-small) を微調整します。
 2. 微調整したモデルを推論に使用します。
 
 <Tip>
@@ -92,7 +92,7 @@ pip install transformers datasets evaluate rouge_score
 ```py
 >>> from transformers import AutoTokenizer
 
->>> checkpoint = "t5-small"
+>>> checkpoint = "google-t5/t5-small"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 ```
 
diff --git a/docs/source/ja/tasks/token_classification.md b/docs/source/ja/tasks/token_classification.md
index a4b759d6b5b3b7..2b650c4a844d84 100644
--- a/docs/source/ja/tasks/token_classification.md
+++ b/docs/source/ja/tasks/token_classification.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 このガイドでは、次の方法を説明します。
 
-1. [WNUT 17](https://huggingface.co/datasets/wnut_17) データセットで [DistilBERT](https://huggingface.co/distilbert-base-uncased) を微調整して、新しいエンティティを検出します。
+1. [WNUT 17](https://huggingface.co/datasets/wnut_17) データセットで [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) を微調整して、新しいエンティティを検出します。
 2. 微調整されたモデルを推論に使用します。
 
 <Tip>
@@ -107,7 +107,7 @@ pip install transformers datasets evaluate seqeval
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 上の `tokens`フィールドの例で見たように、入力はすでにトークン化されているようです。しかし、実際には入力はまだトークン化されていないため、単語をサブワードにトークン化するには`is_split_into_words=True` を設定する必要があります。例えば：
@@ -270,7 +270,7 @@ pip install transformers datasets evaluate seqeval
 >>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
 
 >>> model = AutoModelForTokenClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
 ... )
 ```
 
@@ -340,7 +340,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 >>> from transformers import TFAutoModelForTokenClassification
 
 >>> model = TFAutoModelForTokenClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
 ... )
 ```
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
diff --git a/docs/source/ja/tasks/translation.md b/docs/source/ja/tasks/translation.md
index 9004a87fcbfff6..fb2c89f3856d49 100644
--- a/docs/source/ja/tasks/translation.md
+++ b/docs/source/ja/tasks/translation.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 このガイドでは、次の方法を説明します。
 
-1. [OPUS Books](https://huggingface.co/datasets/opus_books) データセットの英語-フランス語サブセットの [T5](https://huggingface.co/t5-small) を微調整して、英語のテキストを次の形式に翻訳します。フランス語。
+1. [OPUS Books](https://huggingface.co/datasets/opus_books) データセットの英語-フランス語サブセットの [T5](https://huggingface.co/google-t5/t5-small) を微調整して、英語のテキストを次の形式に翻訳します。フランス語。
 2. 微調整されたモデルを推論に使用します。
 
 <Tip>
@@ -88,7 +88,7 @@ pip install transformers datasets evaluate sacrebleu
 ```py
 >>> from transformers import AutoTokenizer
 
->>> checkpoint = "t5-small"
+>>> checkpoint = "google-t5/t5-small"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 ```
 
diff --git a/docs/source/ja/tf_xla.md b/docs/source/ja/tf_xla.md
index d5d83725372766..1f5a2af1a5a288 100644
--- a/docs/source/ja/tf_xla.md
+++ b/docs/source/ja/tf_xla.md
@@ -88,8 +88,8 @@ from transformers.utils import check_min_version
 check_min_version("4.21.0")
 
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
 # One line to create an XLA generation function
@@ -118,8 +118,8 @@ XLAを有効にした関数（上記の`xla_generate()`など）を初めて実
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
 xla_generate = tf.function(model.generate, jit_compile=True)
@@ -139,8 +139,8 @@ import time
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 
 xla_generate = tf.function(model.generate, jit_compile=True)
 
diff --git a/docs/source/ja/tflite.md b/docs/source/ja/tflite.md
index 8ef20a27bebcfb..ad3e9a3f484e2c 100644
--- a/docs/source/ja/tflite.md
+++ b/docs/source/ja/tflite.md
@@ -34,10 +34,10 @@ pip install optimum[exporters-tf]
 optimum-cli export tflite --help
 ```
 
-🤗 Hubからモデルのチェックポイントをエクスポートするには、例えば `bert-base-uncased` を使用する場合、次のコマンドを実行します：
+🤗 Hubからモデルのチェックポイントをエクスポートするには、例えば `google-bert/bert-base-uncased` を使用する場合、次のコマンドを実行します：
 
 ```bash
-optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
 ```
 
 進行状況を示すログが表示され、生成された `model.tflite` が保存された場所も表示されるはずです：
diff --git a/docs/source/ja/tokenizer_summary.md b/docs/source/ja/tokenizer_summary.md
index e17201d7972e3a..448ad9c871aaa3 100644
--- a/docs/source/ja/tokenizer_summary.md
+++ b/docs/source/ja/tokenizer_summary.md
@@ -76,7 +76,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 >>> tokenizer.tokenize("I have a new GPU!")
 ["i", "have", "a", "new", "gp", "##u", "!"]
 ```
@@ -88,7 +88,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import XLNetTokenizer
 
->>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
 >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
 ["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
 ```
diff --git a/docs/source/ja/torchscript.md b/docs/source/ja/torchscript.md
index 99926a0dae8960..27d64a625c8c42 100644
--- a/docs/source/ja/torchscript.md
+++ b/docs/source/ja/torchscript.md
@@ -71,7 +71,7 @@ TorchScriptで`BertModel`をエクスポートするには、`BertConfig`クラ
 from transformers import BertModel, BertTokenizer, BertConfig
 import torch
 
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
+enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
 # Tokenizing input text
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -106,7 +106,7 @@ model = BertModel(config)
 model.eval()
 
 # If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
 
 # Creating the trace
 traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
diff --git a/docs/source/ja/training.md b/docs/source/ja/training.md
index 4e5dbaa77aefad..79fbb1b7fb2571 100644
--- a/docs/source/ja/training.md
+++ b/docs/source/ja/training.md
@@ -55,7 +55,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 >>> def tokenize_function(examples):
 ...     return tokenizer(examples["text"], padding="max_length", truncation=True)
@@ -91,7 +91,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 <Tip>
@@ -194,7 +194,7 @@ dataset = dataset["train"]  # 今のところトレーニング分割のみを
 ```python
 from transformers import AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
 # トークナイザはBatchEncodingを返しますが、それをKeras用に辞書に変換します
 tokenized_data = dict(tokenized_data)
@@ -210,7 +210,7 @@ from transformers import TFAutoModelForSequenceClassification
 from tensorflow.keras.optimizers import Adam
 
 # モデルをロードしてコンパイルする
-model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
 # ファインチューニングには通常、学習率を下げると良いです
 model.compile(optimizer=Adam(3e-5))  # 損失関数の指定は不要です！
 
@@ -332,7 +332,7 @@ torch.cuda.empty_cache()
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 ### Optimizer and learning rate scheduler
diff --git a/docs/source/ja/troubleshooting.md b/docs/source/ja/troubleshooting.md
index ece688d46a7bf5..b13b5993171a0a 100644
--- a/docs/source/ja/troubleshooting.md
+++ b/docs/source/ja/troubleshooting.md
@@ -132,7 +132,7 @@ GPUからより良いトレースバックを取得する別のオプション
 >>> from transformers import AutoModelForSequenceClassification
 >>> import torch
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
 >>> model.config.pad_token_id
 0
 ```
@@ -188,8 +188,8 @@ tensor([[ 0.0082, -0.2307],
 ```py
 >>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
 
->>> processor = AutoProcessor.from_pretrained("gpt2-medium")
->>> model = AutoModelForQuestionAnswering.from_pretrained("gpt2-medium")
+>>> processor = AutoProcessor.from_pretrained("openai-community/gpt2-medium")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("openai-community/gpt2-medium")
 ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
 Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ...
 ```
diff --git a/docs/source/ko/add_tensorflow_model.md b/docs/source/ko/add_tensorflow_model.md
index 378f2163b5dba2..22980b1320c55b 100644
--- a/docs/source/ko/add_tensorflow_model.md
+++ b/docs/source/ko/add_tensorflow_model.md
@@ -33,7 +33,7 @@ rendered properly in your Markdown viewer.
 
 사용하려는 모델이 이미 해당하는 TensorFlow 아키텍처가 있는지 확실하지 않나요?
 
-선택한 모델([예](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14))의 `config.json`의 `model_type` 필드를 확인해보세요. 🤗 Transformers의 해당 모델 폴더에는 "modeling_tf"로 시작하는 파일이 있는 경우, 해당 모델에는 해당 TensorFlow 아키텍처([예](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert))가 있다는 의미입니다.
+선택한 모델([예](https://huggingface.co/google-bert/bert-base-uncased/blob/main/config.json#L14))의 `config.json`의 `model_type` 필드를 확인해보세요. 🤗 Transformers의 해당 모델 폴더에는 "modeling_tf"로 시작하는 파일이 있는 경우, 해당 모델에는 해당 TensorFlow 아키텍처([예](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert))가 있다는 의미입니다.
 
 </Tip>
 
diff --git a/docs/source/ko/autoclass_tutorial.md b/docs/source/ko/autoclass_tutorial.md
index 9ecfd9c2015d1e..e41a2acc7b486b 100644
--- a/docs/source/ko/autoclass_tutorial.md
+++ b/docs/source/ko/autoclass_tutorial.md
@@ -21,7 +21,7 @@ rendered properly in your Markdown viewer.
 
 <Tip>
 
-아키텍처는 모델의 골격을 의미하며 체크포인트는 주어진 아키텍처에 대한 가중치입니다. 예를 들어, [BERT](https://huggingface.co/bert-base-uncased)는 아키텍처이고, `bert-base-uncased`는 체크포인트입니다. 모델은 아키텍처 또는 체크포인트를 의미할 수 있는 일반적인 용어입니다.
+아키텍처는 모델의 골격을 의미하며 체크포인트는 주어진 아키텍처에 대한 가중치입니다. 예를 들어, [BERT](https://huggingface.co/google-bert/bert-base-uncased)는 아키텍처이고, `google-bert/bert-base-uncased`는 체크포인트입니다. 모델은 아키텍처 또는 체크포인트를 의미할 수 있는 일반적인 용어입니다.
 
 </Tip>
 
@@ -41,7 +41,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 그리고 아래와 같이 입력을 토큰화합니다:
@@ -100,7 +100,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 동일한 체크포인트를 쉽게 재사용하여 다른 작업에 아키텍처를 로드할 수 있습니다:
@@ -108,7 +108,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoModelForTokenClassification
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip warning={true}>
@@ -128,7 +128,7 @@ PyTorch모델의 경우 `from_pretrained()` 메서드는 내부적으로 피클
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 쉽게 동일한 체크포인트를 재사용하여 다른 작업에 아키텍처를 로드할 수 있습니다:
@@ -136,7 +136,7 @@ PyTorch모델의 경우 `from_pretrained()` 메서드는 내부적으로 피클
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 일반적으로, `AutoTokenizer`클래스와 `TFAutoModelFor` 클래스를 사용하여 미리 학습된 모델 인스턴스를 로드하는 것이 좋습니다. 이렇게 하면 매번 올바른 아키텍처를 로드할 수 있습니다. 다음 [튜토리얼](preprocessing)에서는 새롭게 로드한 토크나이저, 이미지 프로세서, 특징 추출기를 사용하여 미세 튜닝용 데이터 세트를 전처리하는 방법에 대해 알아봅니다.
diff --git a/docs/source/ko/big_models.md b/docs/source/ko/big_models.md
index 17b3d8db61e8c9..3180b51117a97b 100644
--- a/docs/source/ko/big_models.md
+++ b/docs/source/ko/big_models.md
@@ -41,7 +41,7 @@ rendered properly in your Markdown viewer.
 ```py
 from transformers import AutoModel
 
-model = AutoModel.from_pretrained("bert-base-cased")
+model = AutoModel.from_pretrained("google-bert/bert-base-cased")
 ```
 
 [`~PreTrainedModel.save_pretrained`]을 사용하여 모델을 저장하면, 모델의 구성과 가중치가 들어있는 두 개의 파일이 있는 새 폴더가 생성됩니다:
diff --git a/docs/source/ko/community.md b/docs/source/ko/community.md
index 2d12e9de4a280d..d50168d7548620 100644
--- a/docs/source/ko/community.md
+++ b/docs/source/ko/community.md
@@ -43,8 +43,8 @@ rendered properly in your Markdown viewer.
 |[감정 분석을 위해 Roberta 미세 조정하기](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | 감정 분석을 위해 Roberta 모델을 미세 조정하는 방법 | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
 |[질문 생성 모델 평가하기](https://github.com/flexudy-pipe/qugeev) | seq2seq 트랜스포머 모델이 생성한 질문과 이에 대한 답변이 얼마나 정확한가요? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
 |[DistilBERT와 Tensorflow로 텍스트 분류하기](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | 텍스트 분류를 위해 TensorFlow로  DistilBERT를 미세 조정하는 방법 | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
-|[CNN/Dailail 요약을 위해 인코더-디코더 모델에 BERT 활용하기](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | CNN/Dailail 요약을 위해 *bert-base-uncased* 체크포인트를 활용하여 *EncoderDecoderModel*을 워밍업하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
-|[BBC XSum 요약을 위해 인코더-디코더 모델에 RoBERTa 활용하기](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | BBC/XSum 요약을 위해 *roberta-base* 체크포인트를 활용하여 공유 *EncoderDecoderModel*을 워밍업하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[CNN/Dailail 요약을 위해 인코더-디코더 모델에 BERT 활용하기](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | CNN/Dailail 요약을 위해 *google-bert/bert-base-uncased* 체크포인트를 활용하여 *EncoderDecoderModel*을 워밍업하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[BBC XSum 요약을 위해 인코더-디코더 모델에 RoBERTa 활용하기](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | BBC/XSum 요약을 위해 *FacebookAI/roberta-base* 체크포인트를 활용하여 공유 *EncoderDecoderModel*을 워밍업하는 방법 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
 |[순차적 질문 답변(SQA)을 위해 TAPAS 미세 조정하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | *tapas-base* 체크포인트를 활용하여 순차적 질문 답변(SQA) 데이터 세트로 *TapasForQuestionAnswering*을 미세 조정하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
 |[표 사실 검사(TabFact)로 TAPAS 평가하기](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | 🤗 Datasets와 🤗 Transformer 라이브러리를 함께 사용하여 *tapas-base-finetuned-tabfact* 체크포인트로 미세 조정된 *TapasForSequenceClassification*을 평가하는 방법 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
 |[번역을 위해 mBART 미세 조정하기](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | 힌디어에서 영어로 번역하기 위해 Seq2SeqTrainer를 사용하여 mBART를 미세 조정하는 방법 | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
diff --git a/docs/source/ko/create_a_model.md b/docs/source/ko/create_a_model.md
index 62a118563f1c0d..b911669bb174b9 100644
--- a/docs/source/ko/create_a_model.md
+++ b/docs/source/ko/create_a_model.md
@@ -87,7 +87,7 @@ DistilBertConfig {
 사전 학습된 모델 속성은 [`~PretrainedConfig.from_pretrained`] 함수에서 수정할 수 있습니다:
 
 ```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
 ```
 
 모델 구성이 만족스러우면 [`~PretrainedConfig.save_pretrained`]로 저장할 수 있습니다. 설정 파일은 지정된 작업 경로에 JSON 파일로 저장됩니다:
@@ -128,13 +128,13 @@ configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configu
 사전 학습된 모델을 [`~PreTrainedModel.from_pretrained`]로 생성합니다:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 🤗 Transformers에서 제공한 모델의 사전 학습된 가중치를 사용하는 경우 기본 모델 configuration을 자동으로 불러옵니다. 그러나 원하는 경우 기본 모델 configuration 속성의 일부 또는 전부를 사용자 지정으로 바꿀 수 있습니다:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </pt>
 <tf>
@@ -152,13 +152,13 @@ configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configu
 사전 학습된 모델을 [`~TFPreTrainedModel.from_pretrained`]로 생성합니다:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 🤗 Transformers에서 제공한 모델의 사전 학습된 가중치를 사용하는 경우 기본 모델 configuration을 자동으로 불러옵니다. 그러나 원하는 경우 기본 모델 configuration 속성의 일부 또는 전부를 사용자 지정으로 바꿀 수 있습니다:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </tf>
 </frameworkcontent>
@@ -174,7 +174,7 @@ configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configu
 ```py
 >>> from transformers import DistilBertForSequenceClassification
 
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 다른 모델 헤드로 전환하여 이 체크포인트를 다른 작업에 쉽게 재사용할 수 있습니다. 질의응답 작업의 경우, [`DistilBertForQuestionAnswering`] 모델 헤드를 사용할 수 있습니다. 질의응답 헤드는 숨겨진 상태 출력 위에 선형 레이어가 있다는 점을 제외하면 시퀀스 분류 헤드와 유사합니다.
@@ -182,7 +182,7 @@ configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configu
 ```py
 >>> from transformers import DistilBertForQuestionAnswering
 
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </pt>
 <tf>
@@ -191,7 +191,7 @@ configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configu
 ```py
 >>> from transformers import TFDistilBertForSequenceClassification
 
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 다른 모델 헤드로 전환하여 이 체크포인트를 다른 작업에 쉽게 재사용할 수 있습니다. 질의응답 작업의 경우, [`TFDistilBertForQuestionAnswering`] 모델 헤드를 사용할 수 있습니다. 질의응답 헤드는 숨겨진 상태 출력 위에 선형 레이어가 있다는 점을 제외하면 시퀀스 분류 헤드와 유사합니다.
@@ -199,7 +199,7 @@ configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configu
 ```py
 >>> from transformers import TFDistilBertForQuestionAnswering
 
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </tf>
 </frameworkcontent>
@@ -231,7 +231,7 @@ configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configu
 ```py
 >>> from transformers import DistilBertTokenizer
 
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 [`DistilBertTokenizerFast`] 클래스로 빠른 토크나이저를 생성합니다:
@@ -239,7 +239,7 @@ configuration 파일을 딕셔너리로 저장하거나 사용자 정의 configu
 ```py
 >>> from transformers import DistilBertTokenizerFast
 
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip>
diff --git a/docs/source/ko/custom_tools.md b/docs/source/ko/custom_tools.md
index 6e07ccf86c5601..853d69187f6aaa 100644
--- a/docs/source/ko/custom_tools.md
+++ b/docs/source/ko/custom_tools.md
@@ -548,7 +548,7 @@ task = "text-classification"
 model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
 print(model.id)
 ```
-`text-classification`(텍스트 분류) 작업의 경우 `'facebook/bart-large-mnli'`를 반환하고, `translation`(번역) 작업의 경우 `'t5-base'`를 반환합니다.
+`text-classification`(텍스트 분류) 작업의 경우 `'facebook/bart-large-mnli'`를 반환하고, `translation`(번역) 작업의 경우 `'google-t5/t5-base'`를 반환합니다.
 
 이를 에이전트가 활용할 수 있는 도구로 변환하려면 어떻게 해야 할까요? 
 모든 도구는 필요한 주요 속성을 보유하는 슈퍼클래스 `Tool`에 의존합니다. 이를 상속하는 클래스를 만들어 보겠습니다:
diff --git a/docs/source/ko/installation.md b/docs/source/ko/installation.md
index f7995aa487da0d..062184e5b3ba6c 100644
--- a/docs/source/ko/installation.md
+++ b/docs/source/ko/installation.md
@@ -168,14 +168,14 @@ conda install conda-forge::transformers
 예를 들어 외부 기기 사이에 방화벽을 둔 일반 네트워크에서 평소처럼 프로그램을 다음과 같이 실행할 수 있습니다.
 
 ```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 오프라인 기기에서 동일한 프로그램을 다음과 같이 실행할 수 있습니다.
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 이제 스크립트는 로컬 파일에 한해서만 검색할 것이므로, 스크립트가 중단되거나 시간이 초과될 때까지 멈춰있지 않고 잘 실행될 것입니다.
diff --git a/docs/source/ko/model_memory_anatomy.md b/docs/source/ko/model_memory_anatomy.md
index 351cbebe0285b8..5701e19aaa085d 100644
--- a/docs/source/ko/model_memory_anatomy.md
+++ b/docs/source/ko/model_memory_anatomy.md
@@ -85,14 +85,14 @@ GPU memory occupied: 1343 MB.
 
 ## 모델 로드 [[load-model]]
 
-우선, `bert-large-uncased` 모델을 로드합니다. 모델의 가중치를 직접 GPU에 로드해서 가중치만이 얼마나 많은 공간을 차지하는지 확인할 수 있습니다.
+우선, `google-bert/bert-large-uncased` 모델을 로드합니다. 모델의 가중치를 직접 GPU에 로드해서 가중치만이 얼마나 많은 공간을 차지하는지 확인할 수 있습니다.
 
 
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda")
 >>> print_gpu_utilization()
 GPU memory occupied: 2631 MB.
 ```
diff --git a/docs/source/ko/model_sharing.md b/docs/source/ko/model_sharing.md
index ed6836e8de568d..868cc3b231de93 100644
--- a/docs/source/ko/model_sharing.md
+++ b/docs/source/ko/model_sharing.md
@@ -229,4 +229,4 @@ Flax에서 모델을 사용하는 경우, PyTorch에서 Flax로 체크포인트
 * `README.md` 파일을 수동으로 생성하여 업로드합니다.
 * 모델 저장소에서 **Edit model card** 버튼을 클릭합니다.
 
-모델 카드에 포함할 정보 유형에 대한 좋은 예는 DistilBert [모델 카드](https://huggingface.co/distilbert-base-uncased)를 참조하세요. 모델의 탄소 발자국이나 위젯 예시 등 `README.md` 파일에서 제어할 수 있는 다른 옵션에 대한 자세한 내용은 [여기](https://huggingface.co/docs/hub/models-cards) 문서를 참조하세요.
+모델 카드에 포함할 정보 유형에 대한 좋은 예는 DistilBert [모델 카드](https://huggingface.co/distilbert/distilbert-base-uncased)를 참조하세요. 모델의 탄소 발자국이나 위젯 예시 등 `README.md` 파일에서 제어할 수 있는 다른 옵션에 대한 자세한 내용은 [여기](https://huggingface.co/docs/hub/models-cards) 문서를 참조하세요.
diff --git a/docs/source/ko/multilingual.md b/docs/source/ko/multilingual.md
index 2862bd98388706..c0eee024358f3e 100644
--- a/docs/source/ko/multilingual.md
+++ b/docs/source/ko/multilingual.md
@@ -21,7 +21,7 @@ rendered properly in your Markdown viewer.
 🤗 Transformers에는 여러 종류의 다국어(multilingual) 모델이 있으며, 단일 언어(monolingual) 모델과 추론 시 사용법이 다릅니다.
 그렇다고 해서 *모든* 다국어 모델의 사용법이 다른 것은 아닙니다.
 
-[bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased)와 같은 몇몇 모델은 단일 언어 모델처럼 사용할 수 있습니다.
+[google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)와 같은 몇몇 모델은 단일 언어 모델처럼 사용할 수 있습니다.
 이번 가이드에서 다국어 모델의 추론 시 사용 방법을 알아볼 것입니다.
 
 ## XLM[[xlm]]
@@ -33,25 +33,25 @@ XLM에는 10가지 체크포인트(checkpoint)가 있는데, 이 중 하나만 
 
 다음 XLM 모델은 추론 시에 언어 임베딩을 사용합니다:
 
-- `xlm-mlm-ende-1024` (마스킹된 언어 모델링, 영어-독일어)
-- `xlm-mlm-enfr-1024` (마스킹된 언어 모델링, 영어-프랑스어)
-- `xlm-mlm-enro-1024` (마스킹된 언어 모델링, 영어-루마니아어)
-- `xlm-mlm-xnli15-1024` (마스킹된 언어 모델링, XNLI 데이터 세트에서 제공하는 15개 국어)
-- `xlm-mlm-tlm-xnli15-1024` (마스킹된 언어 모델링 + 번역, XNLI 데이터 세트에서 제공하는 15개 국어)
-- `xlm-clm-enfr-1024` (Causal language modeling, 영어-프랑스어)
-- `xlm-clm-ende-1024` (Causal language modeling, 영어-독일어)
+- `FacebookAI/xlm-mlm-ende-1024` (마스킹된 언어 모델링, 영어-독일어)
+- `FacebookAI/xlm-mlm-enfr-1024` (마스킹된 언어 모델링, 영어-프랑스어)
+- `FacebookAI/xlm-mlm-enro-1024` (마스킹된 언어 모델링, 영어-루마니아어)
+- `FacebookAI/xlm-mlm-xnli15-1024` (마스킹된 언어 모델링, XNLI 데이터 세트에서 제공하는 15개 국어)
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (마스킹된 언어 모델링 + 번역, XNLI 데이터 세트에서 제공하는 15개 국어)
+- `FacebookAI/xlm-clm-enfr-1024` (Causal language modeling, 영어-프랑스어)
+- `FacebookAI/xlm-clm-ende-1024` (Causal language modeling, 영어-독일어)
 
 언어 임베딩은 모델에 전달된 `input_ids`와 동일한 shape의 텐서로 표현됩니다.
 이러한 텐서의 값은 사용된 언어에 따라 다르며 토크나이저의 `lang2id` 및 `id2lang` 속성에 의해 식별됩니다.
 
-다음 예제에서는 `xlm-clm-enfr-1024` 체크포인트(코잘 언어 모델링(causal language modeling), 영어-프랑스어)를 가져옵니다:
+다음 예제에서는 `FacebookAI/xlm-clm-enfr-1024` 체크포인트(코잘 언어 모델링(causal language modeling), 영어-프랑스어)를 가져옵니다:
 
 ```py
 >>> import torch
 >>> from transformers import XLMTokenizer, XLMWithLMHeadModel
 
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
 ```
 
 토크나이저의 `lang2id` 속성은 모델의 언어와 해당 ID를 표시합니다:
@@ -91,8 +91,8 @@ XLM에는 10가지 체크포인트(checkpoint)가 있는데, 이 중 하나만 
 
 다음 XLM 모델은 추론 시에 언어 임베딩이 필요하지 않습니다:
 
-- `xlm-mlm-17-1280` (마스킹된 언어 모델링, 17개 국어)
-- `xlm-mlm-100-1280` (마스킹된 언어 모델링, 100개 국어)
+- `FacebookAI/xlm-mlm-17-1280` (마스킹된 언어 모델링, 17개 국어)
+- `FacebookAI/xlm-mlm-100-1280` (마스킹된 언어 모델링, 100개 국어)
 
 이전의 XLM 체크포인트와 달리 이 모델은 일반 문장 표현에 사용됩니다.
 
@@ -100,8 +100,8 @@ XLM에는 10가지 체크포인트(checkpoint)가 있는데, 이 중 하나만 
 
 다음 BERT 모델은 다국어 태스크에 사용할 수 있습니다:
 
-- `bert-base-multilingual-uncased` (마스킹된 언어 모델링 + 다음 문장 예측, 102개 국어)
-- `bert-base-multilingual-cased` (마스킹된 언어 모델링 + 다음 문장 예측, 104개 국어)
+- `google-bert/bert-base-multilingual-uncased` (마스킹된 언어 모델링 + 다음 문장 예측, 102개 국어)
+- `google-bert/bert-base-multilingual-cased` (마스킹된 언어 모델링 + 다음 문장 예측, 104개 국어)
 
 이러한 모델은 추론 시에 언어 임베딩이 필요하지 않습니다. 
 문맥에서 언어를 식별하고, 식별된 언어로 추론합니다.
@@ -110,8 +110,8 @@ XLM에는 10가지 체크포인트(checkpoint)가 있는데, 이 중 하나만 
 
 다음 XLM-RoBERTa 또한 다국어 다국어 태스크에 사용할 수 있습니다:
 
-- `xlm-roberta-base` (마스킹된 언어 모델링, 100개 국어)
-- `xlm-roberta-large` (마스킹된 언어 모델링, 100개 국어)
+- `FacebookAI/xlm-roberta-base` (마스킹된 언어 모델링, 100개 국어)
+- `FacebookAI/xlm-roberta-large` (마스킹된 언어 모델링, 100개 국어)
 
 XLM-RoBERTa는 100개 국어에 대해 새로 생성되고 정제된 2.5TB 규모의 CommonCrawl 데이터로 학습되었습니다.
 이전에 공개된 mBERT나 XLM과 같은 다국어 모델에 비해 분류, 시퀀스 라벨링, 질의 응답과 같은 다운스트림(downstream) 작업에서 이점이 있습니다.
diff --git a/docs/source/ko/perf_hardware.md b/docs/source/ko/perf_hardware.md
index dedb9a60ed1abc..01282a0c711147 100644
--- a/docs/source/ko/perf_hardware.md
+++ b/docs/source/ko/perf_hardware.md
@@ -117,7 +117,7 @@ GPU1    PHB      X      0-11            N/A
 
 따라서 `nvidia-smi topo -m`의 결과에서 `NVX`의 값이 높을수록 더 좋습니다. 세대는 GPU 아키텍처에 따라 다를 수 있습니다.
 
-그렇다면, gpt2를 작은 wikitext 샘플로 학습시키는 예제를 통해, NVLink가 훈련에 어떤 영향을 미치는지 살펴보겠습니다.
+그렇다면, openai-community/gpt2를 작은 wikitext 샘플로 학습시키는 예제를 통해, NVLink가 훈련에 어떤 영향을 미치는지 살펴보겠습니다.
 
 결과는 다음과 같습니다:
 
@@ -136,7 +136,7 @@ NVLink 사용 시 훈련이 약 23% 더 빠르게 완료됨을 확인할 수 있
 # DDP w/ NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
@@ -145,7 +145,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 # DDP w/o NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
diff --git a/docs/source/ko/perf_train_cpu.md b/docs/source/ko/perf_train_cpu.md
index f0398aaa262728..1a6c58b25afae1 100644
--- a/docs/source/ko/perf_train_cpu.md
+++ b/docs/source/ko/perf_train_cpu.md
@@ -49,7 +49,7 @@ Trainer에서 IPEX의 자동 혼합 정밀도를 활성화하려면 사용자는
 
 - CPU에서 BF16 자동 혼합 정밀도를 사용하여 IPEX로 훈련하기:
 <pre> python run_qa.py \
---model_name_or_path bert-base-uncased \
+--model_name_or_path google-bert/bert-base-uncased \
 --dataset_name squad \
 --do_train \
 --do_eval \
diff --git a/docs/source/ko/perf_train_cpu_many.md b/docs/source/ko/perf_train_cpu_many.md
index 9ff4cfbfa6eb80..e7a68971a7dc54 100644
--- a/docs/source/ko/perf_train_cpu_many.md
+++ b/docs/source/ko/perf_train_cpu_many.md
@@ -88,7 +88,7 @@ Trainer에서 ccl 백엔드를 사용하여 멀티 CPU 분산 훈련을 활성
  export MASTER_ADDR=127.0.0.1
  mpirun -n 2 -genv OMP_NUM_THREADS=23 \
  python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
+ --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
@@ -117,7 +117,7 @@ Trainer에서 ccl 백엔드를 사용하여 멀티 CPU 분산 훈련을 활성
  mpirun -f hostfile -n 4 -ppn 2 \
  -genv OMP_NUM_THREADS=23 \
  python3 run_qa.py \
- --model_name_or_path bert-large-uncased \
+ --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
  --do_eval \
diff --git a/docs/source/ko/perf_train_gpu_many.md b/docs/source/ko/perf_train_gpu_many.md
index 1fc6ce8e1cc53b..c2a80505ef7659 100644
--- a/docs/source/ko/perf_train_gpu_many.md
+++ b/docs/source/ko/perf_train_gpu_many.md
@@ -138,7 +138,7 @@ DP와 DDP 사이에는 다른 차이점이 있지만, 이 토론과는 관련이
 # DP
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
 python examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
@@ -146,7 +146,7 @@ python examples/pytorch/language-modeling/run_clm.py \
 # DDP w/ NVlink
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
 torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
@@ -154,7 +154,7 @@ torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 # DDP w/o NVlink
 rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
 {'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
diff --git a/docs/source/ko/perplexity.md b/docs/source/ko/perplexity.md
index 72eee0643c33ad..9de84a5f289b94 100644
--- a/docs/source/ko/perplexity.md
+++ b/docs/source/ko/perplexity.md
@@ -72,7 +72,7 @@ $$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i})
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 
 device = "cuda"
-model_id = "gpt2-large"
+model_id = "openai-community/gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 ```
diff --git a/docs/source/ko/pipeline_tutorial.md b/docs/source/ko/pipeline_tutorial.md
index 4c32db756f0ecd..2f166fc6939f32 100644
--- a/docs/source/ko/pipeline_tutorial.md
+++ b/docs/source/ko/pipeline_tutorial.md
@@ -156,7 +156,7 @@ def data():
         yield f"My example {i}"
 
 
-pipe = pipe(model="gpt2", device=0)
+pipe = pipe(model="openai-community/gpt2", device=0)
 generated_characters = 0
 for out in pipe(data()):
     generated_characters += len(out["generated_text"])
diff --git a/docs/source/ko/pipeline_webserver.md b/docs/source/ko/pipeline_webserver.md
index b62dfe26541e0b..b7d5366c57c4ef 100644
--- a/docs/source/ko/pipeline_webserver.md
+++ b/docs/source/ko/pipeline_webserver.md
@@ -35,7 +35,7 @@ async def homepage(request):
 
 
 async def server_loop(q):
-    pipe = pipeline(model="bert-base-uncased")
+    pipe = pipeline(model="google-bert/bert-base-uncased")
     while True:
         (string, response_q) = await q.get()
         out = pipe(string)
diff --git a/docs/source/ko/preprocessing.md b/docs/source/ko/preprocessing.md
index e11f68d6566690..4466b63585723d 100644
--- a/docs/source/ko/preprocessing.md
+++ b/docs/source/ko/preprocessing.md
@@ -54,7 +54,7 @@ pip install datasets
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 ```
 
 그 다음으로 텍스트를 토크나이저에 넣어주세요:
diff --git a/docs/source/ko/quicktour.md b/docs/source/ko/quicktour.md
index a456c4e0017a92..c92279fa916bae 100644
--- a/docs/source/ko/quicktour.md
+++ b/docs/source/ko/quicktour.md
@@ -81,7 +81,7 @@ pip install tensorflow
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-[`pipeline`]은 감정 분석을 위한 [사전 훈련된 모델](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)과 토크나이저를 자동으로 다운로드하고 캐시합니다. 이제 `classifier`를 대상 텍스트에 사용할 수 있습니다:
+[`pipeline`]은 감정 분석을 위한 [사전 훈련된 모델](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english)과 토크나이저를 자동으로 다운로드하고 캐시합니다. 이제 `classifier`를 대상 텍스트에 사용할 수 있습니다:
 
 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
@@ -385,7 +385,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import AutoConfig
 
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
 ```
 
 <frameworkcontent>
@@ -422,7 +422,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoModelForSequenceClassification
 
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. [`TrainingArguments`]는 학습률, 배치 크기, 훈련할 에포크 수와 같은 모델 하이퍼파라미터를 포함합니다. 훈련 인자를 지정하지 않으면 기본값이 사용됩니다:
@@ -444,7 +444,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 4. 데이터셋을 로드하세요:
@@ -516,7 +516,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import TFAutoModelForSequenceClassification
 
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. 토크나이저, 이미지 프로세서, 특징 추출기(feature extractor) 또는 프로세서와 같은 전처리 클래스를 로드하세요:
@@ -524,7 +524,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 3. 데이터셋을 토큰화하는 함수를 생성하세요:
diff --git a/docs/source/ko/run_scripts.md b/docs/source/ko/run_scripts.md
index f88e8e8252f970..715a949dde4280 100644
--- a/docs/source/ko/run_scripts.md
+++ b/docs/source/ko/run_scripts.md
@@ -94,12 +94,12 @@ pip install -r requirements.txt
 <pt>
 예제 스크립트는 🤗 [Datasets](https://huggingface.co/docs/datasets/) 라이브러리에서 데이터 세트를 다운로드하고 전처리합니다.
 그런 다음 스크립트는 요약 기능을 지원하는 아키텍처에서 [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer)를 사용하여 데이터 세트를 미세 조정합니다.
-다음 예는 [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) 데이터 세트에서 [T5-small](https://huggingface.co/t5-small)을 미세 조정합니다.
+다음 예는 [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) 데이터 세트에서 [T5-small](https://huggingface.co/google-t5/t5-small)을 미세 조정합니다.
 T5 모델은 훈련 방식에 따라 추가 `source_prefix` 인수가 필요하며, 이 프롬프트는 요약 작업임을 T5에 알려줍니다.
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -115,11 +115,11 @@ python examples/pytorch/summarization/run_summarization.py \
 <tf>
 예제 스크립트는 🤗 [Datasets](https://huggingface.co/docs/datasets/) 라이브러리에서 데이터 세트를 다운로드하고 전처리합니다.
 그런 다음 스크립트는 요약 기능을 지원하는 아키텍처에서 Keras를 사용하여 데이터 세트를 미세 조정합니다. 
-다음 예는 [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) 데이터 세트에서 [T5-small](https://huggingface.co/t5-small)을 미세 조정합니다.
+다음 예는 [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) 데이터 세트에서 [T5-small](https://huggingface.co/google-t5/t5-small)을 미세 조정합니다.
 T5 모델은 훈련 방식에 따라 추가 `source_prefix` 인수가 필요하며, 이 프롬프트는 요약 작업임을 T5에 알려줍니다.
 ```bash
 python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -144,7 +144,7 @@ python examples/tensorflow/summarization/run_summarization.py  \
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -171,7 +171,7 @@ TPU를 사용하려면 `xla_spawn.py` 스크립트를 실행하고 `num_cores` 
 ```bash
 python xla_spawn.py --num_cores 8 \
     summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -192,7 +192,7 @@ TPU를 사용하려면 TPU 리소스의 이름을 `tpu` 인수에 전달합니
 ```bash
 python run_summarization.py  \
     --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -232,7 +232,7 @@ accelerate test
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -252,7 +252,7 @@ accelerate launch run_summarization_no_trainer.py \
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -278,7 +278,7 @@ python examples/pytorch/summarization/run_summarization.py \
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --max_train_samples 50 \
     --max_eval_samples 50 \
     --max_predict_samples 50 \
@@ -311,7 +311,7 @@ examples/pytorch/summarization/run_summarization.py -h
 이 경우 `overwrite_output_dir`을 제거해야 합니다:
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -328,7 +328,7 @@ python examples/pytorch/summarization/run_summarization.py
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -359,7 +359,7 @@ huggingface-cli login
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
diff --git a/docs/source/ko/serialization.md b/docs/source/ko/serialization.md
index 0cbcf005e3aca0..2e521e2b7b4af8 100644
--- a/docs/source/ko/serialization.md
+++ b/docs/source/ko/serialization.md
@@ -56,10 +56,10 @@ pip install optimum[exporters]
 optimum-cli export onnx --help
 ```
 
-예를 들어, 🤗 Hub에서 `distilbert-base-uncased-distilled-squad`와 같은 모델의 체크포인트를 내보내려면 다음 명령을 실행하세요:
+예를 들어, 🤗 Hub에서 `distilbert/distilbert-base-uncased-distilled-squad`와 같은 모델의 체크포인트를 내보내려면 다음 명령을 실행하세요:
 
 ```bash
-optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
 ```
 
 위와 같이 진행 상황을 나타내는 로그가 표시되고 결과인 `model.onnx`가 저장된 위치가 표시됩니다.
@@ -141,7 +141,7 @@ pip install transformers[onnx]
 `transformers.onnx` 패키지를 Python 모듈로 사용하여 준비된 구성을 사용하여 체크포인트를 내보냅니다:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
 ```
 
 이렇게 하면 `--model` 인수에 정의된 체크포인트의 ONNX 그래프가 내보내집니다. 🤗 Hub에서 제공하는 체크포인트나 로컬에 저장된 체크포인트를 전달할 수 있습니다. 결과로 생성된 `model.onnx` 파일은 ONNX 표준을 지원하는 많은 가속기 중 하나에서 실행할 수 있습니다. 예를 들어, 다음과 같이 ONNX Runtime을 사용하여 모델을 로드하고 실행할 수 있습니다:
@@ -150,7 +150,7 @@ python -m transformers.onnx --model=distilbert-base-uncased onnx/
 >>> from transformers import AutoTokenizer
 >>> from onnxruntime import InferenceSession
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 >>> session = InferenceSession("onnx/model.onnx")
 >>> # ONNX Runtime expects NumPy arrays as input
 >>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
diff --git a/docs/source/ko/task_summary.md b/docs/source/ko/task_summary.md
index dbebf38760a67c..a0e60c60924b99 100644
--- a/docs/source/ko/task_summary.md
+++ b/docs/source/ko/task_summary.md
@@ -296,7 +296,7 @@ score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
 >>> from transformers import pipeline
 
 >>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
->>> translator = pipeline(task="translation", model="t5-small")
+>>> translator = pipeline(task="translation", model="google-t5/t5-small")
 >>> translator(text)
 [{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
 ```
diff --git a/docs/source/ko/tasks/language_modeling.md b/docs/source/ko/tasks/language_modeling.md
index bf10660c61c188..ee1d11c1d09daf 100644
--- a/docs/source/ko/tasks/language_modeling.md
+++ b/docs/source/ko/tasks/language_modeling.md
@@ -29,7 +29,7 @@ rendered properly in your Markdown viewer.
 
 이 가이드에서는 다음 작업을 수행하는 방법을 안내합니다:
 
-1. [DistilGPT2](https://huggingface.co/distilgpt2) 모델을 [ELI5](https://huggingface.co/datasets/eli5) 데이터 세트의 [r/askscience](https://www.reddit.com/r/askscience/) 하위 집합으로 미세 조정
+1. [DistilGPT2](https://huggingface.co/distilbert/distilgpt2) 모델을 [ELI5](https://huggingface.co/datasets/eli5) 데이터 세트의 [r/askscience](https://www.reddit.com/r/askscience/) 하위 집합으로 미세 조정
 2. 미세 조정된 모델을 추론에 사용
 
 <Tip>
@@ -104,7 +104,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 ```
 
 위의 예제에서 알 수 있듯이, `text` 필드는 `answers` 아래에 중첩되어 있습니다. 따라서 [`flatten`](https://huggingface.co/docs/datasets/process#flatten) 메소드를 사용하여 중첩 구조에서 `text` 하위 필드를 추출해야 합니다.
@@ -221,7 +221,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
 
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 여기까지 진행하면 세 단계만 남았습니다:
@@ -285,7 +285,7 @@ TensorFlow에서 모델을 미세 조정하려면, 먼저 옵티마이저 함수
 ```py
 >>> from transformers import TFAutoModelForCausalLM
 
->>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환하세요:
diff --git a/docs/source/ko/tasks/masked_language_modeling.md b/docs/source/ko/tasks/masked_language_modeling.md
index ee835d13ebc0b4..3aafdf1cb9eebe 100644
--- a/docs/source/ko/tasks/masked_language_modeling.md
+++ b/docs/source/ko/tasks/masked_language_modeling.md
@@ -26,7 +26,7 @@ rendered properly in your Markdown viewer.
 
 이번 가이드에서 다룰 내용은 다음과 같습니다:
 
-1. [ELI5](https://huggingface.co/datasets/eli5) 데이터 세트에서 [r/askscience](https://www.reddit.com/r/askscience/) 부분을 사용해 [DistilRoBERTa](https://huggingface.co/distilroberta-base) 모델을 미세 조정합니다.
+1. [ELI5](https://huggingface.co/datasets/eli5) 데이터 세트에서 [r/askscience](https://www.reddit.com/r/askscience/) 부분을 사용해 [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) 모델을 미세 조정합니다.
 2. 추론 시에 직접 미세 조정한 모델을 사용합니다.
 
 <Tip>
@@ -103,7 +103,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티와
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
 ```
 
 위의 예제에서와 마찬가지로, `text` 필드는 `answers` 안에 중첩되어 있습니다. 
@@ -224,7 +224,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티와
 ```py
 >>> from transformers import AutoModelForMaskedLM
 
->>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
+>>> model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
 ```
 
 이제 세 단계가 남았습니다:
@@ -289,7 +289,7 @@ TensorFlow로 모델을 미세 조정하기 위해서는 옵티마이저(optimiz
 ```py
 >>> from transformers import TFAutoModelForMaskedLM
 
->>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")
+>>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
 ```
 
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] 메소드를 사용해 데이터 세트를 `tf.data.Dataset` 형식으로 변환하세요:
diff --git a/docs/source/ko/tasks/multiple_choice.md b/docs/source/ko/tasks/multiple_choice.md
index c174ca632f69a6..4e02f7fabe504f 100644
--- a/docs/source/ko/tasks/multiple_choice.md
+++ b/docs/source/ko/tasks/multiple_choice.md
@@ -22,7 +22,7 @@ rendered properly in your Markdown viewer.
 
 진행하는 방법은 아래와 같습니다:
 
-1. [SWAG](https://huggingface.co/datasets/swag) 데이터 세트의 'regular' 구성으로 [BERT](https://huggingface.co/bert-base-uncased)를 미세 조정하여 여러 옵션과 일부 컨텍스트가 주어졌을 때 가장 적합한 답을 선택합니다.
+1. [SWAG](https://huggingface.co/datasets/swag) 데이터 세트의 'regular' 구성으로 [BERT](https://huggingface.co/google-bert/bert-base-uncased)를 미세 조정하여 여러 옵션과 일부 컨텍스트가 주어졌을 때 가장 적합한 답을 선택합니다.
 2. 추론에 미세 조정된 모델을 사용합니다.
 
 <Tip>
@@ -90,7 +90,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 생성하려는 전처리 함수는 다음과 같아야 합니다:
@@ -253,7 +253,7 @@ tokenized_swag = swag.map(preprocess_function, batched=True)
 ```py
 >>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
 
->>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 이제 세 단계만 남았습니다:
@@ -317,7 +317,7 @@ TensorFlow에서 모델을 미세 조정하려면 최적화 함수, 학습률 
 ```py
 >>> from transformers import TFAutoModelForMultipleChoice
 
->>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다:
diff --git a/docs/source/ko/tasks/question_answering.md b/docs/source/ko/tasks/question_answering.md
index 4b218ccce214dc..9539b9a403030e 100644
--- a/docs/source/ko/tasks/question_answering.md
+++ b/docs/source/ko/tasks/question_answering.md
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
 
 이 가이드는 다음과 같은 방법들을 보여줍니다.
 
-1. 추출적 질의 응답을 하기 위해 [SQuAD](https://huggingface.co/datasets/squad) 데이터 세트에서 [DistilBERT](https://huggingface.co/distilbert-base-uncased) 미세 조정하기
+1. 추출적 질의 응답을 하기 위해 [SQuAD](https://huggingface.co/datasets/squad) 데이터 세트에서 [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) 미세 조정하기
 2. 추론에 미세 조정된 모델 사용하기
 
 <Tip>
@@ -99,7 +99,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 질의 응답 태스크와 관련해서 특히 유의해야할 몇 가지 전처리 단계가 있습니다:
@@ -203,7 +203,7 @@ pip install transformers datasets evaluate
 ```py
 >>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
 
->>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 이제 세 단계만 남았습니다:
@@ -268,7 +268,7 @@ TensorFlow를 이용한 모델을 미세 조정하려면 옵티마이저 함수,
 ```py
 >>> from transformers import TFAutoModelForQuestionAnswering
 
->>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
+>>> model = TFAutoModelForQuestionAnswering("distilbert/distilbert-base-uncased")
 ```
 
 [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용해서 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다:
diff --git a/docs/source/ko/tasks/sequence_classification.md b/docs/source/ko/tasks/sequence_classification.md
index bc364d3199e238..a1a5da50e9f614 100644
--- a/docs/source/ko/tasks/sequence_classification.md
+++ b/docs/source/ko/tasks/sequence_classification.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 이 가이드에서 학습할 내용은:
 
-1. [IMDb](https://huggingface.co/datasets/imdb) 데이터셋에서 [DistilBERT](https://huggingface.co/distilbert-base-uncased)를 파인 튜닝하여 영화 리뷰가 긍정적인지 부정적인지 판단합니다.
+1. [IMDb](https://huggingface.co/datasets/imdb) 데이터셋에서 [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased)를 파인 튜닝하여 영화 리뷰가 긍정적인지 부정적인지 판단합니다.
 2. 추론을 위해 파인 튜닝 모델을 사용합니다.
 
 <Tip>
@@ -85,7 +85,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 `text`를 토큰화하고 시퀀스가 DistilBERT의 최대 입력 길이보다 길지 않도록 자르기 위한 전처리 함수를 생성하세요:
@@ -167,7 +167,7 @@ tokenized_imdb = imdb.map(preprocess_function, batched=True)
 >>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
 
 >>> model = AutoModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
 ... )
 ```
 
@@ -241,7 +241,7 @@ TensorFlow에서 모델을 파인 튜닝하려면, 먼저 옵티마이저 함수
 >>> from transformers import TFAutoModelForSequenceClassification
 
 >>> model = TFAutoModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
 ... )
 ```
 
diff --git a/docs/source/ko/tasks/summarization.md b/docs/source/ko/tasks/summarization.md
index 5ca5f63a27c91e..43eae25d79f0aa 100644
--- a/docs/source/ko/tasks/summarization.md
+++ b/docs/source/ko/tasks/summarization.md
@@ -29,7 +29,7 @@ rendered properly in your Markdown viewer.
 
 이 가이드에서 소개할 내용은 아래와 같습니다:
 
-1. 생성 요약을 위한 [BillSum](https://huggingface.co/datasets/billsum) 데이터셋 중 캘리포니아 주 법안 하위 집합으로 [T5](https://huggingface.co/t5-small)를 파인튜닝합니다.
+1. 생성 요약을 위한 [BillSum](https://huggingface.co/datasets/billsum) 데이터셋 중 캘리포니아 주 법안 하위 집합으로 [T5](https://huggingface.co/google-t5/t5-small)를 파인튜닝합니다.
 2. 파인튜닝된 모델을 사용하여 추론합니다.
 
 <Tip>
@@ -95,7 +95,7 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에
 ```py
 >>> from transformers import AutoTokenizer
 
->>> checkpoint = "t5-small"
+>>> checkpoint = "google-t5/t5-small"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 ```
 
diff --git a/docs/source/ko/tasks/token_classification.md b/docs/source/ko/tasks/token_classification.md
index b09c2c8078aa37..1e49d79a0d7235 100644
--- a/docs/source/ko/tasks/token_classification.md
+++ b/docs/source/ko/tasks/token_classification.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 이 가이드에서 학습할 내용은:
 
-1. [WNUT 17](https://huggingface.co/datasets/wnut_17) 데이터 세트에서 [DistilBERT](https://huggingface.co/distilbert-base-uncased)를 파인 튜닝하여 새로운 개체를 탐지합니다.
+1. [WNUT 17](https://huggingface.co/datasets/wnut_17) 데이터 세트에서 [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased)를 파인 튜닝하여 새로운 개체를 탐지합니다.
 2. 추론을 위해 파인 튜닝 모델을 사용합니다.
 
 <Tip>
@@ -109,7 +109,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 위의 예제 `tokens` 필드를 보면 입력이 이미 토큰화된 것처럼 보입니다. 그러나 실제로 입력은 아직 토큰화되지 않았으므로 단어를 하위 단어로 토큰화하기 위해 `is_split_into_words=True`를 설정해야 합니다. 예제로 확인합니다: 
@@ -270,7 +270,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
 >>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
 
 >>> model = AutoModelForTokenClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
 ... )
 ```
 
@@ -341,7 +341,7 @@ TensorFlow에서 모델을 파인 튜닝하려면, 먼저 옵티마이저 함수
 >>> from transformers import TFAutoModelForTokenClassification
 
 >>> model = TFAutoModelForTokenClassification.from_pretrained(
-...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
 ... )
 ```
 
diff --git a/docs/source/ko/tasks/translation.md b/docs/source/ko/tasks/translation.md
index fa7dc348fce38f..6de275f7d04c80 100644
--- a/docs/source/ko/tasks/translation.md
+++ b/docs/source/ko/tasks/translation.md
@@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 
 이 가이드에서 학습할 내용은:
 
-1. 영어 텍스트를 프랑스어로 번역하기 위해 [T5](https://huggingface.co/t5-small) 모델을 OPUS Books 데이터세트의 영어-프랑스어 하위 집합으로 파인튜닝하는 방법과
+1. 영어 텍스트를 프랑스어로 번역하기 위해 [T5](https://huggingface.co/google-t5/t5-small) 모델을 OPUS Books 데이터세트의 영어-프랑스어 하위 집합으로 파인튜닝하는 방법과
 2. 파인튜닝된 모델을 추론에 사용하는 방법입니다.
 
 <Tip>
@@ -88,7 +88,7 @@ pip install transformers datasets evaluate sacrebleu
 ```py
 >>> from transformers import AutoTokenizer
 
->>> checkpoint = "t5-small"
+>>> checkpoint = "google-t5/t5-small"
 >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 ```
 
diff --git a/docs/source/ko/tf_xla.md b/docs/source/ko/tf_xla.md
index 66d30abb2e9816..0b47d6fbad89d6 100644
--- a/docs/source/ko/tf_xla.md
+++ b/docs/source/ko/tf_xla.md
@@ -85,8 +85,8 @@ from transformers.utils import check_min_version
 check_min_version("4.21.0")
 
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
 # XLA 생성 함수를 만들기 위한 한 줄
@@ -114,8 +114,8 @@ XLA 활성화 함수(`xla_generate()`와 같은)를 처음 실행할 때 내부
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
 xla_generate = tf.function(model.generate, jit_compile=True)
@@ -135,8 +135,8 @@ import time
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 
 xla_generate = tf.function(model.generate, jit_compile=True)
 
diff --git a/docs/source/ko/tflite.md b/docs/source/ko/tflite.md
index 5d08ea4078549d..464106a6b7c261 100644
--- a/docs/source/ko/tflite.md
+++ b/docs/source/ko/tflite.md
@@ -38,10 +38,10 @@ pip install optimum[exporters-tf]
 optimum-cli export tflite --help
 ```
 
-예를 들어 🤗 Hub에서의 `bert-base-uncased` 모델 체크포인트를 내보내려면, 다음 명령을 실행하세요:
+예를 들어 🤗 Hub에서의 `google-bert/bert-base-uncased` 모델 체크포인트를 내보내려면, 다음 명령을 실행하세요:
 
 ```bash
-optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
 ```
 
 다음과 같이 진행 상황을 나타내는 로그와 결과물인 `model.tflite`가 저장된 위치를 보여주는 로그가 표시됩니다:
diff --git a/docs/source/ko/tokenizer_summary.md b/docs/source/ko/tokenizer_summary.md
index 5c6b9a6b73ca5f..0a4ece29a476d9 100644
--- a/docs/source/ko/tokenizer_summary.md
+++ b/docs/source/ko/tokenizer_summary.md
@@ -97,7 +97,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 >>> tokenizer.tokenize("I have a new GPU!")
 ["i", "have", "a", "new", "gp", "##u", "!"]
 ```
@@ -111,7 +111,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import XLNetTokenizer
 
->>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
 >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
 ["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
 ```
diff --git a/docs/source/ko/torchscript.md b/docs/source/ko/torchscript.md
index 297479caf2c0b6..28e198c5ec9306 100644
--- a/docs/source/ko/torchscript.md
+++ b/docs/source/ko/torchscript.md
@@ -82,7 +82,7 @@ TorchScript는 묶인 가중치를 가진 모델을 내보낼 수 없으므로,
 from transformers import BertModel, BertTokenizer, BertConfig
 import torch
 
-enc = BertTokenizer.from_pretrained("bert-base-uncased")
+enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
 # 입력 텍스트 토큰화하기
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
@@ -117,7 +117,7 @@ model = BertModel(config)
 model.eval()
 
 # 만약 *from_pretrained*를 사용하여 모델을 인스턴스화하는 경우, TorchScript 플래그를 쉽게 설정할 수 있습니다
-model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
 
 # 추적 생성하기
 traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
diff --git a/docs/source/ko/training.md b/docs/source/ko/training.md
index f4ab1332294363..fa6d56bdc36696 100644
--- a/docs/source/ko/training.md
+++ b/docs/source/ko/training.md
@@ -48,7 +48,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 
 >>> def tokenize_function(examples):
@@ -84,7 +84,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 <Tip>
@@ -187,7 +187,7 @@ dataset = dataset["train"]  # Just take the training split for now
 ```py
 from transformers import AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
 # Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
 tokenized_data = dict(tokenized_data)
@@ -202,7 +202,7 @@ from transformers import TFAutoModelForSequenceClassification
 from tensorflow.keras.optimizers import Adam
 
 # Load and compile our model
-model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
 # Lower learning rates are often better for fine-tuning transformers
 model.compile(optimizer=Adam(3e-5))
 
@@ -329,7 +329,7 @@ torch.cuda.empty_cache()
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 ### 옵티마이저 및 학습 속도 스케줄러[[optimizer-and-learning-rate-scheduler]]
diff --git a/docs/source/ko/troubleshooting.md b/docs/source/ko/troubleshooting.md
index 5eef788e09939c..263d693c23da65 100644
--- a/docs/source/ko/troubleshooting.md
+++ b/docs/source/ko/troubleshooting.md
@@ -134,7 +134,7 @@ RuntimeError: CUDA error: device-side assert triggered
 >>> from transformers import AutoModelForSequenceClassification
 >>> import torch
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
 >>> model.config.pad_token_id
 0
 ```
@@ -191,8 +191,8 @@ tensor([[ 0.0082, -0.2307],
 ```py
 >>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
 
->>> processor = AutoProcessor.from_pretrained("gpt2-medium")
->>> model = AutoModelForQuestionAnswering.from_pretrained("gpt2-medium")
+>>> processor = AutoProcessor.from_pretrained("openai-community/gpt2-medium")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("openai-community/gpt2-medium")
 ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
 Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ...
 ```
diff --git a/docs/source/pt/converting_tensorflow_models.md b/docs/source/pt/converting_tensorflow_models.md
index 97767b2ad420db..190c1aec5b22bf 100644
--- a/docs/source/pt/converting_tensorflow_models.md
+++ b/docs/source/pt/converting_tensorflow_models.md
@@ -100,9 +100,9 @@ transformers-cli convert --model_type gpt \
 Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT-2 pré-treinado (consulte [aqui](https://github.com/openai/gpt-2))
 
 ```bash
-export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
 
-transformers-cli convert --model_type gpt2 \
+transformers-cli convert --model_type openai-community/gpt2 \
   --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
   --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
   [--config OPENAI_GPT2_CONFIG] \
diff --git a/docs/source/pt/create_a_model.md b/docs/source/pt/create_a_model.md
index fd1e9c8f39ad22..dd71963236f4fa 100644
--- a/docs/source/pt/create_a_model.md
+++ b/docs/source/pt/create_a_model.md
@@ -86,7 +86,7 @@ DistilBertConfig {
 Atributos de um modelo pré-treinado podem ser modificados na função [`~PretrainedConfig.from_pretrained`]:
 
 ```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
 ```
 
 Uma vez que você está satisfeito com as configurações do seu modelo, você consegue salvar elas com [`~PretrainedConfig.save_pretrained`]. Seu arquivo de configurações está salvo como um arquivo JSON no diretório especificado:
@@ -127,13 +127,13 @@ Isso cria um modelo com valores aleatórios ao invés de pré-treinar os pesos.
 Criar um modelo pré-treinado com [`~PreTrainedModel.from_pretrained`]:
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Quando você carregar os pesos pré-treinados, a configuração padrão do modelo é automaticamente carregada se o modelo é provido pelo 🤗 Transformers. No entanto, você ainda consegue mudar - alguns ou todos - os atributos padrões de configuração do modelo com os seus próprio atributos, se você preferir: 
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </pt>
 <tf>
@@ -151,13 +151,13 @@ Isso cria um modelo com valores aleatórios ao invés de pré-treinar os pesos.
 Criar um modelo pré-treinado com [`~TFPreTrainedModel.from_pretrained`]:
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Quando você carregar os pesos pré-treinados, a configuração padrão do modelo é automaticamente carregada se o modelo é provido pelo 🤗 Transformers. No entanto, você ainda consegue mudar - alguns ou todos - os atributos padrões de configuração do modelo com os seus próprio atributos, se você preferir: 
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </tf>
 </frameworkcontent>
@@ -173,7 +173,7 @@ Por exemplo, [`DistilBertForSequenceClassification`] é um modelo DistilBERT bas
 ```py
 >>> from transformers import DistilBertForSequenceClassification
 
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Reutilize facilmente esse ponto de parada para outra tarefe mudando para uma head de modelo diferente. Para uma tarefe de responder questões, você usaria a head do modelo [`DistilBertForQuestionAnswering`]. A head de responder questões é similar com a de classificação de sequências exceto o fato de que ela é uma camada no topo dos estados das saídas ocultas.
@@ -181,7 +181,7 @@ Reutilize facilmente esse ponto de parada para outra tarefe mudando para uma hea
 ```py
 >>> from transformers import DistilBertForQuestionAnswering
 
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </pt>
 <tf>
@@ -190,7 +190,7 @@ Por exemplo, [`TFDistilBertForSequenceClassification`] é um modelo DistilBERT b
 ```py
 >>> from transformers import TFDistilBertForSequenceClassification
 
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Reutilize facilmente esse ponto de parada para outra tarefe mudando para uma head de modelo diferente. Para uma tarefe de responder questões, você usaria a head do modelo [`TFDistilBertForQuestionAnswering`]. A head de responder questões é similar com a de classificação de sequências exceto o fato de que ela é uma camada no topo dos estados das saídas ocultas.
@@ -198,7 +198,7 @@ Reutilize facilmente esse ponto de parada para outra tarefe mudando para uma hea
 ```py
 >>> from transformers import TFDistilBertForQuestionAnswering
 
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </tf>
 </frameworkcontent>
@@ -231,7 +231,7 @@ Se você treinou seu prórpio tokenizer, você pode criar um a partir do seu arq
 ```py
 >>> from transformers import DistilBertTokenizer
 
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Criando um 'fast tokenizer' com a classe [`DistilBertTokenizerFast`]:
@@ -239,7 +239,7 @@ Criando um 'fast tokenizer' com a classe [`DistilBertTokenizerFast`]:
 ```py
 >>> from transformers import DistilBertTokenizerFast
 
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip>
diff --git a/docs/source/pt/installation.md b/docs/source/pt/installation.md
index 574d34ee560ad2..7eeefd883d6ec3 100644
--- a/docs/source/pt/installation.md
+++ b/docs/source/pt/installation.md
@@ -185,14 +185,14 @@ Você pode adicionar o [🤗 Datasets](https://huggingface.co/docs/datasets/) ao
 Segue um exemplo de execução do programa numa rede padrão com firewall para instâncias externas, usando o seguinte comando:
 
 ```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 Execute esse mesmo programa numa instância offline com o seguinte comando:
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 O script agora deve ser executado sem travar ou expirar, pois procurará apenas por arquivos locais.
diff --git a/docs/source/pt/multilingual.md b/docs/source/pt/multilingual.md
index b6366b8c2289fb..5515c6a922a701 100644
--- a/docs/source/pt/multilingual.md
+++ b/docs/source/pt/multilingual.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 Existem vários modelos multilinguísticos no 🤗 Transformers e seus usos para inferência diferem dos modelos monolíngues.
 No entanto, nem *todos* os usos dos modelos multilíngues são tão diferentes.
-Alguns modelos, como o [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased),
+Alguns modelos, como o [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased),
 podem ser usados como se fossem monolíngues. Este guia irá te ajudar a usar modelos multilíngues cujo uso difere
 para o propósito de inferência.
 
@@ -34,25 +34,25 @@ checkpoints que usam de language embeddings e os que não.
 
 Os seguintes modelos de XLM usam language embeddings para especificar a linguagem utilizada para a inferência.
 
-- `xlm-mlm-ende-1024` (Masked language modeling, English-German)
-- `xlm-mlm-enfr-1024` (Masked language modeling, English-French)
-- `xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
-- `xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
-- `xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
-- `xlm-clm-enfr-1024` (Causal language modeling, English-French)
-- `xlm-clm-ende-1024` (Causal language modeling, English-German)
+- `FacebookAI/xlm-mlm-ende-1024` (Masked language modeling, English-German)
+- `FacebookAI/xlm-mlm-enfr-1024` (Masked language modeling, English-French)
+- `FacebookAI/xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
+- `FacebookAI/xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
+- `FacebookAI/xlm-clm-enfr-1024` (Causal language modeling, English-French)
+- `FacebookAI/xlm-clm-ende-1024` (Causal language modeling, English-German)
 
 Os language embeddings são representados por um tensor de mesma dimensão que os `input_ids` passados ao modelo.
 Os valores destes tensores dependem do idioma utilizado e se identificam pelos atributos `lang2id` e `id2lang` do tokenizador.
 
-Neste exemplo, carregamos o checkpoint `xlm-clm-enfr-1024`(Causal language modeling, English-French):
+Neste exemplo, carregamos o checkpoint `FacebookAI/xlm-clm-enfr-1024`(Causal language modeling, English-French):
 
 ```py
 >>> import torch
 >>> from transformers import XLMTokenizer, XLMWithLMHeadModel
 
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
 ```
 
 O atributo `lang2id` do tokenizador mostra os idiomas deste modelo e seus ids:
@@ -92,8 +92,8 @@ O script [run_generation.py](https://github.com/huggingface/transformers/tree/ma
 
 Os seguintes modelos XLM não requerem o uso de language embeddings durante a inferência:
 
-- `xlm-mlm-17-1280` (Modelagem de linguagem com máscara, 17 idiomas)
-- `xlm-mlm-100-1280` (Modelagem de linguagem com máscara, 100 idiomas)
+- `FacebookAI/xlm-mlm-17-1280` (Modelagem de linguagem com máscara, 17 idiomas)
+- `FacebookAI/xlm-mlm-100-1280` (Modelagem de linguagem com máscara, 100 idiomas)
 
 Estes modelos são utilizados para representações genéricas de frase diferentemente dos checkpoints XLM anteriores.
 
@@ -101,8 +101,8 @@ Estes modelos são utilizados para representações genéricas de frase diferent
 
 Os seguintes modelos do BERT podem ser utilizados para tarefas multilinguísticas:
 
-- `bert-base-multilingual-uncased` (Modelagem de linguagem com máscara + Previsão de frases, 102 idiomas)
-- `bert-base-multilingual-cased` (Modelagem de linguagem com máscara + Previsão de frases, 104 idiomas)
+- `google-bert/bert-base-multilingual-uncased` (Modelagem de linguagem com máscara + Previsão de frases, 102 idiomas)
+- `google-bert/bert-base-multilingual-cased` (Modelagem de linguagem com máscara + Previsão de frases, 104 idiomas)
 
 Estes modelos não requerem language embeddings durante a inferência. Devem identificar a linguagem a partir
 do contexto e realizar a inferência em sequência.
@@ -111,8 +111,8 @@ do contexto e realizar a inferência em sequência.
 
 Os seguintes modelos do XLM-RoBERTa podem ser utilizados para tarefas multilinguísticas:
 
-- `xlm-roberta-base` (Modelagem de linguagem com máscara, 100 idiomas)
-- `xlm-roberta-large` Modelagem de linguagem com máscara, 100 idiomas)
+- `FacebookAI/xlm-roberta-base` (Modelagem de linguagem com máscara, 100 idiomas)
+- `FacebookAI/xlm-roberta-large` Modelagem de linguagem com máscara, 100 idiomas)
 
 O XLM-RoBERTa foi treinado com 2,5 TB de dados do CommonCrawl recém-criados e testados em 100 idiomas.
 Proporciona fortes vantagens sobre os modelos multilinguísticos publicados anteriormente como o mBERT e o XLM em tarefas
diff --git a/docs/source/pt/pipeline_tutorial.md b/docs/source/pt/pipeline_tutorial.md
index b2294863013601..9c0cb3567e72e3 100644
--- a/docs/source/pt/pipeline_tutorial.md
+++ b/docs/source/pt/pipeline_tutorial.md
@@ -85,8 +85,8 @@ para uma tarefa de modelagem de linguagem causal:
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 ```
 
 Crie uma [`pipeline`] para a sua tarefa e especifíque o modelo e o tokenizador que foram carregados:
diff --git a/docs/source/pt/quicktour.md b/docs/source/pt/quicktour.md
index 67c511169e34d0..d34480ee23a880 100644
--- a/docs/source/pt/quicktour.md
+++ b/docs/source/pt/quicktour.md
@@ -87,7 +87,7 @@ Importe [`pipeline`] e especifique a tarefa que deseja completar:
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-A pipeline baixa and armazena um [modelo pré-treinado](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) padrão e tokenizer para análise sentimental. Agora você pode usar `classifier` no texto alvo: 
+A pipeline baixa and armazena um [modelo pré-treinado](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) padrão e tokenizer para análise sentimental. Agora você pode usar `classifier` no texto alvo: 
 
 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
diff --git a/docs/source/pt/run_scripts.md b/docs/source/pt/run_scripts.md
index ff3110817e8ae7..a64ad72f1dbc61 100644
--- a/docs/source/pt/run_scripts.md
+++ b/docs/source/pt/run_scripts.md
@@ -88,11 +88,11 @@ pip install -r requirements.txt
 <frameworkcontent>
 <pt>
 
-O script de exemplo baixa e pré-processa um conjunto de dados da biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Em seguida, o script ajusta um conjunto de dados com o [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) em uma arquitetura que oferece suporte à sumarização. O exemplo a seguir mostra como ajustar [T5-small](https://huggingface.co/t5-small) no conjunto de dados [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). O modelo T5 requer um argumento `source_prefix` adicional devido à forma como foi treinado. Este prompt informa ao T5 que esta é uma tarefa de sumarização.
+O script de exemplo baixa e pré-processa um conjunto de dados da biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Em seguida, o script ajusta um conjunto de dados com o [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) em uma arquitetura que oferece suporte à sumarização. O exemplo a seguir mostra como ajustar [T5-small](https://huggingface.co/google-t5/t5-small) no conjunto de dados [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). O modelo T5 requer um argumento `source_prefix` adicional devido à forma como foi treinado. Este prompt informa ao T5 que esta é uma tarefa de sumarização.
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -106,11 +106,11 @@ python examples/pytorch/summarization/run_summarization.py \
 ```
 </pt>
 <tf>
-Este outro script de exemplo baixa e pré-processa um conjunto de dados da biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Em seguida, o script ajusta um conjunto de dados usando Keras em uma arquitetura que oferece suporte à sumarização. O exemplo a seguir mostra como ajustar [T5-small](https://huggingface.co/t5-small) no conjunto de dados [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). O modelo T5 requer um argumento `source_prefix` adicional devido à forma como foi treinado. Este prompt informa ao T5 que esta é uma tarefa de sumarização.
+Este outro script de exemplo baixa e pré-processa um conjunto de dados da biblioteca 🤗 [Datasets](https://huggingface.co/docs/datasets/). Em seguida, o script ajusta um conjunto de dados usando Keras em uma arquitetura que oferece suporte à sumarização. O exemplo a seguir mostra como ajustar [T5-small](https://huggingface.co/google-t5/t5-small) no conjunto de dados [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail). O modelo T5 requer um argumento `source_prefix` adicional devido à forma como foi treinado. Este prompt informa ao T5 que esta é uma tarefa de sumarização.
 
 ```bash
 python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -134,7 +134,7 @@ O [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) ofere
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -158,7 +158,7 @@ As Unidades de Processamento de Tensor (TPUs) são projetadas especificamente pa
 ```bash
 python xla_spawn.py --num_cores 8 \
     summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -178,7 +178,7 @@ As Unidades de Processamento de Tensor (TPUs) são projetadas especificamente pa
 ```bash
 python run_summarization.py  \
     --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -217,7 +217,7 @@ Agora você está pronto para iniciar o treinamento:
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -236,7 +236,7 @@ Um script para sumarização usando um conjunto de dados customizado ficaria ass
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -261,7 +261,7 @@ Geralmente, é uma boa ideia executar seu script em um número menor de exemplos
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --max_train_samples 50 \
     --max_eval_samples 50 \
     --max_predict_samples 50 \
@@ -291,7 +291,7 @@ O primeiro método usa o argumento `output_dir previous_output_dir` para retomar
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -308,7 +308,7 @@ O segundo método usa o argumento `resume_from_checkpoint path_to_specific_check
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -338,7 +338,7 @@ O exemplo a seguir mostra como fazer upload de um modelo com um nome de reposit
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
diff --git a/docs/source/pt/serialization.md b/docs/source/pt/serialization.md
index d5a21c7f890d53..9e390f07bde41d 100644
--- a/docs/source/pt/serialization.md
+++ b/docs/source/pt/serialization.md
@@ -146,7 +146,7 @@ optional arguments:
 A exportação de um checkpoint usando uma configuração pronta pode ser feita da seguinte forma:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
 ```
 
 Você deve ver os seguintes logs:
@@ -161,7 +161,7 @@ All good, model saved at: onnx/model.onnx
 ```
 
 Isso exporta um grafo ONNX do ponto de verificação definido pelo argumento `--model`. Nisso
-Por exemplo, é `distilbert-base-uncased`, mas pode ser qualquer checkpoint no Hugging
+Por exemplo, é `distilbert/distilbert-base-uncased`, mas pode ser qualquer checkpoint no Hugging
 Face Hub ou um armazenado localmente.
 
 O arquivo `model.onnx` resultante pode ser executado em um dos [muitos
@@ -173,7 +173,7 @@ Tempo de execução](https://onnxruntime.ai/) da seguinte forma:
 >>> from transformers import AutoTokenizer
 >>> from onnxruntime import InferenceSession
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 >>> session = InferenceSession("onnx/model.onnx")
 >>> # ONNX Runtime expects NumPy arrays as input
 >>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
@@ -207,8 +207,8 @@ arquivos tokenizer armazenados em um diretório. Por exemplo, podemos carregar e
 >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 >>> # Load tokenizer and PyTorch weights form the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 >>> # Save to disk
 >>> tokenizer.save_pretrained("local-pt-checkpoint")
 >>> pt_model.save_pretrained("local-pt-checkpoint")
@@ -225,8 +225,8 @@ python -m transformers.onnx --model=local-pt-checkpoint onnx/
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 
 >>> # Load tokenizer and TensorFlow weights from the Hub
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 >>> # Save to disk
 >>> tokenizer.save_pretrained("local-tf-checkpoint")
 >>> tf_model.save_pretrained("local-tf-checkpoint")
@@ -271,7 +271,7 @@ pacote `transformers.onnx`. Por exemplo, para exportar um modelo de classificaç
 escolher um modelo ajustado no Hub e executar:
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased-finetuned-sst-2-english \
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased-finetuned-sst-2-english \
                             --feature=sequence-classification onnx/
 ```
 
@@ -287,7 +287,7 @@ All good, model saved at: onnx/model.onnx
 ```
 
 Observe que, neste caso, os nomes de saída do modelo ajustado são `logits`
-em vez do `last_hidden_state` que vimos com o checkpoint `distilbert-base-uncased`
+em vez do `last_hidden_state` que vimos com o checkpoint `distilbert/distilbert-base-uncased`
 mais cedo. Isso é esperado, pois o modelo ajustado (fine-tuned) possui uma cabeça de classificação de sequência.
 
 <Tip>
@@ -379,7 +379,7 @@ configuração do modelo base da seguinte forma:
 ```python
 >>> from transformers import AutoConfig
 
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased")
 >>> onnx_config = DistilBertOnnxConfig(config)
 ```
 
@@ -410,7 +410,7 @@ de classificação, poderíamos usar:
 ```python
 >>> from transformers import AutoConfig
 
->>> config = AutoConfig.from_pretrained("distilbert-base-uncased")
+>>> config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased")
 >>> onnx_config_for_seq_clf = DistilBertOnnxConfig(config, task="sequence-classification")
 >>> print(onnx_config_for_seq_clf.outputs)
 OrderedDict([('logits', {0: 'batch'})])
@@ -437,7 +437,7 @@ e o caminho para salvar o arquivo exportado:
 >>> from transformers import AutoTokenizer, AutoModel
 
 >>> onnx_path = Path("model.onnx")
->>> model_ckpt = "distilbert-base-uncased"
+>>> model_ckpt = "distilbert/distilbert-base-uncased"
 >>> base_model = AutoModel.from_pretrained(model_ckpt)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
 
diff --git a/docs/source/pt/tasks/sequence_classification.md b/docs/source/pt/tasks/sequence_classification.md
index 02647f68f8866f..e7776894f874cb 100644
--- a/docs/source/pt/tasks/sequence_classification.md
+++ b/docs/source/pt/tasks/sequence_classification.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 A classificação de texto é uma tarefa comum de NLP que atribui um rótulo ou classe a um texto. Existem muitas aplicações práticas de classificação de texto amplamente utilizadas em produção por algumas das maiores empresas da atualidade. Uma das formas mais populares de classificação de texto é a análise de sentimento, que atribui um rótulo como positivo, negativo ou neutro a um texto.
 
-Este guia mostrará como realizar o fine-tuning do [DistilBERT](https://huggingface.co/distilbert-base-uncased) no conjunto de dados [IMDb](https://huggingface.co/datasets/imdb) para determinar se a crítica de filme é positiva ou negativa.
+Este guia mostrará como realizar o fine-tuning do [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) no conjunto de dados [IMDb](https://huggingface.co/datasets/imdb) para determinar se a crítica de filme é positiva ou negativa.
 
 <Tip>
 
@@ -60,7 +60,7 @@ Carregue o tokenizador do DistilBERT para processar o campo `text`:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Crie uma função de pré-processamento para tokenizar o campo `text` e truncar as sequências para que não sejam maiores que o comprimento máximo de entrada do DistilBERT:
@@ -104,7 +104,7 @@ Carregue o DistilBERT com [`AutoModelForSequenceClassification`] junto com o nú
 ```py
 >>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)
 ```
 
 <Tip>
@@ -190,7 +190,7 @@ Carregue o DistilBERT com [`TFAutoModelForSequenceClassification`] junto com o n
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)
 ```
 
 Configure o modelo para treinamento com o método [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
diff --git a/docs/source/pt/tasks/token_classification.md b/docs/source/pt/tasks/token_classification.md
index 316d6a8102180a..3465680dcc2046 100644
--- a/docs/source/pt/tasks/token_classification.md
+++ b/docs/source/pt/tasks/token_classification.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 A classificação de tokens atribui um rótulo a tokens individuais em uma frase. Uma das tarefas de classificação de tokens mais comuns é o Reconhecimento de Entidade Nomeada, também chamada de NER (sigla em inglês para Named Entity Recognition). O NER tenta encontrar um rótulo para cada entidade em uma frase, como uma pessoa, local ou organização.
 
-Este guia mostrará como realizar o fine-tuning do [DistilBERT](https://huggingface.co/distilbert-base-uncased) no conjunto de dados [WNUT 17](https://huggingface.co/datasets/wnut_17) para detectar novas entidades.
+Este guia mostrará como realizar o fine-tuning do [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) no conjunto de dados [WNUT 17](https://huggingface.co/datasets/wnut_17) para detectar novas entidades.
 
 <Tip>
 
@@ -85,7 +85,7 @@ Carregue o tokenizer do DistilBERT para processar os `tokens`:
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 Como a entrada já foi dividida em palavras, defina `is_split_into_words=True` para tokenizar as palavras em subpalavras:
@@ -162,7 +162,7 @@ Carregue o DistilBERT com o [`AutoModelForTokenClassification`] junto com o núm
 ```py
 >>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=14)
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=14)
 ```
 
 <Tip>
@@ -246,7 +246,7 @@ Carregue o DistilBERT com o [`TFAutoModelForTokenClassification`] junto com o n
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)
 ```
 
 Configure o modelo para treinamento com o método [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
diff --git a/docs/source/pt/training.md b/docs/source/pt/training.md
index 6e39a46b16432d..49f57dead24233 100644
--- a/docs/source/pt/training.md
+++ b/docs/source/pt/training.md
@@ -58,7 +58,7 @@ todo o dataset.
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 
 >>> def tokenize_function(examples):
@@ -93,7 +93,7 @@ sabemos ter 5 labels usamos o seguinte código:
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 <Tip>
@@ -232,7 +232,7 @@ Carregue um modelo do TensorFlow com o número esperado de rótulos:
 >>> import tensorflow as tf
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 A seguir, compile e ajuste o fine-tuning a seu modelo com [`fit`](https://keras.io/api/models/model_training_apis/) como
@@ -311,7 +311,7 @@ Carregue seu modelo com o número de labels esperados:
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 ### Otimização e configuração do Learning Rate
diff --git a/docs/source/te/quicktour.md b/docs/source/te/quicktour.md
index 862ec416da821d..75efa841128605 100644
--- a/docs/source/te/quicktour.md
+++ b/docs/source/te/quicktour.md
@@ -81,7 +81,7 @@ Here is the translation in Telugu:
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-సెంటిమెంట్ విశ్లేషణ కోసం [`pipeline`] డిఫాల్ట్ [ప్రీట్రైన్డ్ మోడల్](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) మరియు టోకెనైజర్‌ని డౌన్‌లోడ్ చేస్తుంది మరియు కాష్ చేస్తుంది. ఇప్పుడు మీరు మీ లక్ష్య వచనంలో `classifier`ని ఉపయోగించవచ్చు:
+సెంటిమెంట్ విశ్లేషణ కోసం [`pipeline`] డిఫాల్ట్ [ప్రీట్రైన్డ్ మోడల్](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) మరియు టోకెనైజర్‌ని డౌన్‌లోడ్ చేస్తుంది మరియు కాష్ చేస్తుంది. ఇప్పుడు మీరు మీ లక్ష్య వచనంలో `classifier`ని ఉపయోగించవచ్చు:
 
 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
@@ -389,7 +389,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import AutoConfig
 
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
 ```
 
 <frameworkcontent>
@@ -425,7 +425,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoModelForSequenceClassification
 
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. [`TrainingArguments`] మీరు నేర్చుకునే రేటు, బ్యాచ్ పరిమాణం మరియు శిక్షణ పొందవలసిన యుగాల సంఖ్య వంటి మార్చగల మోడల్ హైపర్‌పారామీటర్‌లను కలిగి ఉంది. మీరు ఎలాంటి శిక్షణా వాదనలను పేర్కొనకుంటే డిఫాల్ట్ విలువలు ఉపయోగించబడతాయి:
@@ -446,7 +446,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 4. డేటాసెట్‌ను లోడ్ చేయండి:
@@ -517,7 +517,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import TFAutoModelForSequenceClassification
 
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. టోకెనైజర్, ఇమేజ్ ప్రాసెసర్, ఫీచర్ ఎక్స్‌ట్రాక్టర్ లేదా ప్రాసెసర్ వంటి ప్రీప్రాసెసింగ్ క్లాస్‌ని లోడ్ చేయండి:
@@ -525,7 +525,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 3. డేటాసెట్‌ను టోకనైజ్ చేయడానికి ఒక ఫంక్షన్‌ను సృష్టించండి:
diff --git a/docs/source/zh/autoclass_tutorial.md b/docs/source/zh/autoclass_tutorial.md
index 936080a83153d4..7205aa0872d161 100644
--- a/docs/source/zh/autoclass_tutorial.md
+++ b/docs/source/zh/autoclass_tutorial.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 <Tip>
 
-请记住，架构指的是模型的结构，而checkpoints是给定架构的权重。例如，[BERT](https://huggingface.co/bert-base-uncased)是一种架构，而`bert-base-uncased`是一个checkpoint。模型是一个通用术语，可以指代架构或checkpoint。
+请记住，架构指的是模型的结构，而checkpoints是给定架构的权重。例如，[BERT](https://huggingface.co/google-bert/bert-base-uncased)是一种架构，而`google-bert/bert-base-uncased`是一个checkpoint。模型是一个通用术语，可以指代架构或checkpoint。
 
 
 </Tip>
@@ -43,7 +43,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 ```
 
 然后按照如下方式对输入进行分词：
@@ -104,7 +104,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 轻松地重复使用相同的checkpoint来为不同任务加载模型架构：
@@ -113,7 +113,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoModelForTokenClassification
 
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip warning={true}>
@@ -133,7 +133,7 @@ TensorFlow和Flax的checkpoints不受影响，并且可以在PyTorch架构中使
 ```py
 >>> from transformers import TFAutoModelForSequenceClassification
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 轻松地重复使用相同的checkpoint来为不同任务加载模型架构：
@@ -141,7 +141,7 @@ TensorFlow和Flax的checkpoints不受影响，并且可以在PyTorch架构中使
 ```py
 >>> from transformers import TFAutoModelForTokenClassification
 
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 一般来说，我们推荐使用`AutoTokenizer`类和`TFAutoModelFor`类来加载模型的预训练实例。这样可以确保每次加载正确的架构。在下一个[教程](preprocessing)中，学习如何使用新加载的`tokenizer`, `image processor`, `feature extractor`和`processor`对数据集进行预处理以进行微调。
 
diff --git a/docs/source/zh/big_models.md b/docs/source/zh/big_models.md
index ccb8b7ecbba3c2..2215c706618206 100644
--- a/docs/source/zh/big_models.md
+++ b/docs/source/zh/big_models.md
@@ -42,7 +42,7 @@ rendered properly in your Markdown viewer.
 ```py
 from transformers import AutoModel
 
-model = AutoModel.from_pretrained("bert-base-cased")
+model = AutoModel.from_pretrained("google-bert/bert-base-cased")
 ```
 
 如果您使用 [`PreTrainedModel.save_pretrained`](模型预训练保存) 进行保存，您将得到一个新的文件夹，其中包含两个文件：模型的配置和权重：
diff --git a/docs/source/zh/create_a_model.md b/docs/source/zh/create_a_model.md
index 9b36d5397626a4..fd07497e7abf3a 100644
--- a/docs/source/zh/create_a_model.md
+++ b/docs/source/zh/create_a_model.md
@@ -87,7 +87,7 @@ DistilBertConfig {
 预训练模型的属性可以在 [`~PretrainedConfig.from_pretrained`] 函数中进行修改：
 
 ```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
 ```
 
 当你对模型配置满意时，可以使用 [`~PretrainedConfig.save_pretrained`] 来保存配置。你的配置文件将以 JSON 文件的形式存储在指定的保存目录中：
@@ -128,13 +128,13 @@ DistilBertConfig {
 使用 [`~PreTrainedModel.from_pretrained`] 创建预训练模型：
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 当加载预训练权重时，如果模型是由 🤗 Transformers 提供的，将自动加载默认模型配置。然而，如果你愿意，仍然可以将默认模型配置的某些或者所有属性替换成你自己的配置：
 
 ```py
->>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </pt>
 <tf>
@@ -152,13 +152,13 @@ DistilBertConfig {
 使用 [`~TFPreTrainedModel.from_pretrained`] 创建预训练模型：
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 当加载预训练权重时，如果模型是由 🤗 Transformers 提供的，将自动加载默认模型配置。然而，如果你愿意，仍然可以将默认模型配置的某些或者所有属性替换成自己的配置：
 
 ```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
 ```
 </tf>
 </frameworkcontent>
@@ -174,7 +174,7 @@ DistilBertConfig {
 ```py
 >>> from transformers import DistilBertForSequenceClassification
 
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 通过切换到不同的模型头，可以轻松地将此检查点重复用于其他任务。对于问答任务，你可以使用 [`DistilBertForQuestionAnswering`] 模型头。问答头（question answering head）与序列分类头类似，不同点在于它是隐藏状态输出之上的线性层。
@@ -182,7 +182,7 @@ DistilBertConfig {
 ```py
 >>> from transformers import DistilBertForQuestionAnswering
 
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </pt>
 <tf>
@@ -191,7 +191,7 @@ DistilBertConfig {
 ```py
 >>> from transformers import TFDistilBertForSequenceClassification
 
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 通过切换到不同的模型头,可以轻松地将此检查点重复用于其他任务。对于问答任务，你可以使用 [`TFDistilBertForQuestionAnswering`] 模型头。问答头（question answering head）与序列分类头类似，不同点在于它是隐藏状态输出之上的线性层。
@@ -199,7 +199,7 @@ DistilBertConfig {
 ```py
 >>> from transformers import TFDistilBertForQuestionAnswering
 
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 </tf>
 </frameworkcontent>
@@ -232,7 +232,7 @@ DistilBertConfig {
 ```py
 >>> from transformers import DistilBertTokenizer
 
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 使用 [`DistilBertTokenizerFast`] 类创建快速分词器：
@@ -240,7 +240,7 @@ DistilBertConfig {
 ```py
 >>> from transformers import DistilBertTokenizerFast
 
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
 ```
 
 <Tip>
diff --git a/docs/source/zh/installation.md b/docs/source/zh/installation.md
index 0ce10ba5290647..91e09dc904bd7e 100644
--- a/docs/source/zh/installation.md
+++ b/docs/source/zh/installation.md
@@ -180,14 +180,14 @@ conda install conda-forge::transformers
 例如，你通常会使用以下命令对外部实例进行防火墙保护的的普通网络上运行程序：
 
 ```bash
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 在离线环境中运行相同的程序：
 
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 
 现在脚本可以应该正常运行，而无需挂起或等待超时，因为它知道只应查找本地文件。
diff --git a/docs/source/zh/internal/generation_utils.md b/docs/source/zh/internal/generation_utils.md
index a8e191f1ca9978..34e9bf2f787ef1 100644
--- a/docs/source/zh/internal/generation_utils.md
+++ b/docs/source/zh/internal/generation_utils.md
@@ -36,8 +36,8 @@ rendered properly in your Markdown viewer.
 ```python
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained("gpt2")
+tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
 
 inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
 generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
diff --git a/docs/source/zh/main_classes/deepspeed.md b/docs/source/zh/main_classes/deepspeed.md
index 85c5d017ef3c4f..75a0a13df75e24 100644
--- a/docs/source/zh/main_classes/deepspeed.md
+++ b/docs/source/zh/main_classes/deepspeed.md
@@ -178,7 +178,7 @@ deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.js
 ```bash
 deepspeed examples/pytorch/translation/run_translation.py \
 --deepspeed tests/deepspeed/ds_config_zero3.json \
---model_name_or_path t5-small --per_device_train_batch_size 1 \
+--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
 --output_dir output_dir --overwrite_output_dir --fp16 \
 --do_train --max_train_samples 500 --num_train_epochs 1 \
 --dataset_name wmt16 --dataset_config "ro-en" \
@@ -201,7 +201,7 @@ deepspeed examples/pytorch/translation/run_translation.py \
 ```bash
 deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
 --deepspeed tests/deepspeed/ds_config_zero2.json \
---model_name_or_path t5-small --per_device_train_batch_size 1 \
+--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
 --output_dir output_dir --overwrite_output_dir --fp16 \
 --do_train --max_train_samples 500 --num_train_epochs 1 \
 --dataset_name wmt16 --dataset_config "ro-en" \
@@ -1628,7 +1628,7 @@ from transformers import T5ForConditionalGeneration, T5Config
 import deepspeed
 
 with deepspeed.zero.Init():
-    config = T5Config.from_pretrained("t5-small")
+    config = T5Config.from_pretrained("google-t5/t5-small")
     model = T5ForConditionalGeneration(config)
 ```
 
@@ -1640,7 +1640,7 @@ with deepspeed.zero.Init():
 from transformers import AutoModel, Trainer, TrainingArguments
 
 training_args = TrainingArguments(..., deepspeed=ds_config)
-model = AutoModel.from_pretrained("t5-small")
+model = AutoModel.from_pretrained("google-t5/t5-small")
 trainer = Trainer(model=model, args=training_args, ...)
 ```
 
@@ -1690,7 +1690,7 @@ deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds
 ```bash
 deepspeed examples/pytorch/translation/run_translation.py \
 --deepspeed tests/deepspeed/ds_config_zero3.json \
---model_name_or_path t5-small --output_dir output_dir \
+--model_name_or_path google-t5/t5-small --output_dir output_dir \
 --do_eval --max_eval_samples 50 --warmup_steps 50  \
 --max_source_length 128 --val_max_target_length 128 \
 --overwrite_output_dir --per_device_eval_batch_size 4 \
@@ -1870,7 +1870,7 @@ import deepspeed
 ds_config = {...}  # deepspeed config object or path to the file
 # must run before instantiating the model to detect zero 3
 dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-model = AutoModel.from_pretrained("gpt2")
+model = AutoModel.from_pretrained("openai-community/gpt2")
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 ```
 
@@ -1884,7 +1884,7 @@ import deepspeed
 ds_config = {...}  # deepspeed config object or path to the file
 # must run before instantiating the model to detect zero 3
 dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-config = AutoConfig.from_pretrained("gpt2")
+config = AutoConfig.from_pretrained("openai-community/gpt2")
 model = AutoModel.from_config(config)
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 ```
diff --git a/docs/source/zh/main_classes/output.md b/docs/source/zh/main_classes/output.md
index 1619e27219d834..f4d5c3c6941d51 100644
--- a/docs/source/zh/main_classes/output.md
+++ b/docs/source/zh/main_classes/output.md
@@ -24,8 +24,8 @@ rendered properly in your Markdown viewer.
 from transformers import BertTokenizer, BertForSequenceClassification
 import torch
 
-tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
 
 inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
 labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
diff --git a/docs/source/zh/main_classes/pipelines.md b/docs/source/zh/main_classes/pipelines.md
index 82d6de8e7161a4..3cef40478c39a9 100644
--- a/docs/source/zh/main_classes/pipelines.md
+++ b/docs/source/zh/main_classes/pipelines.md
@@ -39,7 +39,7 @@ pipelines是使用模型进行推理的一种简单方法。这些pipelines是
 如果您想使用 [hub](https://huggingface.co) 上的特定模型，可以忽略任务，如果hub上的模型已经定义了该任务：
 
 ```python
->>> pipe = pipeline(model="roberta-large-mnli")
+>>> pipe = pipeline(model="FacebookAI/roberta-large-mnli")
 >>> pipe("This restaurant is awesome")
 [{'label': 'NEUTRAL', 'score': 0.7313136458396912}]
 ```
diff --git a/docs/source/zh/main_classes/trainer.md b/docs/source/zh/main_classes/trainer.md
index 049a3724114bd2..cb0262140cb22d 100644
--- a/docs/source/zh/main_classes/trainer.md
+++ b/docs/source/zh/main_classes/trainer.md
@@ -462,7 +462,7 @@ sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
 export TASK_NAME=mrpc
 
 python examples/pytorch/text-classification/run_glue.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --task_name $TASK_NAME \
   --do_train \
   --do_eval \
@@ -597,7 +597,7 @@ cd transformers
 
 accelerate launch \
 ./examples/pytorch/text-classification/run_glue.py \
---model_name_or_path bert-base-cased \
+--model_name_or_path google-bert/bert-base-cased \
 --task_name $TASK_NAME \
 --do_train \
 --do_eval \
@@ -622,7 +622,7 @@ accelerate launch --num_processes=2 \
 --fsdp_sharding_strategy=1 \
 --fsdp_state_dict_type=FULL_STATE_DICT \
 ./examples/pytorch/text-classification/run_glue.py
---model_name_or_path bert-base-cased \
+--model_name_or_path google-bert/bert-base-cased \
 --task_name $TASK_NAME \
 --do_train \
 --do_eval \
diff --git a/docs/source/zh/model_sharing.md b/docs/source/zh/model_sharing.md
index fbea41a90398ee..e28a000c11535e 100644
--- a/docs/source/zh/model_sharing.md
+++ b/docs/source/zh/model_sharing.md
@@ -235,4 +235,4 @@ pip install huggingface_hub
 * 手动创建并上传一个`README.md`文件。
 * 在你的模型仓库中点击**编辑模型卡片**按钮。
 
-可以参考DistilBert的[模型卡片](https://huggingface.co/distilbert-base-uncased)来了解模型卡片应该包含的信息类型。有关您可以在`README.md`文件中控制的更多选项的细节，例如模型的碳足迹或小部件示例，请参考文档[这里](https://huggingface.co/docs/hub/models-cards)。
\ No newline at end of file
+可以参考DistilBert的[模型卡片](https://huggingface.co/distilbert/distilbert-base-uncased)来了解模型卡片应该包含的信息类型。有关您可以在`README.md`文件中控制的更多选项的细节，例如模型的碳足迹或小部件示例，请参考文档[这里](https://huggingface.co/docs/hub/models-cards)。
\ No newline at end of file
diff --git a/docs/source/zh/multilingual.md b/docs/source/zh/multilingual.md
index 7e8ab1336d9933..9c27bd5f335ba0 100644
--- a/docs/source/zh/multilingual.md
+++ b/docs/source/zh/multilingual.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-🤗 Transformers 中有多种多语言模型，它们的推理用法与单语言模型不同。但是，并非*所有*的多语言模型用法都不同。一些模型，例如 [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased) 就可以像单语言模型一样使用。本指南将向您展示如何使用不同用途的多语言模型进行推理。
+🤗 Transformers 中有多种多语言模型，它们的推理用法与单语言模型不同。但是，并非*所有*的多语言模型用法都不同。一些模型，例如 [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased) 就可以像单语言模型一样使用。本指南将向您展示如何使用不同用途的多语言模型进行推理。
 
 ## XLM
 
@@ -28,24 +28,24 @@ XLM 有十个不同的检查点，其中只有一个是单语言的。剩下的
 
 以下 XLM 模型使用语言嵌入来指定推理中使用的语言：
 
-- `xlm-mlm-ende-1024` （掩码语言建模，英语-德语）
-- `xlm-mlm-enfr-1024` （掩码语言建模，英语-法语）
-- `xlm-mlm-enro-1024` （掩码语言建模，英语-罗马尼亚语）
-- `xlm-mlm-xnli15-1024` （掩码语言建模，XNLI 数据集语言）
-- `xlm-mlm-tlm-xnli15-1024` （掩码语言建模+翻译，XNLI 数据集语言）
-- `xlm-clm-enfr-1024` （因果语言建模，英语-法语）
-- `xlm-clm-ende-1024` （因果语言建模，英语-德语）
+- `FacebookAI/xlm-mlm-ende-1024` （掩码语言建模，英语-德语）
+- `FacebookAI/xlm-mlm-enfr-1024` （掩码语言建模，英语-法语）
+- `FacebookAI/xlm-mlm-enro-1024` （掩码语言建模，英语-罗马尼亚语）
+- `FacebookAI/xlm-mlm-xnli15-1024` （掩码语言建模，XNLI 数据集语言）
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` （掩码语言建模+翻译，XNLI 数据集语言）
+- `FacebookAI/xlm-clm-enfr-1024` （因果语言建模，英语-法语）
+- `FacebookAI/xlm-clm-ende-1024` （因果语言建模，英语-德语）
 
 语言嵌入被表示一个张量，其形状与传递给模型的 `input_ids` 相同。这些张量中的值取决于所使用的语言，并由分词器的 `lang2id` 和 `id2lang`  属性识别。
 
-在此示例中，加载 `xlm-clm-enfr-1024` 检查点（因果语言建模，英语-法语）：
+在此示例中，加载 `FacebookAI/xlm-clm-enfr-1024` 检查点（因果语言建模，英语-法语）：
 
 ```py
 >>> import torch
 >>> from transformers import XLMTokenizer, XLMWithLMHeadModel
 
->>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
 ```
 
 分词器的 `lang2id` 属性显示了该模型的语言及其对应的id：
@@ -83,8 +83,8 @@ XLM 有十个不同的检查点，其中只有一个是单语言的。剩下的
 
 以下 XLM 模型在推理时不需要语言嵌入：
 
-- `xlm-mlm-17-1280` （掩码语言建模，支持 17 种语言）
-- `xlm-mlm-100-1280` （掩码语言建模，支持 100 种语言）
+- `FacebookAI/xlm-mlm-17-1280` （掩码语言建模，支持 17 种语言）
+- `FacebookAI/xlm-mlm-100-1280` （掩码语言建模，支持 100 种语言）
 
 与之前的 XLM 检查点不同，这些模型用于通用句子表示。
 
@@ -92,8 +92,8 @@ XLM 有十个不同的检查点，其中只有一个是单语言的。剩下的
 
 以下 BERT 模型可用于多语言任务：
 
-- `bert-base-multilingual-uncased` （掩码语言建模 + 下一句预测，支持 102 种语言）
-- `bert-base-multilingual-cased` （掩码语言建模 + 下一句预测，支持 104 种语言）
+- `google-bert/bert-base-multilingual-uncased` （掩码语言建模 + 下一句预测，支持 102 种语言）
+- `google-bert/bert-base-multilingual-cased` （掩码语言建模 + 下一句预测，支持 104 种语言）
 
 这些模型在推理时不需要语言嵌入。它们应该能够从上下文中识别语言并进行相应的推理。
 
@@ -101,8 +101,8 @@ XLM 有十个不同的检查点，其中只有一个是单语言的。剩下的
 
 以下 XLM-RoBERTa 模型可用于多语言任务：
 
-- `xlm-roberta-base` （掩码语言建模，支持 100 种语言）
-- `xlm-roberta-large` （掩码语言建模，支持 100 种语言）
+- `FacebookAI/xlm-roberta-base` （掩码语言建模，支持 100 种语言）
+- `FacebookAI/xlm-roberta-large` （掩码语言建模，支持 100 种语言）
 
 XLM-RoBERTa 使用 100 种语言的 2.5TB 新创建和清理的 CommonCrawl 数据进行了训练。与之前发布的 mBERT 或 XLM 等多语言模型相比，它在分类、序列标记和问答等下游任务上提供了更强大的优势。
 
diff --git a/docs/source/zh/perf_hardware.md b/docs/source/zh/perf_hardware.md
index e193e09cd8cb71..95a09eaab4e103 100644
--- a/docs/source/zh/perf_hardware.md
+++ b/docs/source/zh/perf_hardware.md
@@ -136,7 +136,7 @@ GPU1    PHB      X      0-11            N/A
 # DDP w/ NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
@@ -145,7 +145,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 # DDP w/o NVLink
 
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
diff --git a/docs/source/zh/pipeline_tutorial.md b/docs/source/zh/pipeline_tutorial.md
index 01e621840cd3c8..568f8bb63603c2 100644
--- a/docs/source/zh/pipeline_tutorial.md
+++ b/docs/source/zh/pipeline_tutorial.md
@@ -175,7 +175,7 @@ def data():
         yield f"My example {i}"
 
 
-pipe = pipeline(model="gpt2", device=0)
+pipe = pipeline(model="openai-community/gpt2", device=0)
 generated_characters = 0
 for out in pipe(data()):
     generated_characters += len(out[0]["generated_text"])
diff --git a/docs/source/zh/preprocessing.md b/docs/source/zh/preprocessing.md
index 266cf0e6b9ef3c..b90c89b36d1567 100644
--- a/docs/source/zh/preprocessing.md
+++ b/docs/source/zh/preprocessing.md
@@ -56,7 +56,7 @@ pip install datasets
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 ```
 
 然后将您的文本传递给`tokenizer`：
diff --git a/docs/source/zh/quicktour.md b/docs/source/zh/quicktour.md
index 75b5f398e9463e..c23a38ab5f0004 100644
--- a/docs/source/zh/quicktour.md
+++ b/docs/source/zh/quicktour.md
@@ -73,7 +73,7 @@ pip install tensorflow
 >>> classifier = pipeline("sentiment-analysis")
 ```
 
-[`pipeline`] 会下载并缓存一个用于情感分析的默认的[预训练模型](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)和分词器。现在你可以在目标文本上使用 `classifier` 了：
+[`pipeline`] 会下载并缓存一个用于情感分析的默认的[预训练模型](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english)和分词器。现在你可以在目标文本上使用 `classifier` 了：
 
 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
@@ -379,7 +379,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import AutoConfig
 
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
 ```
 
 <frameworkcontent>
@@ -416,7 +416,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoModelForSequenceClassification
 
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. [`TrainingArguments`] 含有你可以修改的模型超参数，比如学习率，批次大小和训练时的迭代次数。如果你没有指定训练参数，那么它会使用默认值：
@@ -438,7 +438,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 4. 加载一个数据集：
@@ -506,7 +506,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import TFAutoModelForSequenceClassification
 
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. 一个预处理类，比如分词器，特征提取器或者处理器：
@@ -514,7 +514,7 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
    ```py
    >>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 3. 创建一个给数据集分词的函数
diff --git a/docs/source/zh/run_scripts.md b/docs/source/zh/run_scripts.md
index 0a0121c32f0b27..b6e9c8ea6a2d89 100644
--- a/docs/source/zh/run_scripts.md
+++ b/docs/source/zh/run_scripts.md
@@ -88,11 +88,11 @@ pip install -r requirements.txt
 <frameworkcontent>
 <pt>
 
-示例脚本从🤗 [Datasets](https://huggingface.co/docs/datasets/)库下载并预处理数据集。然后，脚本通过[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer)使用支持摘要任务的架构对数据集进行微调。以下示例展示了如何在[CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail)数据集上微调[T5-small](https://huggingface.co/t5-small)。由于T5模型的训练方式，它需要一个额外的`source_prefix`参数。这个提示让T5知道这是一个摘要任务。
+示例脚本从🤗 [Datasets](https://huggingface.co/docs/datasets/)库下载并预处理数据集。然后，脚本通过[Trainer](https://huggingface.co/docs/transformers/main_classes/trainer)使用支持摘要任务的架构对数据集进行微调。以下示例展示了如何在[CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail)数据集上微调[T5-small](https://huggingface.co/google-t5/t5-small)。由于T5模型的训练方式，它需要一个额外的`source_prefix`参数。这个提示让T5知道这是一个摘要任务。
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -107,11 +107,11 @@ python examples/pytorch/summarization/run_summarization.py \
 </pt>
 <tf>
 
-示例脚本从  🤗 [Datasets](https://huggingface.co/docs/datasets/) 库下载并预处理数据集。然后，脚本使用 Keras 在支持摘要的架构上微调数据集。以下示例展示了如何在 [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) 数据集上微调 [T5-small](https://huggingface.co/t5-small)。T5 模型由于训练方式需要额外的 `source_prefix` 参数。这个提示让 T5 知道这是一个摘要任务。
+示例脚本从  🤗 [Datasets](https://huggingface.co/docs/datasets/) 库下载并预处理数据集。然后，脚本使用 Keras 在支持摘要的架构上微调数据集。以下示例展示了如何在 [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) 数据集上微调 [T5-small](https://huggingface.co/google-t5/t5-small)。T5 模型由于训练方式需要额外的 `source_prefix` 参数。这个提示让 T5 知道这是一个摘要任务。
 
 ```bash
 python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -136,7 +136,7 @@ python examples/tensorflow/summarization/run_summarization.py  \
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -161,7 +161,7 @@ TensorFlow脚本使用[`MirroredStrategy`](https://www.tensorflow.org/guide/dist
 ```bash
 python xla_spawn.py --num_cores 8 \
     summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -181,7 +181,7 @@ python xla_spawn.py --num_cores 8 \
 ```bash
 python run_summarization.py  \
     --tpu name_of_tpu_resource \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -219,7 +219,7 @@ accelerate test
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -238,7 +238,7 @@ accelerate launch run_summarization_no_trainer.py \
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -264,7 +264,7 @@ python examples/pytorch/summarization/run_summarization.py \
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --max_train_samples 50 \
     --max_eval_samples 50 \
     --max_predict_samples 50 \
@@ -294,7 +294,7 @@ examples/pytorch/summarization/run_summarization.py -h
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -312,7 +312,7 @@ python examples/pytorch/summarization/run_summarization.py
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -343,7 +343,7 @@ huggingface-cli login
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
diff --git a/docs/source/zh/serialization.md b/docs/source/zh/serialization.md
index 584befebe2d76b..b9cc74e5849d63 100644
--- a/docs/source/zh/serialization.md
+++ b/docs/source/zh/serialization.md
@@ -56,10 +56,10 @@ pip install optimum[exporters]
 optimum-cli export onnx --help
 ```
 
-运行以下命令，以从 🤗 Hub 导出模型的检查点（checkpoint），以 `distilbert-base-uncased-distilled-squad` 为例：
+运行以下命令，以从 🤗 Hub 导出模型的检查点（checkpoint），以 `distilbert/distilbert-base-uncased-distilled-squad` 为例：
 
 ```bash
-optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
 ```
 
 你应该能在日志中看到导出进度以及生成的 `model.onnx` 文件的保存位置，如下所示：
@@ -141,7 +141,7 @@ pip install transformers[onnx]
 将 `transformers.onnx` 包作为 Python 模块使用，以使用现成的配置导出检查点：
 
 ```bash
-python -m transformers.onnx --model=distilbert-base-uncased onnx/
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
 ```
 
 以上代码将导出由 `--model` 参数定义的检查点的 ONNX 图。传入任何 🤗 Hub 上或者存储与本地的检查点。生成的 `model.onnx` 文件可以在支持 ONNX 标准的众多加速引擎上运行。例如，使用 ONNX Runtime 加载并运行模型，如下所示：
@@ -150,7 +150,7 @@ python -m transformers.onnx --model=distilbert-base-uncased onnx/
 >>> from transformers import AutoTokenizer
 >>> from onnxruntime import InferenceSession
 
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
 >>> session = InferenceSession("onnx/model.onnx")
 >>> # ONNX Runtime expects NumPy arrays as input
 >>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
diff --git a/docs/source/zh/task_summary.md b/docs/source/zh/task_summary.md
index da60f4a080a2e9..8d088bfa71b2d0 100644
--- a/docs/source/zh/task_summary.md
+++ b/docs/source/zh/task_summary.md
@@ -272,7 +272,7 @@ score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
 >>> from transformers import pipeline
 
 >>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
->>> translator = pipeline(task="translation", model="t5-small")
+>>> translator = pipeline(task="translation", model="google-t5/t5-small")
 >>> translator(text)
 [{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
 ```
diff --git a/docs/source/zh/tf_xla.md b/docs/source/zh/tf_xla.md
index da8d13d8d04bac..2e5b444d876c0a 100644
--- a/docs/source/zh/tf_xla.md
+++ b/docs/source/zh/tf_xla.md
@@ -86,8 +86,8 @@ from transformers.utils import check_min_version
 check_min_version("4.21.0")
 
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
 # One line to create an XLA generation function
@@ -115,8 +115,8 @@ print(f"Generated -- {decoded_text}")
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
 xla_generate = tf.function(model.generate, jit_compile=True)
@@ -136,8 +136,8 @@ import time
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
 
-tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 
 xla_generate = tf.function(model.generate, jit_compile=True)
 
diff --git a/docs/source/zh/tflite.md b/docs/source/zh/tflite.md
index bf47d411447a0a..f0280156def431 100644
--- a/docs/source/zh/tflite.md
+++ b/docs/source/zh/tflite.md
@@ -32,10 +32,10 @@ pip install optimum[exporters-tf]
 optimum-cli export tflite --help
 ```
 
-运行以下命令，以从 🤗 Hub 导出模型的检查点（checkpoint），以 `bert-base-uncased` 为例：
+运行以下命令，以从 🤗 Hub 导出模型的检查点（checkpoint），以 `google-bert/bert-base-uncased` 为例：
 
 ```bash
-optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
 ```
 
 你应该能在日志中看到导出进度以及生成的 `model.tflite` 文件的保存位置，如下所示：
diff --git a/docs/source/zh/tokenizer_summary.md b/docs/source/zh/tokenizer_summary.md
index d3a4cf7a33058e..c349154f961218 100644
--- a/docs/source/zh/tokenizer_summary.md
+++ b/docs/source/zh/tokenizer_summary.md
@@ -92,7 +92,7 @@ and [SentencePiece](#sentencepiece)，并且给出了示例，哪个模型用到
 ```py
 >>> from transformers import BertTokenizer
 
->>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 >>> tokenizer.tokenize("I have a new GPU!")
 ["i", "have", "a", "new", "gp", "##u", "!"]
 ```
@@ -106,7 +106,7 @@ token应该附着在前面那个token的后面，不带空格的附着（分词
 ```py
 >>> from transformers import XLNetTokenizer
 
->>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
 >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
 ["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
 ```
diff --git a/docs/source/zh/training.md b/docs/source/zh/training.md
index 89908130fe303a..773c58181c31e9 100644
--- a/docs/source/zh/training.md
+++ b/docs/source/zh/training.md
@@ -48,7 +48,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
 
 >>> def tokenize_function(examples):
@@ -85,7 +85,7 @@ rendered properly in your Markdown viewer.
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 <Tip>
@@ -180,7 +180,7 @@ dataset = dataset["train"]  # Just take the training split for now
 ```py
 from transformers import AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
 # Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
 tokenized_data = dict(tokenized_data)
@@ -194,7 +194,7 @@ from transformers import TFAutoModelForSequenceClassification
 from tensorflow.keras.optimizers import Adam
 
 # Load and compile our model
-model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
 # Lower learning rates are often better for fine-tuning transformers
 model.compile(optimizer=Adam(3e-5))  # No loss argument!
 
@@ -306,7 +306,7 @@ torch.cuda.empty_cache()
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
 ```
 
 ### Optimizer and learning rate scheduler
diff --git a/examples/README.md b/examples/README.md
index 3a18950064bfdb..a38b4576b35fd3 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -118,8 +118,8 @@ pip install runhouse
 # For an on-demand V100 with whichever cloud provider you have configured:
 python run_on_remote.py \
     --example pytorch/text-generation/run_generation.py \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2 \
+    --model_type=openai-community/gpt2 \
+    --model_name_or_path=openai-community/gpt2 \
     --prompt "I am a language model and"
 
 # For byo (bring your own) cluster:
diff --git a/examples/flax/image-captioning/README.md b/examples/flax/image-captioning/README.md
index b76dc4cd057f66..dd2b420639258f 100644
--- a/examples/flax/image-captioning/README.md
+++ b/examples/flax/image-captioning/README.md
@@ -34,7 +34,7 @@ Next, we create a [FlaxVisionEncoderDecoderModel](https://huggingface.co/docs/tr
 python3 create_model_from_encoder_decoder_models.py \
     --output_dir model \
     --encoder_model_name_or_path google/vit-base-patch16-224-in21k \
-    --decoder_model_name_or_path gpt2
+    --decoder_model_name_or_path openai-community/gpt2
 ```
 
 ### Train the model
diff --git a/examples/flax/language-modeling/README.md b/examples/flax/language-modeling/README.md
index e687c76a9cc20d..cb8671147ff98c 100644
--- a/examples/flax/language-modeling/README.md
+++ b/examples/flax/language-modeling/README.md
@@ -28,7 +28,7 @@ way which enables simple and efficient model parallelism.
 In the following, we demonstrate how to train a bi-directional transformer model 
 using masked language modeling objective as introduced in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
 More specifically, we demonstrate how JAX/Flax can be leveraged 
-to pre-train [**`roberta-base`**](https://huggingface.co/roberta-base)
+to pre-train [**`FacebookAI/roberta-base`**](https://huggingface.co/FacebookAI/roberta-base)
 in Norwegian on a single TPUv3-8 pod.
 
 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
@@ -76,13 +76,13 @@ tokenizer.save("./norwegian-roberta-base/tokenizer.json")
 ### Create configuration
 
 Next, we create the model's configuration file. This is as simple 
-as loading and storing [`**roberta-base**`](https://huggingface.co/roberta-base)
+as loading and storing [`**FacebookAI/roberta-base**`](https://huggingface.co/FacebookAI/roberta-base)
 in the local model folder:
 
 ```python
 from transformers import RobertaConfig
 
-config = RobertaConfig.from_pretrained("roberta-base", vocab_size=50265)
+config = RobertaConfig.from_pretrained("FacebookAI/roberta-base", vocab_size=50265)
 config.save_pretrained("./norwegian-roberta-base")
 ```
 
@@ -129,8 +129,8 @@ look at [this](https://colab.research.google.com/github/huggingface/notebooks/bl
 
 In the following, we demonstrate how to train an auto-regressive causal transformer model 
 in JAX/Flax.
-More specifically, we pretrain a randomly initialized [**`gpt2`**](https://huggingface.co/gpt2) model in Norwegian on a single TPUv3-8.
-to pre-train 124M [**`gpt2`**](https://huggingface.co/gpt2)
+More specifically, we pretrain a randomly initialized [**`openai-community/gpt2`**](https://huggingface.co/openai-community/gpt2) model in Norwegian on a single TPUv3-8.
+to pre-train 124M [**`openai-community/gpt2`**](https://huggingface.co/openai-community/gpt2)
 in Norwegian on a single TPUv3-8 pod.
 
 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
@@ -179,13 +179,13 @@ tokenizer.save("./norwegian-gpt2/tokenizer.json")
 ### Create configuration
 
 Next, we create the model's configuration file. This is as simple 
-as loading and storing [`**gpt2**`](https://huggingface.co/gpt2)
+as loading and storing [`**openai-community/gpt2**`](https://huggingface.co/openai-community/gpt2)
 in the local model folder:
 
 ```python
 from transformers import GPT2Config
 
-config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=50257)
+config = GPT2Config.from_pretrained("openai-community/gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=50257)
 config.save_pretrained("./norwegian-gpt2")
 ```
 
@@ -199,7 +199,7 @@ Finally, we can run the example script to pretrain the model:
 ```bash
 python run_clm_flax.py \
     --output_dir="./norwegian-gpt2" \
-    --model_type="gpt2" \
+    --model_type="openai-community/gpt2" \
     --config_name="./norwegian-gpt2" \
     --tokenizer_name="./norwegian-gpt2" \
     --dataset_name="oscar" \
diff --git a/examples/flax/question-answering/README.md b/examples/flax/question-answering/README.md
index 822342a99e2168..2f6caa984d4bc1 100644
--- a/examples/flax/question-answering/README.md
+++ b/examples/flax/question-answering/README.md
@@ -29,7 +29,7 @@ The following example fine-tunes BERT on SQuAD:
 
 ```bash
 python run_qa.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --dataset_name squad \
   --do_train   \
   --do_eval   \
@@ -67,7 +67,7 @@ Here is an example training on 4 TITAN RTX GPUs and Bert Whole Word Masking unca
 ```bash
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 python run_qa.py   \
---model_name_or_path bert-large-uncased-whole-word-masking   \
+--model_name_or_path google-bert/bert-large-uncased-whole-word-masking   \
 --dataset_name squad   \
 --do_train   \
 --do_eval   \
diff --git a/examples/flax/test_flax_examples.py b/examples/flax/test_flax_examples.py
index 47ac66de118aaa..9fc424c1a7532c 100644
--- a/examples/flax/test_flax_examples.py
+++ b/examples/flax/test_flax_examples.py
@@ -78,7 +78,7 @@ def test_run_glue(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_glue.py
-            --model_name_or_path distilbert-base-uncased
+            --model_name_or_path distilbert/distilbert-base-uncased
             --output_dir {tmp_dir}
             --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
             --validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
@@ -101,7 +101,7 @@ def test_run_clm(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_clm_flax.py
-            --model_name_or_path distilgpt2
+            --model_name_or_path distilbert/distilgpt2
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --do_train
@@ -125,7 +125,7 @@ def test_run_summarization(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_summarization.py
-            --model_name_or_path t5-small
+            --model_name_or_path google-t5/t5-small
             --train_file tests/fixtures/tests_samples/xsum/sample.json
             --validation_file tests/fixtures/tests_samples/xsum/sample.json
             --test_file tests/fixtures/tests_samples/xsum/sample.json
@@ -155,7 +155,7 @@ def test_run_mlm(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_mlm.py
-            --model_name_or_path distilroberta-base
+            --model_name_or_path distilbert/distilroberta-base
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --output_dir {tmp_dir}
@@ -179,7 +179,7 @@ def test_run_t5_mlm(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_t5_mlm_flax.py
-            --model_name_or_path t5-small
+            --model_name_or_path google-t5/t5-small
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --do_train
@@ -206,7 +206,7 @@ def test_run_ner(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_flax_ner.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --train_file tests/fixtures/tests_samples/conll/sample.json
             --validation_file tests/fixtures/tests_samples/conll/sample.json
             --output_dir {tmp_dir}
@@ -233,7 +233,7 @@ def test_run_qa(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_qa.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --version_2_with_negative
             --train_file tests/fixtures/tests_samples/SQUAD/sample.json
             --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
diff --git a/examples/flax/text-classification/README.md b/examples/flax/text-classification/README.md
index 8d43ab7725a241..65e50a075b78d5 100644
--- a/examples/flax/text-classification/README.md
+++ b/examples/flax/text-classification/README.md
@@ -31,7 +31,7 @@ GLUE is made up of a total of 9 different tasks. Here is how to run the script o
 export TASK_NAME=mrpc
 
 python run_flax_glue.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --task_name ${TASK_NAME} \
   --max_seq_length 128 \
   --learning_rate 2e-5 \
diff --git a/examples/flax/token-classification/README.md b/examples/flax/token-classification/README.md
index 915cf6ae20ff93..1f8175072148bb 100644
--- a/examples/flax/token-classification/README.md
+++ b/examples/flax/token-classification/README.md
@@ -25,7 +25,7 @@ The following example fine-tunes BERT on CoNLL-2003:
 
 ```bash
 python run_flax_ner.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --dataset_name conll2003 \
   --max_seq_length 128 \
   --learning_rate 2e-5 \
diff --git a/examples/legacy/benchmarking/README.md b/examples/legacy/benchmarking/README.md
index 7099ed9f6b3d3d..03e174770d1077 100644
--- a/examples/legacy/benchmarking/README.md
+++ b/examples/legacy/benchmarking/README.md
@@ -22,5 +22,5 @@ If you would like to list benchmark results on your favorite models of the [mode
 
 | Benchmark description | Results | Environment info |      Author      |
 |:----------|:-------------|:-------------|------:|
-| PyTorch Benchmark on inference for `bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 
-| PyTorch Benchmark on inference for `bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 
+| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 
+| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 
diff --git a/examples/legacy/question-answering/README.md b/examples/legacy/question-answering/README.md
index 905fabf35bdf6c..339837c94f5d86 100644
--- a/examples/legacy/question-answering/README.md
+++ b/examples/legacy/question-answering/README.md
@@ -1,7 +1,7 @@
 #### Fine-tuning BERT on SQuAD1.0 with relative position embeddings
 
 The following examples show how to fine-tune BERT models with different relative position embeddings. The BERT model 
-`bert-base-uncased` was pretrained with default absolute position embeddings. We provide the following pretrained 
+`google-bert/bert-base-uncased` was pretrained with default absolute position embeddings. We provide the following pretrained 
 models which were pre-trained on the same training data (BooksCorpus and English Wikipedia) as in the BERT model 
 training, but with different relative position embeddings. 
 
@@ -10,7 +10,7 @@ Shaw et al., [Self-Attention with Relative Position Representations](https://arx
 * `zhiheng-huang/bert-base-uncased-embedding-relative-key-query`, trained from scratch with relative embedding method 4 
 in Huang et al. [Improve Transformer Models with Better Relative Position Embeddings](https://arxiv.org/abs/2009.13658)
 * `zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query`, fine-tuned from model 
-`bert-large-uncased-whole-word-masking` with 3 additional epochs with relative embedding method 4 in Huang et al. 
+`google-bert/bert-large-uncased-whole-word-masking` with 3 additional epochs with relative embedding method 4 in Huang et al. 
 [Improve Transformer Models with Better Relative Position Embeddings](https://arxiv.org/abs/2009.13658)
 
 
@@ -61,7 +61,7 @@ torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
     --gradient_accumulation_steps 3
 ```
 Training with the above command leads to the f1 score of 93.52, which is slightly better than the f1 score of 93.15 for 
-`bert-large-uncased-whole-word-masking`.
+`google-bert/bert-large-uncased-whole-word-masking`.
 
 #### Distributed training
 
@@ -69,7 +69,7 @@ Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word
 
 ```bash
 torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --model_name_or_path google-bert/bert-large-uncased-whole-word-masking \
     --dataset_name squad \
     --do_train \
     --do_eval \
@@ -90,7 +90,7 @@ exact_match = 86.91
 ```
 
 This fine-tuned model is available as a checkpoint under the reference
-[`bert-large-uncased-whole-word-masking-finetuned-squad`](https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad).
+[`google-bert/bert-large-uncased-whole-word-masking-finetuned-squad`](https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad).
 
 ## Results
 
diff --git a/examples/legacy/run_camembert.py b/examples/legacy/run_camembert.py
index 9651570b39e1e8..67e04babe1043e 100755
--- a/examples/legacy/run_camembert.py
+++ b/examples/legacy/run_camembert.py
@@ -39,8 +39,8 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
     return topk_filled_outputs
 
 
-tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
-model = CamembertForMaskedLM.from_pretrained("camembert-base")
+tokenizer = CamembertTokenizer.from_pretrained("almanach/camembert-base")
+model = CamembertForMaskedLM.from_pretrained("almanach/camembert-base")
 model.eval()
 
 masked_input = "Le camembert est <mask> :)"
diff --git a/examples/legacy/run_openai_gpt.py b/examples/legacy/run_openai_gpt.py
index 03031f205768ff..d0c21aba27eaca 100755
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@@ -20,7 +20,7 @@
 
     This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset:
         python run_openai_gpt.py \
-          --model_name openai-gpt \
+          --model_name openai-community/openai-gpt \
           --do_train \
           --do_eval \
           --train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
@@ -104,7 +104,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
+    parser.add_argument("--model_name", type=str, default="openai-community/openai-gpt", help="pretrained model name")
     parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
     parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
     parser.add_argument(
diff --git a/examples/legacy/run_transfo_xl.py b/examples/legacy/run_transfo_xl.py
index 7ee941150852e1..1c48974f39c77a 100755
--- a/examples/legacy/run_transfo_xl.py
+++ b/examples/legacy/run_transfo_xl.py
@@ -40,7 +40,7 @@
 
 def main():
     parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
-    parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
+    parser.add_argument("--model_name", type=str, default="transfo-xl/transfo-xl-wt103", help="pretrained model name")
     parser.add_argument(
         "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
     )
diff --git a/examples/legacy/seq2seq/README.md b/examples/legacy/seq2seq/README.md
index e6e3e20dcf8a96..f574ccabda2c4a 100644
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@@ -170,7 +170,7 @@ If 'translation' is in your task name, the computed metric will be BLEU. Otherwi
 For t5, you need to specify --task translation_{src}_to_{tgt} as follows:
 ```bash
 export DATA_DIR=wmt_en_ro
-./run_eval.py t5-base \
+./run_eval.py google-t5/t5-base \
     $DATA_DIR/val.source t5_val_generations.txt \
     --reference_path $DATA_DIR/val.target \
     --score_path enro_bleu.json \
diff --git a/examples/legacy/seq2seq/old_test_datasets.py b/examples/legacy/seq2seq/old_test_datasets.py
index 0b907b1ed9fbb6..be108f7645f8a9 100644
--- a/examples/legacy/seq2seq/old_test_datasets.py
+++ b/examples/legacy/seq2seq/old_test_datasets.py
@@ -28,7 +28,7 @@
 from utils import FAIRSEQ_AVAILABLE, DistributedSortishSampler, LegacySeq2SeqDataset, Seq2SeqDataset
 
 
-BERT_BASE_CASED = "bert-base-cased"
+BERT_BASE_CASED = "google-bert/bert-base-cased"
 PEGASUS_XSUM = "google/pegasus-xsum"
 ARTICLES = [" Sam ate lunch today.", "Sams lunch ingredients."]
 SUMMARIES = ["A very interesting story about what I ate for lunch.", "Avocado, celery, turkey, coffee"]
diff --git a/examples/legacy/seq2seq/pack_dataset.py b/examples/legacy/seq2seq/pack_dataset.py
index 8b069e452a7177..5c13c74f412df6 100755
--- a/examples/legacy/seq2seq/pack_dataset.py
+++ b/examples/legacy/seq2seq/pack_dataset.py
@@ -74,7 +74,7 @@ def pack_data_dir(tok, data_dir: Path, max_tokens, save_path):
 
 def packer_cli():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--tok_name", type=str, help="like facebook/bart-large-cnn,t5-base, etc.")
+    parser.add_argument("--tok_name", type=str, help="like facebook/bart-large-cnn,google-t5/t5-base, etc.")
     parser.add_argument("--max_seq_len", type=int, default=128)
     parser.add_argument("--data_dir", type=str)
     parser.add_argument("--save_path", type=str)
diff --git a/examples/legacy/seq2seq/run_distributed_eval.py b/examples/legacy/seq2seq/run_distributed_eval.py
index 4e8283727750b5..40a946f81c5e15 100755
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@@ -124,7 +124,7 @@ def run_generate():
     parser.add_argument(
         "--model_name",
         type=str,
-        help="like facebook/bart-large-cnn,t5-base, etc.",
+        help="like facebook/bart-large-cnn,google-t5/t5-base, etc.",
         default="sshleifer/distilbart-xsum-12-3",
     )
     parser.add_argument("--save_dir", type=str, help="where to save", default="tmp_gen")
diff --git a/examples/legacy/seq2seq/run_eval.py b/examples/legacy/seq2seq/run_eval.py
index cc9ceae6f83828..f69e5d51264c78 100755
--- a/examples/legacy/seq2seq/run_eval.py
+++ b/examples/legacy/seq2seq/run_eval.py
@@ -100,7 +100,7 @@ def run_generate(verbose=True):
     """
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("model_name", type=str, help="like facebook/bart-large-cnn,t5-base, etc.")
+    parser.add_argument("model_name", type=str, help="like facebook/bart-large-cnn,google-t5/t5-base, etc.")
     parser.add_argument("input_path", type=str, help="like cnn_dm/test.source")
     parser.add_argument("save_path", type=str, help="where to save summaries")
     parser.add_argument("--reference_path", type=str, required=False, help="like cnn_dm/test.target")
diff --git a/examples/legacy/token-classification/README.md b/examples/legacy/token-classification/README.md
index c2fa6eec7282b2..fbf17f84d2d7ee 100644
--- a/examples/legacy/token-classification/README.md
+++ b/examples/legacy/token-classification/README.md
@@ -34,7 +34,7 @@ Let's define some variables that we need for further pre-processing steps and tr
 
 ```bash
 export MAX_LENGTH=128
-export BERT_MODEL=bert-base-multilingual-cased
+export BERT_MODEL=google-bert/bert-base-multilingual-cased
 ```
 
 Run the pre-processing script on training, dev and test datasets:
@@ -92,7 +92,7 @@ Instead of passing all parameters via commandline arguments, the `run_ner.py` sc
 {
     "data_dir": ".",
     "labels": "./labels.txt",
-    "model_name_or_path": "bert-base-multilingual-cased",
+    "model_name_or_path": "google-bert/bert-base-multilingual-cased",
     "output_dir": "germeval-model",
     "max_seq_length": 128,
     "num_train_epochs": 3,
@@ -222,7 +222,7 @@ Let's define some variables that we need for further pre-processing steps:
 
 ```bash
 export MAX_LENGTH=128
-export BERT_MODEL=bert-large-cased
+export BERT_MODEL=google-bert/bert-large-cased
 ```
 
 Here we use the English BERT large model for fine-tuning.
@@ -250,7 +250,7 @@ This configuration file looks like:
 {
     "data_dir": "./data_wnut_17",
     "labels": "./data_wnut_17/labels.txt",
-    "model_name_or_path": "bert-large-cased",
+    "model_name_or_path": "google-bert/bert-large-cased",
     "output_dir": "wnut-17-model-1",
     "max_seq_length": 128,
     "num_train_epochs": 3,
diff --git a/examples/legacy/token-classification/utils_ner.py b/examples/legacy/token-classification/utils_ner.py
index 2b54c7c4a49159..e7e3a157e30516 100644
--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@@ -113,7 +113,7 @@ def convert_examples_to_features(
             for word, label in zip(example.words, example.labels):
                 word_tokens = tokenizer.tokenize(word)
 
-                # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
+                # google-bert/bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
                 if len(word_tokens) > 0:
                     tokens.extend(word_tokens)
                     # Use the real label id for the first token of the word, and padding ids for the remaining tokens
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index be3c9c52a07984..63a56a06e8d5a4 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -109,7 +109,7 @@ classification MNLI task using the `run_glue` script, with 8 GPUs:
 ```bash
 torchrun \
     --nproc_per_node 8 pytorch/text-classification/run_glue.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --model_name_or_path google-bert/bert-large-uncased-whole-word-masking \
     --task_name mnli \
     --do_train \
     --do_eval \
@@ -153,7 +153,7 @@ classification MNLI task using the `run_glue` script, with 8 TPUs (from this fol
 ```bash
 python xla_spawn.py --num_cores 8 \
     text-classification/run_glue.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --model_name_or_path google-bert/bert-large-uncased-whole-word-masking \
     --task_name mnli \
     --do_train \
     --do_eval \
diff --git a/examples/pytorch/contrastive-image-text/README.md b/examples/pytorch/contrastive-image-text/README.md
index f22f2c82dce2dd..c39f17a138a632 100644
--- a/examples/pytorch/contrastive-image-text/README.md
+++ b/examples/pytorch/contrastive-image-text/README.md
@@ -64,10 +64,10 @@ from transformers import (
 )
 
 model = VisionTextDualEncoderModel.from_vision_text_pretrained(
-    "openai/clip-vit-base-patch32", "roberta-base"
+    "openai/clip-vit-base-patch32", "FacebookAI/roberta-base"
 )
 
-tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
 image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
 processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
 
diff --git a/examples/pytorch/language-modeling/README.md b/examples/pytorch/language-modeling/README.md
index 3069fe9eb974c1..23c0bc2c79aeb4 100644
--- a/examples/pytorch/language-modeling/README.md
+++ b/examples/pytorch/language-modeling/README.md
@@ -36,7 +36,7 @@ the tokenization). The loss here is that of causal language modeling.
 
 ```bash
 python run_clm.py \
-    --model_name_or_path gpt2 \
+    --model_name_or_path openai-community/gpt2 \
     --dataset_name wikitext \
     --dataset_config_name wikitext-2-raw-v1 \
     --per_device_train_batch_size 8 \
@@ -53,7 +53,7 @@ To run on your own training and validation files, use the following command:
 
 ```bash
 python run_clm.py \
-    --model_name_or_path gpt2 \
+    --model_name_or_path openai-community/gpt2 \
     --train_file path_to_train_file \
     --validation_file path_to_validation_file \
     --per_device_train_batch_size 8 \
@@ -69,7 +69,7 @@ This uses the built in HuggingFace `Trainer` for training. If you want to use a
 python run_clm_no_trainer.py \
     --dataset_name wikitext \
     --dataset_config_name wikitext-2-raw-v1 \
-    --model_name_or_path gpt2 \
+    --model_name_or_path openai-community/gpt2 \
     --output_dir /tmp/test-clm
 ```
 
@@ -84,7 +84,7 @@ converge slightly slower (over-fitting takes more epochs).
 
 ```bash
 python run_mlm.py \
-    --model_name_or_path roberta-base \
+    --model_name_or_path FacebookAI/roberta-base \
     --dataset_name wikitext \
     --dataset_config_name wikitext-2-raw-v1 \
     --per_device_train_batch_size 8 \
@@ -98,7 +98,7 @@ To run on your own training and validation files, use the following command:
 
 ```bash
 python run_mlm.py \
-    --model_name_or_path roberta-base \
+    --model_name_or_path FacebookAI/roberta-base \
     --train_file path_to_train_file \
     --validation_file path_to_validation_file \
     --per_device_train_batch_size 8 \
@@ -117,7 +117,7 @@ This uses the built in HuggingFace `Trainer` for training. If you want to use a
 python run_mlm_no_trainer.py \
     --dataset_name wikitext \
     --dataset_config_name wikitext-2-raw-v1 \
-    --model_name_or_path roberta-base \
+    --model_name_or_path FacebookAI/roberta-base \
     --output_dir /tmp/test-mlm
 ```
 
@@ -144,7 +144,7 @@ Here is how to fine-tune XLNet on wikitext-2:
 
 ```bash
 python run_plm.py \
-    --model_name_or_path=xlnet-base-cased \
+    --model_name_or_path=xlnet/xlnet-base-cased \
     --dataset_name wikitext \
     --dataset_config_name wikitext-2-raw-v1 \
     --per_device_train_batch_size 8 \
@@ -158,7 +158,7 @@ To fine-tune it on your own training and validation file, run:
 
 ```bash
 python run_plm.py \
-    --model_name_or_path=xlnet-base-cased \
+    --model_name_or_path=xlnet/xlnet-base-cased \
     --train_file path_to_train_file \
     --validation_file path_to_validation_file \
     --per_device_train_batch_size 8 \
@@ -188,7 +188,7 @@ When training a model from scratch, configuration values may be overridden with
 
 
 ```bash
-python run_clm.py --model_type gpt2 --tokenizer_name gpt2 \ --config_overrides="n_embd=1024,n_head=16,n_layer=48,n_positions=102" \
+python run_clm.py --model_type openai-community/gpt2 --tokenizer_name openai-community/gpt2 \ --config_overrides="n_embd=1024,n_head=16,n_layer=48,n_positions=102" \
 [...]
 ```
 
diff --git a/examples/pytorch/multiple-choice/README.md b/examples/pytorch/multiple-choice/README.md
index 8d56ccfe3dbd7e..118234002c88a3 100644
--- a/examples/pytorch/multiple-choice/README.md
+++ b/examples/pytorch/multiple-choice/README.md
@@ -22,7 +22,7 @@ limitations under the License.
 
 ```bash
 python examples/multiple-choice/run_swag.py \
---model_name_or_path roberta-base \
+--model_name_or_path FacebookAI/roberta-base \
 --do_train \
 --do_eval \
 --learning_rate 5e-5 \
@@ -62,7 +62,7 @@ then
 export DATASET_NAME=swag
 
 python run_swag_no_trainer.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --dataset_name $DATASET_NAME \
   --max_seq_length 128 \
   --per_device_train_batch_size 32 \
@@ -89,7 +89,7 @@ that will check everything is ready for training. Finally, you can launch traini
 export DATASET_NAME=swag
 
 accelerate launch run_swag_no_trainer.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --dataset_name $DATASET_NAME \
   --max_seq_length 128 \
   --per_device_train_batch_size 32 \
diff --git a/examples/pytorch/old_test_xla_examples.py b/examples/pytorch/old_test_xla_examples.py
index 4a29ce3beea64a..2f24035d72377b 100644
--- a/examples/pytorch/old_test_xla_examples.py
+++ b/examples/pytorch/old_test_xla_examples.py
@@ -54,7 +54,7 @@ def test_run_glue(self):
             ./examples/pytorch/text-classification/run_glue.py
             --num_cores=8
             ./examples/pytorch/text-classification/run_glue.py
-            --model_name_or_path distilbert-base-uncased
+            --model_name_or_path distilbert/distilbert-base-uncased
             --output_dir {tmp_dir}
             --overwrite_output_dir
             --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
diff --git a/examples/pytorch/question-answering/README.md b/examples/pytorch/question-answering/README.md
index 6b86a4effa9508..9fac0b30385093 100644
--- a/examples/pytorch/question-answering/README.md
+++ b/examples/pytorch/question-answering/README.md
@@ -40,7 +40,7 @@ on a single tesla V100 16GB.
 
 ```bash
 python run_qa.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --dataset_name squad \
   --do_train \
   --do_eval \
@@ -67,7 +67,7 @@ The [`run_qa_beam_search.py`](https://github.com/huggingface/transformers/blob/m
 
 ```bash
 python run_qa_beam_search.py \
-    --model_name_or_path xlnet-large-cased \
+    --model_name_or_path xlnet/xlnet-large-cased \
     --dataset_name squad \
     --do_train \
     --do_eval \
@@ -87,7 +87,7 @@ python run_qa_beam_search.py \
 export SQUAD_DIR=/path/to/SQUAD
 
 python run_qa_beam_search.py \
-    --model_name_or_path xlnet-large-cased \
+    --model_name_or_path xlnet/xlnet-large-cased \
     --dataset_name squad_v2 \
     --do_train \
     --do_eval \
@@ -111,7 +111,7 @@ This example code fine-tunes T5 on the SQuAD2.0 dataset.
 
 ```bash
 python run_seq2seq_qa.py \
-  --model_name_or_path t5-small \
+  --model_name_or_path google-t5/t5-small \
   --dataset_name squad_v2 \
   --context_column context \
   --question_column question \
@@ -143,7 +143,7 @@ then
 
 ```bash
 python run_qa_no_trainer.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --dataset_name squad \
   --max_seq_length 384 \
   --doc_stride 128 \
@@ -166,7 +166,7 @@ that will check everything is ready for training. Finally, you can launch traini
 
 ```bash
 accelerate launch run_qa_no_trainer.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --dataset_name squad \
   --max_seq_length 384 \
   --doc_stride 128 \
diff --git a/examples/pytorch/summarization/README.md b/examples/pytorch/summarization/README.md
index 027119681de020..93c0bbccef6c06 100644
--- a/examples/pytorch/summarization/README.md
+++ b/examples/pytorch/summarization/README.md
@@ -41,7 +41,7 @@ and you also will find examples of these below.
 Here is an example on a summarization task:
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -54,9 +54,9 @@ python examples/pytorch/summarization/run_summarization.py \
     --predict_with_generate
 ```
 
-Only T5 models `t5-small`, `t5-base`, `t5-large`, `t5-3b` and `t5-11b` must use an additional argument: `--source_prefix "summarize: "`.
+Only T5 models `google-t5/t5-small`, `google-t5/t5-base`, `google-t5/t5-large`, `google-t5/t5-3b` and `google-t5/t5-11b` must use an additional argument: `--source_prefix "summarize: "`.
 
-We used CNN/DailyMail dataset in this example as `t5-small` was trained on it and one can get good scores even when pre-training with a very small sample.
+We used CNN/DailyMail dataset in this example as `google-t5/t5-small` was trained on it and one can get good scores even when pre-training with a very small sample.
 
 Extreme Summarization (XSum) Dataset is another commonly used dataset for the task of summarization. To use it replace `--dataset_name cnn_dailymail --dataset_config "3.0.0"` with  `--dataset_name xsum`.
 
@@ -65,7 +65,7 @@ And here is how you would use it on your own files, after adjusting the values f
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --train_file path_to_csv_or_jsonlines_file \
@@ -156,7 +156,7 @@ then
 
 ```bash
 python run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
@@ -179,7 +179,7 @@ that will check everything is ready for training. Finally, you can launch traini
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 92f59cb2c80381..793917264a7648 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -368,11 +368,11 @@ def main():
     logger.info(f"Training/evaluation parameters {training_args}")
 
     if data_args.source_prefix is None and model_args.model_name_or_path in [
-        "t5-small",
-        "t5-base",
-        "t5-large",
-        "t5-3b",
-        "t5-11b",
+        "google-t5/t5-small",
+        "google-t5/t5-base",
+        "google-t5/t5-large",
+        "google-t5/t5-3b",
+        "google-t5/t5-11b",
     ]:
         logger.warning(
             "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 5432e508d6f9ee..1cd9f3865df377 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -339,11 +339,11 @@ def main():
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
     if args.source_prefix is None and args.model_name_or_path in [
-        "t5-small",
-        "t5-base",
-        "t5-large",
-        "t5-3b",
-        "t5-11b",
+        "google-t5/t5-small",
+        "google-t5/t5-base",
+        "google-t5/t5-large",
+        "google-t5/t5-3b",
+        "google-t5/t5-11b",
     ]:
         logger.warning(
             "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
index fc485cf59a2ebb..918167635e854b 100644
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -80,7 +80,7 @@ def test_run_glue_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             {self.examples_dir}/pytorch/text-classification/run_glue_no_trainer.py
-            --model_name_or_path distilbert-base-uncased
+            --model_name_or_path distilbert/distilbert-base-uncased
             --output_dir {tmp_dir}
             --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
             --validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
@@ -105,7 +105,7 @@ def test_run_clm_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             {self.examples_dir}/pytorch/language-modeling/run_clm_no_trainer.py
-            --model_name_or_path distilgpt2
+            --model_name_or_path distilbert/distilgpt2
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --block_size 128
@@ -133,7 +133,7 @@ def test_run_mlm_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             {self.examples_dir}/pytorch/language-modeling/run_mlm_no_trainer.py
-            --model_name_or_path distilroberta-base
+            --model_name_or_path distilbert/distilroberta-base
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --output_dir {tmp_dir}
@@ -156,7 +156,7 @@ def test_run_ner_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             {self.examples_dir}/pytorch/token-classification/run_ner_no_trainer.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --train_file tests/fixtures/tests_samples/conll/sample.json
             --validation_file tests/fixtures/tests_samples/conll/sample.json
             --output_dir {tmp_dir}
@@ -181,7 +181,7 @@ def test_run_squad_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             {self.examples_dir}/pytorch/question-answering/run_qa_no_trainer.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --version_2_with_negative
             --train_file tests/fixtures/tests_samples/SQUAD/sample.json
             --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
@@ -209,7 +209,7 @@ def test_run_swag_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             {self.examples_dir}/pytorch/multiple-choice/run_swag_no_trainer.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --train_file tests/fixtures/tests_samples/swag/sample.json
             --validation_file tests/fixtures/tests_samples/swag/sample.json
             --output_dir {tmp_dir}
@@ -232,7 +232,7 @@ def test_run_summarization_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             {self.examples_dir}/pytorch/summarization/run_summarization_no_trainer.py
-            --model_name_or_path t5-small
+            --model_name_or_path google-t5/t5-small
             --train_file tests/fixtures/tests_samples/xsum/sample.json
             --validation_file tests/fixtures/tests_samples/xsum/sample.json
             --output_dir {tmp_dir}
diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index 0aabbb4bcb881c..36045e2f5ee133 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -99,7 +99,7 @@ def test_run_glue(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_glue.py
-            --model_name_or_path distilbert-base-uncased
+            --model_name_or_path distilbert/distilbert-base-uncased
             --output_dir {tmp_dir}
             --overwrite_output_dir
             --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
@@ -127,7 +127,7 @@ def test_run_clm(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_clm.py
-            --model_name_or_path distilgpt2
+            --model_name_or_path distilbert/distilgpt2
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --do_train
@@ -159,8 +159,8 @@ def test_run_clm_config_overrides(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_clm.py
-            --model_type gpt2
-            --tokenizer_name gpt2
+            --model_type openai-community/gpt2
+            --tokenizer_name openai-community/gpt2
             --train_file ./tests/fixtures/sample_text.txt
             --output_dir {tmp_dir}
             --config_overrides n_embd=10,n_head=2
@@ -181,7 +181,7 @@ def test_run_mlm(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_mlm.py
-            --model_name_or_path distilroberta-base
+            --model_name_or_path distilbert/distilroberta-base
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --output_dir {tmp_dir}
@@ -207,7 +207,7 @@ def test_run_ner(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_ner.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --train_file tests/fixtures/tests_samples/conll/sample.json
             --validation_file tests/fixtures/tests_samples/conll/sample.json
             --output_dir {tmp_dir}
@@ -235,7 +235,7 @@ def test_run_squad(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_qa.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --version_2_with_negative
             --train_file tests/fixtures/tests_samples/SQUAD/sample.json
             --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
@@ -260,7 +260,7 @@ def test_run_squad_seq2seq(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_seq2seq_qa.py
-            --model_name_or_path t5-small
+            --model_name_or_path google-t5/t5-small
             --context_column context
             --question_column question
             --answer_column answers
@@ -289,7 +289,7 @@ def test_run_swag(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_swag.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --train_file tests/fixtures/tests_samples/swag/sample.json
             --validation_file tests/fixtures/tests_samples/swag/sample.json
             --output_dir {tmp_dir}
@@ -315,7 +315,7 @@ def test_generation(self):
             testargs.append("--fp16")
 
         model_type, model_name = (
-            "--model_type=gpt2",
+            "--model_type=openai-community/gpt2",
             "--model_name_or_path=sshleifer/tiny-gpt2",
         )
         with patch.object(sys, "argv", testargs + [model_type, model_name]):
@@ -327,7 +327,7 @@ def test_run_summarization(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_summarization.py
-            --model_name_or_path t5-small
+            --model_name_or_path google-t5/t5-small
             --train_file tests/fixtures/tests_samples/xsum/sample.json
             --validation_file tests/fixtures/tests_samples/xsum/sample.json
             --output_dir {tmp_dir}
diff --git a/examples/pytorch/text-classification/README.md b/examples/pytorch/text-classification/README.md
index 95116bcfd6e62b..6eae65e7c4bc51 100644
--- a/examples/pytorch/text-classification/README.md
+++ b/examples/pytorch/text-classification/README.md
@@ -31,7 +31,7 @@ GLUE is made up of a total of 9 different tasks. Here is how to run the script o
 export TASK_NAME=mrpc
 
 python run_glue.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --task_name $TASK_NAME \
   --do_train \
   --do_eval \
@@ -68,7 +68,7 @@ The following example fine-tunes BERT on the `imdb` dataset hosted on our [hub](
 
 ```bash
 python run_glue.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --dataset_name imdb  \
   --do_train \
   --do_predict \
@@ -90,7 +90,7 @@ We can specify the metric, the label column and aso choose which text columns to
 dataset="amazon_reviews_multi"
 subset="en"
 python run_classification.py \
-    --model_name_or_path  bert-base-uncased \
+    --model_name_or_path  google-bert/bert-base-uncased \
     --dataset_name ${dataset} \
     --dataset_config_name ${subset} \
     --shuffle_train_dataset \
@@ -113,7 +113,7 @@ The following is a multi-label classification example. It fine-tunes BERT on the
 dataset="reuters21578"
 subset="ModApte"
 python run_classification.py \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
     --dataset_name ${dataset} \
     --dataset_config_name ${subset} \
     --shuffle_train_dataset \
@@ -175,7 +175,7 @@ then
 export TASK_NAME=mrpc
 
 python run_glue_no_trainer.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --task_name $TASK_NAME \
   --max_length 128 \
   --per_device_train_batch_size 32 \
@@ -202,7 +202,7 @@ that will check everything is ready for training. Finally, you can launch traini
 export TASK_NAME=mrpc
 
 accelerate launch run_glue_no_trainer.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --task_name $TASK_NAME \
   --max_length 128 \
   --per_device_train_batch_size 32 \
@@ -232,7 +232,7 @@ This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It
 
 ```bash
 python run_xnli.py \
-  --model_name_or_path bert-base-multilingual-cased \
+  --model_name_or_path google-bert/bert-base-multilingual-cased \
   --language de \
   --train_language en \
   --do_train \
diff --git a/examples/pytorch/text-generation/README.md b/examples/pytorch/text-generation/README.md
index cc914754adcdf3..e619c25e162d52 100644
--- a/examples/pytorch/text-generation/README.md
+++ b/examples/pytorch/text-generation/README.md
@@ -26,6 +26,6 @@ Example usage:
 
 ```bash
 python run_generation.py \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2
+    --model_type=openai-community/gpt2 \
+    --model_name_or_path=openai-community/gpt2
 ```
diff --git a/examples/pytorch/text-generation/run_generation_contrastive_search.py b/examples/pytorch/text-generation/run_generation_contrastive_search.py
index 91781f05185f58..a48529fb30dd4b 100755
--- a/examples/pytorch/text-generation/run_generation_contrastive_search.py
+++ b/examples/pytorch/text-generation/run_generation_contrastive_search.py
@@ -16,7 +16,7 @@
 """ The examples of running contrastive search on the auto-APIs;
 
 Running this example:
-python run_generation_contrastive_search.py --model_name_or_path=gpt2-large --penalty_alpha=0.6 --k=4 --length=256
+python run_generation_contrastive_search.py --model_name_or_path=openai-community/gpt2-large --penalty_alpha=0.6 --k=4 --length=256
 """
 
 
diff --git a/examples/pytorch/token-classification/README.md b/examples/pytorch/token-classification/README.md
index 496722cf6b9a14..568e5242fee3ff 100644
--- a/examples/pytorch/token-classification/README.md
+++ b/examples/pytorch/token-classification/README.md
@@ -29,7 +29,7 @@ The following example fine-tunes BERT on CoNLL-2003:
 
 ```bash
 python run_ner.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --dataset_name conll2003 \
   --output_dir /tmp/test-ner \
   --do_train \
@@ -42,7 +42,7 @@ To run on your own training and validation files, use the following command:
 
 ```bash
 python run_ner.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --train_file path_to_train_file \
   --validation_file path_to_validation_file \
   --output_dir /tmp/test-ner \
@@ -84,7 +84,7 @@ then
 export TASK_NAME=ner
 
 python run_ner_no_trainer.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --dataset_name conll2003 \
   --task_name $TASK_NAME \
   --max_length 128 \
@@ -112,7 +112,7 @@ that will check everything is ready for training. Finally, you can launch traini
 export TASK_NAME=ner
 
 accelerate launch run_ner_no_trainer.py \
-  --model_name_or_path bert-base-cased \
+  --model_name_or_path google-bert/bert-base-cased \
   --dataset_name conll2003 \
   --task_name $TASK_NAME \
   --max_length 128 \
diff --git a/examples/pytorch/translation/README.md b/examples/pytorch/translation/README.md
index bd95e3a552150c..74ca16ccb0bf63 100644
--- a/examples/pytorch/translation/README.md
+++ b/examples/pytorch/translation/README.md
@@ -59,11 +59,11 @@ python examples/pytorch/translation/run_translation.py \
 
 MBart and some T5 models require special handling.
 
-T5 models `t5-small`, `t5-base`, `t5-large`, `t5-3b` and `t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For example:
+T5 models `google-t5/t5-small`, `google-t5/t5-base`, `google-t5/t5-large`, `google-t5/t5-3b` and `google-t5/t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For example:
 
 ```bash
 python examples/pytorch/translation/run_translation.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --source_lang en \
@@ -105,7 +105,7 @@ values for the arguments `--train_file`, `--validation_file` to match your setup
 
 ```bash
 python examples/pytorch/translation/run_translation.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --source_lang en \
@@ -134,7 +134,7 @@ If you want to use a pre-processed dataset that leads to high BLEU scores, but f
 
 ```bash
 python examples/pytorch/translation/run_translation.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --source_lang en \
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 807311531f9a1f..f2718c1122acae 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -317,11 +317,11 @@ def main():
     logger.info(f"Training/evaluation parameters {training_args}")
 
     if data_args.source_prefix is None and model_args.model_name_or_path in [
-        "t5-small",
-        "t5-base",
-        "t5-large",
-        "t5-3b",
-        "t5-11b",
+        "google-t5/t5-small",
+        "google-t5/t5-base",
+        "google-t5/t5-large",
+        "google-t5/t5-3b",
+        "google-t5/t5-11b",
     ]:
         logger.warning(
             "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with "
diff --git a/examples/research_projects/bert-loses-patience/README.md b/examples/research_projects/bert-loses-patience/README.md
index d1e5baa92e90bb..b405e8a9488750 100755
--- a/examples/research_projects/bert-loses-patience/README.md
+++ b/examples/research_projects/bert-loses-patience/README.md
@@ -15,7 +15,7 @@ export TASK_NAME=MRPC
 
 python ./run_glue_with_pabee.py \
   --model_type albert \
-  --model_name_or_path bert-base-uncased/albert-base-v2 \
+  --model_name_or_path google-bert/bert-base-uncased/albert/albert-base-v2 \
   --task_name $TASK_NAME \
   --do_train \
   --do_eval \
diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
index 57b649ec067bc3..6881bf8d184e8c 100644
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -276,8 +276,8 @@ def forward(
                 from torch import nn
                 import torch
 
-                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-                model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
+                tokenizer = AlbertTokenizer.from_pretrained('albert/albert-base-v2')
+                model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert/albert-base-v2')
                 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
                 labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
                 outputs = model(input_ids, labels=labels)
diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
index b32f47d0c30020..dfa78585a64489 100644
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -300,8 +300,8 @@ def forward(
             from torch import nn
             import torch
 
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
+            tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
+            model = BertForSequenceClassificationWithPabee.from_pretrained('google-bert/bert-base-uncased')
 
             input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
             labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
diff --git a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
index 6a084d0741d5f5..5516924f0f2fb7 100644
--- a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
@@ -29,7 +29,7 @@ def test_run_glue(self):
         testargs = f"""
             run_glue_with_pabee.py
             --model_type albert
-            --model_name_or_path albert-base-v2
+            --model_name_or_path albert/albert-base-v2
             --data_dir ./tests/fixtures/tests_samples/MRPC/
             --output_dir {tmp_dir}
             --overwrite_output_dir
diff --git a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
index 53ba3829b15030..b6f5d1775150cf 100644
--- a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@@ -107,7 +107,7 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
     # ----------------------------------
 
     logging.info("Make sure that the models' outputs are identical")
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
     # prepare the model inputs
     encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
diff --git a/examples/research_projects/bertabs/modeling_bertabs.py b/examples/research_projects/bertabs/modeling_bertabs.py
index 19e62804ef08ea..2ebce466561393 100644
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ b/examples/research_projects/bertabs/modeling_bertabs.py
@@ -128,7 +128,7 @@ class Bert(nn.Module):
 
     def __init__(self):
         super().__init__()
-        config = BertConfig.from_pretrained("bert-base-uncased")
+        config = BertConfig.from_pretrained("google-bert/bert-base-uncased")
         self.model = BertModel(config)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
diff --git a/examples/research_projects/bertabs/run_summarization.py b/examples/research_projects/bertabs/run_summarization.py
index 82ef8ab39ea9b7..1f969f117baaf2 100644
--- a/examples/research_projects/bertabs/run_summarization.py
+++ b/examples/research_projects/bertabs/run_summarization.py
@@ -29,7 +29,7 @@
 
 
 def evaluate(args):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", do_lower_case=True)
     model = BertAbs.from_pretrained("remi/bertabs-finetuned-extractive-abstractive-summarization")
     model.to(args.device)
     model.eval()
diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index 3259041ba5404a..f0af3d144f781a 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -79,7 +79,7 @@ python scripts/pretokenizing.py \
 Before training a new model for code we create a new tokenizer that is efficient at code tokenization. To train the tokenizer you can run the following command: 
 ```bash
 python scripts/bpe_training.py \
-    --base_tokenizer gpt2 \
+    --base_tokenizer openai-community/gpt2 \
     --dataset_name codeparrot/codeparrot-clean-train
 ```
 
@@ -90,12 +90,12 @@ The models are randomly initialized and trained from scratch. To initialize a ne
 
 ```bash
 python scripts/initialize_model.py \
---config_name gpt2-large \
+--config_name openai-community/gpt2-large \
 --tokenizer_name codeparrot/codeparrot \
 --model_name codeparrot \
 --push_to_hub True
 ```
-This will initialize a new model with the architecture and configuration of `gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.
+This will initialize a new model with the architecture and configuration of `openai-community/gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.
 
 We can either pass the name of a text dataset or a pretokenized dataset which speeds up training a bit.
 Now that the tokenizer and model are also ready we can start training the model. The main training script is built with `accelerate` to scale across a wide range of platforms and infrastructure scales. We train two models with [110M](https://huggingface.co/codeparrot/codeparrot-small/) and [1.5B](https://huggingface.co/codeparrot/codeparrot/) parameters for 25-30B tokens on a 16xA100 (40GB) machine which takes 1 day and 1 week, respectively.
diff --git a/examples/research_projects/codeparrot/scripts/arguments.py b/examples/research_projects/codeparrot/scripts/arguments.py
index 4def9ac3b854ec..5fee05eb04c50a 100644
--- a/examples/research_projects/codeparrot/scripts/arguments.py
+++ b/examples/research_projects/codeparrot/scripts/arguments.py
@@ -172,7 +172,7 @@ class TokenizerTrainingArguments:
     """
 
     base_tokenizer: Optional[str] = field(
-        default="gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
+        default="openai-community/gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
     )
     dataset_name: Optional[str] = field(
         default="transformersbook/codeparrot-train", metadata={"help": "Dataset to train tokenizer on."}
@@ -211,7 +211,7 @@ class InitializationArguments:
     """
 
     config_name: Optional[str] = field(
-        default="gpt2-large", metadata={"help": "Configuration to use for model initialization."}
+        default="openai-community/gpt2-large", metadata={"help": "Configuration to use for model initialization."}
     )
     tokenizer_name: Optional[str] = field(
         default="codeparrot/codeparrot", metadata={"help": "Tokenizer attached to model."}
diff --git a/examples/research_projects/deebert/test_glue_deebert.py b/examples/research_projects/deebert/test_glue_deebert.py
index 775c4d70b6523e..7a5f059c8cedff 100644
--- a/examples/research_projects/deebert/test_glue_deebert.py
+++ b/examples/research_projects/deebert/test_glue_deebert.py
@@ -48,7 +48,7 @@ def run_and_check(self, args):
     def test_glue_deebert_train(self):
         train_args = """
             --model_type roberta
-            --model_name_or_path roberta-base
+            --model_name_or_path FacebookAI/roberta-base
             --task_name MRPC
             --do_train
             --do_eval
@@ -61,7 +61,7 @@ def test_glue_deebert_train(self):
             --num_train_epochs 3
             --overwrite_output_dir
             --seed 42
-            --output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
             --plot_data_dir ./examples/deebert/results/
             --save_steps 0
             --overwrite_cache
@@ -71,12 +71,12 @@ def test_glue_deebert_train(self):
 
         eval_args = """
             --model_type roberta
-            --model_name_or_path ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --model_name_or_path ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
             --task_name MRPC
             --do_eval
             --do_lower_case
             --data_dir ./tests/fixtures/tests_samples/MRPC/
-            --output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
             --plot_data_dir ./examples/deebert/results/
             --max_seq_length 128
             --eval_each_highway
@@ -88,12 +88,12 @@ def test_glue_deebert_train(self):
 
         entropy_eval_args = """
             --model_type roberta
-            --model_name_or_path ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --model_name_or_path ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
             --task_name MRPC
             --do_eval
             --do_lower_case
             --data_dir ./tests/fixtures/tests_samples/MRPC/
-            --output_dir ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
+            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
             --plot_data_dir ./examples/deebert/results/
             --max_seq_length 128
             --early_exit_entropy 0.1
diff --git a/examples/research_projects/information-gain-filtration/README.md b/examples/research_projects/information-gain-filtration/README.md
index cba7a808947372..f685a512509f0d 100644
--- a/examples/research_projects/information-gain-filtration/README.md
+++ b/examples/research_projects/information-gain-filtration/README.md
@@ -64,7 +64,7 @@ To fine-tune a transformer model with IGF on a language modeling task, use the f
   
 ```python
 python run_clm_igf.py\
---model_name_or_path "gpt2" \
+--model_name_or_path "openai-community/gpt2" \
 --data_file="data/tokenized_stories_train_wikitext103" \
 --igf_data_file="data/IGF_values" \
 --context_len 32 \
diff --git a/examples/research_projects/information-gain-filtration/igf/igf.py b/examples/research_projects/information-gain-filtration/igf/igf.py
index 6861467a33592a..4c5aefd9584e16 100644
--- a/examples/research_projects/information-gain-filtration/igf/igf.py
+++ b/examples/research_projects/information-gain-filtration/igf/igf.py
@@ -69,9 +69,9 @@ def compute_perplexity(model, test_data, context_len):
     return perplexity
 
 
-def load_gpt2(model_name="gpt2"):
+def load_gpt2(model_name="openai-community/gpt2"):
     """
-    load original gpt2 and save off for quicker loading
+    load original openai-community/gpt2 and save off for quicker loading
 
     Args:
         model_name: GPT-2
diff --git a/examples/research_projects/information-gain-filtration/run_clm_igf.py b/examples/research_projects/information-gain-filtration/run_clm_igf.py
index 26b72072784f8a..74973309c4e16b 100644
--- a/examples/research_projects/information-gain-filtration/run_clm_igf.py
+++ b/examples/research_projects/information-gain-filtration/run_clm_igf.py
@@ -84,7 +84,7 @@ def generate_n_pairs(
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
     # load pretrained model
-    model = load_gpt2("gpt2").to(device)
+    model = load_gpt2("openai-community/gpt2").to(device)
     print("computing perplexity on objective set")
     orig_perp = compute_perplexity(model, objective_set, context_len).item()
     print("perplexity on objective set:", orig_perp)
@@ -121,7 +121,7 @@ def training_secondary_learner(
     set_seed(42)
 
     # Load pre-trained model
-    model = GPT2LMHeadModel.from_pretrained("gpt2")
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
 
     # Initialize secondary learner to use embedding weights of model
     secondary_learner = SecondaryLearner(model)
@@ -153,7 +153,7 @@ def finetune(
     recopy_model=recopy_gpt2,
     secondary_learner=None,
     eval_interval=10,
-    finetuned_model_name="gpt2_finetuned.pt",
+    finetuned_model_name="openai-community/gpt2_finetuned.pt",
 ):
     """
     fine-tune with IGF if secondary_learner is not None, else standard fine-tuning
@@ -346,7 +346,10 @@ def main():
     )
 
     parser.add_argument(
-        "--batch_size", default=16, type=int, help="batch size of training data of language model(gpt2) "
+        "--batch_size",
+        default=16,
+        type=int,
+        help="batch size of training data of language model(openai-community/gpt2) ",
     )
 
     parser.add_argument(
@@ -383,7 +386,9 @@ def main():
         ),
     )
 
-    parser.add_argument("--finetuned_model_name", default="gpt2_finetuned.pt", type=str, help="finetuned_model_name")
+    parser.add_argument(
+        "--finetuned_model_name", default="openai-community/gpt2_finetuned.pt", type=str, help="finetuned_model_name"
+    )
 
     parser.add_argument(
         "--recopy_model",
@@ -416,16 +421,16 @@ def main():
         igf_model_path="igf_model.pt",
     )
 
-    # load pretrained gpt2 model
-    model = GPT2LMHeadModel.from_pretrained("gpt2")
+    # load pretrained openai-community/gpt2 model
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
     set_seed(42)
 
-    # Generate train and test data to train and evaluate gpt2 model
+    # Generate train and test data to train and evaluate openai-community/gpt2 model
     train_dataset, test_dataset = generate_datasets(
         context_len=32, file="data/tokenized_stories_train_wikitext103.jbl", number=100, min_len=1026, trim=True
     )
 
-    # fine-tuning of the gpt2 model using igf (Information Gain Filtration)
+    # fine-tuning of the openai-community/gpt2 model using igf (Information Gain Filtration)
     finetune(
         model,
         train_dataset,
@@ -437,7 +442,7 @@ def main():
         recopy_model=recopy_gpt2,
         secondary_learner=secondary_learner,
         eval_interval=10,
-        finetuned_model_name="gpt2_finetuned.pt",
+        finetuned_model_name="openai-community/gpt2_finetuned.pt",
     )
 
 
diff --git a/examples/research_projects/jax-projects/README.md b/examples/research_projects/jax-projects/README.md
index cb670a0a520c6e..88d8d7f9eba926 100644
--- a/examples/research_projects/jax-projects/README.md
+++ b/examples/research_projects/jax-projects/README.md
@@ -159,13 +159,13 @@ to be used, but that everybody in team is on the same page on what type of model
 To give an example, a well-defined project would be the following:
 
 - task: summarization
-- model: [t5-small](https://huggingface.co/t5-small)
+- model: [google-t5/t5-small](https://huggingface.co/google-t5/t5-small)
 - dataset: [CNN/Daily mail](https://huggingface.co/datasets/cnn_dailymail)
 - training script: [run_summarization_flax.py](https://github.com/huggingface/transformers/blob/main/examples/flax/summarization/run_summarization_flax.py)
 - outcome: t5 model that can summarize news
-- work flow: adapt `run_summarization_flax.py` to work with `t5-small`.
+- work flow: adapt `run_summarization_flax.py` to work with `google-t5/t5-small`.
 
-This example is a very easy and not the most interesting project since a `t5-small`
+This example is a very easy and not the most interesting project since a `google-t5/t5-small`
 summarization model exists already for CNN/Daily mail and pretty much no code has to be 
 written. 
 A well-defined project does not need to have the dataset be part of 
@@ -335,7 +335,7 @@ dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', str
 
 dummy_input = next(iter(dataset))["text"]
 
-tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
 input_ids = tokenizer(dummy_input, return_tensors="np").input_ids[:, :10]
 
 model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
@@ -492,7 +492,7 @@ dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', str
 
 dummy_input = next(iter(dataset))["text"]
 
-tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
 input_ids = tokenizer(dummy_input, return_tensors="np").input_ids[:, :10]
 
 model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
@@ -518,7 +518,7 @@ be available in a couple of days.
 - [BigBird](https://github.com/huggingface/transformers/blob/main/src/transformers/models/big_bird/modeling_flax_big_bird.py)
 - [CLIP](https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_flax_clip.py)
 - [ELECTRA](https://github.com/huggingface/transformers/blob/main/src/transformers/models/electra/modeling_flax_electra.py)
-- [GPT2](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_flax_gpt2.py)
+- [GPT2](https://github.com/huggingface/transformers/blob/main/src/transformers/models/openai-community/gpt2/modeling_flax_gpt2.py)
 - [(TODO) MBART](https://github.com/huggingface/transformers/blob/main/src/transformers/models/mbart/modeling_flax_mbart.py)
 - [RoBERTa](https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/modeling_flax_roberta.py)
 - [T5](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_flax_t5.py)
@@ -729,7 +729,7 @@ Let's use the base `FlaxRobertaModel` without any heads as an example.
 from transformers import FlaxRobertaModel, RobertaTokenizerFast
 import jax
 
-tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
 inputs = tokenizer("JAX/Flax is amazing ", padding="max_length", max_length=128, return_tensors="np")
 
 model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
@@ -1011,7 +1011,7 @@ and run the following commands in a Python shell to save a config.
 ```python
 from transformers import RobertaConfig
 
-config = RobertaConfig.from_pretrained("roberta-base")
+config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
 config.save_pretrained("./")
 ```
 
@@ -1193,12 +1193,12 @@ All the widgets are open sourced in the `huggingface_hub` [repo](https://github.
 **NLP**
 * **Conversational:** To have the best conversations!. [Example](https://huggingface.co/microsoft/DialoGPT-large?).
 * **Feature Extraction:** Retrieve the input embeddings. [Example](https://huggingface.co/sentence-transformers/distilbert-base-nli-mean-tokens?text=test).
-* **Fill Mask:** Predict potential words for a mask token. [Example](https://huggingface.co/bert-base-uncased?).
-* **Question Answering:** Given a context and a question, predict the answer. [Example](https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad).
+* **Fill Mask:** Predict potential words for a mask token. [Example](https://huggingface.co/google-bert/bert-base-uncased?).
+* **Question Answering:** Given a context and a question, predict the answer. [Example](https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad).
 * **Sentence Simmilarity:** Predict how similar a set of sentences are. Useful for Sentence Transformers.
 * **Summarization:** Given a text, output a summary of it. [Example](https://huggingface.co/sshleifer/distilbart-cnn-12-6).
 * **Table Question Answering:** Given a table and a question, predict the answer. [Example](https://huggingface.co/google/tapas-base-finetuned-wtq).
-* **Text Generation:** Generate text based on a prompt. [Example](https://huggingface.co/gpt2)
+* **Text Generation:** Generate text based on a prompt. [Example](https://huggingface.co/openai-community/gpt2)
 * **Token Classification:** Useful for tasks such as Named Entity Recognition and Part of Speech. [Example](https://huggingface.co/dslim/bert-base-NER).
 * **Zero-Shot Classification:** Too cool to explain with words. Here is an [example](https://huggingface.co/typeform/distilbert-base-uncased-mnli)
 * ([WIP](https://github.com/huggingface/huggingface_hub/issues/99)) **Table to Text Generation**.
diff --git a/examples/research_projects/jax-projects/dataset-streaming/README.md b/examples/research_projects/jax-projects/dataset-streaming/README.md
index bbb58037443a2f..bdb6629e509c6f 100644
--- a/examples/research_projects/jax-projects/dataset-streaming/README.md
+++ b/examples/research_projects/jax-projects/dataset-streaming/README.md
@@ -31,7 +31,7 @@ without ever having to download the full dataset.
 In the following, we demonstrate how to train a bi-directional transformer model 
 using masked language modeling objective as introduced in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
 More specifically, we demonstrate how JAX/Flax and dataset streaming can be leveraged 
-to pre-train [**`roberta-base`**](https://huggingface.co/roberta-base)
+to pre-train [**`FacebookAI/roberta-base`**](https://huggingface.co/FacebookAI/roberta-base)
 in English on a single TPUv3-8 pod for 10000 update steps.
 
 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
@@ -80,8 +80,8 @@ from transformers import RobertaTokenizerFast, RobertaConfig
 
 model_dir = "./english-roberta-base-dummy"
 
-tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
-config = RobertaConfig.from_pretrained("roberta-base")
+tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
+config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
 
 tokenizer.save_pretrained(model_dir)
 config.save_pretrained(model_dir)
diff --git a/examples/research_projects/jax-projects/hybrid_clip/README.md b/examples/research_projects/jax-projects/hybrid_clip/README.md
index 76df92e463c40b..72d3db1935895f 100644
--- a/examples/research_projects/jax-projects/hybrid_clip/README.md
+++ b/examples/research_projects/jax-projects/hybrid_clip/README.md
@@ -32,7 +32,7 @@ Models written in JAX/Flax are **immutable** and updated in a purely functional
 way which enables simple and efficient model parallelism.
 
 In this example we will use the vision model from [CLIP](https://huggingface.co/models?filter=clip)
-as the image encoder and [`roberta-base`](https://huggingface.co/roberta-base) as the text encoder.
+as the image encoder and [`FacebookAI/roberta-base`](https://huggingface.co/FacebookAI/roberta-base) as the text encoder.
 Note that one can also use the [ViT](https://huggingface.co/models?filter=vit) model as image encoder and any other BERT or ROBERTa model as text encoder.
 To train the model on languages other than English one should choose a text encoder trained on the desired
 language and a image-text dataset in that language. One such dataset is [WIT](https://github.com/google-research-datasets/wit).	
@@ -76,7 +76,7 @@ Here is an example of how to load the model using pre-trained text and vision mo
 ```python
 from modeling_hybrid_clip import FlaxHybridCLIP
 
-model = FlaxHybridCLIP.from_text_vision_pretrained("bert-base-uncased", "openai/clip-vit-base-patch32")
+model = FlaxHybridCLIP.from_text_vision_pretrained("google-bert/bert-base-uncased", "openai/clip-vit-base-patch32")
 
 # save the model
 model.save_pretrained("bert-clip")
@@ -89,7 +89,7 @@ If the checkpoints are in PyTorch then one could pass `text_from_pt=True` and `v
 PyTorch checkpoints convert them to flax and load the model.
 
 ```python
-model = FlaxHybridCLIP.from_text_vision_pretrained("bert-base-uncased", "openai/clip-vit-base-patch32", text_from_pt=True, vision_from_pt=True)
+model = FlaxHybridCLIP.from_text_vision_pretrained("google-bert/bert-base-uncased", "openai/clip-vit-base-patch32", text_from_pt=True, vision_from_pt=True)
 ```
 
 This loads both the text and vision encoders using pre-trained weights, the projection layers are randomly
@@ -154,9 +154,9 @@ Next we can run the example script to train the model:
 ```bash
 python run_hybrid_clip.py \
     --output_dir ${MODEL_DIR} \
-    --text_model_name_or_path="roberta-base" \
+    --text_model_name_or_path="FacebookAI/roberta-base" \
     --vision_model_name_or_path="openai/clip-vit-base-patch32" \
-    --tokenizer_name="roberta-base" \
+    --tokenizer_name="FacebookAI/roberta-base" \
     --train_file="coco_dataset/train_dataset.json" \
     --validation_file="coco_dataset/validation_dataset.json" \
     --do_train --do_eval \
diff --git a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
index e60f07bdd06325..08cb3bd0b3412e 100644
--- a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
+++ b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
@@ -314,8 +314,6 @@ def from_text_vision_pretrained(
                 Information necessary to initiate the text model. Can be either:
 
                     - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
                     - A path to a `directory` containing model weights saved using
                       :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
                     - A path or url to a `PyTorch checkpoint folder` (e.g, ``./pt_model``). In
@@ -327,8 +325,6 @@ def from_text_vision_pretrained(
                 Information necessary to initiate the vision model. Can be either:
 
                     - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
                     - A path to a `directory` containing model weights saved using
                       :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
                     - A path or url to a `PyTorch checkpoint folder` (e.g, ``./pt_model``). In
@@ -354,7 +350,7 @@ def from_text_vision_pretrained(
             >>> from transformers import FlaxHybridCLIP
             >>> # initialize a model from pretrained BERT and CLIP models. Note that the projection layers will be randomly initialized.
             >>> # If using CLIP's vision model the vision projection layer will be initialized using pre-trained weights
-            >>> model = FlaxHybridCLIP.from_text_vision_pretrained('bert-base-uncased', 'openai/clip-vit-base-patch32')
+            >>> model = FlaxHybridCLIP.from_text_vision_pretrained('google-bert/bert-base-uncased', 'openai/clip-vit-base-patch32')
             >>> # saving model after fine-tuning
             >>> model.save_pretrained("./bert-clip")
             >>> # load fine-tuned model
diff --git a/examples/research_projects/jax-projects/model_parallel/README.md b/examples/research_projects/jax-projects/model_parallel/README.md
index 97f3cdb047741a..393c9e89375085 100644
--- a/examples/research_projects/jax-projects/model_parallel/README.md
+++ b/examples/research_projects/jax-projects/model_parallel/README.md
@@ -54,7 +54,7 @@ model.save_pretrained("gpt-neo-1.3B")
 ```bash
 python run_clm_mp.py \
     --model_name_or_path gpt-neo-1.3B  \
-    --tokenizer_name gpt2 \
+    --tokenizer_name openai-community/gpt2 \
     --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
     --do_train  --do_eval \
     --block_size 1024 \
diff --git a/examples/research_projects/longform-qa/eli5_app.py b/examples/research_projects/longform-qa/eli5_app.py
index ae8d8f91568d58..6b1b15cc9cbba3 100644
--- a/examples/research_projects/longform-qa/eli5_app.py
+++ b/examples/research_projects/longform-qa/eli5_app.py
@@ -36,7 +36,7 @@ def load_models():
         _ = s2s_model.eval()
     else:
         s2s_tokenizer, s2s_model = make_qa_s2s_model(
-            model_name="t5-small", from_file="seq2seq_models/eli5_t5_model_1024_4.pth", device="cuda:0"
+            model_name="google-t5/t5-small", from_file="seq2seq_models/eli5_t5_model_1024_4.pth", device="cuda:0"
         )
     return (qar_tokenizer, qar_model, s2s_tokenizer, s2s_model)
 
diff --git a/examples/research_projects/mlm_wwm/README.md b/examples/research_projects/mlm_wwm/README.md
index 0144b1ad309206..bf5aa9410826ed 100644
--- a/examples/research_projects/mlm_wwm/README.md
+++ b/examples/research_projects/mlm_wwm/README.md
@@ -32,7 +32,7 @@ to that word). This technique has been refined for Chinese in [this paper](https
 To fine-tune a model using whole word masking, use the following script:
 ```bash
 python run_mlm_wwm.py \
-    --model_name_or_path roberta-base \
+    --model_name_or_path FacebookAI/roberta-base \
     --dataset_name wikitext \
     --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
@@ -83,7 +83,7 @@ export VALIDATION_REF_FILE=/path/to/validation/chinese_ref/file
 export OUTPUT_DIR=/tmp/test-mlm-wwm
 
 python run_mlm_wwm.py \
-    --model_name_or_path roberta-base \
+    --model_name_or_path FacebookAI/roberta-base \
     --train_file $TRAIN_FILE \
     --validation_file $VALIDATION_FILE \
     --train_ref_file $TRAIN_REF_FILE \
diff --git a/examples/research_projects/mm-imdb/README.md b/examples/research_projects/mm-imdb/README.md
index 73e77aeb962c41..68b2f15159ec23 100644
--- a/examples/research_projects/mm-imdb/README.md
+++ b/examples/research_projects/mm-imdb/README.md
@@ -10,7 +10,7 @@ Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformer
 python run_mmimdb.py \
     --data_dir /path/to/mmimdb/dataset/ \
     --model_type bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
     --output_dir /path/to/save/dir/ \
     --do_train \
     --do_eval \
diff --git a/examples/research_projects/movement-pruning/README.md b/examples/research_projects/movement-pruning/README.md
index c2f74d6dcddbbd..575ec1a9b49287 100644
--- a/examples/research_projects/movement-pruning/README.md
+++ b/examples/research_projects/movement-pruning/README.md
@@ -61,7 +61,7 @@ python examples/movement-pruning/masked_run_squad.py \
     --predict_file dev-v1.1.json \
     --do_train --do_eval --do_lower_case \
     --model_type masked_bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
     --per_gpu_train_batch_size 16 \
     --warmup_steps 5400 \
     --num_train_epochs 10 \
@@ -84,7 +84,7 @@ python examples/movement-pruning/masked_run_squad.py \
     --predict_file dev-v1.1.json \
     --do_train --do_eval --do_lower_case \
     --model_type masked_bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
     --per_gpu_train_batch_size 16 \
     --warmup_steps 5400 \
     --num_train_epochs 10 \
@@ -104,7 +104,7 @@ python examples/movement-pruning/masked_run_squad.py \
     --predict_file dev-v1.1.json \
     --do_train --do_eval --do_lower_case \
     --model_type masked_bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
     --per_gpu_train_batch_size 16 \
     --warmup_steps 5400 \
     --num_train_epochs 10 \
@@ -124,7 +124,7 @@ python examples/movement-pruning/masked_run_squad.py \
     --predict_file dev-v1.1.json \
     --do_train --do_eval --do_lower_case \
     --model_type masked_bert \
-    --model_name_or_path bert-base-uncased \
+    --model_name_or_path google-bert/bert-base-uncased \
     --per_gpu_train_batch_size 16 \
     --warmup_steps 5400 \
     --num_train_epochs 10 \
diff --git a/examples/research_projects/performer/README.md b/examples/research_projects/performer/README.md
index 42cb6fa358f95f..fa847268b0c8b3 100644
--- a/examples/research_projects/performer/README.md
+++ b/examples/research_projects/performer/README.md
@@ -10,8 +10,8 @@ Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyo
 
 ## Examples
 
-`sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
-`full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
+`sanity_script.sh` will launch performer fine-tuning from the google-bert/bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
+`full_script.sh` will launch performer fine-tuning from the google-bert/bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
 
 Here are a few key arguments:
 - Remove the `--performer` argument to use a standard Bert model.
diff --git a/examples/research_projects/pplm/run_pplm.py b/examples/research_projects/pplm/run_pplm.py
index 54008d56c14cba..cc49b7fa83c4c3 100644
--- a/examples/research_projects/pplm/run_pplm.py
+++ b/examples/research_projects/pplm/run_pplm.py
@@ -61,7 +61,7 @@
         "embed_size": 1024,
         "class_vocab": {"non_clickbait": 0, "clickbait": 1},
         "default_class": 1,
-        "pretrained_model": "gpt2-medium",
+        "pretrained_model": "openai-community/gpt2-medium",
     },
     "sentiment": {
         "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
@@ -69,7 +69,7 @@
         "embed_size": 1024,
         "class_vocab": {"very_positive": 2, "very_negative": 3},
         "default_class": 3,
-        "pretrained_model": "gpt2-medium",
+        "pretrained_model": "openai-community/gpt2-medium",
     },
 }
 
@@ -585,7 +585,7 @@ def set_generic_model_params(discrim_weights, discrim_meta):
 
 
 def run_pplm_example(
-    pretrained_model="gpt2-medium",
+    pretrained_model="openai-community/gpt2-medium",
     cond_text="",
     uncond=False,
     num_samples=1,
@@ -738,7 +738,7 @@ def run_pplm_example(
         "--pretrained_model",
         "-M",
         type=str,
-        default="gpt2-medium",
+        default="openai-community/gpt2-medium",
         help="pretrained model name or path to local checkpoint",
     )
     parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
diff --git a/examples/research_projects/pplm/run_pplm_discrim_train.py b/examples/research_projects/pplm/run_pplm_discrim_train.py
index 4ac603a33bc842..43ec5823e37764 100644
--- a/examples/research_projects/pplm/run_pplm_discrim_train.py
+++ b/examples/research_projects/pplm/run_pplm_discrim_train.py
@@ -45,7 +45,7 @@
 class Discriminator(nn.Module):
     """Transformer encoder followed by a Classification Head"""
 
-    def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
+    def __init__(self, class_size, pretrained_model="openai-community/gpt2-medium", cached_mode=False, device="cpu"):
         super().__init__()
         self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
         self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
@@ -218,7 +218,7 @@ def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, de
 def train_discriminator(
     dataset,
     dataset_fp=None,
-    pretrained_model="gpt2-medium",
+    pretrained_model="openai-community/gpt2-medium",
     epochs=10,
     batch_size=64,
     log_interval=10,
@@ -502,7 +502,10 @@ def train_discriminator(
         help="File path of the dataset to use. Needed only in case of generic datadset",
     )
     parser.add_argument(
-        "--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
+        "--pretrained_model",
+        type=str,
+        default="openai-community/gpt2-medium",
+        help="Pretrained model to use as encoder",
     )
     parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
     parser.add_argument(
diff --git a/examples/research_projects/quantization-qdqbert/README.md b/examples/research_projects/quantization-qdqbert/README.md
index 4d459c4c715289..2cc2d5e5f98c71 100644
--- a/examples/research_projects/quantization-qdqbert/README.md
+++ b/examples/research_projects/quantization-qdqbert/README.md
@@ -50,11 +50,11 @@ Calibrate the pretrained model and finetune with quantization awared:
 
 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --dataset_name squad \
   --max_seq_length 128 \
   --doc_stride 32 \
-  --output_dir calib/bert-base-uncased \
+  --output_dir calib/google-bert/bert-base-uncased \
   --do_calib \
   --calibrator percentile \
   --percentile 99.99
@@ -62,7 +62,7 @@ python3 run_quant_qa.py \
 
 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path calib/bert-base-uncased \
+  --model_name_or_path calib/google-bert/bert-base-uncased \
   --dataset_name squad \
   --do_train \
   --do_eval \
@@ -71,8 +71,8 @@ python3 run_quant_qa.py \
   --num_train_epochs 2 \
   --max_seq_length 128 \
   --doc_stride 32 \
-  --output_dir finetuned_int8/bert-base-uncased \
-  --tokenizer_name bert-base-uncased \
+  --output_dir finetuned_int8/google-bert/bert-base-uncased \
+  --tokenizer_name google-bert/bert-base-uncased \
   --save_steps 0
 ```
 
@@ -82,14 +82,14 @@ To export the QAT model finetuned above:
 
 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path finetuned_int8/bert-base-uncased \
+  --model_name_or_path finetuned_int8/google-bert/bert-base-uncased \
   --output_dir ./ \
   --save_onnx \
   --per_device_eval_batch_size 1 \
   --max_seq_length 128 \
   --doc_stride 32 \
   --dataset_name squad \
-  --tokenizer_name bert-base-uncased
+  --tokenizer_name google-bert/bert-base-uncased
 ```
 
 Use `--recalibrate-weights` to calibrate the weight ranges according to the quantizer axis. Use `--quant-per-tensor` for per tensor quantization (default is per channel).
@@ -117,7 +117,7 @@ python3 evaluate-hf-trt-qa.py \
   --max_seq_length 128 \
   --doc_stride 32 \
   --dataset_name squad \
-  --tokenizer_name bert-base-uncased \
+  --tokenizer_name google-bert/bert-base-uncased \
   --int8 \
   --seed 42
 ```
@@ -128,14 +128,14 @@ Finetune a fp32 precision model with [transformers/examples/pytorch/question-ans
 
 ```bash
 python3 ../../pytorch/question-answering/run_qa.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --dataset_name squad \
   --per_device_train_batch_size 12 \
   --learning_rate 3e-5 \
   --num_train_epochs 2 \
   --max_seq_length 128 \
   --doc_stride 32 \
-  --output_dir ./finetuned_fp32/bert-base-uncased \
+  --output_dir ./finetuned_fp32/google-bert/bert-base-uncased \
   --save_steps 0 \
   --do_train \
   --do_eval
@@ -147,13 +147,13 @@ python3 ../../pytorch/question-answering/run_qa.py \
 
 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path ./finetuned_fp32/bert-base-uncased \
+  --model_name_or_path ./finetuned_fp32/google-bert/bert-base-uncased \
   --dataset_name squad \
   --calibrator percentile \
   --percentile 99.99 \
   --max_seq_length 128 \
   --doc_stride 32 \
-  --output_dir ./calib/bert-base-uncased \
+  --output_dir ./calib/google-bert/bert-base-uncased \
   --save_steps 0 \
   --do_calib \
   --do_eval
@@ -163,14 +163,14 @@ python3 run_quant_qa.py \
 
 ```bash
 python3 run_quant_qa.py \
-  --model_name_or_path ./calib/bert-base-uncased \
+  --model_name_or_path ./calib/google-bert/bert-base-uncased \
   --output_dir ./ \
   --save_onnx \
   --per_device_eval_batch_size 1 \
   --max_seq_length 128 \
   --doc_stride 32 \
   --dataset_name squad \
-  --tokenizer_name bert-base-uncased
+  --tokenizer_name google-bert/bert-base-uncased
 ```
 
 ### Evaluate the INT8 PTQ ONNX model inference with TensorRT
@@ -183,7 +183,7 @@ python3 evaluate-hf-trt-qa.py \
   --max_seq_length 128 \
   --doc_stride 32 \
   --dataset_name squad \
-  --tokenizer_name bert-base-uncased \
+  --tokenizer_name google-bert/bert-base-uncased \
   --int8 \
   --seed 42
 ```
diff --git a/examples/tensorflow/benchmarking/README.md b/examples/tensorflow/benchmarking/README.md
index 7099ed9f6b3d3d..03e174770d1077 100644
--- a/examples/tensorflow/benchmarking/README.md
+++ b/examples/tensorflow/benchmarking/README.md
@@ -22,5 +22,5 @@ If you would like to list benchmark results on your favorite models of the [mode
 
 | Benchmark description | Results | Environment info |      Author      |
 |:----------|:-------------|:-------------|------:|
-| PyTorch Benchmark on inference for `bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 
-| PyTorch Benchmark on inference for `bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 
+| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 
+| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 
diff --git a/examples/tensorflow/contrastive-image-text/README.md b/examples/tensorflow/contrastive-image-text/README.md
index 9e3a011fcb33c4..29d9b897734cb2 100644
--- a/examples/tensorflow/contrastive-image-text/README.md
+++ b/examples/tensorflow/contrastive-image-text/README.md
@@ -65,7 +65,7 @@ Finally, we can run the example script to train the model:
 python examples/tensorflow/contrastive-image-text/run_clip.py \
     --output_dir ./clip-roberta-finetuned \
     --vision_model_name_or_path openai/clip-vit-base-patch32 \
-    --text_model_name_or_path roberta-base \
+    --text_model_name_or_path FacebookAI/roberta-base \
     --data_dir $PWD/data \
     --dataset_name ydshieh/coco_dataset_script \
     --dataset_config_name=2017 \
diff --git a/examples/tensorflow/language-modeling-tpu/run_mlm.py b/examples/tensorflow/language-modeling-tpu/run_mlm.py
index 544bca716addc6..7ed111ab12712b 100644
--- a/examples/tensorflow/language-modeling-tpu/run_mlm.py
+++ b/examples/tensorflow/language-modeling-tpu/run_mlm.py
@@ -57,7 +57,7 @@ def parse_args():
     parser.add_argument(
         "--pretrained_model_config",
         type=str,
-        default="roberta-base",
+        default="FacebookAI/roberta-base",
         help="The model config to use. Note that we don't copy the model's weights, only the config!",
     )
     parser.add_argument(
diff --git a/examples/tensorflow/language-modeling/README.md b/examples/tensorflow/language-modeling/README.md
index e91639adb00554..ed4f507d4e82ce 100644
--- a/examples/tensorflow/language-modeling/README.md
+++ b/examples/tensorflow/language-modeling/README.md
@@ -43,7 +43,7 @@ This script trains a masked language model.
 ### Example command
 ```bash
 python run_mlm.py \
---model_name_or_path distilbert-base-cased \
+--model_name_or_path distilbert/distilbert-base-cased \
 --output_dir output \
 --dataset_name wikitext \
 --dataset_config_name wikitext-103-raw-v1
@@ -52,7 +52,7 @@ python run_mlm.py \
 When using a custom dataset, the validation file can be separately passed as an input argument. Otherwise some split (customizable) of training data is used as validation.
 ```bash
 python run_mlm.py \
---model_name_or_path distilbert-base-cased \
+--model_name_or_path distilbert/distilbert-base-cased \
 --output_dir output \
 --train_file train_file_path
 ```
@@ -64,7 +64,7 @@ This script trains a causal language model.
 ### Example command
 ```bash
 python run_clm.py \
---model_name_or_path distilgpt2 \
+--model_name_or_path distilbert/distilgpt2 \
 --output_dir output \
 --dataset_name wikitext \
 --dataset_config_name wikitext-103-raw-v1
@@ -74,7 +74,7 @@ When using a custom dataset, the validation file can be separately passed as an
 
 ```bash
 python run_clm.py \
---model_name_or_path distilgpt2 \
+--model_name_or_path distilbert/distilgpt2 \
 --output_dir output \
 --train_file train_file_path
 ```
diff --git a/examples/tensorflow/multiple-choice/README.md b/examples/tensorflow/multiple-choice/README.md
index 01e33fb62dbe23..a7f499963ec678 100644
--- a/examples/tensorflow/multiple-choice/README.md
+++ b/examples/tensorflow/multiple-choice/README.md
@@ -36,7 +36,7 @@ README, but for more information you can see the 'Input Datasets' section of
 ### Example command
 ```bash
 python run_swag.py \
- --model_name_or_path distilbert-base-cased \
+ --model_name_or_path distilbert/distilbert-base-cased \
  --output_dir output \
  --do_eval \
  --do_train
diff --git a/examples/tensorflow/question-answering/README.md b/examples/tensorflow/question-answering/README.md
index b347ffad81ae88..41cc8b7ef30c69 100644
--- a/examples/tensorflow/question-answering/README.md
+++ b/examples/tensorflow/question-answering/README.md
@@ -47,7 +47,7 @@ README, but for more information you can see the 'Input Datasets' section of
 ### Example command
 ```bash
 python run_qa.py \
---model_name_or_path distilbert-base-cased \
+--model_name_or_path distilbert/distilbert-base-cased \
 --output_dir output \
 --dataset_name squad \
 --do_train \
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 92c2f11d59812d..d4430227860a9f 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -334,11 +334,11 @@ def main():
 
     # region T5 special-casing
     if data_args.source_prefix is None and model_args.model_name_or_path in [
-        "t5-small",
-        "t5-base",
-        "t5-large",
-        "t5-3b",
-        "t5-11b",
+        "google-t5/t5-small",
+        "google-t5/t5-base",
+        "google-t5/t5-large",
+        "google-t5/t5-3b",
+        "google-t5/t5-11b",
     ]:
         logger.warning(
             "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py
index b07d5f7df89174..914ea767d0f08e 100644
--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@@ -107,7 +107,7 @@ def test_run_text_classification(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_text_classification.py
-            --model_name_or_path distilbert-base-uncased
+            --model_name_or_path distilbert/distilbert-base-uncased
             --output_dir {tmp_dir}
             --overwrite_output_dir
             --train_file ./tests/fixtures/tests_samples/MRPC/train.csv
@@ -137,7 +137,7 @@ def test_run_clm(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_clm.py
-            --model_name_or_path distilgpt2
+            --model_name_or_path distilbert/distilgpt2
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --do_train
@@ -163,7 +163,7 @@ def test_run_mlm(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_mlm.py
-            --model_name_or_path distilroberta-base
+            --model_name_or_path distilbert/distilroberta-base
             --train_file ./tests/fixtures/sample_text.txt
             --validation_file ./tests/fixtures/sample_text.txt
             --max_seq_length 64
@@ -188,7 +188,7 @@ def test_run_ner(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_ner.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --train_file tests/fixtures/tests_samples/conll/sample.json
             --validation_file tests/fixtures/tests_samples/conll/sample.json
             --output_dir {tmp_dir}
@@ -212,7 +212,7 @@ def test_run_squad(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_qa.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --version_2_with_negative
             --train_file tests/fixtures/tests_samples/SQUAD/sample.json
             --validation_file tests/fixtures/tests_samples/SQUAD/sample.json
@@ -237,7 +237,7 @@ def test_run_swag(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_swag.py
-            --model_name_or_path bert-base-uncased
+            --model_name_or_path google-bert/bert-base-uncased
             --train_file tests/fixtures/tests_samples/swag/sample.json
             --validation_file tests/fixtures/tests_samples/swag/sample.json
             --output_dir {tmp_dir}
@@ -261,7 +261,7 @@ def test_run_summarization(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
             run_summarization.py
-            --model_name_or_path t5-small
+            --model_name_or_path google-t5/t5-small
             --train_file tests/fixtures/tests_samples/xsum/sample.json
             --validation_file tests/fixtures/tests_samples/xsum/sample.json
             --output_dir {tmp_dir}
diff --git a/examples/tensorflow/text-classification/README.md b/examples/tensorflow/text-classification/README.md
index 39ce91530348d8..b8bc0b367c4d82 100644
--- a/examples/tensorflow/text-classification/README.md
+++ b/examples/tensorflow/text-classification/README.md
@@ -71,7 +71,7 @@ README, but for more information you can see the 'Input Datasets' section of
 ### Example command
 ```bash
 python run_text_classification.py \
---model_name_or_path distilbert-base-cased \
+--model_name_or_path distilbert/distilbert-base-cased \
 --train_file training_data.json \
 --validation_file validation_data.json \
 --output_dir output/ \
@@ -103,7 +103,7 @@ README, but for more information you can see the 'Input Datasets' section of
 ### Example command
 ```bash
 python run_glue.py \
---model_name_or_path distilbert-base-cased \
+--model_name_or_path distilbert/distilbert-base-cased \
 --task_name mnli \
 --do_train \
 --do_eval \
diff --git a/examples/tensorflow/token-classification/README.md b/examples/tensorflow/token-classification/README.md
index 0e5ec84528f8f2..6c8a15c00e813a 100644
--- a/examples/tensorflow/token-classification/README.md
+++ b/examples/tensorflow/token-classification/README.md
@@ -27,7 +27,7 @@ The following example fine-tunes BERT on CoNLL-2003:
 
 ```bash
 python run_ner.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --dataset_name conll2003 \
   --output_dir /tmp/test-ner
 ```
@@ -36,7 +36,7 @@ To run on your own training and validation files, use the following command:
 
 ```bash
 python run_ner.py \
-  --model_name_or_path bert-base-uncased \
+  --model_name_or_path google-bert/bert-base-uncased \
   --train_file path_to_train_file \
   --validation_file path_to_validation_file \
   --output_dir /tmp/test-ner
diff --git a/examples/tensorflow/translation/README.md b/examples/tensorflow/translation/README.md
index df5ee9c1ae36ba..bbe6e27e9c78a4 100644
--- a/examples/tensorflow/translation/README.md
+++ b/examples/tensorflow/translation/README.md
@@ -29,11 +29,11 @@ can also be used by passing the name of the TPU resource with the `--tpu` argume
 
 MBart and some T5 models require special handling.
 
-T5 models `t5-small`, `t5-base`, `t5-large`, `t5-3b` and `t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For example:
+T5 models `google-t5/t5-small`, `google-t5/t5-base`, `google-t5/t5-large`, `google-t5/t5-3b` and `google-t5/t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For example:
 
 ```bash
 python run_translation.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path google-t5/t5-small \
     --do_train \
     --do_eval \
     --source_lang en \
diff --git a/hubconf.py b/hubconf.py
index f2ef70b73db786..412cb27f6380df 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -41,12 +41,12 @@ def config(*args, **kwargs):
                 # Using torch.hub !
                 import torch
 
-                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased')  # Download configuration from huggingface.co and cache.
+                config = torch.hub.load('huggingface/transformers', 'config', 'google-bert/bert-base-uncased')  # Download configuration from huggingface.co and cache.
                 config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
                 config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attentions=True, foo=False)
+                config = torch.hub.load('huggingface/transformers', 'config', 'google-bert/bert-base-uncased', output_attentions=True, foo=False)
                 assert config.output_attentions == True
-                config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True)
+                config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'google-bert/bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True)
                 assert config.output_attentions == True
                 assert unused_kwargs == {'foo': False}
 
@@ -61,7 +61,7 @@ def tokenizer(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from huggingface.co and cache.
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'google-bert/bert-base-uncased')    # Download vocabulary from huggingface.co and cache.
         tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
     """
@@ -75,9 +75,9 @@ def model(*args, **kwargs):
             # Using torch.hub !
             import torch
 
-            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
+            model = torch.hub.load('huggingface/transformers', 'model', 'google-bert/bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
             model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'model', 'google-bert/bert-base-uncased', output_attentions=True)  # Update configuration during loading
             assert model.config.output_attentions == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_pretrained('./tf_model/bert_tf_model_config.json')
@@ -94,9 +94,9 @@ def modelForCausalLM(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'gpt2')    # Download model and configuration from huggingface.co and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'openai-community/gpt2')    # Download model and configuration from huggingface.co and cache.
         model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', './test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'gpt2', output_attentions=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'openai-community/gpt2', output_attentions=True)  # Update configuration during loading
         assert model.config.output_attentions == True
         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
         config = AutoConfig.from_pretrained('./tf_model/gpt_tf_model_config.json')
@@ -112,9 +112,9 @@ def modelForMaskedLM(*args, **kwargs):
             # Using torch.hub !
             import torch
 
-            model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', 'bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
+            model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', 'google-bert/bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
             model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', 'google-bert/bert-base-uncased', output_attentions=True)  # Update configuration during loading
             assert model.config.output_attentions == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_pretrained('./tf_model/bert_tf_model_config.json')
@@ -131,9 +131,9 @@ def modelForSequenceClassification(*args, **kwargs):
             # Using torch.hub !
             import torch
 
-            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'google-bert/bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
             model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'google-bert/bert-base-uncased', output_attentions=True)  # Update configuration during loading
             assert model.config.output_attentions == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_pretrained('./tf_model/bert_tf_model_config.json')
@@ -150,9 +150,9 @@ def modelForQuestionAnswering(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'google-bert/bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
         model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'google-bert/bert-base-uncased', output_attentions=True)  # Update configuration during loading
         assert model.config.output_attentions == True
         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
         config = AutoConfig.from_pretrained('./tf_model/bert_tf_model_config.json')
diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index 903b4e0dd6d500..9eab3f638d7f21 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -54,7 +54,7 @@
 #
 # CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \
 # --base-cmd \
-# ' examples/pytorch/translation/run_translation.py --model_name_or_path t5-small \
+# ' examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small \
 # --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \
 # --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \
 # --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
diff --git a/src/transformers/benchmark/benchmark_args_utils.py b/src/transformers/benchmark/benchmark_args_utils.py
index 48fcb311b43722..b63d792986c619 100644
--- a/src/transformers/benchmark/benchmark_args_utils.py
+++ b/src/transformers/benchmark/benchmark_args_utils.py
@@ -151,7 +151,7 @@ def model_names(self) -> List[str]:
         if len(self.models) <= 0:
             raise ValueError(
                 "Please make sure you provide at least one model name / model identifier, *e.g.* `--models"
-                " bert-base-cased` or `args.models = ['bert-base-cased']."
+                " google-bert/bert-base-cased` or `args.models = ['google-bert/bert-base-cased']."
             )
         return self.models
 
diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py
index df86a22799a510..3b7fcdf19f869f 100644
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@@ -1674,7 +1674,7 @@ def get_user_input():
         "What will be the name of the config class for this model? ", default_value=f"{model_camel_cased}Config"
     )
     checkpoint = get_user_field(
-        "Please give a checkpoint identifier (on the model Hub) for this new model (e.g. facebook/roberta-base): "
+        "Please give a checkpoint identifier (on the model Hub) for this new model (e.g. facebook/FacebookAI/roberta-base): "
     )
 
     old_processing_classes = [
diff --git a/src/transformers/commands/train.py b/src/transformers/commands/train.py
index bdcbae9e01ba78..5c264dbb068604 100644
--- a/src/transformers/commands/train.py
+++ b/src/transformers/commands/train.py
@@ -82,7 +82,7 @@ def register_subcommand(parser: ArgumentParser):
             "--task", type=str, default="text_classification", help="Task to train the model on."
         )
         train_parser.add_argument(
-            "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model."
+            "--model", type=str, default="google-bert/bert-base-uncased", help="Model's name or path to stored model."
         )
         train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
         train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index bd7c5b0c7fe668..819fe5fcf288be 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -527,8 +527,7 @@ def from_pretrained(
                 This can be either:
 
                 - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a configuration file saved using the
                   [`~PretrainedConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
                 - a path or url to a saved configuration JSON *file*, e.g., `./my_model_directory/configuration.json`.
@@ -581,16 +580,16 @@ def from_pretrained(
         # We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a
         # derived class: BertConfig
         config = BertConfig.from_pretrained(
-            "bert-base-uncased"
+            "google-bert/bert-base-uncased"
         )  # Download configuration from huggingface.co and cache.
         config = BertConfig.from_pretrained(
             "./test/saved_model/"
         )  # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
         config = BertConfig.from_pretrained("./test/saved_model/my_configuration.json")
-        config = BertConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
+        config = BertConfig.from_pretrained("google-bert/bert-base-uncased", output_attentions=True, foo=False)
         assert config.output_attentions == True
         config, unused_kwargs = BertConfig.from_pretrained(
-            "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+            "google-bert/bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
         )
         assert config.output_attentions == True
         assert unused_kwargs == {"foo": False}
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index 4538f381f2eacd..e3270bb9debe50 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -61,9 +61,9 @@ def __init__(self):
             "--model",
             type=str,
             required=True,
-            help="Model's id or path (ex: bert-base-cased)",
+            help="Model's id or path (ex: google-bert/bert-base-cased)",
         )
-        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
+        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: google-bert/bert-base-cased)")
         self.add_argument(
             "--framework",
             type=str,
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index 26b19a4e81f41b..12f89ff2e57f23 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -148,19 +148,19 @@
         BertForPreTraining,
         BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
-    "bert-large-uncased-whole-word-masking-finetuned-squad": (
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
         BertConfig,
         TFBertForQuestionAnswering,
         BertForQuestionAnswering,
         BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
-    "bert-large-cased-whole-word-masking-finetuned-squad": (
+    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
         BertConfig,
         TFBertForQuestionAnswering,
         BertForQuestionAnswering,
         BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
-    "bert-base-cased-finetuned-mrpc": (
+    "google-bert/bert-base-cased-finetuned-mrpc": (
         BertConfig,
         TFBertForSequenceClassification,
         BertForSequenceClassification,
@@ -178,7 +178,7 @@
         DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
         DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
     ),
-    "gpt2": (
+    "openai-community/gpt2": (
         GPT2Config,
         TFGPT2LMHeadModel,
         GPT2LMHeadModel,
@@ -208,7 +208,7 @@
         TransfoXLLMHeadModel,
         TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
-    "openai-gpt": (
+    "openai-community/openai-gpt": (
         OpenAIGPTConfig,
         TFOpenAIGPTLMHeadModel,
         OpenAIGPTLMHeadModel,
@@ -227,7 +227,7 @@
         LayoutLMForMaskedLM,
         LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
     ),
-    "roberta-large-mnli": (
+    "FacebookAI/roberta-large-mnli": (
         RobertaConfig,
         TFRobertaForSequenceClassification,
         RobertaForSequenceClassification,
@@ -269,7 +269,7 @@
         LxmertVisualFeatureEncoder,
         LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
-    "ctrl": (
+    "Salesforce/ctrl": (
         CTRLConfig,
         TFCTRLLMHeadModel,
         CTRLLMHeadModel,
diff --git a/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py b/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
index 9be405f47195d8..2b003d4bc48000 100755
--- a/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
+++ b/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
@@ -33,7 +33,7 @@
 def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
     # Initialise PyTorch model
     bert_config = BertConfig.from_pretrained(
-        "bert-large-cased",
+        "google-bert/bert-large-cased",
         vocab_size=vocab_size,
         max_position_embeddings=512,
         is_decoder=True,
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 7cdc0ad93d5268..2236b30f778c99 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -224,8 +224,7 @@ def get_cached_module_file(
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+              huggingface.co.
             - a path to a *directory* containing a configuration file saved using the
               [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
 
@@ -401,6 +400,8 @@ def get_class_from_dynamic_module(
 
     </Tip>
 
+
+
     Args:
         class_reference (`str`):
             The full name of the class to load, including its module and optionally its repo.
@@ -408,8 +409,7 @@ def get_class_from_dynamic_module(
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+              huggingface.co.
             - a path to a *directory* containing a configuration file saved using the
               [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
 
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index fe1f7a78c93f74..bed343e48d6238 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -281,8 +281,7 @@ def from_pretrained(
                 This can be either:
 
                 - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a feature extractor file saved using the
                   [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
                   `./my_model_directory/`.
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 4c3cdc12a44993..ad8cfd796b4b35 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -636,8 +636,7 @@ def from_pretrained(
                 This can be either:
 
                 - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a configuration file saved using the
                   [`~GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
             config_file_name (`str` or `os.PathLike`, *optional*, defaults to `"generation_config.json"`):
@@ -691,7 +690,7 @@ def from_pretrained(
         >>> from transformers import GenerationConfig
 
         >>> # Download configuration from huggingface.co and cache.
-        >>> generation_config = GenerationConfig.from_pretrained("gpt2")
+        >>> generation_config = GenerationConfig.from_pretrained("openai-community/gpt2")
 
         >>> # E.g. config was saved using *save_pretrained('./test/saved_model/')*
         >>> generation_config.save_pretrained("./test/saved_model/")
@@ -704,7 +703,7 @@ def from_pretrained(
         >>> # If you'd like to try a minor variation to an existing configuration, you can also pass generation
         >>> # arguments to `.from_pretrained()`. Be mindful that typos and unused arguments will be ignored
         >>> generation_config, unused_kwargs = GenerationConfig.from_pretrained(
-        ...     "gpt2", top_k=1, foo=False, do_sample=True, return_unused_kwargs=True
+        ...     "openai-community/gpt2", top_k=1, foo=False, do_sample=True, return_unused_kwargs=True
         ... )
         >>> generation_config.top_k
         1
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 04120e39fbd27c..aa773f3bc6a382 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -246,8 +246,8 @@ class TemperatureLogitsWarper(LogitsWarper):
 
     >>> set_seed(0)  # for reproducibility
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
     >>> model.config.pad_token_id = model.config.eos_token_id
     >>> inputs = tokenizer(["Hugging Face Company is"], return_tensors="pt")
 
@@ -306,8 +306,8 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
     >>> # Initializing the model and tokenizer for it
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
     >>> inputs = tokenizer(["I'm not going to"], return_tensors="pt")
 
     >>> # This shows a normal generate without any specific parameters
@@ -414,8 +414,8 @@ class TopPLogitsWarper(LogitsWarper):
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 
     >>> set_seed(0)
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 
     >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
 
@@ -478,8 +478,8 @@ class TopKLogitsWarper(LogitsWarper):
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 
     >>> set_seed(0)
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 
     >>> inputs = tokenizer("A sequence: A, B, C, D", return_tensors="pt")
 
@@ -619,8 +619,8 @@ class EpsilonLogitsWarper(LogitsWarper):
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 
     >>> set_seed(0)
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 
     >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
 
@@ -696,8 +696,8 @@ class EtaLogitsWarper(LogitsWarper):
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 
     >>> set_seed(0)
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 
     >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
 
@@ -840,8 +840,8 @@ class NoRepeatNGramLogitsProcessor(LogitsProcessor):
     ```py
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
     >>> inputs = tokenizer(["Today I"], return_tensors="pt")
 
     >>> output = model.generate(**inputs)
@@ -967,8 +967,8 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
     ```python
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
     >>> inputs = tokenizer(["The full name of Donald is Donald"], return_tensors="pt")
 
     >>> summary_ids = model.generate(inputs["input_ids"], max_new_tokens=4)
@@ -976,7 +976,7 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
     The full name of Donald is Donald J. Trump Jr
 
     >>> # Now let's control generation through a bias. Please note that the tokenizer is initialized differently!
-    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("gpt2", add_prefix_space=True)
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("openai-community/gpt2", add_prefix_space=True)
 
 
     >>> def get_tokens_as_tuple(word):
@@ -1112,8 +1112,8 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
     ```python
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
     >>> inputs = tokenizer(["In a word, the cake is a"], return_tensors="pt")
 
     >>> output_ids = model.generate(inputs["input_ids"], max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
@@ -1121,7 +1121,7 @@ class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
     In a word, the cake is a bit of a mess.
 
     >>> # Now let's take the bad words out. Please note that the tokenizer is initialized differently
-    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("gpt2", add_prefix_space=True)
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("openai-community/gpt2", add_prefix_space=True)
 
 
     >>> def get_tokens_as_list(word_list):
@@ -1272,8 +1272,8 @@ class HammingDiversityLogitsProcessor(LogitsProcessor):
     >>> import torch
 
     >>> # Initialize the model and tokenizer
-    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
     >>> # A long text about the solar system
     >>> text = (
@@ -1436,8 +1436,8 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
     ```python
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 
     >>> inputs = tokenizer("A sequence: 1, 2, 3", return_tensors="pt")
 
@@ -1511,8 +1511,8 @@ class ExponentialDecayLengthPenalty(LogitsProcessor):
     ```python
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 
-    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
     >>> text = "Just wanted to let you know, I"
     >>> inputs = tokenizer(text, return_tensors="pt")
@@ -1595,8 +1595,8 @@ class LogitNormalization(LogitsProcessor, LogitsWarper):
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
     >>> import torch
 
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
 
     >>> inputs = tokenizer("A sequence: 1, 2, 3", return_tensors="pt")
 
@@ -2083,8 +2083,8 @@ class UnbatchedClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
     ```python
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
     >>> inputs = tokenizer(["Today, a dragon flew over Paris, France,"], return_tensors="pt")
     >>> out = model.generate(inputs["input_ids"], guidance_scale=1.5)
     >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
diff --git a/src/transformers/generation/streamers.py b/src/transformers/generation/streamers.py
index 4b299db5da6982..c75b43466af7a8 100644
--- a/src/transformers/generation/streamers.py
+++ b/src/transformers/generation/streamers.py
@@ -58,8 +58,8 @@ class TextStreamer(BaseStreamer):
         ```python
         >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 
-        >>> tok = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
         >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
         >>> streamer = TextStreamer(tok)
 
@@ -185,8 +185,8 @@ class TextIteratorStreamer(TextStreamer):
         >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
         >>> from threading import Thread
 
-        >>> tok = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
         >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
         >>> streamer = TextIteratorStreamer(tok)
 
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 7e015d718e7b7e..3021e1e55945f0 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -511,8 +511,8 @@ def compute_transition_scores(
         >>> from transformers import GPT2Tokenizer, TFAutoModelForCausalLM
         >>> import numpy as np
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
         >>> tokenizer.pad_token_id = tokenizer.eos_token_id
         >>> inputs = tokenizer(["Today is"], return_tensors="tf")
 
@@ -1583,8 +1583,8 @@ def greedy_search(
         ...     TFMinLengthLogitsProcessor,
         ... )
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 
         >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
         >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
@@ -1857,8 +1857,8 @@ def sample(
         ...     TFTemperatureLogitsWarper,
         ... )
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 
         >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
         >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
@@ -2180,8 +2180,8 @@ def beam_search(
         ... )
         >>> import tensorflow as tf
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
         >>> encoder_input_str = "translate English to German: How old are you?"
         >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="tf").input_ids
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 0bbdd643421996..e41b7c509133e1 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -976,7 +976,7 @@ def compute_transition_scores(
         >>> import numpy as np
 
         >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
         >>> tokenizer.pad_token_id = tokenizer.eos_token_id
         >>> inputs = tokenizer(["Today is"], return_tensors="pt")
 
@@ -2263,8 +2263,8 @@ def greedy_search(
         ...     MaxLengthCriteria,
         ... )
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 
         >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
         >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
@@ -2530,8 +2530,8 @@ def sample(
         ... )
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 
         >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
         >>> model.config.pad_token_id = model.config.eos_token_id
@@ -2838,8 +2838,8 @@ def beam_search(
         ... )
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
         >>> encoder_input_str = "translate English to German: How old are you?"
         >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
@@ -2959,7 +2959,16 @@ def beam_search(
             if sequential:
                 if any(
                     model_name in self.__class__.__name__.lower()
-                    for model_name in ["fsmt", "reformer", "bloom", "ctrl", "gpt_bigcode", "transo_xl", "xlnet", "cpm"]
+                    for model_name in [
+                        "fsmt",
+                        "reformer",
+                        "bloom",
+                        "ctrl",
+                        "gpt_bigcode",
+                        "transo_xl",
+                        "xlnet",
+                        "cpm",
+                    ]
                 ):
                     raise RuntimeError(
                         f"Currently generation for {self.__class__.__name__} is not supported "
@@ -3203,8 +3212,8 @@ def beam_sample(
         ... )
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
         >>> encoder_input_str = "translate English to German: How old are you?"
         >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
@@ -3535,8 +3544,8 @@ def group_beam_search(
         ... )
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
         >>> encoder_input_str = "translate English to German: How old are you?"
         >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
@@ -3925,8 +3934,8 @@ def constrained_beam_search(
         ... )
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
         >>> encoder_input_str = "translate English to German: How old are you?"
         >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
@@ -4277,9 +4286,9 @@ def assisted_decoding(
         ...     MaxLengthCriteria,
         ... )
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-        >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
         >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
         >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
         >>> input_prompt = "It might be possible to"
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 4a7b06621a4b27..a2004a8b55931e 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -111,8 +111,7 @@ def from_pretrained(
                 This can be either:
 
                 - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a image processor file saved using the
                   [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
                   `./my_model_directory/`.
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index 43aeaf6708d045..d58e749f824547 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -76,7 +76,7 @@ class `Int8Params` from `bitsandbytes`.
             else:
                 new_value = torch.tensor(value, device="cpu")
 
-            # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization.
+            # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
             # Since weights are saved in the correct "orientation", we skip transposing when loading.
             if issubclass(module.source_cls, Conv1D) and not prequantized_loading:
                 new_value = new_value.T
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 9e8f2becae002b..4776737a3746e3 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -131,8 +131,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             pretrained_model_name_or_path: either:
 
                 - a string, the *model id* of a pretrained model card hosted inside a model repo on huggingface.co.
-                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                  user or organization name, like `dbmdz/bert-base-german-cased`.
                 - a path to a *directory* containing a model card file saved using the [`~ModelCard.save_pretrained`]
                   method, e.g.: `./my_model_directory/`.
                 - a path or url to a saved model card JSON *file*, e.g.: `./my_model_directory/modelcard.json`.
@@ -163,11 +161,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         ```python
         # Download model card from huggingface.co and cache.
-        modelcard = ModelCard.from_pretrained("bert-base-uncased")
+        modelcard = ModelCard.from_pretrained("google-bert/bert-base-uncased")
         # Model card was saved using *save_pretrained('./test/saved_model/')*
         modelcard = ModelCard.from_pretrained("./test/saved_model/")
         modelcard = ModelCard.from_pretrained("./test/saved_model/modelcard.json")
-        modelcard = ModelCard.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
+        modelcard = ModelCard.from_pretrained("google-bert/bert-base-uncased", output_attentions=True, foo=False)
         ```"""
         cache_dir = kwargs.pop("cache_dir", None)
         proxies = kwargs.pop("proxies", None)
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index b57458c0826b81..eaf5410bc2f27d 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -347,14 +347,14 @@ def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
         >>> from transformers import FlaxBertModel
 
         >>> # load model
-        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
+        >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased")
         >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
         >>> model.params = model.to_bf16(model.params)
         >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
         >>> # then pass the mask as follows
         >>> from flax import traverse_util
 
-        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
+        >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased")
         >>> flat_params = traverse_util.flatten_dict(model.params)
         >>> mask = {
         ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
@@ -383,7 +383,7 @@ def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
         >>> from transformers import FlaxBertModel
 
         >>> # Download model and configuration from huggingface.co
-        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
+        >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased")
         >>> # By default, the model params will be in fp32, to illustrate the use of this method,
         >>> # we'll first cast to fp16 and back to fp32
         >>> model.params = model.to_f16(model.params)
@@ -413,14 +413,14 @@ def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
         >>> from transformers import FlaxBertModel
 
         >>> # load model
-        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
+        >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased")
         >>> # By default, the model params will be in fp32, to cast these to float16
         >>> model.params = model.to_fp16(model.params)
         >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
         >>> # then pass the mask as follows
         >>> from flax import traverse_util
 
-        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
+        >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased")
         >>> flat_params = traverse_util.flatten_dict(model.params)
         >>> mask = {
         ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
@@ -545,8 +545,6 @@ def from_pretrained(
                 Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *pt index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In this case,
@@ -639,7 +637,7 @@ def from_pretrained(
         >>> from transformers import BertConfig, FlaxBertModel
 
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
+        >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased")
         >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
         >>> model = FlaxBertModel.from_pretrained("./test/saved_model/")
         >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index f8b1122d467df9..92f713a970680c 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2493,8 +2493,6 @@ def from_pretrained(
                 Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
@@ -2592,11 +2590,11 @@ def from_pretrained(
         >>> from transformers import BertConfig, TFBertModel
 
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = TFBertModel.from_pretrained("bert-base-uncased")
+        >>> model = TFBertModel.from_pretrained("google-bert/bert-base-uncased")
         >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
         >>> model = TFBertModel.from_pretrained("./test/saved_model/")
         >>> # Update configuration during loading.
-        >>> model = TFBertModel.from_pretrained("bert-base-uncased", output_attentions=True)
+        >>> model = TFBertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
         >>> assert model.config.output_attentions == True
         >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
         >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json")
@@ -3075,7 +3073,7 @@ def push_to_hub(
         ```python
         from transformers import TFAutoModel
 
-        model = TFAutoModel.from_pretrained("bert-base-cased")
+        model = TFAutoModel.from_pretrained("google-bert/bert-base-cased")
 
         # Push the model to your namespace with the name "my-finetuned-bert".
         model.push_to_hub("my-finetuned-bert")
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index a6dc313fbaa172..668805d981421d 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1251,7 +1251,7 @@ def add_model_tags(self, tags: Union[List[str], str]) -> None:
         ```python
         from transformers import AutoModel
 
-        model = AutoModel.from_pretrained("bert-base-cased")
+        model = AutoModel.from_pretrained("google-bert/bert-base-cased")
 
         model.add_model_tags(["custom", "custom-bert"])
 
@@ -2608,8 +2608,6 @@ def from_pretrained(
                 Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -2788,17 +2786,17 @@ def from_pretrained(
         >>> from transformers import BertConfig, BertModel
 
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = BertModel.from_pretrained("bert-base-uncased")
+        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased")
         >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
         >>> model = BertModel.from_pretrained("./test/saved_model/")
         >>> # Update configuration during loading.
-        >>> model = BertModel.from_pretrained("bert-base-uncased", output_attentions=True)
+        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
         >>> assert model.config.output_attentions == True
         >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
         >>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json")
         >>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config)
         >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
-        >>> model = BertModel.from_pretrained("bert-base-uncased", from_flax=True)
+        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", from_flax=True)
         ```
 
         * `low_cpu_mem_usage` algorithm:
diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py
index cacc0499035c19..690be7fbbf2c0c 100644
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -22,14 +22,14 @@
 
 
 ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "albert-base-v1": "https://huggingface.co/albert-base-v1/resolve/main/config.json",
-    "albert-large-v1": "https://huggingface.co/albert-large-v1/resolve/main/config.json",
-    "albert-xlarge-v1": "https://huggingface.co/albert-xlarge-v1/resolve/main/config.json",
-    "albert-xxlarge-v1": "https://huggingface.co/albert-xxlarge-v1/resolve/main/config.json",
-    "albert-base-v2": "https://huggingface.co/albert-base-v2/resolve/main/config.json",
-    "albert-large-v2": "https://huggingface.co/albert-large-v2/resolve/main/config.json",
-    "albert-xlarge-v2": "https://huggingface.co/albert-xlarge-v2/resolve/main/config.json",
-    "albert-xxlarge-v2": "https://huggingface.co/albert-xxlarge-v2/resolve/main/config.json",
+    "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/config.json",
+    "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/config.json",
+    "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/config.json",
+    "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/config.json",
+    "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/config.json",
+    "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/config.json",
+    "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/config.json",
+    "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/config.json",
 }
 
 
@@ -38,7 +38,7 @@ class AlbertConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`AlbertModel`] or a [`TFAlbertModel`]. It is used
     to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating
     a configuration with the defaults will yield a similar configuration to that of the ALBERT
-    [albert-xxlarge-v2](https://huggingface.co/albert-xxlarge-v2) architecture.
+    [albert/albert-xxlarge-v2](https://huggingface.co/albert/albert-xxlarge-v2) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index fe6b3773233270..25ae832b03a00a 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -48,19 +48,19 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "albert-base-v2"
+_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
 _CONFIG_FOR_DOC = "AlbertConfig"
 
 
 ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "albert-base-v1",
-    "albert-large-v1",
-    "albert-xlarge-v1",
-    "albert-xxlarge-v1",
-    "albert-base-v2",
-    "albert-large-v2",
-    "albert-xlarge-v2",
-    "albert-xxlarge-v2",
+    "albert/albert-base-v1",
+    "albert/albert-large-v1",
+    "albert/albert-xlarge-v1",
+    "albert/albert-xxlarge-v1",
+    "albert/albert-base-v2",
+    "albert/albert-large-v2",
+    "albert/albert-xlarge-v2",
+    "albert/albert-xxlarge-v2",
     # See all ALBERT models at https://huggingface.co/models?filter=albert
 ]
 
@@ -816,8 +816,8 @@ def forward(
         >>> from transformers import AutoTokenizer, AlbertForPreTraining
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
-        >>> model = AlbertForPreTraining.from_pretrained("albert-base-v2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")
 
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
         >>> # Batch size 1
@@ -958,8 +958,8 @@ def forward(
         >>> import torch
         >>> from transformers import AutoTokenizer, AlbertForMaskedLM
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
-        >>> model = AlbertForMaskedLM.from_pretrained("albert-base-v2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")
 
         >>> # add mask_token
         >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index 6333f0bd3ac204..b2c01ded3619ca 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -47,7 +47,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "albert-base-v2"
+_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
 _CONFIG_FOR_DOC = "AlbertConfig"
 
 
@@ -754,8 +754,8 @@ class FlaxAlbertForPreTraining(FlaxAlbertPreTrainedModel):
     ```python
     >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
-    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert-base-v2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
     >>> outputs = model(**inputs)
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index acdc8c886c5376..1225465c5260a8 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -62,18 +62,18 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "albert-base-v2"
+_CHECKPOINT_FOR_DOC = "albert/albert-base-v2"
 _CONFIG_FOR_DOC = "AlbertConfig"
 
 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "albert-base-v1",
-    "albert-large-v1",
-    "albert-xlarge-v1",
-    "albert-xxlarge-v1",
-    "albert-base-v2",
-    "albert-large-v2",
-    "albert-xlarge-v2",
-    "albert-xxlarge-v2",
+    "albert/albert-base-v1",
+    "albert/albert-large-v1",
+    "albert/albert-xlarge-v1",
+    "albert/albert-xxlarge-v1",
+    "albert/albert-base-v2",
+    "albert/albert-large-v2",
+    "albert/albert-xlarge-v2",
+    "albert/albert-xxlarge-v2",
     # See all ALBERT models at https://huggingface.co/models?filter=albert
 ]
 
@@ -971,8 +971,8 @@ def call(
         >>> import tensorflow as tf
         >>> from transformers import AutoTokenizer, TFAlbertForPreTraining
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
-        >>> model = TFAlbertForPreTraining.from_pretrained("albert-base-v2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+        >>> model = TFAlbertForPreTraining.from_pretrained("albert/albert-base-v2")
 
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]
         >>> # Batch size 1
@@ -1103,8 +1103,8 @@ def call(
         >>> import tensorflow as tf
         >>> from transformers import AutoTokenizer, TFAlbertForMaskedLM
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
-        >>> model = TFAlbertForMaskedLM.from_pretrained("albert-base-v2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
+        >>> model = TFAlbertForMaskedLM.from_pretrained("albert/albert-base-v2")
 
         >>> # add mask_token
         >>> inputs = tokenizer(f"The capital of [MASK] is Paris.", return_tensors="tf")
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 3ff319199522cc..7baaa0a6000e6f 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -31,26 +31,26 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "albert-base-v1": "https://huggingface.co/albert-base-v1/resolve/main/spiece.model",
-        "albert-large-v1": "https://huggingface.co/albert-large-v1/resolve/main/spiece.model",
-        "albert-xlarge-v1": "https://huggingface.co/albert-xlarge-v1/resolve/main/spiece.model",
-        "albert-xxlarge-v1": "https://huggingface.co/albert-xxlarge-v1/resolve/main/spiece.model",
-        "albert-base-v2": "https://huggingface.co/albert-base-v2/resolve/main/spiece.model",
-        "albert-large-v2": "https://huggingface.co/albert-large-v2/resolve/main/spiece.model",
-        "albert-xlarge-v2": "https://huggingface.co/albert-xlarge-v2/resolve/main/spiece.model",
-        "albert-xxlarge-v2": "https://huggingface.co/albert-xxlarge-v2/resolve/main/spiece.model",
+        "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/spiece.model",
+        "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/spiece.model",
+        "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/spiece.model",
+        "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/spiece.model",
+        "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/spiece.model",
+        "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/spiece.model",
+        "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/spiece.model",
+        "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/spiece.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "albert-base-v1": 512,
-    "albert-large-v1": 512,
-    "albert-xlarge-v1": 512,
-    "albert-xxlarge-v1": 512,
-    "albert-base-v2": 512,
-    "albert-large-v2": 512,
-    "albert-xlarge-v2": 512,
-    "albert-xxlarge-v2": 512,
+    "albert/albert-base-v1": 512,
+    "albert/albert-large-v1": 512,
+    "albert/albert-xlarge-v1": 512,
+    "albert/albert-xxlarge-v1": 512,
+    "albert/albert-base-v2": 512,
+    "albert/albert-large-v2": 512,
+    "albert/albert-xlarge-v2": 512,
+    "albert/albert-xxlarge-v2": 512,
 }
 
 SPIECE_UNDERLINE = "▁"
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index 200953f8e6b9f6..91cf403d07eefd 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -34,36 +34,36 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "albert-base-v1": "https://huggingface.co/albert-base-v1/resolve/main/spiece.model",
-        "albert-large-v1": "https://huggingface.co/albert-large-v1/resolve/main/spiece.model",
-        "albert-xlarge-v1": "https://huggingface.co/albert-xlarge-v1/resolve/main/spiece.model",
-        "albert-xxlarge-v1": "https://huggingface.co/albert-xxlarge-v1/resolve/main/spiece.model",
-        "albert-base-v2": "https://huggingface.co/albert-base-v2/resolve/main/spiece.model",
-        "albert-large-v2": "https://huggingface.co/albert-large-v2/resolve/main/spiece.model",
-        "albert-xlarge-v2": "https://huggingface.co/albert-xlarge-v2/resolve/main/spiece.model",
-        "albert-xxlarge-v2": "https://huggingface.co/albert-xxlarge-v2/resolve/main/spiece.model",
+        "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/spiece.model",
+        "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/spiece.model",
+        "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/spiece.model",
+        "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/spiece.model",
+        "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/spiece.model",
+        "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/spiece.model",
+        "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/spiece.model",
+        "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/spiece.model",
     },
     "tokenizer_file": {
-        "albert-base-v1": "https://huggingface.co/albert-base-v1/resolve/main/tokenizer.json",
-        "albert-large-v1": "https://huggingface.co/albert-large-v1/resolve/main/tokenizer.json",
-        "albert-xlarge-v1": "https://huggingface.co/albert-xlarge-v1/resolve/main/tokenizer.json",
-        "albert-xxlarge-v1": "https://huggingface.co/albert-xxlarge-v1/resolve/main/tokenizer.json",
-        "albert-base-v2": "https://huggingface.co/albert-base-v2/resolve/main/tokenizer.json",
-        "albert-large-v2": "https://huggingface.co/albert-large-v2/resolve/main/tokenizer.json",
-        "albert-xlarge-v2": "https://huggingface.co/albert-xlarge-v2/resolve/main/tokenizer.json",
-        "albert-xxlarge-v2": "https://huggingface.co/albert-xxlarge-v2/resolve/main/tokenizer.json",
+        "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/tokenizer.json",
+        "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/tokenizer.json",
+        "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/tokenizer.json",
+        "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/tokenizer.json",
+        "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/tokenizer.json",
+        "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/tokenizer.json",
+        "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/tokenizer.json",
+        "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/tokenizer.json",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "albert-base-v1": 512,
-    "albert-large-v1": 512,
-    "albert-xlarge-v1": 512,
-    "albert-xxlarge-v1": 512,
-    "albert-base-v2": 512,
-    "albert-large-v2": 512,
-    "albert-xlarge-v2": 512,
-    "albert-xxlarge-v2": 512,
+    "albert/albert-base-v1": 512,
+    "albert/albert-large-v1": 512,
+    "albert/albert-xlarge-v1": 512,
+    "albert/albert-xxlarge-v1": 512,
+    "albert/albert-base-v2": 512,
+    "albert/albert-large-v2": 512,
+    "albert/albert-xlarge-v2": 512,
+    "albert/albert-xxlarge-v2": 512,
 }
 
 SPIECE_UNDERLINE = "▁"
diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py
index 96e98107976904..610db8482f9162 100644
--- a/src/transformers/models/align/convert_align_tf_to_hf.py
+++ b/src/transformers/models/align/convert_align_tf_to_hf.py
@@ -78,7 +78,7 @@ def get_processor():
         include_top=False,
         resample=Image.BILINEAR,
     )
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
     tokenizer.model_max_length = 64
     processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer)
     return processor
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 0ef455fea47423..ce7884d2ef120e 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -87,8 +87,6 @@
                 Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -194,8 +192,6 @@
                 Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
@@ -295,8 +291,6 @@
                 Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
@@ -642,7 +636,7 @@ def insert_head_doc(docstring, head_doc=""):
     )
 
 
-def auto_class_update(cls, checkpoint_for_example="bert-base-cased", head_doc=""):
+def auto_class_update(cls, checkpoint_for_example="google-bert/bert-base-cased", head_doc=""):
     # Create a new class with the right name from the base class
     model_mapping = cls._model_mapping
     name = cls.__name__
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 682241ea4a84ec..44d435bc45aa05 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -1017,8 +1017,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 Can be either:
 
                     - A string, the *model id* of a pretrained model configuration hosted inside a model repo on
-                      huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                      namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                      huggingface.co.
                     - A path to a *directory* containing a configuration file saved using the
                       [`~PretrainedConfig.save_pretrained`] method, or the [`~PreTrainedModel.save_pretrained`] method,
                       e.g., `./my_model_directory/`.
@@ -1061,7 +1060,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> from transformers import AutoConfig
 
         >>> # Download configuration from huggingface.co and cache.
-        >>> config = AutoConfig.from_pretrained("bert-base-uncased")
+        >>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
 
         >>> # Download configuration from huggingface.co (user-uploaded) and cache.
         >>> config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased")
@@ -1073,12 +1072,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json")
 
         >>> # Change some config attributes when loading a pretrained config.
-        >>> config = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
+        >>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased", output_attentions=True, foo=False)
         >>> config.output_attentions
         True
 
         >>> config, unused_kwargs = AutoConfig.from_pretrained(
-        ...     "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+        ...     "google-bert/bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
         ... )
         >>> config.output_attentions
         True
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index b3461e8b56a7a9..f8cb55091b02fd 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -155,8 +155,7 @@ def get_feature_extractor_config(
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+              huggingface.co.
             - a path to a *directory* containing a configuration file saved using the
               [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
 
@@ -194,14 +193,14 @@ def get_feature_extractor_config(
 
     ```python
     # Download configuration from huggingface.co and cache.
-    tokenizer_config = get_tokenizer_config("bert-base-uncased")
+    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
     # This model does not have a tokenizer config so the result will be an empty dict.
-    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
+    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
 
     # Save a pretrained tokenizer locally and you can reload its config
     from transformers import AutoTokenizer
 
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
     tokenizer.save_pretrained("tokenizer-test")
     tokenizer_config = get_tokenizer_config("tokenizer-test")
     ```"""
@@ -267,8 +266,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 This can be either:
 
                 - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a feature extractor file saved using the
                   [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
                   `./my_model_directory/`.
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 54675f5693c4ce..c9cd6fca69d661 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -168,8 +168,7 @@ def get_image_processor_config(
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+              huggingface.co.
             - a path to a *directory* containing a configuration file saved using the
               [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
 
@@ -207,9 +206,9 @@ def get_image_processor_config(
 
     ```python
     # Download configuration from huggingface.co and cache.
-    image_processor_config = get_image_processor_config("bert-base-uncased")
+    image_processor_config = get_image_processor_config("google-bert/bert-base-uncased")
     # This model does not have a image processor config so the result will be an empty dict.
-    image_processor_config = get_image_processor_config("xlm-roberta-base")
+    image_processor_config = get_image_processor_config("FacebookAI/xlm-roberta-base")
 
     # Save a pretrained image processor locally and you can reload its config
     from transformers import AutoTokenizer
@@ -280,8 +279,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 This can be either:
 
                 - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a image processor file saved using the
                   [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
                   `./my_model_directory/`.
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 6aa882a5340f9a..1de0249831dbd9 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1354,7 +1354,7 @@ class AutoModelForSeq2SeqLM(_BaseAutoModelClass):
 AutoModelForSeq2SeqLM = auto_class_update(
     AutoModelForSeq2SeqLM,
     head_doc="sequence-to-sequence language modeling",
-    checkpoint_for_example="t5-base",
+    checkpoint_for_example="google-t5/t5-base",
 )
 
 
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index 3438e1c7bc7d9f..785035b98fb74e 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -308,7 +308,9 @@ class FlaxAutoModelForSeq2SeqLM(_BaseAutoModelClass):
 
 
 FlaxAutoModelForSeq2SeqLM = auto_class_update(
-    FlaxAutoModelForSeq2SeqLM, head_doc="sequence-to-sequence language modeling", checkpoint_for_example="t5-base"
+    FlaxAutoModelForSeq2SeqLM,
+    head_doc="sequence-to-sequence language modeling",
+    checkpoint_for_example="google-t5/t5-base",
 )
 
 
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index e79922f928226d..deed743162e477 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -621,7 +621,9 @@ class TFAutoModelForSeq2SeqLM(_BaseAutoModelClass):
 
 
 TFAutoModelForSeq2SeqLM = auto_class_update(
-    TFAutoModelForSeq2SeqLM, head_doc="sequence-to-sequence language modeling", checkpoint_for_example="t5-base"
+    TFAutoModelForSeq2SeqLM,
+    head_doc="sequence-to-sequence language modeling",
+    checkpoint_for_example="google-t5/t5-base",
 )
 
 
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 2a8823fea7c0ee..e41e39e56eeea2 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -156,8 +156,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 This can be either:
 
                 - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a processor files saved using the `save_pretrained()` method,
                   e.g., `./my_model_directory/`.
             cache_dir (`str` or `os.PathLike`, *optional*):
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ff464c578c2ab9..7760369507bd92 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -295,7 +295,10 @@
                 ),
             ),
             ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
-            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "openai-gpt",
+                ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None),
+            ),
             ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
             ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
@@ -524,8 +527,7 @@ def get_tokenizer_config(
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+              huggingface.co.
             - a path to a *directory* containing a configuration file saved using the
               [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
 
@@ -566,14 +568,14 @@ def get_tokenizer_config(
 
     ```python
     # Download configuration from huggingface.co and cache.
-    tokenizer_config = get_tokenizer_config("bert-base-uncased")
+    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
     # This model does not have a tokenizer config so the result will be an empty dict.
-    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
+    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
 
     # Save a pretrained tokenizer locally and you can reload its config
     from transformers import AutoTokenizer
 
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
     tokenizer.save_pretrained("tokenizer-test")
     tokenizer_config = get_tokenizer_config("tokenizer-test")
     ```"""
@@ -646,8 +648,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 Can be either:
 
                     - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                       using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                     - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
@@ -697,7 +697,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         >>> from transformers import AutoTokenizer
 
         >>> # Download vocabulary from huggingface.co and cache.
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
         >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
         >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
@@ -706,7 +706,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
 
         >>> # Download vocabulary from huggingface.co and define model-specific arguments
-        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
         ```"""
         use_auth_token = kwargs.pop("use_auth_token", None)
         if use_auth_token is not None:
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
index b322615ae233ff..d58b89bf6f8f9b 100644
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@@ -73,8 +73,7 @@ def from_pretrained(
                 This can be either:
 
                 - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
                   method, e.g., `./my_model_directory/`.
             speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py
index e0db2c9f1bb222..1f79260f510ff2 100644
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@@ -25,29 +25,29 @@
 logger = logging.get_logger(__name__)
 
 BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
-    "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
-    "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
-    "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
-    "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
-    "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
-    "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
-    "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
-    "bert-large-uncased-whole-word-masking": (
-        "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json"
+    "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/config.json",
+    "google-bert/bert-large-uncased": "https://huggingface.co/google-bert/bert-large-uncased/resolve/main/config.json",
+    "google-bert/bert-base-cased": "https://huggingface.co/google-bert/bert-base-cased/resolve/main/config.json",
+    "google-bert/bert-large-cased": "https://huggingface.co/google-bert/bert-large-cased/resolve/main/config.json",
+    "google-bert/bert-base-multilingual-uncased": "https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/config.json",
+    "google-bert/bert-base-multilingual-cased": "https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/config.json",
+    "google-bert/bert-base-chinese": "https://huggingface.co/google-bert/bert-base-chinese/resolve/main/config.json",
+    "google-bert/bert-base-german-cased": "https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/config.json",
+    "google-bert/bert-large-uncased-whole-word-masking": (
+        "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/config.json"
     ),
-    "bert-large-cased-whole-word-masking": (
-        "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json"
+    "google-bert/bert-large-cased-whole-word-masking": (
+        "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/config.json"
     ),
-    "bert-large-uncased-whole-word-masking-finetuned-squad": (
-        "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json"
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
+        "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json"
     ),
-    "bert-large-cased-whole-word-masking-finetuned-squad": (
-        "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json"
+    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
+        "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json"
     ),
-    "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
-    "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
-    "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
+    "google-bert/bert-base-cased-finetuned-mrpc": "https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
+    "google-bert/bert-base-german-dbmdz-cased": "https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/config.json",
+    "google-bert/bert-base-german-dbmdz-uncased": "https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/config.json",
     "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
     "cl-tohoku/bert-base-japanese-whole-word-masking": (
         "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json"
@@ -74,7 +74,7 @@ class BertConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to
     instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the BERT
-    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+    [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -127,10 +127,10 @@ class BertConfig(PretrainedConfig):
     ```python
     >>> from transformers import BertConfig, BertModel
 
-    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
     >>> configuration = BertConfig()
 
-    >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
+    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
     >>> model = BertModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
index 418e1f89051953..f7cb149053a3d0 100644
--- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -91,7 +91,7 @@ def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
 
 def main(raw_args=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased")
+    parser.add_argument("--model_name", type=str, required=True, help="model name e.g. google-bert/bert-base-uncased")
     parser.add_argument(
         "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
     )
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index c6764c771e7664..4c068c4d4f1d76 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -54,7 +54,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
 _CONFIG_FOR_DOC = "BertConfig"
 
 # TokenClassification docstring
@@ -78,21 +78,21 @@
 
 
 BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "bert-base-uncased",
-    "bert-large-uncased",
-    "bert-base-cased",
-    "bert-large-cased",
-    "bert-base-multilingual-uncased",
-    "bert-base-multilingual-cased",
-    "bert-base-chinese",
-    "bert-base-german-cased",
-    "bert-large-uncased-whole-word-masking",
-    "bert-large-cased-whole-word-masking",
-    "bert-large-uncased-whole-word-masking-finetuned-squad",
-    "bert-large-cased-whole-word-masking-finetuned-squad",
-    "bert-base-cased-finetuned-mrpc",
-    "bert-base-german-dbmdz-cased",
-    "bert-base-german-dbmdz-uncased",
+    "google-bert/bert-base-uncased",
+    "google-bert/bert-large-uncased",
+    "google-bert/bert-base-cased",
+    "google-bert/bert-large-cased",
+    "google-bert/bert-base-multilingual-uncased",
+    "google-bert/bert-base-multilingual-cased",
+    "google-bert/bert-base-chinese",
+    "google-bert/bert-base-german-cased",
+    "google-bert/bert-large-uncased-whole-word-masking",
+    "google-bert/bert-large-cased-whole-word-masking",
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad",
+    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad",
+    "google-bert/bert-base-cased-finetuned-mrpc",
+    "google-bert/bert-base-german-dbmdz-cased",
+    "google-bert/bert-base-german-dbmdz-uncased",
     "cl-tohoku/bert-base-japanese",
     "cl-tohoku/bert-base-japanese-whole-word-masking",
     "cl-tohoku/bert-base-japanese-char",
@@ -1101,8 +1101,8 @@ def forward(
         >>> from transformers import AutoTokenizer, BertForPreTraining
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1453,8 +1453,8 @@ def forward(
         >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-        >>> model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index b32a618655e600..772ea2bf12b2ee 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -52,7 +52,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
 _CONFIG_FOR_DOC = "BertConfig"
 
 remat = nn_partitioning.remat
@@ -1114,8 +1114,8 @@ class FlaxBertForPreTraining(FlaxBertPreTrainedModel):
     ```python
     >>> from transformers import AutoTokenizer, FlaxBertForPreTraining
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    >>> model = FlaxBertForPreTraining.from_pretrained("bert-base-uncased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+    >>> model = FlaxBertForPreTraining.from_pretrained("google-bert/bert-base-uncased")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
     >>> outputs = model(**inputs)
@@ -1269,8 +1269,8 @@ class FlaxBertForNextSentencePrediction(FlaxBertPreTrainedModel):
     ```python
     >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
     >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 853ec6e6df44a1..7fe89e43e86335 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -67,7 +67,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
 _CONFIG_FOR_DOC = "BertConfig"
 
 # TokenClassification docstring
@@ -90,19 +90,19 @@
 _SEQ_CLASS_EXPECTED_LOSS = 0.01
 
 TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "bert-base-uncased",
-    "bert-large-uncased",
-    "bert-base-cased",
-    "bert-large-cased",
-    "bert-base-multilingual-uncased",
-    "bert-base-multilingual-cased",
-    "bert-base-chinese",
-    "bert-base-german-cased",
-    "bert-large-uncased-whole-word-masking",
-    "bert-large-cased-whole-word-masking",
-    "bert-large-uncased-whole-word-masking-finetuned-squad",
-    "bert-large-cased-whole-word-masking-finetuned-squad",
-    "bert-base-cased-finetuned-mrpc",
+    "google-bert/bert-base-uncased",
+    "google-bert/bert-large-uncased",
+    "google-bert/bert-base-cased",
+    "google-bert/bert-large-cased",
+    "google-bert/bert-base-multilingual-uncased",
+    "google-bert/bert-base-multilingual-cased",
+    "google-bert/bert-base-chinese",
+    "google-bert/bert-base-german-cased",
+    "google-bert/bert-large-uncased-whole-word-masking",
+    "google-bert/bert-large-cased-whole-word-masking",
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad",
+    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad",
+    "google-bert/bert-base-cased-finetuned-mrpc",
     "cl-tohoku/bert-base-japanese",
     "cl-tohoku/bert-base-japanese-whole-word-masking",
     "cl-tohoku/bert-base-japanese-char",
@@ -1327,8 +1327,8 @@ def call(
         >>> import tensorflow as tf
         >>> from transformers import AutoTokenizer, TFBertForPreTraining
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-        >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = TFBertForPreTraining.from_pretrained("google-bert/bert-base-uncased")
         >>> input_ids = tokenizer("Hello, my dog is cute", add_special_tokens=True, return_tensors="tf")
         >>> # Batch size 1
 
@@ -1657,8 +1657,8 @@ def call(
         >>> import tensorflow as tf
         >>> from transformers import AutoTokenizer, TFBertForNextSentencePrediction
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-        >>> model = TFBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = TFBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index 16044973343bc5..c95e9ff0f8b43c 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -30,34 +30,34 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
-        "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
-        "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
-        "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
-        "bert-base-multilingual-uncased": (
-            "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt"
+        "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/vocab.txt",
+        "google-bert/bert-large-uncased": "https://huggingface.co/google-bert/bert-large-uncased/resolve/main/vocab.txt",
+        "google-bert/bert-base-cased": "https://huggingface.co/google-bert/bert-base-cased/resolve/main/vocab.txt",
+        "google-bert/bert-large-cased": "https://huggingface.co/google-bert/bert-large-cased/resolve/main/vocab.txt",
+        "google-bert/bert-base-multilingual-uncased": (
+            "https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/vocab.txt"
         ),
-        "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
-        "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
-        "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt",
-        "bert-large-uncased-whole-word-masking": (
-            "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"
+        "google-bert/bert-base-multilingual-cased": "https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/vocab.txt",
+        "google-bert/bert-base-chinese": "https://huggingface.co/google-bert/bert-base-chinese/resolve/main/vocab.txt",
+        "google-bert/bert-base-german-cased": "https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/vocab.txt",
+        "google-bert/bert-large-uncased-whole-word-masking": (
+            "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"
         ),
-        "bert-large-cased-whole-word-masking": (
-            "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"
+        "google-bert/bert-large-cased-whole-word-masking": (
+            "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"
         ),
-        "bert-large-uncased-whole-word-masking-finetuned-squad": (
-            "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
+        "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
+            "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
         ),
-        "bert-large-cased-whole-word-masking-finetuned-squad": (
-            "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
+        "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
+            "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
         ),
-        "bert-base-cased-finetuned-mrpc": (
-            "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"
+        "google-bert/bert-base-cased-finetuned-mrpc": (
+            "https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"
         ),
-        "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
-        "bert-base-german-dbmdz-uncased": (
-            "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"
+        "google-bert/bert-base-german-dbmdz-cased": "https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
+        "google-bert/bert-base-german-dbmdz-uncased": (
+            "https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"
         ),
         "TurkuNLP/bert-base-finnish-cased-v1": (
             "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"
@@ -72,42 +72,42 @@
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "bert-base-uncased": 512,
-    "bert-large-uncased": 512,
-    "bert-base-cased": 512,
-    "bert-large-cased": 512,
-    "bert-base-multilingual-uncased": 512,
-    "bert-base-multilingual-cased": 512,
-    "bert-base-chinese": 512,
-    "bert-base-german-cased": 512,
-    "bert-large-uncased-whole-word-masking": 512,
-    "bert-large-cased-whole-word-masking": 512,
-    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
-    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
-    "bert-base-cased-finetuned-mrpc": 512,
-    "bert-base-german-dbmdz-cased": 512,
-    "bert-base-german-dbmdz-uncased": 512,
+    "google-bert/bert-base-uncased": 512,
+    "google-bert/bert-large-uncased": 512,
+    "google-bert/bert-base-cased": 512,
+    "google-bert/bert-large-cased": 512,
+    "google-bert/bert-base-multilingual-uncased": 512,
+    "google-bert/bert-base-multilingual-cased": 512,
+    "google-bert/bert-base-chinese": 512,
+    "google-bert/bert-base-german-cased": 512,
+    "google-bert/bert-large-uncased-whole-word-masking": 512,
+    "google-bert/bert-large-cased-whole-word-masking": 512,
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "google-bert/bert-base-cased-finetuned-mrpc": 512,
+    "google-bert/bert-base-german-dbmdz-cased": 512,
+    "google-bert/bert-base-german-dbmdz-uncased": 512,
     "TurkuNLP/bert-base-finnish-cased-v1": 512,
     "TurkuNLP/bert-base-finnish-uncased-v1": 512,
     "wietsedv/bert-base-dutch-cased": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "bert-base-uncased": {"do_lower_case": True},
-    "bert-large-uncased": {"do_lower_case": True},
-    "bert-base-cased": {"do_lower_case": False},
-    "bert-large-cased": {"do_lower_case": False},
-    "bert-base-multilingual-uncased": {"do_lower_case": True},
-    "bert-base-multilingual-cased": {"do_lower_case": False},
-    "bert-base-chinese": {"do_lower_case": False},
-    "bert-base-german-cased": {"do_lower_case": False},
-    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
-    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
-    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
-    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
-    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
-    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
-    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "google-bert/bert-base-uncased": {"do_lower_case": True},
+    "google-bert/bert-large-uncased": {"do_lower_case": True},
+    "google-bert/bert-base-cased": {"do_lower_case": False},
+    "google-bert/bert-large-cased": {"do_lower_case": False},
+    "google-bert/bert-base-multilingual-uncased": {"do_lower_case": True},
+    "google-bert/bert-base-multilingual-cased": {"do_lower_case": False},
+    "google-bert/bert-base-chinese": {"do_lower_case": False},
+    "google-bert/bert-base-german-cased": {"do_lower_case": False},
+    "google-bert/bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "google-bert/bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "google-bert/bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "google-bert/bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "google-bert/bert-base-german-dbmdz-uncased": {"do_lower_case": True},
     "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
     "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
     "wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py
index 80d542367dca33..e7754b2fb5a128 100644
--- a/src/transformers/models/bert/tokenization_bert_fast.py
+++ b/src/transformers/models/bert/tokenization_bert_fast.py
@@ -30,34 +30,34 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
-        "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
-        "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
-        "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
-        "bert-base-multilingual-uncased": (
-            "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt"
+        "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/vocab.txt",
+        "google-bert/bert-large-uncased": "https://huggingface.co/google-bert/bert-large-uncased/resolve/main/vocab.txt",
+        "google-bert/bert-base-cased": "https://huggingface.co/google-bert/bert-base-cased/resolve/main/vocab.txt",
+        "google-bert/bert-large-cased": "https://huggingface.co/google-bert/bert-large-cased/resolve/main/vocab.txt",
+        "google-bert/bert-base-multilingual-uncased": (
+            "https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/vocab.txt"
         ),
-        "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
-        "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
-        "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt",
-        "bert-large-uncased-whole-word-masking": (
-            "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"
+        "google-bert/bert-base-multilingual-cased": "https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/vocab.txt",
+        "google-bert/bert-base-chinese": "https://huggingface.co/google-bert/bert-base-chinese/resolve/main/vocab.txt",
+        "google-bert/bert-base-german-cased": "https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/vocab.txt",
+        "google-bert/bert-large-uncased-whole-word-masking": (
+            "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"
         ),
-        "bert-large-cased-whole-word-masking": (
-            "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"
+        "google-bert/bert-large-cased-whole-word-masking": (
+            "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"
         ),
-        "bert-large-uncased-whole-word-masking-finetuned-squad": (
-            "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
+        "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
+            "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
         ),
-        "bert-large-cased-whole-word-masking-finetuned-squad": (
-            "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
+        "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
+            "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
         ),
-        "bert-base-cased-finetuned-mrpc": (
-            "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"
+        "google-bert/bert-base-cased-finetuned-mrpc": (
+            "https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"
         ),
-        "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
-        "bert-base-german-dbmdz-uncased": (
-            "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"
+        "google-bert/bert-base-german-dbmdz-cased": "https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
+        "google-bert/bert-base-german-dbmdz-uncased": (
+            "https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"
         ),
         "TurkuNLP/bert-base-finnish-cased-v1": (
             "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"
@@ -70,38 +70,38 @@
         ),
     },
     "tokenizer_file": {
-        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
-        "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json",
-        "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json",
-        "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/tokenizer.json",
-        "bert-base-multilingual-uncased": (
-            "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/tokenizer.json"
+        "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/tokenizer.json",
+        "google-bert/bert-large-uncased": "https://huggingface.co/google-bert/bert-large-uncased/resolve/main/tokenizer.json",
+        "google-bert/bert-base-cased": "https://huggingface.co/google-bert/bert-base-cased/resolve/main/tokenizer.json",
+        "google-bert/bert-large-cased": "https://huggingface.co/google-bert/bert-large-cased/resolve/main/tokenizer.json",
+        "google-bert/bert-base-multilingual-uncased": (
+            "https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/tokenizer.json"
         ),
-        "bert-base-multilingual-cased": (
-            "https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json"
+        "google-bert/bert-base-multilingual-cased": (
+            "https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/tokenizer.json"
         ),
-        "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/tokenizer.json",
-        "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/tokenizer.json",
-        "bert-large-uncased-whole-word-masking": (
-            "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/tokenizer.json"
+        "google-bert/bert-base-chinese": "https://huggingface.co/google-bert/bert-base-chinese/resolve/main/tokenizer.json",
+        "google-bert/bert-base-german-cased": "https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/tokenizer.json",
+        "google-bert/bert-large-uncased-whole-word-masking": (
+            "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/tokenizer.json"
         ),
-        "bert-large-cased-whole-word-masking": (
-            "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/tokenizer.json"
+        "google-bert/bert-large-cased-whole-word-masking": (
+            "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/tokenizer.json"
         ),
-        "bert-large-uncased-whole-word-masking-finetuned-squad": (
-            "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json"
+        "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
+            "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json"
         ),
-        "bert-large-cased-whole-word-masking-finetuned-squad": (
-            "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json"
+        "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
+            "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json"
         ),
-        "bert-base-cased-finetuned-mrpc": (
-            "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/tokenizer.json"
+        "google-bert/bert-base-cased-finetuned-mrpc": (
+            "https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/tokenizer.json"
         ),
-        "bert-base-german-dbmdz-cased": (
-            "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/tokenizer.json"
+        "google-bert/bert-base-german-dbmdz-cased": (
+            "https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/tokenizer.json"
         ),
-        "bert-base-german-dbmdz-uncased": (
-            "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/tokenizer.json"
+        "google-bert/bert-base-german-dbmdz-uncased": (
+            "https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/tokenizer.json"
         ),
         "TurkuNLP/bert-base-finnish-cased-v1": (
             "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/tokenizer.json"
@@ -116,42 +116,42 @@
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "bert-base-uncased": 512,
-    "bert-large-uncased": 512,
-    "bert-base-cased": 512,
-    "bert-large-cased": 512,
-    "bert-base-multilingual-uncased": 512,
-    "bert-base-multilingual-cased": 512,
-    "bert-base-chinese": 512,
-    "bert-base-german-cased": 512,
-    "bert-large-uncased-whole-word-masking": 512,
-    "bert-large-cased-whole-word-masking": 512,
-    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
-    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
-    "bert-base-cased-finetuned-mrpc": 512,
-    "bert-base-german-dbmdz-cased": 512,
-    "bert-base-german-dbmdz-uncased": 512,
+    "google-bert/bert-base-uncased": 512,
+    "google-bert/bert-large-uncased": 512,
+    "google-bert/bert-base-cased": 512,
+    "google-bert/bert-large-cased": 512,
+    "google-bert/bert-base-multilingual-uncased": 512,
+    "google-bert/bert-base-multilingual-cased": 512,
+    "google-bert/bert-base-chinese": 512,
+    "google-bert/bert-base-german-cased": 512,
+    "google-bert/bert-large-uncased-whole-word-masking": 512,
+    "google-bert/bert-large-cased-whole-word-masking": 512,
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "google-bert/bert-base-cased-finetuned-mrpc": 512,
+    "google-bert/bert-base-german-dbmdz-cased": 512,
+    "google-bert/bert-base-german-dbmdz-uncased": 512,
     "TurkuNLP/bert-base-finnish-cased-v1": 512,
     "TurkuNLP/bert-base-finnish-uncased-v1": 512,
     "wietsedv/bert-base-dutch-cased": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "bert-base-uncased": {"do_lower_case": True},
-    "bert-large-uncased": {"do_lower_case": True},
-    "bert-base-cased": {"do_lower_case": False},
-    "bert-large-cased": {"do_lower_case": False},
-    "bert-base-multilingual-uncased": {"do_lower_case": True},
-    "bert-base-multilingual-cased": {"do_lower_case": False},
-    "bert-base-chinese": {"do_lower_case": False},
-    "bert-base-german-cased": {"do_lower_case": False},
-    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
-    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
-    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
-    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
-    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
-    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
-    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "google-bert/bert-base-uncased": {"do_lower_case": True},
+    "google-bert/bert-large-uncased": {"do_lower_case": True},
+    "google-bert/bert-base-cased": {"do_lower_case": False},
+    "google-bert/bert-large-cased": {"do_lower_case": False},
+    "google-bert/bert-base-multilingual-uncased": {"do_lower_case": True},
+    "google-bert/bert-base-multilingual-cased": {"do_lower_case": False},
+    "google-bert/bert-base-chinese": {"do_lower_case": False},
+    "google-bert/bert-base-german-cased": {"do_lower_case": False},
+    "google-bert/bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "google-bert/bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "google-bert/bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "google-bert/bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "google-bert/bert-base-german-dbmdz-uncased": {"do_lower_case": True},
     "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
     "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
     "wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py
index 5f3a02b54783a6..ebf88eeac9bbe8 100644
--- a/src/transformers/models/bert/tokenization_bert_tf.py
+++ b/src/transformers/models/bert/tokenization_bert_tf.py
@@ -116,7 +116,7 @@ def from_tokenizer(cls, tokenizer: "PreTrainedTokenizerBase", **kwargs):  # noqa
         ```python
         from transformers import AutoTokenizer, TFBertTokenizer
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
         ```
         """
@@ -155,7 +155,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         ```python
         from transformers import TFBertTokenizer
 
-        tf_tokenizer = TFBertTokenizer.from_pretrained("bert-base-uncased")
+        tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
         ```
         """
         try:
diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
index 7609b4a40e857f..714aaa1e273d1a 100644
--- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
+++ b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
@@ -105,7 +105,7 @@ def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None):
 
     image_size = 384
     image = load_demo_image(image_size=image_size, device="cpu")
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
     input_ids = tokenizer(["a picture of"]).input_ids
 
     out = hf_model.generate(image, input_ids)
diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py
index d712726492ae18..d904c35ad7b7a5 100644
--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@@ -26,7 +26,7 @@
 logger = logging.get_logger(__name__)
 
 CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "camembert-base": "https://huggingface.co/camembert-base/resolve/main/config.json",
+    "almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/config.json",
     "umberto-commoncrawl-cased-v1": (
         "https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1/resolve/main/config.json"
     ),
@@ -41,7 +41,7 @@ class CamembertConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`CamembertModel`] or a [`TFCamembertModel`]. It is
     used to instantiate a Camembert model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Camembert
-    [camembert-base](https://huggingface.co/camembert-base) architecture.
+    [almanach/camembert-base](https://huggingface.co/almanach/camembert-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -94,10 +94,10 @@ class CamembertConfig(PretrainedConfig):
     ```python
     >>> from transformers import CamembertConfig, CamembertModel
 
-    >>> # Initializing a Camembert camembert-base style configuration
+    >>> # Initializing a Camembert almanach/camembert-base style configuration
     >>> configuration = CamembertConfig()
 
-    >>> # Initializing a model (with random weights) from the camembert-base style configuration
+    >>> # Initializing a model (with random weights) from the almanach/camembert-base style configuration
     >>> model = CamembertModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index 50fac0efd000a1..cd0b329b6ae00d 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -48,11 +48,11 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "camembert-base"
+_CHECKPOINT_FOR_DOC = "almanach/camembert-base"
 _CONFIG_FOR_DOC = "CamembertConfig"
 
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "camembert-base",
+    "almanach/camembert-base",
     "Musixmatch/umberto-commoncrawl-cased-v1",
     "Musixmatch/umberto-wikipedia-uncased-v1",
     # See all CamemBERT models at https://huggingface.co/models?filter=camembert
@@ -1397,7 +1397,7 @@ def forward(
 @add_start_docstrings(
     """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, roberta-base->camembert-base
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, FacebookAI/roberta-base->almanach/camembert-base
 class CamembertForCausalLM(CamembertPreTrainedModel):
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
@@ -1471,10 +1471,10 @@ def forward(
         >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("camembert-base")
-        >>> config = AutoConfig.from_pretrained("camembert-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
+        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
         >>> config.is_decoder = True
-        >>> model = CamembertForCausalLM.from_pretrained("camembert-base", config=config)
+        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index c4bb10891db994..e3e3fca4cef440 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -62,7 +62,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "camembert-base"
+_CHECKPOINT_FOR_DOC = "almanach/camembert-base"
 _CONFIG_FOR_DOC = "CamembertConfig"
 
 TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index 40755494901791..0949db02fbb850 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -31,12 +31,12 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "camembert-base": "https://huggingface.co/camembert-base/resolve/main/sentencepiece.bpe.model",
+        "almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/sentencepiece.bpe.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "camembert-base": 512,
+    "almanach/camembert-base": 512,
 }
 
 SPIECE_UNDERLINE = "▁"
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index f5720e45f2c06e..627971eb51db3e 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -36,15 +36,15 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "camembert-base": "https://huggingface.co/camembert-base/resolve/main/sentencepiece.bpe.model",
+        "almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/sentencepiece.bpe.model",
     },
     "tokenizer_file": {
-        "camembert-base": "https://huggingface.co/camembert-base/resolve/main/tokenizer.json",
+        "almanach/camembert-base": "https://huggingface.co/almanach/camembert-base/resolve/main/tokenizer.json",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "camembert-base": 512,
+    "almanach/camembert-base": 512,
 }
 
 SPIECE_UNDERLINE = "▁"
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index b0dc90424bd8f4..19a6a84fc75f16 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -45,7 +45,7 @@
 
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "Salesforce/ctrl"
-    # See all CTRL models at https://huggingface.co/models?filter=ctrl
+    # See all CTRL models at https://huggingface.co/models?filter=Salesforce/ctrl
 ]
 
 
diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
index f00b50348048d6..3aac022897d4c0 100644
--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -33,12 +33,12 @@
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"},
-    "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"},
+    "vocab_file": {"Salesforce/ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"},
+    "merges_file": {"Salesforce/ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"},
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "ctrl": 256,
+    "Salesforce/ctrl": 256,
 }
 
 CONTROL_CODES = {
diff --git a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
index 4753f593da19b2..5dc9a244c43c78 100644
--- a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
@@ -277,7 +277,7 @@ def check_and_map_params(hf_param, gluon_param):
     hf_bort_model.half()
 
     # Compare output of both models
-    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
 
     input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]
 
diff --git a/src/transformers/models/deprecated/mmbt/modeling_mmbt.py b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
index db0cef3a650294..8dc450ce8f6c13 100644
--- a/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
@@ -213,7 +213,7 @@ def forward(
 
         ```python
         # For example purposes. Not runnable.
-        transformer = BertModel.from_pretrained("bert-base-uncased")
+        transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
         encoder = ImageEncoder(args)
         mmbt = MMBTModel(config, transformer, encoder)
         ```"""
@@ -333,7 +333,7 @@ class MMBTForClassification(nn.Module):
 
     ```python
     # For example purposes. Not runnable.
-    transformer = BertModel.from_pretrained("bert-base-uncased")
+    transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
     encoder = ImageEncoder(args)
     model = MMBTForClassification(config, transformer, encoder)
     outputs = model(input_modal, input_ids, labels=labels)
diff --git a/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
index 842c1643a00b26..f7d5f2f87fb1ad 100644
--- a/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
@@ -22,7 +22,7 @@
 logger = logging.get_logger(__name__)
 
 TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "transfo-xl-wt103": "https://huggingface.co/transfo-xl-wt103/resolve/main/config.json",
+    "transfo-xl/transfo-xl-wt103": "https://huggingface.co/transfo-xl/transfo-xl-wt103/resolve/main/config.json",
 }
 
 
@@ -31,7 +31,7 @@ class TransfoXLConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`TransfoXLModel`] or a [`TFTransfoXLModel`]. It is
     used to instantiate a Transformer-XL model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the TransfoXL
-    [transfo-xl-wt103](https://huggingface.co/transfo-xl-wt103) architecture.
+    [transfo-xl/transfo-xl-wt103](https://huggingface.co/transfo-xl/transfo-xl-wt103) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
index c99d8346701ea8..ab2725df0c4dcf 100644
--- a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
@@ -48,11 +48,11 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "transfo-xl-wt103"
+_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
 _CONFIG_FOR_DOC = "TransfoXLConfig"
 
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "transfo-xl-wt103",
+    "transfo-xl/transfo-xl-wt103",
     # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
 ]
 
diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
index 2fa251399b1bd4..1b8f222f508a35 100644
--- a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
@@ -39,11 +39,11 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "transfo-xl-wt103"
+_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
 _CONFIG_FOR_DOC = "TransfoXLConfig"
 
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "transfo-xl-wt103",
+    "transfo-xl/transfo-xl-wt103",
     # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
 ]
 
diff --git a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
index cea74e76bc15a6..12d360076fba4f 100644
--- a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
@@ -57,16 +57,16 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "pretrained_vocab_file": {
-        "transfo-xl-wt103": "https://huggingface.co/transfo-xl-wt103/resolve/main/vocab.pkl",
+        "transfo-xl/transfo-xl-wt103": "https://huggingface.co/transfo-xl/transfo-xl-wt103/resolve/main/vocab.pkl",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "transfo-xl-wt103": None,
+    "transfo-xl/transfo-xl-wt103": None,
 }
 
 PRETRAINED_CORPUS_ARCHIVE_MAP = {
-    "transfo-xl-wt103": "https://huggingface.co/transfo-xl-wt103/resolve/main/corpus.bin",
+    "transfo-xl/transfo-xl-wt103": "https://huggingface.co/transfo-xl/transfo-xl-wt103/resolve/main/corpus.bin",
 }
 CORPUS_NAME = "corpus.bin"
 
@@ -451,7 +451,7 @@ def moses_pipeline(self, text: str) -> List[str]:
         Example:
 
         ```python
-        >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103")
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl/transfo-xl-wt103")
         >>> tokenizer.moses_pipeline("23,000 people are 1.80 m tall")
         ['23', '@,@', '000', 'people', 'are', '1', '@.@', '80', 'm', 'tall']
         ```"""
diff --git a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
index b4965857b55757..c11345d1eb4e46 100644
--- a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
@@ -54,7 +54,7 @@ def from_type(comp_type: str, *args, **kwargs) -> "DPRState":
 
 class DPRContextEncoderState(DPRState):
     def load_dpr_model(self):
-        model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("bert-base-uncased")[0]))
+        model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
         print(f"Loading DPR biencoder from {self.src_file}")
         saved_state = load_states_from_checkpoint(self.src_file)
         encoder, prefix = model.ctx_encoder, "ctx_model."
@@ -72,7 +72,7 @@ def load_dpr_model(self):
 
 class DPRQuestionEncoderState(DPRState):
     def load_dpr_model(self):
-        model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("bert-base-uncased")[0]))
+        model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
         print(f"Loading DPR biencoder from {self.src_file}")
         saved_state = load_states_from_checkpoint(self.src_file)
         encoder, prefix = model.question_encoder, "question_model."
@@ -90,7 +90,7 @@ def load_dpr_model(self):
 
 class DPRReaderState(DPRState):
     def load_dpr_model(self):
-        model = DPRReader(DPRConfig(**BertConfig.get_config_dict("bert-base-uncased")[0]))
+        model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
         print(f"Loading DPR reader from {self.src_file}")
         saved_state = load_states_from_checkpoint(self.src_file)
         # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index 9f373ea4544286..8c0ae2771e81f1 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -45,13 +45,13 @@ class EncoderDecoderConfig(PretrainedConfig):
     ```python
     >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
 
-    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> # Initializing a BERT google-bert/bert-base-uncased style configuration
     >>> config_encoder = BertConfig()
     >>> config_decoder = BertConfig()
 
     >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-    >>> # Initializing a Bert2Bert model (with random weights) from the bert-base-uncased style configurations
+    >>> # Initializing a Bert2Bert model (with random weights) from the google-bert/bert-base-uncased style configurations
     >>> model = EncoderDecoderModel(config=config)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 12959f8f200a0e..1a6adcee1f8386 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -403,8 +403,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -416,8 +414,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -444,7 +440,7 @@ def from_encoder_decoder_pretrained(
         >>> from transformers import EncoderDecoderModel
 
         >>> # initialize a bert2bert from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
-        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./bert2bert")
         >>> # load fine-tuned model
@@ -560,9 +556,9 @@ def forward(
         >>> from transformers import EncoderDecoderModel, BertTokenizer
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
         >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "bert-base-uncased", "bert-base-uncased"
+        ...     "google-bert/bert-base-uncased", "google-bert/bert-base-uncased"
         ... )  # initialize Bert2Bert from pre-trained checkpoints
 
         >>> # training
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 93cac0b3f657aa..beecd080328e16 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -449,9 +449,9 @@ def encode(
         >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
 
         >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> input_ids = tokenizer.encode(text, return_tensors="np")
@@ -527,9 +527,9 @@ def decode(
         >>> import jax.numpy as jnp
 
         >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors="np")
@@ -653,8 +653,8 @@ def __call__(
         >>> # load a fine-tuned bert2gpt2 model
         >>> model = FlaxEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
         >>> # load input & output tokenizer
-        >>> tokenizer_input = BertTokenizer.from_pretrained("bert-base-cased")
-        >>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> tokenizer_input = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+        >>> tokenizer_output = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         >>> article = '''Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members
         >>> singing a racist chant. SAE's national chapter suspended the students,
@@ -774,8 +774,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
@@ -783,8 +781,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
@@ -807,7 +803,7 @@ def from_encoder_decoder_pretrained(
         >>> from transformers import FlaxEncoderDecoderModel
 
         >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./bert2gpt2")
         >>> # load fine-tuned model
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index b4b2503bd00124..855fb767d13d73 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -327,8 +327,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case,
@@ -338,8 +336,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *pytorch checkpoint file* (e.g, `./pt_model/`). In this case,
@@ -364,7 +360,7 @@ def from_encoder_decoder_pretrained(
         >>> from transformers import TFEncoderDecoderModel
 
         >>> # initialize a bert2gpt2 from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
-        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "gpt2")
+        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "openai-community/gpt2")
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./bert2gpt2")
         >>> # load fine-tuned model
@@ -486,9 +482,9 @@ def call(
         >>> from transformers import TFEncoderDecoderModel, BertTokenizer
 
         >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
+        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
         >>> # forward
         >>> input_ids = tokenizer.encode(
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index 318e9bfd471c7e..4786fc6d5781a7 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -1143,8 +1143,8 @@ def forward(
         >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
         >>> import torch
 
-        >>> tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
-        >>> model = XLMForQuestionAnswering.from_pretrained("xlm-mlm-en-2048")
+        >>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+        >>> model = XLMForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")
 
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
         ...     0
diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py
index 5dde4da15e5195..4e3e8e7b317905 100644
--- a/src/transformers/models/git/convert_git_to_pytorch.py
+++ b/src/transformers/models/git/convert_git_to_pytorch.py
@@ -311,7 +311,9 @@ def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=Fal
             size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size}
         )
     )
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_input_names=["input_ids", "attention_mask"])
+    tokenizer = AutoTokenizer.from_pretrained(
+        "google-bert/bert-base-uncased", model_input_names=["input_ids", "attention_mask"]
+    )
     processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
     if is_video:
diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index d35a161428838e..395e2b4873fec8 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -26,11 +26,11 @@
 logger = logging.get_logger(__name__)
 
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
-    "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
-    "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
-    "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
-    "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
+    "openai-community/gpt2": "https://huggingface.co/openai-community/gpt2/resolve/main/config.json",
+    "openai-community/gpt2-medium": "https://huggingface.co/openai-community/gpt2-medium/resolve/main/config.json",
+    "openai-community/gpt2-large": "https://huggingface.co/openai-community/gpt2-large/resolve/main/config.json",
+    "openai-community/gpt2-xl": "https://huggingface.co/openai-community/gpt2-xl/resolve/main/config.json",
+    "distilbert/distilgpt2": "https://huggingface.co/distilbert/distilgpt2/resolve/main/config.json",
 }
 
 
@@ -39,7 +39,7 @@ class GPT2Config(PretrainedConfig):
     This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
     instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the GPT-2
-    [gpt2](https://huggingface.co/gpt2) architecture.
+    [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/gpt2/modeling_flax_gpt2.py b/src/transformers/models/gpt2/modeling_flax_gpt2.py
index 50cfb5e11221a8..c3ef377642a3c5 100644
--- a/src/transformers/models/gpt2/modeling_flax_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -35,7 +35,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "gpt2"
+_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
 _CONFIG_FOR_DOC = "GPT2Config"
 
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 25c92dd2dd5bfe..e1b357cefb649c 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -51,15 +51,15 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "gpt2"
+_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
 _CONFIG_FOR_DOC = "GPT2Config"
 
 GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "gpt2",
-    "gpt2-medium",
-    "gpt2-large",
-    "gpt2-xl",
-    "distilgpt2",
+    "openai-community/gpt2",
+    "openai-community/gpt2-medium",
+    "openai-community/gpt2-large",
+    "openai-community/gpt2-xl",
+    "distilbert/distilgpt2",
     # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
 ]
 
@@ -619,16 +619,16 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
             have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
             following number of attention modules:
 
-                - gpt2: 12
-                - gpt2-medium: 24
-                - gpt2-large: 36
-                - gpt2-xl: 48
+                - openai-community/gpt2: 12
+                - openai-community/gpt2-medium: 24
+                - openai-community/gpt2-large: 36
+                - openai-community/gpt2-xl: 48
 
     Example:
 
     ```python
     # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
-    model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl")
     device_map = {
         0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
         1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
@@ -644,8 +644,8 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
     Example:
 
     ```python
-    # On a 4 GPU machine with gpt2-large:
-    model = GPT2LMHeadModel.from_pretrained("gpt2-large")
+    # On a 4 GPU machine with openai-community/gpt2-large:
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
     device_map = {
         0: [0, 1, 2, 3, 4, 5, 6, 7],
         1: [8, 9, 10, 11, 12, 13, 14, 15],
@@ -1277,8 +1277,8 @@ def forward(
         >>> import torch
         >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
         >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index fd40df97ddc6c8..2c17593e26808c 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -55,16 +55,16 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "gpt2"
+_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
 _CONFIG_FOR_DOC = "GPT2Config"
 
 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "gpt2",
-    "gpt2-medium",
-    "gpt2-large",
-    "gpt2-xl",
-    "distilgpt2",
-    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
+    "openai-community/gpt2",
+    "openai-community/gpt2-medium",
+    "openai-community/gpt2-large",
+    "openai-community/gpt2-xl",
+    "distilbert/distilgpt2",
+    # See all GPT-2 models at https://huggingface.co/models?filter=openai-community/gpt2
 ]
 
 
@@ -1026,8 +1026,8 @@ def call(
         >>> import tensorflow as tf
         >>> from transformers import AutoTokenizer, TFGPT2DoubleHeadsModel
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = TFGPT2DoubleHeadsModel.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFGPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
         >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index a7b576e92defb4..801e997344a194 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -35,27 +35,27 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
-        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
-        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
-        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
-        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
+        "openai-community/gpt2": "https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json",
+        "openai-community/gpt2-medium": "https://huggingface.co/openai-community/gpt2-medium/resolve/main/vocab.json",
+        "openai-community/gpt2-large": "https://huggingface.co/openai-community/gpt2-large/resolve/main/vocab.json",
+        "openai-community/gpt2-xl": "https://huggingface.co/openai-community/gpt2-xl/resolve/main/vocab.json",
+        "distilbert/distilgpt2": "https://huggingface.co/distilbert/distilgpt2/resolve/main/vocab.json",
     },
     "merges_file": {
-        "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
-        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
-        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
-        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
-        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
+        "openai-community/gpt2": "https://huggingface.co/openai-community/gpt2/resolve/main/merges.txt",
+        "openai-community/gpt2-medium": "https://huggingface.co/openai-community/gpt2-medium/resolve/main/merges.txt",
+        "openai-community/gpt2-large": "https://huggingface.co/openai-community/gpt2-large/resolve/main/merges.txt",
+        "openai-community/gpt2-xl": "https://huggingface.co/openai-community/gpt2-xl/resolve/main/merges.txt",
+        "distilbert/distilgpt2": "https://huggingface.co/distilbert/distilgpt2/resolve/main/merges.txt",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "gpt2": 1024,
-    "gpt2-medium": 1024,
-    "gpt2-large": 1024,
-    "gpt2-xl": 1024,
-    "distilgpt2": 1024,
+    "openai-community/gpt2": 1024,
+    "openai-community/gpt2-medium": 1024,
+    "openai-community/gpt2-large": 1024,
+    "openai-community/gpt2-xl": 1024,
+    "distilbert/distilgpt2": 1024,
 }
 
 
@@ -108,7 +108,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
     ```python
     >>> from transformers import GPT2Tokenizer
 
-    >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    >>> tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
     >>> tokenizer("Hello world")["input_ids"]
     [15496, 995]
 
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
index a5dcade90a0198..c4e49d23d146e4 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -32,34 +32,34 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
-        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
-        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
-        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
-        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
+        "openai-community/gpt2": "https://huggingface.co/openai-community/gpt2/resolve/main/vocab.json",
+        "openai-community/gpt2-medium": "https://huggingface.co/openai-community/gpt2-medium/resolve/main/vocab.json",
+        "openai-community/gpt2-large": "https://huggingface.co/openai-community/gpt2-large/resolve/main/vocab.json",
+        "openai-community/gpt2-xl": "https://huggingface.co/openai-community/gpt2-xl/resolve/main/vocab.json",
+        "distilbert/distilgpt2": "https://huggingface.co/distilbert/distilgpt2/resolve/main/vocab.json",
     },
     "merges_file": {
-        "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
-        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
-        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
-        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
-        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
+        "openai-community/gpt2": "https://huggingface.co/openai-community/gpt2/resolve/main/merges.txt",
+        "openai-community/gpt2-medium": "https://huggingface.co/openai-community/gpt2-medium/resolve/main/merges.txt",
+        "openai-community/gpt2-large": "https://huggingface.co/openai-community/gpt2-large/resolve/main/merges.txt",
+        "openai-community/gpt2-xl": "https://huggingface.co/openai-community/gpt2-xl/resolve/main/merges.txt",
+        "distilbert/distilgpt2": "https://huggingface.co/distilbert/distilgpt2/resolve/main/merges.txt",
     },
     "tokenizer_file": {
-        "gpt2": "https://huggingface.co/gpt2/resolve/main/tokenizer.json",
-        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/tokenizer.json",
-        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/tokenizer.json",
-        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/tokenizer.json",
-        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/tokenizer.json",
+        "openai-community/gpt2": "https://huggingface.co/openai-community/gpt2/resolve/main/tokenizer.json",
+        "openai-community/gpt2-medium": "https://huggingface.co/openai-community/gpt2-medium/resolve/main/tokenizer.json",
+        "openai-community/gpt2-large": "https://huggingface.co/openai-community/gpt2-large/resolve/main/tokenizer.json",
+        "openai-community/gpt2-xl": "https://huggingface.co/openai-community/gpt2-xl/resolve/main/tokenizer.json",
+        "distilbert/distilgpt2": "https://huggingface.co/distilbert/distilgpt2/resolve/main/tokenizer.json",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "gpt2": 1024,
-    "gpt2-medium": 1024,
-    "gpt2-large": 1024,
-    "gpt2-xl": 1024,
-    "distilgpt2": 1024,
+    "openai-community/gpt2": 1024,
+    "openai-community/gpt2-medium": 1024,
+    "openai-community/gpt2-large": 1024,
+    "openai-community/gpt2-xl": 1024,
+    "distilbert/distilgpt2": 1024,
 }
 
 
@@ -74,7 +74,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
     ```python
     >>> from transformers import GPT2TokenizerFast
 
-    >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+    >>> tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
     >>> tokenizer("Hello world")["input_ids"]
     [15496, 995]
 
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_tf.py b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
index 41f0874919a85e..d763eb84855015 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_tf.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
@@ -45,7 +45,7 @@ def from_tokenizer(cls, tokenizer: GPT2Tokenizer, *args, **kwargs):
         ```python
         from transformers import AutoTokenizer, TFGPT2Tokenizer
 
-        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
         tf_tokenizer = TFGPT2Tokenizer.from_tokenizer(tokenizer)
         ```
         """
@@ -65,7 +65,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
         ```python
         from transformers import TFGPT2Tokenizer
 
-        tf_tokenizer = TFGPT2Tokenizer.from_pretrained("gpt2")
+        tf_tokenizer = TFGPT2Tokenizer.from_pretrained("openai-community/gpt2")
         ```
         """
         tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
index 31f8a7708adf0b..16ed6b1e753e54 100644
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -48,7 +48,7 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
     ```python
     >>> from transformers import GPTNeoXTokenizerFast
 
-    >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2")
+    >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("openai-community/gpt2")
     >>> tokenizer("Hello world")["input_ids"]
     [15496, 995]
 
diff --git a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
index 87e8b90d6cc81a..f8b9c86cfddcd6 100644
--- a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
+++ b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
@@ -132,7 +132,7 @@ def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_
     """
     Copy/paste/tweak model's weights to Transformers design.
     """
-    qformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation_side="left")
+    qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left")
     qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
 
     if "t5" in model_name:
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index a7c2155b0da249..7a5db51987d9af 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -117,7 +117,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
             ```python
             >>> from transformers import T5Tokenizer
 
-            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
             >>> tokenizer.encode("Hello <extra_id_0>.")
             [8774, 32099, 3, 5, 1]
             ```
@@ -125,7 +125,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
             ```python
             >>> from transformers import T5Tokenizer
 
-            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
             >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
             [8774, 32099, 5, 1]
             ```
diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py
index 4f76f16d5180db..cf0477bac1056f 100644
--- a/src/transformers/models/longformer/tokenization_longformer.py
+++ b/src/transformers/models/longformer/tokenization_longformer.py
@@ -112,7 +112,7 @@ def get_pairs(word):
     return pairs
 
 
-# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer with roberta-base->allenai/longformer-base-4096, RoBERTa->Longformer all-casing, RobertaTokenizer->LongformerTokenizer
+# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer with FacebookAI/roberta-base->allenai/longformer-base-4096, RoBERTa->Longformer all-casing, RobertaTokenizer->LongformerTokenizer
 class LongformerTokenizer(PreTrainedTokenizer):
     """
     Constructs a Longformer tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
index fb35a8b67bba7a..e40ebff3b65c13 100644
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ b/src/transformers/models/longformer/tokenization_longformer_fast.py
@@ -87,7 +87,7 @@
 }
 
 
-# Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast with roberta-base->allenai/longformer-base-4096, RoBERTa->Longformer all-casing, Roberta->Longformer
+# Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast with FacebookAI/roberta-base->allenai/longformer-base-4096, RoBERTa->Longformer all-casing, Roberta->Longformer
 class LongformerTokenizerFast(PreTrainedTokenizerFast):
     """
     Construct a "fast" Longformer tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index 36e273d5725a4f..d47f644ba37da0 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -1828,7 +1828,7 @@ def encode(
         ```python
         >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
         >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
 
         >>> text = "My friends are cool but they eat too many carbs."
@@ -1890,7 +1890,7 @@ def decode(
         >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
         >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
 
         >>> text = "My friends are cool but they eat too many carbs."
@@ -2119,7 +2119,7 @@ class FlaxLongT5Model(FlaxLongT5PreTrainedModel):
     ```python
     >>> from transformers import AutoTokenizer, FlaxLongT5Model
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
     >>> model = FlaxLongT5Model.from_pretrained("google/long-t5-local-base")
 
     >>> input_ids = tokenizer(
@@ -2278,7 +2278,7 @@ def decode(
         >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
         >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
 
         >>> text = "summarize: My friends are cool but they eat too many carbs."
@@ -2426,7 +2426,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     ```python
     >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
     >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
 
     >>> ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index 874aaa331d7e26..02cdf289432b38 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -81,10 +81,10 @@ class MegatronBertConfig(PretrainedConfig):
     ```python
     >>> from transformers import MegatronBertConfig, MegatronBertModel
 
-    >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
+    >>> # Initializing a MEGATRON_BERT google-bert/bert-base-uncased style configuration
     >>> configuration = MegatronBertConfig()
 
-    >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
+    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
     >>> model = MegatronBertModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index b535e599ad6ca4..15ccfb4dcb1ff8 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -550,7 +550,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
     # see https://github.com/huggingface/transformers/issues/13906)
 
     if args.tokenizer_name is None:
-        tokenizer_name = "gpt2"
+        tokenizer_name = "openai-community/gpt2"
     else:
         tokenizer_name = args.tokenizer_name
 
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index 88d54f10e2605b..38060f8af5c7b0 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -324,13 +324,13 @@ def main():
     if ds_args is not None:
         tokenizer_type = ds_args.tokenizer_type
         if tokenizer_type == "GPT2BPETokenizer":
-            tokenizer_model_name = "gpt2"
+            tokenizer_model_name = "openai-community/gpt2"
         elif tokenizer_type == "PretrainedFromHF":
             tokenizer_model_name = ds_args.tokenizer_name_or_path
         else:
             raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
     else:
-        tokenizer_model_name = "gpt2"
+        tokenizer_model_name = "openai-community/gpt2"
 
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
     tokenizer_class = type(tokenizer).__name__
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 71422e844d0f90..207d4230ba09b7 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -71,8 +71,8 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
             raise ValueError("You need to specify a `tokenizer`.")
 
         self.char_tokenizer = tokenizer
-        self.bpe_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        self.wp_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        self.wp_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index f9d42afc22ee61..100273a5ac5628 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -1470,8 +1470,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, MT5Model
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
-        >>> model = MT5Model.from_pretrained("mt5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-mt5/mt5-small")
+        >>> model = MT5Model.from_pretrained("google-mt5/mt5-small")
 
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
@@ -1706,8 +1706,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, MT5ForConditionalGeneration
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
-        >>> model = MT5ForConditionalGeneration.from_pretrained("mt5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-mt5/mt5-small")
+        >>> model = MT5ForConditionalGeneration.from_pretrained("google-mt5/mt5-small")
 
         >>> # training
         >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
@@ -2017,8 +2017,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, MT5EncoderModel
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
-        >>> model = MT5EncoderModel.from_pretrained("mt5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-mt5/mt5-small")
+        >>> model = MT5EncoderModel.from_pretrained("google-mt5/mt5-small")
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
         ... ).input_ids  # Batch size 1
diff --git a/src/transformers/models/musicgen/convert_musicgen_transformers.py b/src/transformers/models/musicgen/convert_musicgen_transformers.py
index d4b61046e5ea00..f1eb9e40704dfe 100644
--- a/src/transformers/models/musicgen/convert_musicgen_transformers.py
+++ b/src/transformers/models/musicgen/convert_musicgen_transformers.py
@@ -138,7 +138,7 @@ def convert_musicgen_checkpoint(
         decoder_state_dict, hidden_size=decoder_config.hidden_size
     )
 
-    text_encoder = T5EncoderModel.from_pretrained("t5-base")
+    text_encoder = T5EncoderModel.from_pretrained("google-t5/t5-base")
     audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
     decoder = MusicgenForCausalLM(decoder_config).eval()
 
@@ -172,7 +172,7 @@ def convert_musicgen_checkpoint(
         raise ValueError("Incorrect shape for logits")
 
     # now construct the processor
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
     feature_extractor = AutoFeatureExtractor.from_pretrained(
         "facebook/encodec_32khz", padding_side="left", feature_size=decoder_config.audio_channels
     )
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 9a6518a4d11881..2514a487632385 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1576,8 +1576,6 @@ def from_sub_models_pretrained(
                 Information necessary to initiate the text encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `t5-base`, or namespaced under a user or
-                      organization name, like `google/flan-t5-base.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
@@ -1585,8 +1583,6 @@ def from_sub_models_pretrained(
                 Information necessary to initiate the audio encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `facebook/encodec_24khz`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
@@ -1594,8 +1590,6 @@ def from_sub_models_pretrained(
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `gpt2`, or namespaced under a user or
-                      organization name, like `facebook/musicgen-small`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
@@ -1622,7 +1616,7 @@ def from_sub_models_pretrained(
 
         >>> # initialize a musicgen model from a t5 text encoder, encodec audio encoder, and musicgen decoder
         >>> model = MusicgenForConditionalGeneration.from_sub_models_pretrained(
-        ...     text_encoder_pretrained_model_name_or_path="t5-base",
+        ...     text_encoder_pretrained_model_name_or_path="google-t5/t5-base",
         ...     audio_encoder_pretrained_model_name_or_path="facebook/encodec_24khz",
         ...     decoder_pretrained_model_name_or_path="facebook/musicgen-small",
         ... )
diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py
index dd6f349249e3e7..c50948895ae131 100644
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -21,7 +21,9 @@
 
 logger = logging.get_logger(__name__)
 
-OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/config.json"}
+OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/config.json"
+}
 
 
 class OpenAIGPTConfig(PretrainedConfig):
@@ -29,7 +31,7 @@ class OpenAIGPTConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`OpenAIGPTModel`] or a [`TFOpenAIGPTModel`]. It is
     used to instantiate a GPT model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the GPT
-    [openai-gpt](https://huggingface.co/openai-gpt) architecture from OpenAI.
+    [openai-community/openai-gpt](https://huggingface.co/openai-community/openai-gpt) architecture from OpenAI.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -108,7 +110,7 @@ class OpenAIGPTConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "openai-gpt"
+    model_type = "openai-community/openai-gpt"
     attribute_map = {
         "max_position_embeddings": "n_positions",
         "hidden_size": "n_embd",
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index ebb83cfc6bd428..747118bd27f228 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -43,12 +43,12 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "openai-gpt"
+_CHECKPOINT_FOR_DOC = "openai-community/openai-gpt"
 _CONFIG_FOR_DOC = "OpenAIGPTConfig"
 
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "openai-gpt",
-    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
+    "openai-community/openai-gpt",
+    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-community/openai-gpt
 ]
 
 
@@ -678,8 +678,8 @@ def forward(
         >>> from transformers import AutoTokenizer, OpenAIGPTDoubleHeadsModel
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
-        >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
+        >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-community/openai-gpt")
         >>> tokenizer.add_special_tokens(
         ...     {"cls_token": "[CLS]"}
         ... )  # Add a [CLS] to the vocabulary (we should train it also!)
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 8c213bcebdb160..34bc5aa522d20a 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -52,12 +52,12 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "openai-gpt"
+_CHECKPOINT_FOR_DOC = "openai-community/openai-gpt"
 _CONFIG_FOR_DOC = "OpenAIGPTConfig"
 
 TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "openai-gpt",
-    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
+    "openai-community/openai-gpt",
+    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-community/openai-gpt
 ]
 
 
@@ -731,8 +731,8 @@ def call(
         >>> import tensorflow as tf
         >>> from transformers import AutoTokenizer, TFOpenAIGPTDoubleHeadsModel
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
-        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
+        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained("openai-community/openai-gpt")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
         >>> tokenizer.add_special_tokens({"cls_token": "[CLS]"})
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index cfdeb3207a6d96..e189b15035b8c0 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -33,12 +33,16 @@
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/vocab.json"},
-    "merges_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/merges.txt"},
+    "vocab_file": {
+        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/vocab.json"
+    },
+    "merges_file": {
+        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/merges.txt"
+    },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "openai-gpt": 512,
+    "openai-community/openai-gpt": 512,
 }
 
 
diff --git a/src/transformers/models/openai/tokenization_openai_fast.py b/src/transformers/models/openai/tokenization_openai_fast.py
index 2df26c3a2f626d..e1f04722ee27e1 100644
--- a/src/transformers/models/openai/tokenization_openai_fast.py
+++ b/src/transformers/models/openai/tokenization_openai_fast.py
@@ -27,13 +27,19 @@
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/vocab.json"},
-    "merges_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/merges.txt"},
-    "tokenizer_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/tokenizer.json"},
+    "vocab_file": {
+        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/vocab.json"
+    },
+    "merges_file": {
+        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/merges.txt"
+    },
+    "tokenizer_file": {
+        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/tokenizer.json"
+    },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "openai-gpt": 512,
+    "openai-community/openai-gpt": 512,
 }
 
 
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index eb1576197e5e4a..81eb503ddbe944 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -2192,10 +2192,10 @@ def forward(
         >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
         >>> import torch
 
-        >>> tokenizer_enc = BertTokenizer.from_pretrained("bert-large-uncased")
+        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
         >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "bert-large-uncased", "microsoft/prophetnet-large-uncased"
+        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
         ... )
 
         >>> ARTICLE = (
diff --git a/src/transformers/models/qdqbert/configuration_qdqbert.py b/src/transformers/models/qdqbert/configuration_qdqbert.py
index b790dd1efc550d..1efa2ef811ecbe 100644
--- a/src/transformers/models/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/qdqbert/configuration_qdqbert.py
@@ -21,7 +21,7 @@
 logger = logging.get_logger(__name__)
 
 QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
+    "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/config.json",
     # QDQBERT models can be loaded from any BERT checkpoint, available at https://huggingface.co/models?filter=bert
 }
 
@@ -31,7 +31,7 @@ class QDQBertConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`QDQBertModel`]. It is used to instantiate an
     QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the BERT
-    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+    [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -76,10 +76,10 @@ class QDQBertConfig(PretrainedConfig):
     ```python
     >>> from transformers import QDQBertModel, QDQBertConfig
 
-    >>> # Initializing a QDQBERT bert-base-uncased style configuration
+    >>> # Initializing a QDQBERT google-bert/bert-base-uncased style configuration
     >>> configuration = QDQBertConfig()
 
-    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> # Initializing a model from the google-bert/bert-base-uncased style configuration
     >>> model = QDQBertModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index 33d6d6b2088102..8c610ecaedbfc4 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -66,11 +66,11 @@
             " https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization."
         )
 
-_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
 _CONFIG_FOR_DOC = "QDQBertConfig"
 
 QDQBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "bert-base-uncased",
+    "google-bert/bert-base-uncased",
     # See all BERT models at https://huggingface.co/models?filter=bert
 ]
 
@@ -1076,10 +1076,10 @@ def forward(
         >>> from transformers import AutoTokenizer, QDQBertLMHeadModel, QDQBertConfig
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        >>> config = QDQBertConfig.from_pretrained("bert-base-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+        >>> config = QDQBertConfig.from_pretrained("google-bert/bert-base-cased")
         >>> config.is_decoder = True
-        >>> model = QDQBertLMHeadModel.from_pretrained("bert-base-cased", config=config)
+        >>> model = QDQBertLMHeadModel.from_pretrained("google-bert/bert-base-cased", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1319,8 +1319,8 @@ def forward(
         >>> from transformers import AutoTokenizer, QDQBertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-        >>> model = QDQBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = QDQBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 09fc9dabe84e58..a840b0681eddbe 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -260,8 +260,6 @@ def from_pretrained_question_encoder_generator(
                 Information necessary to initiate the question encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -273,8 +271,6 @@ def from_pretrained_question_encoder_generator(
                 Information necessary to initiate the generator. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -304,7 +300,7 @@ def from_pretrained_question_encoder_generator(
 
         >>> # initialize a RAG from two pretrained models.
         >>> model = RagModel.from_pretrained_question_encoder_generator(
-        ...     "facebook/dpr-question_encoder-single-nq-base", "t5-small"
+        ...     "facebook/dpr-question_encoder-single-nq-base", "google-t5/t5-small"
         ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./rag")
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index e586bed87c8099..9d8ed650497528 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -248,7 +248,7 @@ def from_pretrained_question_encoder_generator(
                 Information necessary to initiate the question encoder. Can be either:
 
                     - A string with the *shortcut name* of a pretrained model to load from cache or download, e.g.,
-                      `bert-base-uncased`.
+                      `google-bert/bert-base-uncased`.
                     - A string with the *identifier name* of a pretrained model that was user-uploaded to our S3, e.g.,
                       `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
@@ -260,7 +260,7 @@ def from_pretrained_question_encoder_generator(
                 Information necessary to initiate the generator. Can be either:
 
                     - A string with the *shortcut name* of a pretrained model to load from cache or download, e.g.,
-                      `t5-small`.
+                      `google-t5/t5-small`.
                     - A string with the *identifier name* of a pretrained model that was user-uploaded to our S3, e.g.,
                       `facebook/bart-base`.
                     - A path to a *directory* containing model weights saved using
@@ -290,7 +290,7 @@ def from_pretrained_question_encoder_generator(
 
         >>> # initialize a RAG from two pretrained models.
         >>> model = TFRagModel.from_pretrained_question_encoder_generator(
-        ...     "facebook/dpr-question_encoder-single-nq-base", "t5-small"
+        ...     "facebook/dpr-question_encoder-single-nq-base", "google-t5/t5-small"
         ... )
         >>> # alternatively, initialize from pytorch pretrained models can also be done
         >>> model = TFRagModel.from_pretrained_question_encoder_generator(
diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py
index 86334f0a224e89..8cc35d6090ceeb 100644
--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -25,12 +25,12 @@
 logger = logging.get_logger(__name__)
 
 ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
-    "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
-    "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
-    "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
-    "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
-    "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
+    "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/config.json",
+    "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/config.json",
+    "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/config.json",
+    "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/config.json",
+    "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/config.json",
+    "openai-community/roberta-large-openai-detector": "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/config.json",
 }
 
 
@@ -39,7 +39,7 @@ class RobertaConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`RobertaModel`] or a [`TFRobertaModel`]. It is
     used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
-    [roberta-base](https://huggingface.co/roberta-base) architecture.
+    [FacebookAI/roberta-base](https://huggingface.co/FacebookAI/roberta-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py
index 70a6f540a2352a..ecdd31386b21eb 100644
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -43,7 +43,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "roberta-base"
+_CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
 _CONFIG_FOR_DOC = "RobertaConfig"
 
 remat = nn_partitioning.remat
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 8f34098f7bbbb5..f755bd9d566a92 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -48,16 +48,16 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "roberta-base"
+_CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
 _CONFIG_FOR_DOC = "RobertaConfig"
 
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "roberta-base",
-    "roberta-large",
-    "roberta-large-mnli",
-    "distilroberta-base",
-    "roberta-base-openai-detector",
-    "roberta-large-openai-detector",
+    "FacebookAI/roberta-base",
+    "FacebookAI/roberta-large",
+    "FacebookAI/roberta-large-mnli",
+    "distilbert/distilroberta-base",
+    "openai-community/roberta-base-openai-detector",
+    "openai-community/roberta-large-openai-detector",
     # See all RoBERTa models at https://huggingface.co/models?filter=roberta
 ]
 
@@ -936,10 +936,10 @@ def forward(
         >>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-        >>> config = AutoConfig.from_pretrained("roberta-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
+        >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
         >>> config.is_decoder = True
-        >>> model = RobertaForCausalLM.from_pretrained("roberta-base", config=config)
+        >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index afe773ec97b7d7..0bc5e85e808a56 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -62,14 +62,14 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "roberta-base"
+_CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
 _CONFIG_FOR_DOC = "RobertaConfig"
 
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "roberta-base",
-    "roberta-large",
-    "roberta-large-mnli",
-    "distilroberta-base",
+    "FacebookAI/roberta-base",
+    "FacebookAI/roberta-large",
+    "FacebookAI/roberta-large-mnli",
+    "distilbert/distilroberta-base",
     # See all RoBERTa models at https://huggingface.co/models?filter=roberta
 ]
 
diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py
index b7b3c75be180cd..c7dc51b972944c 100644
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -34,34 +34,34 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
-        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
-        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
-        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
-        "roberta-large-openai-detector": (
-            "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json"
+        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/vocab.json",
+        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/vocab.json",
+        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/vocab.json",
+        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/vocab.json",
+        "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/vocab.json",
+        "openai-community/roberta-large-openai-detector": (
+            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/vocab.json"
         ),
     },
     "merges_file": {
-        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
-        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
-        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
-        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
-        "roberta-large-openai-detector": (
-            "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt"
+        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/merges.txt",
+        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/merges.txt",
+        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/merges.txt",
+        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/merges.txt",
+        "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/merges.txt",
+        "openai-community/roberta-large-openai-detector": (
+            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/merges.txt"
         ),
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "roberta-base": 512,
-    "roberta-large": 512,
-    "roberta-large-mnli": 512,
-    "distilroberta-base": 512,
-    "roberta-base-openai-detector": 512,
-    "roberta-large-openai-detector": 512,
+    "FacebookAI/roberta-base": 512,
+    "FacebookAI/roberta-large": 512,
+    "FacebookAI/roberta-large-mnli": 512,
+    "distilbert/distilroberta-base": 512,
+    "openai-community/roberta-base-openai-detector": 512,
+    "openai-community/roberta-large-openai-detector": 512,
 }
 
 
@@ -114,7 +114,7 @@ class RobertaTokenizer(PreTrainedTokenizer):
     ```python
     >>> from transformers import RobertaTokenizer
 
-    >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    >>> tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
     >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
 
diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py
index 05f64ac2ab185a..00341e870f8bc8 100644
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -30,46 +30,46 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
-        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
-        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
-        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
-        "roberta-large-openai-detector": (
-            "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json"
+        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/vocab.json",
+        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/vocab.json",
+        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/vocab.json",
+        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/vocab.json",
+        "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/vocab.json",
+        "openai-community/roberta-large-openai-detector": (
+            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/vocab.json"
         ),
     },
     "merges_file": {
-        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
-        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
-        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
-        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
-        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
-        "roberta-large-openai-detector": (
-            "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt"
+        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/merges.txt",
+        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/merges.txt",
+        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/merges.txt",
+        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/merges.txt",
+        "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/merges.txt",
+        "openai-community/roberta-large-openai-detector": (
+            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/merges.txt"
         ),
     },
     "tokenizer_file": {
-        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
-        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
-        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer.json",
-        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/tokenizer.json",
-        "roberta-base-openai-detector": (
-            "https://huggingface.co/roberta-base-openai-detector/resolve/main/tokenizer.json"
+        "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/tokenizer.json",
+        "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/tokenizer.json",
+        "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/tokenizer.json",
+        "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/tokenizer.json",
+        "openai-community/roberta-base-openai-detector": (
+            "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/tokenizer.json"
         ),
-        "roberta-large-openai-detector": (
-            "https://huggingface.co/roberta-large-openai-detector/resolve/main/tokenizer.json"
+        "openai-community/roberta-large-openai-detector": (
+            "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/tokenizer.json"
         ),
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "roberta-base": 512,
-    "roberta-large": 512,
-    "roberta-large-mnli": 512,
-    "distilroberta-base": 512,
-    "roberta-base-openai-detector": 512,
-    "roberta-large-openai-detector": 512,
+    "FacebookAI/roberta-base": 512,
+    "FacebookAI/roberta-large": 512,
+    "FacebookAI/roberta-large-mnli": 512,
+    "distilbert/distilroberta-base": 512,
+    "openai-community/roberta-base-openai-detector": 512,
+    "openai-community/roberta-large-openai-detector": 512,
 }
 
 
@@ -84,7 +84,7 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
     ```python
     >>> from transformers import RobertaTokenizerFast
 
-    >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+    >>> tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
     >>> tokenizer("Hello world")["input_ids"]
     [0, 31414, 232, 2]
 
diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
index 1957a30f41b258..f9325138165a7c 100644
--- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
@@ -31,7 +31,7 @@
 }
 
 
-# Copied from transformers.models.roberta.configuration_roberta.RobertaConfig with roberta-base->andreasmadsen/efficient_mlm_m0.40,RoBERTa->RoBERTa-PreLayerNorm,Roberta->RobertaPreLayerNorm,roberta->roberta-prelayernorm
+# Copied from transformers.models.roberta.configuration_roberta.RobertaConfig with FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40,RoBERTa->RoBERTa-PreLayerNorm,Roberta->RobertaPreLayerNorm,roberta->roberta-prelayernorm
 class RobertaPreLayerNormConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`RobertaPreLayerNormModel`] or a [`TFRobertaPreLayerNormModel`]. It is
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index cb22bbe14a0f2a..7c37950e478b6f 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -867,7 +867,7 @@ def forward(
     """RoBERTa-PreLayerNorm Model with a `language modeling` head on top for CLM fine-tuning.""",
     ROBERTA_PRELAYERNORM_START_DOCSTRING,
 )
-# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer
 class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
index 378f082e4b9c17..32a58ec5589eed 100644
--- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
@@ -52,7 +52,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
 
     >>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-    >>> # Initializing a Wav2Vec2Bert model from a Wav2Vec2 & bert-base-uncased style configurations
+    >>> # Initializing a Wav2Vec2Bert model from a Wav2Vec2 & google-bert/bert-base-uncased style configurations
     >>> model = SpeechEncoderDecoderModel(config=config)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
index b9975510abfd9d..e3bbd86266ea11 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@@ -796,8 +796,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
@@ -805,8 +803,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 5028e30344ccb8..942dfb5f9c49fc 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -301,8 +301,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -314,8 +312,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -343,7 +339,7 @@ def from_encoder_decoder_pretrained(
 
         >>> # initialize a wav2vec2bert from a pretrained Wav2Vec2 and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
         >>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "facebook/wav2vec2-base-960h", "bert-base-uncased"
+        ...     "facebook/wav2vec2-base-960h", "google-bert/bert-base-uncased"
         ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./wav2vec2bert")
diff --git a/src/transformers/models/switch_transformers/convert_big_switch.py b/src/transformers/models/switch_transformers/convert_big_switch.py
index 86c673b48a4ede..e4b8af07cd4c88 100644
--- a/src/transformers/models/switch_transformers/convert_big_switch.py
+++ b/src/transformers/models/switch_transformers/convert_big_switch.py
@@ -185,7 +185,7 @@ def sanity_check():
         "/home/arthur_huggingface_co/transformers/switch_converted", device_map="auto"
     )
 
-    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
     text = "A <extra_id_0> walks into a bar a orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
 
     input_ids = tokenizer(text, return_tensors="pt").input_ids
diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py
index 05d737d035afa3..6a1d3c529e0ac5 100644
--- a/src/transformers/models/t5/configuration_t5.py
+++ b/src/transformers/models/t5/configuration_t5.py
@@ -23,11 +23,11 @@
 logger = logging.get_logger(__name__)
 
 T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "t5-small": "https://huggingface.co/t5-small/resolve/main/config.json",
-    "t5-base": "https://huggingface.co/t5-base/resolve/main/config.json",
-    "t5-large": "https://huggingface.co/t5-large/resolve/main/config.json",
-    "t5-3b": "https://huggingface.co/t5-3b/resolve/main/config.json",
-    "t5-11b": "https://huggingface.co/t5-11b/resolve/main/config.json",
+    "google-t5/t5-small": "https://huggingface.co/google-t5/t5-small/resolve/main/config.json",
+    "google-t5/t5-base": "https://huggingface.co/google-t5/t5-base/resolve/main/config.json",
+    "google-t5/t5-large": "https://huggingface.co/google-t5/t5-large/resolve/main/config.json",
+    "google-t5/t5-3b": "https://huggingface.co/google-t5/t5-3b/resolve/main/config.json",
+    "google-t5/t5-11b": "https://huggingface.co/google-t5/t5-11b/resolve/main/config.json",
 }
 
 
@@ -36,7 +36,7 @@ class T5Config(PretrainedConfig):
     This is the configuration class to store the configuration of a [`T5Model`] or a [`TFT5Model`]. It is used to
     instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the T5
-    [t5-small](https://huggingface.co/t5-small) architecture.
+    [google-t5/t5-small](https://huggingface.co/google-t5/t5-small) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 09575fdcc3b82e..94b24bd42f9671 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -49,7 +49,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "t5-small"
+_CHECKPOINT_FOR_DOC = "google-t5/t5-small"
 _CONFIG_FOR_DOC = "T5Config"
 
 remat = nn_partitioning.remat
@@ -1090,8 +1090,8 @@ def encode(
         ```python
         >>> from transformers import AutoTokenizer, FlaxT5ForConditionalGeneration
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, return_tensors="np")
@@ -1152,8 +1152,8 @@ def decode(
         >>> from transformers import AutoTokenizer, FlaxT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
         >>> text = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, return_tensors="np")
@@ -1378,8 +1378,8 @@ class FlaxT5Model(FlaxT5PreTrainedModel):
     ```python
     >>> from transformers import AutoTokenizer, FlaxT5Model
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-    >>> model = FlaxT5Model.from_pretrained("t5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+    >>> model = FlaxT5Model.from_pretrained("google-t5/t5-small")
 
     >>> input_ids = tokenizer(
     ...     "Studies have been shown that owning a dog is good for you", return_tensors="np"
@@ -1630,8 +1630,8 @@ def decode(
         >>> from transformers import AutoTokenizer, FlaxT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
         >>> text = "summarize: My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer(text, return_tensors="np")
@@ -1778,8 +1778,8 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     ```python
     >>> from transformers import AutoTokenizer, FlaxT5ForConditionalGeneration
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-    >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+    >>> model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
     >>> ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
     >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], return_tensors="np")
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 9d4ba820d0442c..a3febdd1aa7bb6 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -53,18 +53,18 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "T5Config"
-_CHECKPOINT_FOR_DOC = "t5-small"
+_CHECKPOINT_FOR_DOC = "google-t5/t5-small"
 
 ####################################################
 # This dict contains ids and associated url
 # for the pretrained weights provided with the models
 ####################################################
 T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "t5-small",
-    "t5-base",
-    "t5-large",
-    "t5-3b",
-    "t5-11b",
+    "google-t5/t5-small",
+    "google-t5/t5-base",
+    "google-t5/t5-large",
+    "google-t5/t5-3b",
+    "google-t5/t5-11b",
     # See all T5 models at https://huggingface.co/models?filter=t5
 ]
 
@@ -196,17 +196,17 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
             have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
             following number of attention modules:
 
-                - t5-small: 6
-                - t5-base: 12
-                - t5-large: 24
-                - t5-3b: 24
-                - t5-11b: 24
+                - google-t5/t5-small: 6
+                - google-t5/t5-base: 12
+                - google-t5/t5-large: 24
+                - google-t5/t5-3b: 24
+                - google-t5/t5-11b: 24
 
     Example:
 
     ```python
-    # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    # Here is an example of a device map on a machine with 4 GPUs using google-t5/t5-3b, which has a total of 24 attention modules:
+    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b")
     device_map = {
         0: [0, 1, 2],
         1: [3, 4, 5, 6, 7, 8, 9],
@@ -222,8 +222,8 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
     Example:
 
     ```python
-    # On a 4 GPU machine with t5-3b:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    # On a 4 GPU machine with google-t5/t5-3b:
+    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b")
     device_map = {
         0: [0, 1, 2],
         1: [3, 4, 5, 6, 7, 8, 9],
@@ -1463,8 +1463,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, T5Model
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = T5Model.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = T5Model.from_pretrained("google-t5/t5-small")
 
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
@@ -1678,8 +1678,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, T5ForConditionalGeneration
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
         >>> # training
         >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
@@ -1967,8 +1967,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, T5EncoderModel
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = T5EncoderModel.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = T5EncoderModel.from_pretrained("google-t5/t5-small")
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
         ... ).input_ids  # Batch size 1
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index c0a05a8a39e31d..c809659477bcc6 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -59,11 +59,11 @@
 _CONFIG_FOR_DOC = "T5Config"
 
 TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "t5-small",
-    "t5-base",
-    "t5-large",
-    "t5-3b",
-    "t5-11b",
+    "google-t5/t5-small",
+    "google-t5/t5-base",
+    "google-t5/t5-large",
+    "google-t5/t5-3b",
+    "google-t5/t5-11b",
     # See all T5 models at https://huggingface.co/models?filter=t5
 ]
 
@@ -1236,8 +1236,8 @@ def call(
         ```python
         >>> from transformers import AutoTokenizer, TFT5Model
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = TFT5Model.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = TFT5Model.from_pretrained("google-t5/t5-small")
 
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="tf"
@@ -1418,8 +1418,8 @@ def call(
         ```python
         >>> from transformers import AutoTokenizer, TFT5ForConditionalGeneration
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
 
         >>> # training
         >>> inputs = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="tf").input_ids
@@ -1642,8 +1642,8 @@ def call(
         ```python
         >>> from transformers import AutoTokenizer, TFT5EncoderModel
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = TFT5EncoderModel.from_pretrained("t5-small")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+        >>> model = TFT5EncoderModel.from_pretrained("google-t5/t5-small")
 
         >>> input_ids = tokenizer(
         ...     "Studies have been shown that owning a dog is good for you", return_tensors="tf"
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index af2d8ef6e04adc..ffd58a4d5a537c 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -39,22 +39,22 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "t5-small": "https://huggingface.co/t5-small/resolve/main/spiece.model",
-        "t5-base": "https://huggingface.co/t5-base/resolve/main/spiece.model",
-        "t5-large": "https://huggingface.co/t5-large/resolve/main/spiece.model",
-        "t5-3b": "https://huggingface.co/t5-3b/resolve/main/spiece.model",
-        "t5-11b": "https://huggingface.co/t5-11b/resolve/main/spiece.model",
+        "google-t5/t5-small": "https://huggingface.co/google-t5/t5-small/resolve/main/spiece.model",
+        "google-t5/t5-base": "https://huggingface.co/google-t5/t5-base/resolve/main/spiece.model",
+        "google-t5/t5-large": "https://huggingface.co/google-t5/t5-large/resolve/main/spiece.model",
+        "google-t5/t5-3b": "https://huggingface.co/google-t5/t5-3b/resolve/main/spiece.model",
+        "google-t5/t5-11b": "https://huggingface.co/google-t5/t5-11b/resolve/main/spiece.model",
     }
 }
 
 
 # TODO(PVP) - this should be removed in Transformers v5
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "t5-small": 512,
-    "t5-base": 512,
-    "t5-large": 512,
-    "t5-3b": 512,
-    "t5-11b": 512,
+    "google-t5/t5-small": 512,
+    "google-t5/t5-base": 512,
+    "google-t5/t5-large": 512,
+    "google-t5/t5-3b": 512,
+    "google-t5/t5-11b": 512,
 }
 
 SPIECE_UNDERLINE = "▁"
@@ -117,7 +117,7 @@ class T5Tokenizer(PreTrainedTokenizer):
             ```python
             >>> from transformers import T5Tokenizer
 
-            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
             >>> tokenizer.encode("Hello <extra_id_0>.")
             [8774, 32099, 3, 5, 1]
             ```
@@ -125,7 +125,7 @@ class T5Tokenizer(PreTrainedTokenizer):
             ```python
             >>> from transformers import T5Tokenizer
 
-            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
             >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
             [8774, 32099, 5, 1]
             ```
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index a0fedd9e3be894..71a7bd07b4d52a 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -37,29 +37,29 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "t5-small": "https://huggingface.co/t5-small/resolve/main/spiece.model",
-        "t5-base": "https://huggingface.co/t5-base/resolve/main/spiece.model",
-        "t5-large": "https://huggingface.co/t5-large/resolve/main/spiece.model",
-        "t5-3b": "https://huggingface.co/t5-3b/resolve/main/spiece.model",
-        "t5-11b": "https://huggingface.co/t5-11b/resolve/main/spiece.model",
+        "google-t5/t5-small": "https://huggingface.co/google-t5/t5-small/resolve/main/spiece.model",
+        "google-t5/t5-base": "https://huggingface.co/google-t5/t5-base/resolve/main/spiece.model",
+        "google-t5/t5-large": "https://huggingface.co/google-t5/t5-large/resolve/main/spiece.model",
+        "google-t5/t5-3b": "https://huggingface.co/google-t5/t5-3b/resolve/main/spiece.model",
+        "google-t5/t5-11b": "https://huggingface.co/google-t5/t5-11b/resolve/main/spiece.model",
     },
     "tokenizer_file": {
-        "t5-small": "https://huggingface.co/t5-small/resolve/main/tokenizer.json",
-        "t5-base": "https://huggingface.co/t5-base/resolve/main/tokenizer.json",
-        "t5-large": "https://huggingface.co/t5-large/resolve/main/tokenizer.json",
-        "t5-3b": "https://huggingface.co/t5-3b/resolve/main/tokenizer.json",
-        "t5-11b": "https://huggingface.co/t5-11b/resolve/main/tokenizer.json",
+        "google-t5/t5-small": "https://huggingface.co/google-t5/t5-small/resolve/main/tokenizer.json",
+        "google-t5/t5-base": "https://huggingface.co/google-t5/t5-base/resolve/main/tokenizer.json",
+        "google-t5/t5-large": "https://huggingface.co/google-t5/t5-large/resolve/main/tokenizer.json",
+        "google-t5/t5-3b": "https://huggingface.co/google-t5/t5-3b/resolve/main/tokenizer.json",
+        "google-t5/t5-11b": "https://huggingface.co/google-t5/t5-11b/resolve/main/tokenizer.json",
     },
 }
 
 
 # TODO(PVP) - this should be removed in Transformers v5
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "t5-small": 512,
-    "t5-base": 512,
-    "t5-large": 512,
-    "t5-3b": 512,
-    "t5-11b": 512,
+    "google-t5/t5-small": 512,
+    "google-t5/t5-base": 512,
+    "google-t5/t5-large": 512,
+    "google-t5/t5-3b": 512,
+    "google-t5/t5-11b": 512,
 }
 
 
diff --git a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
index b82adf690e7e55..428406d82c685f 100644
--- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+++ b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
@@ -183,7 +183,7 @@ def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
 
     # Check outputs on an image
     image_processor = ViTImageProcessor(size=encoder_config.image_size)
-    tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
+    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-large")
     processor = TrOCRProcessor(image_processor, tokenizer)
 
     pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
index a93a68016899b7..1bf8469f77e66d 100644
--- a/src/transformers/models/umt5/modeling_umt5.py
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -1418,7 +1418,7 @@ class PreTrainedModel
 
     @add_start_docstrings_to_model_forward(UMT5_ENCODER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
-    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.forward with T5->UMT5, t5-small->google/umt5-small
+    # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.forward with T5->UMT5, google-t5/t5-small->google/umt5-small
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
index 015db07453d17d..e597d0d7e778b7 100644
--- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+++ b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
@@ -224,7 +224,7 @@ def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
 
     # Define processor
     image_processor = ViltImageProcessor(size=384)
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
     processor = ViltProcessor(image_processor, tokenizer)
 
     # Forward pass on example inputs (image + text)
diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
index ba380ed3ea3f80..a4aa663f98526f 100644
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -59,7 +59,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
 
     >>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-    >>> # Initializing a ViTBert model (with random weights) from a ViT & bert-base-uncased style configurations
+    >>> # Initializing a ViTBert model (with random weights) from a ViT & google-bert/bert-base-uncased style configurations
     >>> model = VisionEncoderDecoderModel(config=config)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index 899acd10703b67..987c9a1afa3d19 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -421,7 +421,7 @@ def encode(
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
         >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
         ... )
 
         >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
@@ -500,7 +500,7 @@ def decode(
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
         >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
         ... )
 
         >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
@@ -627,11 +627,11 @@ def __call__(
         >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # load output tokenizer
-        >>> tokenizer_output = AutoTokenizer.from_pretrained("gpt2")
+        >>> tokenizer_output = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
         >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
         ... )
 
         >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
@@ -746,8 +746,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
@@ -771,7 +769,7 @@ def from_encoder_decoder_pretrained(
 
         >>> # initialize a vit-gpt2 from a pretrained ViT and a pretrained GPT2 model. Note that the cross-attention layers will be randomly initialized
         >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
         ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-gpt2")
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index a323c0607f4d7b..75ff2dbd82e48b 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -335,8 +335,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *pytorch checkpoint file* (e.g, `./pt_model/`). In this case,
@@ -362,7 +360,7 @@ def from_encoder_decoder_pretrained(
 
         >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
         >>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "google/vit-base-patch16-224-in21k", "bert-base-uncased"
+        ...     "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
         ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-bert")
@@ -487,11 +485,11 @@ def call(
         >>> import requests
 
         >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
-        >>> decoder_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        >>> decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
         >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
         >>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
         ... )
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index f7134c94ff01d1..88b5efd0476086 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -391,8 +391,6 @@ def from_encoder_decoder_pretrained(
                 Information necessary to initiate the text decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
@@ -420,7 +418,7 @@ def from_encoder_decoder_pretrained(
 
         >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
         >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "google/vit-base-patch16-224-in21k", "bert-base-uncased"
+        ...     "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
         ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-bert")
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index f38b6b931f5ab7..ba8bf7091b3f94 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -426,8 +426,6 @@ def from_vision_text_pretrained(
                 Information necessary to initiate the vision model. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
@@ -439,8 +437,6 @@ def from_vision_text_pretrained(
                 Information necessary to initiate the text model. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
@@ -468,7 +464,7 @@ def from_vision_text_pretrained(
 
         >>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
         >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
-        ...     "google/vit-base-patch16-224", "bert-base-uncased"
+        ...     "google/vit-base-patch16-224", "google-bert/bert-base-uncased"
         ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-bert")
@@ -560,11 +556,11 @@ def from_vision_text_pretrained(
     ...     AutoTokenizer,
     ... )
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
     >>> image_processor = AutoImageProcesor.from_pretrained("google/vit-base-patch16-224")
     >>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
     >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
-    ...     "google/vit-base-patch16-224", "bert-base-uncased"
+    ...     "google/vit-base-patch16-224", "google-bert/bert-base-uncased"
     ... )
 
     >>> # contrastive training
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
index 3f3cc81795be44..6f7e30d3f6fa6f 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -374,11 +374,11 @@ def call(
         ...     AutoTokenizer,
         ... )
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
         >>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
         >>> model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
-        ...     "google/vit-base-patch16-224", "bert-base-uncased"
+        ...     "google/vit-base-patch16-224", "google-bert/bert-base-uncased"
         ... )
 
         >>> # contrastive training
@@ -477,8 +477,6 @@ def from_vision_text_pretrained(
                 Information necessary to initiate the vision model. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
@@ -488,8 +486,6 @@ def from_vision_text_pretrained(
                 Information necessary to initiate the text model. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
@@ -515,7 +511,7 @@ def from_vision_text_pretrained(
 
         >>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
         >>> model = TFVisionTextDualEncoderModel.from_vision_text_pretrained(
-        ...     "google/vit-base-patch16-224", "bert-base-uncased"
+        ...     "google/vit-base-patch16-224", "google-bert/bert-base-uncased"
         ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-bert")
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 106ff462e3e3bb..cd4d5bd7a1f197 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -319,11 +319,11 @@ def forward(
         ...     AutoTokenizer,
         ... )
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
         >>> processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
         >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
-        ...     "google/vit-base-patch16-224", "bert-base-uncased"
+        ...     "google/vit-base-patch16-224", "google-bert/bert-base-uncased"
         ... )
 
         >>> # contrastive training
@@ -425,8 +425,6 @@ def from_vision_text_pretrained(
                 Information necessary to initiate the vision model. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
@@ -438,8 +436,6 @@ def from_vision_text_pretrained(
                 Information necessary to initiate the text model. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
                     - A path to a *directory* containing model weights saved using
                       [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                     - A path or url to a *PyTorch checkpoint folder* (e.g, `./pt_model`). In this case, `from_pt`
@@ -467,7 +463,7 @@ def from_vision_text_pretrained(
 
         >>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
         >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
-        ...     "google/vit-base-patch16-224", "bert-base-uncased"
+        ...     "google/vit-base-patch16-224", "google-bert/bert-base-uncased"
         ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-bert")
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index f8a146ed2c4eb7..4af7696fc39634 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -733,7 +733,7 @@ def forward(
         from transformers import AutoTokenizer, VisualBertModel
         import torch
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
@@ -920,7 +920,7 @@ def forward(
         # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
         from transformers import AutoTokenizer, VisualBertForPreTraining
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")
@@ -1060,7 +1060,7 @@ def forward(
         from transformers import AutoTokenizer, VisualBertForMultipleChoice
         import torch
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
 
         prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
@@ -1211,7 +1211,7 @@ def forward(
         from transformers import AutoTokenizer, VisualBertForQuestionAnswering
         import torch
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
 
         text = "Who is eating the apple?"
@@ -1337,7 +1337,7 @@ def forward(
         from transformers import AutoTokenizer, VisualBertForVisualReasoning
         import torch
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2")
 
         text = "Who is eating the apple?"
@@ -1503,7 +1503,7 @@ def forward(
         from transformers import AutoTokenizer, VisualBertForRegionToPhraseAlignment
         import torch
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
         model = VisualBertForRegionToPhraseAlignment.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         text = "Who is eating the apple?"
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 916cca51a9894c..b388be245f1389 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -131,8 +131,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 This can be either:
 
                 - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a feature extractor file saved using the
                   [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
                 - a path or url to a saved feature extractor JSON *file*, e.g.,
diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py
index cd8d721bfc37d2..2992a3ab322d63 100644
--- a/src/transformers/models/xlm/configuration_xlm.py
+++ b/src/transformers/models/xlm/configuration_xlm.py
@@ -24,16 +24,16 @@
 logger = logging.get_logger(__name__)
 
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "xlm-mlm-en-2048": "https://huggingface.co/xlm-mlm-en-2048/resolve/main/config.json",
-    "xlm-mlm-ende-1024": "https://huggingface.co/xlm-mlm-ende-1024/resolve/main/config.json",
-    "xlm-mlm-enfr-1024": "https://huggingface.co/xlm-mlm-enfr-1024/resolve/main/config.json",
-    "xlm-mlm-enro-1024": "https://huggingface.co/xlm-mlm-enro-1024/resolve/main/config.json",
-    "xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/xlm-mlm-tlm-xnli15-1024/resolve/main/config.json",
-    "xlm-mlm-xnli15-1024": "https://huggingface.co/xlm-mlm-xnli15-1024/resolve/main/config.json",
-    "xlm-clm-enfr-1024": "https://huggingface.co/xlm-clm-enfr-1024/resolve/main/config.json",
-    "xlm-clm-ende-1024": "https://huggingface.co/xlm-clm-ende-1024/resolve/main/config.json",
-    "xlm-mlm-17-1280": "https://huggingface.co/xlm-mlm-17-1280/resolve/main/config.json",
-    "xlm-mlm-100-1280": "https://huggingface.co/xlm-mlm-100-1280/resolve/main/config.json",
+    "FacebookAI/xlm-mlm-en-2048": "https://huggingface.co/FacebookAI/xlm-mlm-en-2048/resolve/main/config.json",
+    "FacebookAI/xlm-mlm-ende-1024": "https://huggingface.co/FacebookAI/xlm-mlm-ende-1024/resolve/main/config.json",
+    "FacebookAI/xlm-mlm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enfr-1024/resolve/main/config.json",
+    "FacebookAI/xlm-mlm-enro-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enro-1024/resolve/main/config.json",
+    "FacebookAI/xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-tlm-xnli15-1024/resolve/main/config.json",
+    "FacebookAI/xlm-mlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-xnli15-1024/resolve/main/config.json",
+    "FacebookAI/xlm-clm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-clm-enfr-1024/resolve/main/config.json",
+    "FacebookAI/xlm-clm-ende-1024": "https://huggingface.co/FacebookAI/xlm-clm-ende-1024/resolve/main/config.json",
+    "FacebookAI/xlm-mlm-17-1280": "https://huggingface.co/FacebookAI/xlm-mlm-17-1280/resolve/main/config.json",
+    "FacebookAI/xlm-mlm-100-1280": "https://huggingface.co/FacebookAI/xlm-mlm-100-1280/resolve/main/config.json",
 }
 
 
@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`XLMModel`] or a [`TFXLMModel`]. It is used to
     instantiate a XLM model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the
-    [xlm-mlm-en-2048](https://huggingface.co/xlm-mlm-en-2048) architecture.
+    [FacebookAI/xlm-mlm-en-2048](https://huggingface.co/FacebookAI/xlm-mlm-en-2048) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index 63d807317b28b2..173f1d0acdb03d 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -63,20 +63,20 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "xlm-mlm-en-2048"
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-mlm-en-2048"
 _CONFIG_FOR_DOC = "XLMConfig"
 
 TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "xlm-mlm-en-2048",
-    "xlm-mlm-ende-1024",
-    "xlm-mlm-enfr-1024",
-    "xlm-mlm-enro-1024",
-    "xlm-mlm-tlm-xnli15-1024",
-    "xlm-mlm-xnli15-1024",
-    "xlm-clm-enfr-1024",
-    "xlm-clm-ende-1024",
-    "xlm-mlm-17-1280",
-    "xlm-mlm-100-1280",
+    "FacebookAI/xlm-mlm-en-2048",
+    "FacebookAI/xlm-mlm-ende-1024",
+    "FacebookAI/xlm-mlm-enfr-1024",
+    "FacebookAI/xlm-mlm-enro-1024",
+    "FacebookAI/xlm-mlm-tlm-xnli15-1024",
+    "FacebookAI/xlm-mlm-xnli15-1024",
+    "FacebookAI/xlm-clm-enfr-1024",
+    "FacebookAI/xlm-clm-ende-1024",
+    "FacebookAI/xlm-mlm-17-1280",
+    "FacebookAI/xlm-mlm-100-1280",
     # See all XLM models at https://huggingface.co/models?filter=xlm
 ]
 
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index 2b7265489bdddf..de07829974d747 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -50,20 +50,20 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "xlm-mlm-en-2048"
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-mlm-en-2048"
 _CONFIG_FOR_DOC = "XLMConfig"
 
 XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "xlm-mlm-en-2048",
-    "xlm-mlm-ende-1024",
-    "xlm-mlm-enfr-1024",
-    "xlm-mlm-enro-1024",
-    "xlm-mlm-tlm-xnli15-1024",
-    "xlm-mlm-xnli15-1024",
-    "xlm-clm-enfr-1024",
-    "xlm-clm-ende-1024",
-    "xlm-mlm-17-1280",
-    "xlm-mlm-100-1280",
+    "FacebookAI/xlm-mlm-en-2048",
+    "FacebookAI/xlm-mlm-ende-1024",
+    "FacebookAI/xlm-mlm-enfr-1024",
+    "FacebookAI/xlm-mlm-enro-1024",
+    "FacebookAI/xlm-mlm-tlm-xnli15-1024",
+    "FacebookAI/xlm-mlm-xnli15-1024",
+    "FacebookAI/xlm-clm-enfr-1024",
+    "FacebookAI/xlm-clm-ende-1024",
+    "FacebookAI/xlm-mlm-17-1280",
+    "FacebookAI/xlm-mlm-100-1280",
     # See all XLM models at https://huggingface.co/models?filter=xlm
 ]
 
@@ -1030,8 +1030,8 @@ def forward(
         >>> from transformers import AutoTokenizer, XLMForQuestionAnswering
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("xlm-mlm-en-2048")
-        >>> model = XLMForQuestionAnswering.from_pretrained("xlm-mlm-en-2048")
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+        >>> model = XLMForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")
 
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
         ...     0
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index 49d22934e072d4..a99b5cb73c9e71 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -35,62 +35,62 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "xlm-mlm-en-2048": "https://huggingface.co/xlm-mlm-en-2048/resolve/main/vocab.json",
-        "xlm-mlm-ende-1024": "https://huggingface.co/xlm-mlm-ende-1024/resolve/main/vocab.json",
-        "xlm-mlm-enfr-1024": "https://huggingface.co/xlm-mlm-enfr-1024/resolve/main/vocab.json",
-        "xlm-mlm-enro-1024": "https://huggingface.co/xlm-mlm-enro-1024/resolve/main/vocab.json",
-        "xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/xlm-mlm-tlm-xnli15-1024/resolve/main/vocab.json",
-        "xlm-mlm-xnli15-1024": "https://huggingface.co/xlm-mlm-xnli15-1024/resolve/main/vocab.json",
-        "xlm-clm-enfr-1024": "https://huggingface.co/xlm-clm-enfr-1024/resolve/main/vocab.json",
-        "xlm-clm-ende-1024": "https://huggingface.co/xlm-clm-ende-1024/resolve/main/vocab.json",
-        "xlm-mlm-17-1280": "https://huggingface.co/xlm-mlm-17-1280/resolve/main/vocab.json",
-        "xlm-mlm-100-1280": "https://huggingface.co/xlm-mlm-100-1280/resolve/main/vocab.json",
+        "FacebookAI/xlm-mlm-en-2048": "https://huggingface.co/FacebookAI/xlm-mlm-en-2048/resolve/main/vocab.json",
+        "FacebookAI/xlm-mlm-ende-1024": "https://huggingface.co/FacebookAI/xlm-mlm-ende-1024/resolve/main/vocab.json",
+        "FacebookAI/xlm-mlm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enfr-1024/resolve/main/vocab.json",
+        "FacebookAI/xlm-mlm-enro-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enro-1024/resolve/main/vocab.json",
+        "FacebookAI/xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-tlm-xnli15-1024/resolve/main/vocab.json",
+        "FacebookAI/xlm-mlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-xnli15-1024/resolve/main/vocab.json",
+        "FacebookAI/xlm-clm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-clm-enfr-1024/resolve/main/vocab.json",
+        "FacebookAI/xlm-clm-ende-1024": "https://huggingface.co/FacebookAI/xlm-clm-ende-1024/resolve/main/vocab.json",
+        "FacebookAI/xlm-mlm-17-1280": "https://huggingface.co/FacebookAI/xlm-mlm-17-1280/resolve/main/vocab.json",
+        "FacebookAI/xlm-mlm-100-1280": "https://huggingface.co/FacebookAI/xlm-mlm-100-1280/resolve/main/vocab.json",
     },
     "merges_file": {
-        "xlm-mlm-en-2048": "https://huggingface.co/xlm-mlm-en-2048/resolve/main/merges.txt",
-        "xlm-mlm-ende-1024": "https://huggingface.co/xlm-mlm-ende-1024/resolve/main/merges.txt",
-        "xlm-mlm-enfr-1024": "https://huggingface.co/xlm-mlm-enfr-1024/resolve/main/merges.txt",
-        "xlm-mlm-enro-1024": "https://huggingface.co/xlm-mlm-enro-1024/resolve/main/merges.txt",
-        "xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/xlm-mlm-tlm-xnli15-1024/resolve/main/merges.txt",
-        "xlm-mlm-xnli15-1024": "https://huggingface.co/xlm-mlm-xnli15-1024/resolve/main/merges.txt",
-        "xlm-clm-enfr-1024": "https://huggingface.co/xlm-clm-enfr-1024/resolve/main/merges.txt",
-        "xlm-clm-ende-1024": "https://huggingface.co/xlm-clm-ende-1024/resolve/main/merges.txt",
-        "xlm-mlm-17-1280": "https://huggingface.co/xlm-mlm-17-1280/resolve/main/merges.txt",
-        "xlm-mlm-100-1280": "https://huggingface.co/xlm-mlm-100-1280/resolve/main/merges.txt",
+        "FacebookAI/xlm-mlm-en-2048": "https://huggingface.co/FacebookAI/xlm-mlm-en-2048/resolve/main/merges.txt",
+        "FacebookAI/xlm-mlm-ende-1024": "https://huggingface.co/FacebookAI/xlm-mlm-ende-1024/resolve/main/merges.txt",
+        "FacebookAI/xlm-mlm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enfr-1024/resolve/main/merges.txt",
+        "FacebookAI/xlm-mlm-enro-1024": "https://huggingface.co/FacebookAI/xlm-mlm-enro-1024/resolve/main/merges.txt",
+        "FacebookAI/xlm-mlm-tlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-tlm-xnli15-1024/resolve/main/merges.txt",
+        "FacebookAI/xlm-mlm-xnli15-1024": "https://huggingface.co/FacebookAI/xlm-mlm-xnli15-1024/resolve/main/merges.txt",
+        "FacebookAI/xlm-clm-enfr-1024": "https://huggingface.co/FacebookAI/xlm-clm-enfr-1024/resolve/main/merges.txt",
+        "FacebookAI/xlm-clm-ende-1024": "https://huggingface.co/FacebookAI/xlm-clm-ende-1024/resolve/main/merges.txt",
+        "FacebookAI/xlm-mlm-17-1280": "https://huggingface.co/FacebookAI/xlm-mlm-17-1280/resolve/main/merges.txt",
+        "FacebookAI/xlm-mlm-100-1280": "https://huggingface.co/FacebookAI/xlm-mlm-100-1280/resolve/main/merges.txt",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xlm-mlm-en-2048": 512,
-    "xlm-mlm-ende-1024": 512,
-    "xlm-mlm-enfr-1024": 512,
-    "xlm-mlm-enro-1024": 512,
-    "xlm-mlm-tlm-xnli15-1024": 512,
-    "xlm-mlm-xnli15-1024": 512,
-    "xlm-clm-enfr-1024": 512,
-    "xlm-clm-ende-1024": 512,
-    "xlm-mlm-17-1280": 512,
-    "xlm-mlm-100-1280": 512,
+    "FacebookAI/xlm-mlm-en-2048": 512,
+    "FacebookAI/xlm-mlm-ende-1024": 512,
+    "FacebookAI/xlm-mlm-enfr-1024": 512,
+    "FacebookAI/xlm-mlm-enro-1024": 512,
+    "FacebookAI/xlm-mlm-tlm-xnli15-1024": 512,
+    "FacebookAI/xlm-mlm-xnli15-1024": 512,
+    "FacebookAI/xlm-clm-enfr-1024": 512,
+    "FacebookAI/xlm-clm-ende-1024": 512,
+    "FacebookAI/xlm-mlm-17-1280": 512,
+    "FacebookAI/xlm-mlm-100-1280": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True},
-    "xlm-mlm-ende-1024": {
+    "FacebookAI/xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True},
+    "FacebookAI/xlm-mlm-ende-1024": {
         "do_lowercase_and_remove_accent": True,
         "id2lang": {0: "de", 1: "en"},
         "lang2id": {"de": 0, "en": 1},
     },
-    "xlm-mlm-enfr-1024": {
+    "FacebookAI/xlm-mlm-enfr-1024": {
         "do_lowercase_and_remove_accent": True,
         "id2lang": {0: "en", 1: "fr"},
         "lang2id": {"en": 0, "fr": 1},
     },
-    "xlm-mlm-enro-1024": {
+    "FacebookAI/xlm-mlm-enro-1024": {
         "do_lowercase_and_remove_accent": True,
         "id2lang": {0: "en", 1: "ro"},
         "lang2id": {"en": 0, "ro": 1},
     },
-    "xlm-mlm-tlm-xnli15-1024": {
+    "FacebookAI/xlm-mlm-tlm-xnli15-1024": {
         "do_lowercase_and_remove_accent": True,
         "id2lang": {
             0: "ar",
@@ -127,7 +127,7 @@
             "zh": 14,
         },
     },
-    "xlm-mlm-xnli15-1024": {
+    "FacebookAI/xlm-mlm-xnli15-1024": {
         "do_lowercase_and_remove_accent": True,
         "id2lang": {
             0: "ar",
@@ -164,17 +164,17 @@
             "zh": 14,
         },
     },
-    "xlm-clm-enfr-1024": {
+    "FacebookAI/xlm-clm-enfr-1024": {
         "do_lowercase_and_remove_accent": True,
         "id2lang": {0: "en", 1: "fr"},
         "lang2id": {"en": 0, "fr": 1},
     },
-    "xlm-clm-ende-1024": {
+    "FacebookAI/xlm-clm-ende-1024": {
         "do_lowercase_and_remove_accent": True,
         "id2lang": {0: "de", 1: "en"},
         "lang2id": {"de": 0, "en": 1},
     },
-    "xlm-mlm-17-1280": {
+    "FacebookAI/xlm-mlm-17-1280": {
         "do_lowercase_and_remove_accent": False,
         "id2lang": {
             0: "ar",
@@ -215,7 +215,7 @@
             "zh": 16,
         },
     },
-    "xlm-mlm-100-1280": {
+    "FacebookAI/xlm-mlm-100-1280": {
         "do_lowercase_and_remove_accent": False,
         "id2lang": {
             0: "af",
@@ -512,7 +512,7 @@ def remove_non_printing_char(text):
 
 
 def romanian_preprocessing(text):
-    """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`"""
+    """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `FacebookAI/xlm-mlm-enro-1024`"""
     # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
     text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
     text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
@@ -807,7 +807,7 @@ def _tokenize(self, text, lang="en", bypass_tokenizer=False):
             text = text.split()
         elif lang not in self.lang_with_custom_tokenizer:
             text = self.moses_pipeline(text, lang=lang)
-            # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
+            # TODO: make sure we are using `FacebookAI/xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
             if lang == "ro":
                 text = romanian_preprocessing(text)
             text = self.moses_tokenize(text, lang=lang)
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index 37bd32186af4d5..e705b95b177877 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -2216,10 +2216,10 @@ def forward(
         >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
         >>> import torch
 
-        >>> tokenizer_enc = BertTokenizer.from_pretrained("bert-large-uncased")
+        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
         >>> tokenizer_dec = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
         >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
-        ...     "bert-large-uncased", "patrickvonplaten/xprophetnet-large-uncased-standalone"
+        ...     "google-bert/bert-large-uncased", "patrickvonplaten/xprophetnet-large-uncased-standalone"
         ... )
 
         >>> ARTICLE = (
diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
index 517b751f422003..65c536ba437346 100644
--- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
@@ -25,19 +25,19 @@
 logger = logging.get_logger(__name__)
 
 XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/config.json",
-    "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/config.json",
-    "xlm-roberta-large-finetuned-conll02-dutch": (
-        "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/config.json"
+    "FacebookAI/xlm-roberta-base": "https://huggingface.co/FacebookAI/xlm-roberta-base/resolve/main/config.json",
+    "FacebookAI/xlm-roberta-large": "https://huggingface.co/FacebookAI/xlm-roberta-large/resolve/main/config.json",
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": (
+        "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/config.json"
     ),
-    "xlm-roberta-large-finetuned-conll02-spanish": (
-        "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/config.json"
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": (
+        "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/config.json"
     ),
-    "xlm-roberta-large-finetuned-conll03-english": (
-        "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/config.json"
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-english": (
+        "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english/resolve/main/config.json"
     ),
-    "xlm-roberta-large-finetuned-conll03-german": (
-        "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/config.json"
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-german": (
+        "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-german/resolve/main/config.json"
     ),
 }
 
@@ -47,7 +47,7 @@ class XLMRobertaConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
     is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
-    [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) architecture.
+    [FacebookAI/xlm-roberta-base](https://huggingface.co/FacebookAI/xlm-roberta-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -101,10 +101,10 @@ class XLMRobertaConfig(PretrainedConfig):
     ```python
     >>> from transformers import XLMRobertaConfig, XLMRobertaModel
 
-    >>> # Initializing a XLM-RoBERTa xlm-roberta-base style configuration
+    >>> # Initializing a XLM-RoBERTa FacebookAI/xlm-roberta-base style configuration
     >>> configuration = XLMRobertaConfig()
 
-    >>> # Initializing a model (with random weights) from the xlm-roberta-base style configuration
+    >>> # Initializing a model (with random weights) from the FacebookAI/xlm-roberta-base style configuration
     >>> model = XLMRobertaModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
index e8247b3f28de39..0017be6bd8c145 100644
--- a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
@@ -46,14 +46,14 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "xlm-roberta-base"
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
 _CONFIG_FOR_DOC = "XLMRobertaConfig"
 
 remat = nn_partitioning.remat
 
 FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "xlm-roberta-base",
-    "xlm-roberta-large",
+    "FacebookAI/xlm-roberta-base",
+    "FacebookAI/xlm-roberta-large",
     # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta
 ]
 
diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
index c33f12298a261b..dcf1b018b2af66 100644
--- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -64,12 +64,12 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "xlm-roberta-base"
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
 _CONFIG_FOR_DOC = "XLMRobertaConfig"
 
 TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "xlm-roberta-base",
-    "xlm-roberta-large",
+    "FacebookAI/xlm-roberta-base",
+    "FacebookAI/xlm-roberta-large",
     "joeddav/xlm-roberta-large-xnli",
     "cardiffnlp/twitter-xlm-roberta-base-sentiment",
     # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 95ea2e7dca7bd1..8abd77b8c30215 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -48,16 +48,16 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "xlm-roberta-base"
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
 _CONFIG_FOR_DOC = "XLMRobertaConfig"
 
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "xlm-roberta-base",
-    "xlm-roberta-large",
-    "xlm-roberta-large-finetuned-conll02-dutch",
-    "xlm-roberta-large-finetuned-conll02-spanish",
-    "xlm-roberta-large-finetuned-conll03-english",
-    "xlm-roberta-large-finetuned-conll03-german",
+    "FacebookAI/xlm-roberta-base",
+    "FacebookAI/xlm-roberta-large",
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
     # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta
 ]
 
@@ -940,10 +940,10 @@ def forward(
         >>> from transformers import AutoTokenizer, XLMRobertaForCausalLM, AutoConfig
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-        >>> config = AutoConfig.from_pretrained("roberta-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
+        >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
         >>> config.is_decoder = True
-        >>> model = XLMRobertaForCausalLM.from_pretrained("roberta-base", config=config)
+        >>> model = XLMRobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index f704d136faee5f..3f87bd9b0dd9fa 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -33,30 +33,30 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/sentencepiece.bpe.model",
-        "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/sentencepiece.bpe.model",
-        "xlm-roberta-large-finetuned-conll02-dutch": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/sentencepiece.bpe.model"
+        "FacebookAI/xlm-roberta-base": "https://huggingface.co/FacebookAI/xlm-roberta-base/resolve/main/sentencepiece.bpe.model",
+        "FacebookAI/xlm-roberta-large": "https://huggingface.co/FacebookAI/xlm-roberta-large/resolve/main/sentencepiece.bpe.model",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/sentencepiece.bpe.model"
         ),
-        "xlm-roberta-large-finetuned-conll02-spanish": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/sentencepiece.bpe.model"
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/sentencepiece.bpe.model"
         ),
-        "xlm-roberta-large-finetuned-conll03-english": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/sentencepiece.bpe.model"
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-english": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english/resolve/main/sentencepiece.bpe.model"
         ),
-        "xlm-roberta-large-finetuned-conll03-german": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/sentencepiece.bpe.model"
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-german": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-german/resolve/main/sentencepiece.bpe.model"
         ),
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xlm-roberta-base": 512,
-    "xlm-roberta-large": 512,
-    "xlm-roberta-large-finetuned-conll02-dutch": 512,
-    "xlm-roberta-large-finetuned-conll02-spanish": 512,
-    "xlm-roberta-large-finetuned-conll03-english": 512,
-    "xlm-roberta-large-finetuned-conll03-german": 512,
+    "FacebookAI/xlm-roberta-base": 512,
+    "FacebookAI/xlm-roberta-large": 512,
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": 512,
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": 512,
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-english": 512,
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-german": 512,
 }
 
 
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index 41079e29d8ca8b..8f2c1e02a0a37e 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -36,46 +36,46 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/sentencepiece.bpe.model",
-        "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/sentencepiece.bpe.model",
-        "xlm-roberta-large-finetuned-conll02-dutch": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/sentencepiece.bpe.model"
+        "FacebookAI/xlm-roberta-base": "https://huggingface.co/FacebookAI/xlm-roberta-base/resolve/main/sentencepiece.bpe.model",
+        "FacebookAI/xlm-roberta-large": "https://huggingface.co/FacebookAI/xlm-roberta-large/resolve/main/sentencepiece.bpe.model",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/sentencepiece.bpe.model"
         ),
-        "xlm-roberta-large-finetuned-conll02-spanish": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/sentencepiece.bpe.model"
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/sentencepiece.bpe.model"
         ),
-        "xlm-roberta-large-finetuned-conll03-english": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/sentencepiece.bpe.model"
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-english": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english/resolve/main/sentencepiece.bpe.model"
         ),
-        "xlm-roberta-large-finetuned-conll03-german": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/sentencepiece.bpe.model"
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-german": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-german/resolve/main/sentencepiece.bpe.model"
         ),
     },
     "tokenizer_file": {
-        "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/tokenizer.json",
-        "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/tokenizer.json",
-        "xlm-roberta-large-finetuned-conll02-dutch": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/tokenizer.json"
+        "FacebookAI/xlm-roberta-base": "https://huggingface.co/FacebookAI/xlm-roberta-base/resolve/main/tokenizer.json",
+        "FacebookAI/xlm-roberta-large": "https://huggingface.co/FacebookAI/xlm-roberta-large/resolve/main/tokenizer.json",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/tokenizer.json"
         ),
-        "xlm-roberta-large-finetuned-conll02-spanish": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/tokenizer.json"
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/tokenizer.json"
         ),
-        "xlm-roberta-large-finetuned-conll03-english": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/tokenizer.json"
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-english": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english/resolve/main/tokenizer.json"
         ),
-        "xlm-roberta-large-finetuned-conll03-german": (
-            "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/tokenizer.json"
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-german": (
+            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-german/resolve/main/tokenizer.json"
         ),
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xlm-roberta-base": 512,
-    "xlm-roberta-large": 512,
-    "xlm-roberta-large-finetuned-conll02-dutch": 512,
-    "xlm-roberta-large-finetuned-conll02-spanish": 512,
-    "xlm-roberta-large-finetuned-conll03-english": 512,
-    "xlm-roberta-large-finetuned-conll03-german": 512,
+    "FacebookAI/xlm-roberta-base": 512,
+    "FacebookAI/xlm-roberta-large": 512,
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": 512,
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": 512,
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-english": 512,
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-german": 512,
 }
 
 
diff --git a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
index e2dee1cbe4e11b..acb9c630970975 100644
--- a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
@@ -88,10 +88,10 @@ class XLMRobertaXLConfig(PretrainedConfig):
     ```python
     >>> from transformers import XLMRobertaXLConfig, XLMRobertaXLModel
 
-    >>> # Initializing a XLM_ROBERTA_XL bert-base-uncased style configuration
+    >>> # Initializing a XLM_ROBERTA_XL google-bert/bert-base-uncased style configuration
     >>> configuration = XLMRobertaXLConfig()
 
-    >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
+    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
     >>> model = XLMRobertaXLModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 48bb28bf4ee2c6..2799752ca4bdd9 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -906,10 +906,10 @@ def forward(
         >>> from transformers import AutoTokenizer, RobertaForCausalLM, RobertaConfig
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-        >>> config = RobertaConfig.from_pretrained("roberta-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
+        >>> config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
         >>> config.is_decoder = True
-        >>> model = RobertaForCausalLM.from_pretrained("roberta-base", config=config)
+        >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> prediction_logits = outputs.logits
diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py
index 9ebc1f8bb9fb6f..8528bb06394d25 100644
--- a/src/transformers/models/xlnet/configuration_xlnet.py
+++ b/src/transformers/models/xlnet/configuration_xlnet.py
@@ -24,8 +24,8 @@
 logger = logging.get_logger(__name__)
 
 XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "xlnet-base-cased": "https://huggingface.co/xlnet-base-cased/resolve/main/config.json",
-    "xlnet-large-cased": "https://huggingface.co/xlnet-large-cased/resolve/main/config.json",
+    "xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/config.json",
+    "xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/config.json",
 }
 
 
@@ -34,7 +34,7 @@ class XLNetConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`XLNetModel`] or a [`TFXLNetModel`]. It is used to
     instantiate a XLNet model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the
-    [xlnet-large-cased](https://huggingface.co/xlnet-large-cased) architecture.
+    [xlnet/xlnet-large-cased](https://huggingface.co/xlnet/xlnet-large-cased) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index 9bf26872f80b57..598af1b707a5e9 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -57,12 +57,12 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "xlnet-base-cased"
+_CHECKPOINT_FOR_DOC = "xlnet/xlnet-base-cased"
 _CONFIG_FOR_DOC = "XLNetConfig"
 
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "xlnet-base-cased",
-    "xlnet-large-cased",
+    "xlnet/xlnet-base-cased",
+    "xlnet/xlnet-large-cased",
     # See all XLNet models at https://huggingface.co/models?filter=xlnet
 ]
 
@@ -1325,8 +1325,8 @@ def call(
         >>> import numpy as np
         >>> from transformers import AutoTokenizer, TFXLNetLMHeadModel
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-large-cased")
-        >>> model = TFXLNetLMHeadModel.from_pretrained("xlnet-large-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet/xlnet-large-cased")
+        >>> model = TFXLNetLMHeadModel.from_pretrained("xlnet/xlnet-large-cased")
 
         >>> # We show how to setup inputs to predict a next token using a bi-directional context.
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index c987c1e187a4f5..6def87ef07b4e3 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -40,12 +40,12 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "xlnet-base-cased"
+_CHECKPOINT_FOR_DOC = "xlnet/xlnet-base-cased"
 _CONFIG_FOR_DOC = "XLNetConfig"
 
 XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "xlnet-base-cased",
-    "xlnet-large-cased",
+    "xlnet/xlnet-base-cased",
+    "xlnet/xlnet-large-cased",
     # See all XLNet models at https://huggingface.co/models?filter=xlnet
 ]
 
@@ -1393,8 +1393,8 @@ def forward(
         >>> from transformers import AutoTokenizer, XLNetLMHeadModel
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-large-cased")
-        >>> model = XLNetLMHeadModel.from_pretrained("xlnet-large-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet/xlnet-large-cased")
+        >>> model = XLNetLMHeadModel.from_pretrained("xlnet/xlnet-large-cased")
 
         >>> # We show how to setup inputs to predict a next token using a bi-directional context.
         >>> input_ids = torch.tensor(
@@ -1970,8 +1970,8 @@ def forward(
         >>> from transformers import AutoTokenizer, XLNetForQuestionAnswering
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
-        >>> model = XLNetForQuestionAnswering.from_pretrained("xlnet-base-cased")
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet/xlnet-base-cased")
+        >>> model = XLNetForQuestionAnswering.from_pretrained("xlnet/xlnet-base-cased")
 
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
         ...     0
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index adc201abb96856..808a7ff5bfc07f 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -32,14 +32,14 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "xlnet-base-cased": "https://huggingface.co/xlnet-base-cased/resolve/main/spiece.model",
-        "xlnet-large-cased": "https://huggingface.co/xlnet-large-cased/resolve/main/spiece.model",
+        "xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/spiece.model",
+        "xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/spiece.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xlnet-base-cased": None,
-    "xlnet-large-cased": None,
+    "xlnet/xlnet-base-cased": None,
+    "xlnet/xlnet-large-cased": None,
 }
 
 # Segments (not really needed)
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
index 589675f0062cd5..c43016a1a77799 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -36,18 +36,18 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "xlnet-base-cased": "https://huggingface.co/xlnet-base-cased/resolve/main/spiece.model",
-        "xlnet-large-cased": "https://huggingface.co/xlnet-large-cased/resolve/main/spiece.model",
+        "xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/spiece.model",
+        "xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/spiece.model",
     },
     "tokenizer_file": {
-        "xlnet-base-cased": "https://huggingface.co/xlnet-base-cased/resolve/main/tokenizer.json",
-        "xlnet-large-cased": "https://huggingface.co/xlnet-large-cased/resolve/main/tokenizer.json",
+        "xlnet/xlnet-base-cased": "https://huggingface.co/xlnet/xlnet-base-cased/resolve/main/tokenizer.json",
+        "xlnet/xlnet-large-cased": "https://huggingface.co/xlnet/xlnet-large-cased/resolve/main/tokenizer.json",
     },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "xlnet-base-cased": None,
-    "xlnet-large-cased": None,
+    "xlnet/xlnet-base-cased": None,
+    "xlnet/xlnet-large-cased": None,
 }
 
 SPIECE_UNDERLINE = "▁"
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index cb048fb85e28d5..ba5ba6b7271b23 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -1045,7 +1045,7 @@ def forward(
         >>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
         >>> import torch
 
-        >>> tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
         >>> config = AutoConfig.from_pretrained("facebook/xmod-base")
         >>> config.is_decoder = True
         >>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 72e8b2b4aa9232..8ee0137a20b3ff 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -713,12 +713,12 @@ def pipeline(
 
     >>> # Question answering pipeline, specifying the checkpoint identifier
     >>> oracle = pipeline(
-    ...     "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="bert-base-cased"
+    ...     "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="google-bert/bert-base-cased"
     ... )
 
     >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
     >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
     >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
     ```"""
     if model_kwargs is None:
diff --git a/src/transformers/pipelines/feature_extraction.py b/src/transformers/pipelines/feature_extraction.py
index 118baeccd0d6a2..e8adb11b687da6 100644
--- a/src/transformers/pipelines/feature_extraction.py
+++ b/src/transformers/pipelines/feature_extraction.py
@@ -22,7 +22,7 @@ class FeatureExtractionPipeline(Pipeline):
     ```python
     >>> from transformers import pipeline
 
-    >>> extractor = pipeline(model="bert-base-uncased", task="feature-extraction")
+    >>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction")
     >>> result = extractor("This is a simple test.", return_tensors=True)
     >>> result.shape  # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input string.
     torch.Size([1, 8, 768])
diff --git a/src/transformers/pipelines/fill_mask.py b/src/transformers/pipelines/fill_mask.py
index 1d54c615ea258c..a6f240822322f7 100644
--- a/src/transformers/pipelines/fill_mask.py
+++ b/src/transformers/pipelines/fill_mask.py
@@ -41,7 +41,7 @@ class FillMaskPipeline(Pipeline):
     ```python
     >>> from transformers import pipeline
 
-    >>> fill_masker = pipeline(model="bert-base-uncased")
+    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
     >>> fill_masker("This is a simple [MASK].")
     [{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}]
     ```
@@ -70,7 +70,7 @@ class FillMaskPipeline(Pipeline):
     ```python
     >>> from transformers import pipeline
 
-    >>> fill_masker = pipeline(model="bert-base-uncased")
+    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
     >>> tokenizer_kwargs = {"truncation": True}
     >>> fill_masker(
     ...     "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 09f0b0c4490765..bb8abdfcf7f500 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -222,7 +222,7 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
     `"summarization"`.
 
     The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
-    currently, '*bart-large-cnn*', '*t5-small*', '*t5-base*', '*t5-large*', '*t5-3b*', '*t5-11b*'. See the up-to-date
+    currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date
     list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list
     of available parameters, see the [following
     documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
@@ -235,7 +235,7 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
     summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
 
     # use t5 in tf
-    summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
+    summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf")
     summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
     ```"""
 
diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py
index 2b7717934ddcd3..0c54fe1706c034 100644
--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@@ -55,7 +55,7 @@ class TextClassificationPipeline(Pipeline):
     ```python
     >>> from transformers import pipeline
 
-    >>> classifier = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")
+    >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
     >>> classifier("This movie is disgustingly good !")
     [{'label': 'POSITIVE', 'score': 1.0}]
 
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index 839395d7fe0528..ce7e180601f97e 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -31,7 +31,7 @@ class TextGenerationPipeline(Pipeline):
     ```python
     >>> from transformers import pipeline
 
-    >>> generator = pipeline(model="gpt2")
+    >>> generator = pipeline(model="openai-community/gpt2")
     >>> generator("I can't believe you did such a ", do_sample=False)
     [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}]
 
@@ -48,7 +48,7 @@ class TextGenerationPipeline(Pipeline):
     `"text-generation"`.
 
     The models that this pipeline can use are models that have been trained with an autoregressive language modeling
-    objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available models
+    objective, which includes the uni-directional models in the library (e.g. openai-community/gpt2). See the list of available models
     on [huggingface.co/models](https://huggingface.co/models?filter=text-generation).
     """
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 30cbfddeed7d39..5b46d5ea4a4801 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -432,8 +432,7 @@ def from_pretrained(
                 This can be either:
 
                 - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                  huggingface.co.
                 - a path to a *directory* containing a feature extractor file saved using the
                   [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
                 - a path or url to a saved feature extractor JSON *file*, e.g.,
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 16745f756ca525..6cea1b5512392d 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -204,7 +204,7 @@ def create_quantized_param(
         else:
             new_value = param_value.to("cpu")
 
-            # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization.
+            # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
             # Since weights are saved in the correct "orientation", we skip transposing when loading.
             if issubclass(module.source_cls, Conv1D):
                 new_value = new_value.T
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index d41a280f89a4f8..193da44d2c855f 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -190,7 +190,7 @@ def create_quantized_param(
                 "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
             )
 
-        # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization.
+        # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
         # Since weights are saved in the correct "orientation", we skip transposing when loading.
         if issubclass(module.source_cls, Conv1D):
             if fp16_statistics is None:
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 0ff7e718af20a9..0ceea9c7d45295 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1317,7 +1317,7 @@ def LoggingLevel(level):
 
     ```python
     with LoggingLevel(logging.INFO):
-        AutoModel.from_pretrained("gpt2")  # calls logger.info() several times
+        AutoModel.from_pretrained("openai-community/gpt2")  # calls logger.info() several times
     ```
     """
     orig_level = transformers_logging.get_verbosity()
@@ -1603,7 +1603,7 @@ def python_one_liner_max_rss(self, one_liner_str):
         Example:
 
         ```
-        one_liner_str = 'from transformers import AutoModel; AutoModel.from_pretrained("t5-large")'
+        one_liner_str = 'from transformers import AutoModel; AutoModel.from_pretrained("google-t5/t5-large")'
         max_rss = self.python_one_liner_max_rss(one_liner_str)
         ```
         """
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 50a42b4bb5de52..8f1b15c1c11438 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -452,8 +452,8 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
 
         ```python
         # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-        model = BertModel.from_pretrained("bert-base-uncased")
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = BertModel.from_pretrained("google-bert/bert-base-uncased")
 
         num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
         print("We have added", num_added_toks, "tokens")
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index d389af676fd0c8..f4a467c32fa92d 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -916,8 +916,8 @@ def add_special_tokens(
 
         ```python
         # Let's see how to add a new classification token to GPT-2
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        model = GPT2Model.from_pretrained("gpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        model = GPT2Model.from_pretrained("openai-community/gpt2")
 
         special_tokens_dict = {"cls_token": "<CLS>"}
 
@@ -1005,8 +1005,8 @@ def add_tokens(
 
         ```python
         # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-        model = BertModel.from_pretrained("bert-base-uncased")
+        tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
+        model = BertModel.from_pretrained("google-bert/bert-base-uncased")
 
         num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
         print("We have added", num_added_toks, "tokens")
@@ -1821,8 +1821,6 @@ def from_pretrained(
                 Can be either:
 
                 - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
-                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                  user or organization name, like `dbmdz/bert-base-german-cased`.
                 - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                   using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,
                   `./my_model_directory/`.
@@ -1871,7 +1869,7 @@ def from_pretrained(
         ```python
         # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer
         # Download vocabulary from huggingface.co and cache.
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
         # Download vocabulary from huggingface.co (user-uploaded) and cache.
         tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
@@ -1883,7 +1881,7 @@ def from_pretrained(
         tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt")
 
         # You can link tokens to special vocabulary when instantiating
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", unk_token="<unk>")
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", unk_token="<unk>")
         # You should be sure '<unk>' is in the vocabulary when doing that.
         # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
         assert tokenizer.unk_token == "<unk>"
diff --git a/src/transformers/training_args_seq2seq.py b/src/transformers/training_args_seq2seq.py
index ccacbbb3702708..88ae662570abef 100644
--- a/src/transformers/training_args_seq2seq.py
+++ b/src/transformers/training_args_seq2seq.py
@@ -48,8 +48,7 @@ class Seq2SeqTrainingArguments(TrainingArguments):
             Allows to load a [`~generation.GenerationConfig`] from the `from_pretrained` method. This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+              huggingface.co.
             - a path to a *directory* containing a configuration file saved using the
               [`~GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
             - a [`~generation.GenerationConfig`] object.
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 3aa452cf27a2cd..984fba1b6b743b 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -332,7 +332,7 @@ def cached_file(
 
     ```python
     # Download a model weight from the Hub and cache it.
-    model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
+    model_weights_file = cached_file("google-bert/bert-base-uncased", "pytorch_model.bin")
     ```
     """
     use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
@@ -531,9 +531,9 @@ def get_file_from_repo(
 
     ```python
     # Download a tokenizer configuration from huggingface.co and cache.
-    tokenizer_config = get_file_from_repo("bert-base-uncased", "tokenizer_config.json")
+    tokenizer_config = get_file_from_repo("google-bert/bert-base-uncased", "tokenizer_config.json")
     # This model does not have a tokenizer config so the result will be None.
-    tokenizer_config = get_file_from_repo("xlm-roberta-base", "tokenizer_config.json")
+    tokenizer_config = get_file_from_repo("FacebookAI/xlm-roberta-base", "tokenizer_config.json")
     ```
     """
     use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
@@ -819,7 +819,7 @@ def push_to_hub(
         ```python
         from transformers import {object_class}
 
-        {object} = {object_class}.from_pretrained("bert-base-cased")
+        {object} = {object_class}.from_pretrained("google-bert/bert-base-cased")
 
         # Push the {object} to your namespace with the name "my-finetuned-bert".
         {object}.push_to_hub("my-finetuned-bert")
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index d2ab879f24ab61..d26cfca678c7b0 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -393,8 +393,6 @@ class GPTQConfig(QuantizationConfigMixin):
             The tokenizer used to process the dataset. You can pass either:
                 - A custom tokenizer object.
                 - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
-                    Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                    user or organization name, like `dbmdz/bert-base-german-cased`.
                 - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                     using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
         dataset (`Union[List[str]]`, *optional*):
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index fe623d972c86f0..e2d25a28316219 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -70,7 +70,7 @@
 # default torch.distributed port
 DEFAULT_MASTER_PORT = "10999"
 
-T5_SMALL = "t5-small"
+T5_SMALL = "google-t5/t5-small"
 T5_TINY = "patrickvonplaten/t5-tiny-random"
 GPT2_TINY = "sshleifer/tiny-gpt2"
 GPTJ_TINY = "hf-internal-testing/tiny-random-gptj"
diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py
index e51fe1e7cfcca2..08c8b86dc07e93 100644
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -50,7 +50,7 @@
 # default torch.distributed port
 DEFAULT_MASTER_PORT = "10999"
 
-T5_SMALL = "t5-small"
+T5_SMALL = "google-t5/t5-small"
 
 # *** Working Models ***
 ALBERT_TINY = "hf-internal-testing/tiny-albert"
@@ -105,7 +105,7 @@
 
 # issues with tokenizer
 CTRL_TINY = "hf-internal-testing/tiny-random-ctrl"
-TRANSFO_XL_TINY = "hf-internal-testing/tiny-random-transfo-xl"  # same as ctrl
+TRANSFO_XL_TINY = "hf-internal-testing/tiny-random-transfo-xl"  # same as Salesforce/ctrl
 
 # other issues with tiny models
 IBERT_TINY = "hf-internal-testing/tiny-random-ibert"  # multiple issues with either mlm/qa/clas
@@ -218,9 +218,9 @@ def make_task_cmds():
             "xlnet",
             # "hubert", # missing tokenizer files
             # "ibert", # multiple issues with either mlm/qa/clas
-            # "transfo-xl", # tokenizer issues as ctrl
-            # "ctrl", # tokenizer issues
-            # "openai-gpt", missing model files
+            # "transfo-xl", # tokenizer issues as Salesforce/ctrl
+            # "Salesforce/ctrl", # tokenizer issues
+            # "openai-community/openai-gpt", missing model files
             # "tapas", multiple issues
         ],
         "img_clas": [
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
index d883f29ed3698c..aa5b3537531dbe 100644
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -256,7 +256,7 @@ def run_cmd_and_get_logs(self, use_accelerate, sharding_strategy, launcher, scri
 
     def get_base_args(self, output_dir, num_epochs, logging_steps):
         return f"""
-            --model_name_or_path bert-base-cased
+            --model_name_or_path google-bert/bert-base-cased
             --task_name mrpc
             --output_dir {output_dir}
             --overwrite_output_dir
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index dc69a673eface2..7aabee4b521552 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -52,7 +52,7 @@ def test_save_load_config(self, config_name):
         self.assertEqual(loaded_config.max_time, None)
 
     def test_from_model_config(self):
-        model_config = AutoConfig.from_pretrained("gpt2")
+        model_config = AutoConfig.from_pretrained("openai-community/gpt2")
         generation_config_from_model = GenerationConfig.from_model_config(model_config)
         default_generation_config = GenerationConfig()
 
diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py
index 7efa4281b0937e..f4f13dd8d555ea 100644
--- a/tests/generation/test_framework_agnostic.py
+++ b/tests/generation/test_framework_agnostic.py
@@ -157,10 +157,10 @@ def test_transition_scores_greedy_search(self):
         is_pt = not model_cls.__name__.startswith("TF")
 
         articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilgpt2", padding_side="left")
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
         tokenizer.pad_token = tokenizer.eos_token
 
-        model = model_cls.from_pretrained("distilgpt2")
+        model = model_cls.from_pretrained("distilbert/distilgpt2")
         input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
         if is_pt:
             model = model.to(torch_device)
@@ -193,10 +193,10 @@ def test_transition_scores_greedy_search_normalized(self):
         is_pt = not model_cls.__name__.startswith("TF")
 
         articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilgpt2", padding_side="left")
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
         tokenizer.pad_token = tokenizer.eos_token
 
-        model = model_cls.from_pretrained("distilgpt2")
+        model = model_cls.from_pretrained("distilbert/distilgpt2")
         input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
         if is_pt:
             model = model.to(torch_device)
@@ -375,7 +375,7 @@ def test_transition_scores_early_stopping(self):
         is_pt = not model_cls.__name__.startswith("TF")
 
         input_ids = create_tensor_fn(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]])
-        model = model_cls.from_pretrained("t5-small")
+        model = model_cls.from_pretrained("google-t5/t5-small")
         if is_pt:
             model = model.to(torch_device)
             input_ids = input_ids.to(torch_device)
diff --git a/tests/generation/test_streamers.py b/tests/generation/test_streamers.py
index 361f39e03e0f5c..c82a5e99e0ded0 100644
--- a/tests/generation/test_streamers.py
+++ b/tests/generation/test_streamers.py
@@ -89,8 +89,8 @@ def test_text_streamer_decode_kwargs(self):
         # Tests that we can pass `decode_kwargs` to the streamer to control how the tokens are decoded. Must be tested
         # with actual models -- the dummy models' tokenizers are not aligned with their models, and
         # `skip_special_tokens=True` has no effect on them
-        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-        model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+        model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2").to(torch_device)
         model.config.eos_token_id = -1
 
         input_ids = torch.ones((1, 5), device=torch_device).long() * model.config.bos_token_id
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 4a13487cf8935d..c91ff7993a171b 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2840,8 +2840,8 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
         self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
 
     def test_beam_search_low_memory(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        model = AutoModelForCausalLM.from_pretrained("gpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
         tokenizer.pad_token_id = tokenizer.eos_token_id
         model_inputs = tokenizer("I", return_tensors="pt")["input_ids"]
 
@@ -2857,8 +2857,8 @@ def test_beam_search_example_integration(self):
         # PT-only test: TF doesn't have a BeamSearchScorer
         # exactly the example provided in the docstrings of beam search, which previously
         # failed after directly copying from it. Refer to PR #15555
-        tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
         encoder_input_str = "translate English to German: How old are you?"
         encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
@@ -2898,8 +2898,8 @@ def test_beam_search_example_integration(self):
     @slow
     def test_constrained_beam_search(self):
         # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         force_tokens = tokenizer("scared", add_prefix_space=True, add_special_tokens=False).input_ids
         force_tokens_2 = tokenizer("big weapons", add_prefix_space=True, add_special_tokens=False).input_ids
@@ -2936,8 +2936,8 @@ def test_constrained_beam_search(self):
     @slow
     def test_constrained_beam_search_mixed(self):
         # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         force_phrase = tokenizer("scared", add_prefix_space=True, add_special_tokens=False).input_ids
         flexible_phrases = tokenizer(
@@ -2977,8 +2977,8 @@ def test_constrained_beam_search_mixed(self):
     @slow
     def test_constrained_beam_search_mixed_mixin(self):
         # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         force_word = "scared"
         force_flexible = ["scream", "screams", "screaming", "screamed"]
@@ -3014,8 +3014,8 @@ def test_constrained_beam_search_mixed_mixin(self):
 
     @slow
     def test_cfg_mixin(self):
-        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         input = tokenizer(["The dragon flew over Paris,"], return_tensors="pt", return_attention_mask=True)
         input["input_ids"] = input["input_ids"].to(torch_device)
@@ -3055,8 +3055,8 @@ def test_cfg_mixin(self):
     @slow
     def test_constrained_beam_search_example_translation_mixin(self):
         # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
         encoder_input_str = "translate English to German: How old are you?"
         force_words = ["sind"]
@@ -3080,8 +3080,8 @@ def test_constrained_beam_search_example_translation_mixin(self):
     @slow
     def test_constrained_beam_search_example_integration(self):
         # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
         encoder_input_str = "translate English to German: How old are you?"
         encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index 75c84ad0d3d3ff..823315bc6785bb 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -331,7 +331,7 @@ def test_model_from_pretrained(self):
 class AlbertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_no_head_absolute_embedding(self):
-        model = AlbertModel.from_pretrained("albert-base-v2")
+        model = AlbertModel.from_pretrained("albert/albert-base-v2")
         input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
         attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
         with torch.no_grad():
diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py
index 0bdc8065bce9aa..956de9ebdc9e57 100644
--- a/tests/models/albert/test_modeling_flax_albert.py
+++ b/tests/models/albert/test_modeling_flax_albert.py
@@ -139,7 +139,7 @@ def setUp(self):
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("albert-base-v2")
+            model = model_class_name.from_pretrained("albert/albert-base-v2")
             outputs = model(np.ones((1, 1)))
             self.assertIsNotNone(outputs)
 
@@ -148,7 +148,7 @@ def test_model_from_pretrained(self):
 class FlaxAlbertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_no_head_absolute_embedding(self):
-        model = FlaxAlbertModel.from_pretrained("albert-base-v2")
+        model = FlaxAlbertModel.from_pretrained("albert/albert-base-v2")
         input_ids = np.array([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
         attention_mask = np.array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
         output = model(input_ids, attention_mask=attention_mask)[0]
diff --git a/tests/models/albert/test_modeling_tf_albert.py b/tests/models/albert/test_modeling_tf_albert.py
index 7314eb4749a8c0..7bea29fa9cb1d5 100644
--- a/tests/models/albert/test_modeling_tf_albert.py
+++ b/tests/models/albert/test_modeling_tf_albert.py
@@ -311,7 +311,7 @@ def test_model_from_pretrained(self):
 class TFAlbertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-        model = TFAlbertForPreTraining.from_pretrained("albert-base-v2")
+        model = TFAlbertForPreTraining.from_pretrained("albert/albert-base-v2")
         input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
 
diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py
index d9bb86bf29948c..343cba168f28ff 100644
--- a/tests/models/albert/test_tokenization_albert.py
+++ b/tests/models/albert/test_tokenization_albert.py
@@ -127,6 +127,6 @@ def test_tokenizer_integration(self):
 
         self.tokenizer_integration_test_util(
             expected_encoding=expected_encoding,
-            model_name="albert-base-v2",
+            model_name="albert/albert-base-v2",
             revision="6b6560eaf5ff2e250b00c50f380c5389a9c2d82e",
         )
diff --git a/tests/models/auto/test_configuration_auto.py b/tests/models/auto/test_configuration_auto.py
index fa05952d29a32f..8b202b90921097 100644
--- a/tests/models/auto/test_configuration_auto.py
+++ b/tests/models/auto/test_configuration_auto.py
@@ -46,7 +46,7 @@ def test_module_spec(self):
         self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto"))
 
     def test_config_from_model_shortcut(self):
-        config = AutoConfig.from_pretrained("bert-base-uncased")
+        config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
         self.assertIsInstance(config, BertConfig)
 
     def test_config_model_type_from_local_file(self):
diff --git a/tests/models/auto/test_modeling_flax_auto.py b/tests/models/auto/test_modeling_flax_auto.py
index 5880551f54dac8..8880972e044e40 100644
--- a/tests/models/auto/test_modeling_flax_auto.py
+++ b/tests/models/auto/test_modeling_flax_auto.py
@@ -30,7 +30,7 @@
 class FlaxAutoModelTest(unittest.TestCase):
     @slow
     def test_bert_from_pretrained(self):
-        for model_name in ["bert-base-cased", "bert-large-uncased"]:
+        for model_name in ["google-bert/bert-base-cased", "google-bert/bert-large-uncased"]:
             with self.subTest(model_name):
                 config = AutoConfig.from_pretrained(model_name)
                 self.assertIsNotNone(config)
@@ -42,7 +42,7 @@ def test_bert_from_pretrained(self):
 
     @slow
     def test_roberta_from_pretrained(self):
-        for model_name in ["roberta-base", "roberta-large"]:
+        for model_name in ["FacebookAI/roberta-base", "FacebookAI/roberta-large"]:
             with self.subTest(model_name):
                 config = AutoConfig.from_pretrained(model_name)
                 self.assertIsNotNone(config)
@@ -54,7 +54,7 @@ def test_roberta_from_pretrained(self):
 
     @slow
     def test_bert_jax_jit(self):
-        for model_name in ["bert-base-cased", "bert-large-uncased"]:
+        for model_name in ["google-bert/bert-base-cased", "google-bert/bert-large-uncased"]:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = FlaxBertModel.from_pretrained(model_name)
             tokens = tokenizer("Do you support jax jitted function?", return_tensors=TensorType.JAX)
@@ -67,7 +67,7 @@ def eval(**kwargs):
 
     @slow
     def test_roberta_jax_jit(self):
-        for model_name in ["roberta-base", "roberta-large"]:
+        for model_name in ["FacebookAI/roberta-base", "FacebookAI/roberta-large"]:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = FlaxRobertaModel.from_pretrained(model_name)
             tokens = tokenizer("Do you support jax jitted function?", return_tensors=TensorType.JAX)
diff --git a/tests/models/auto/test_modeling_tf_auto.py b/tests/models/auto/test_modeling_tf_auto.py
index 9c284a78aee56c..e0758610871a86 100644
--- a/tests/models/auto/test_modeling_tf_auto.py
+++ b/tests/models/auto/test_modeling_tf_auto.py
@@ -85,7 +85,7 @@ class TFNewModel(TFBertModel):
 class TFAutoModelTest(unittest.TestCase):
     @slow
     def test_model_from_pretrained(self):
-        model_name = "bert-base-cased"
+        model_name = "google-bert/bert-base-cased"
         config = AutoConfig.from_pretrained(model_name)
         self.assertIsNotNone(config)
         self.assertIsInstance(config, BertConfig)
@@ -96,7 +96,7 @@ def test_model_from_pretrained(self):
 
     @slow
     def test_model_for_pretraining_from_pretrained(self):
-        model_name = "bert-base-cased"
+        model_name = "google-bert/bert-base-cased"
         config = AutoConfig.from_pretrained(model_name)
         self.assertIsNotNone(config)
         self.assertIsInstance(config, BertConfig)
@@ -155,7 +155,7 @@ def test_model_for_encoder_decoder_lm(self):
     @slow
     def test_sequence_classification_model_from_pretrained(self):
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-        for model_name in ["bert-base-uncased"]:
+        for model_name in ["google-bert/bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -167,7 +167,7 @@ def test_sequence_classification_model_from_pretrained(self):
     @slow
     def test_question_answering_model_from_pretrained(self):
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-        for model_name in ["bert-base-uncased"]:
+        for model_name in ["google-bert/bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
diff --git a/tests/models/auto/test_modeling_tf_pytorch.py b/tests/models/auto/test_modeling_tf_pytorch.py
index 3e213f29562ab2..77b19a8e3a7976 100644
--- a/tests/models/auto/test_modeling_tf_pytorch.py
+++ b/tests/models/auto/test_modeling_tf_pytorch.py
@@ -75,7 +75,7 @@ class TFPTAutoModelTest(unittest.TestCase):
     @slow
     def test_model_from_pretrained(self):
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-        for model_name in ["bert-base-uncased"]:
+        for model_name in ["google-bert/bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -91,7 +91,7 @@ def test_model_from_pretrained(self):
     @slow
     def test_model_for_pretraining_from_pretrained(self):
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-        for model_name in ["bert-base-uncased"]:
+        for model_name in ["google-bert/bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -185,7 +185,7 @@ def test_model_for_encoder_decoder_lm(self):
     @slow
     def test_sequence_classification_model_from_pretrained(self):
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-        for model_name in ["bert-base-uncased"]:
+        for model_name in ["google-bert/bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -201,7 +201,7 @@ def test_sequence_classification_model_from_pretrained(self):
     @slow
     def test_question_answering_model_from_pretrained(self):
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-        for model_name in ["bert-base-uncased"]:
+        for model_name in ["google-bert/bert-base-uncased"]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index 597c995b6e3227..8ebf834f12ae08 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -176,12 +176,14 @@ def test_model_name_edge_cases_in_mappings(self):
 
     @require_tokenizers
     def test_from_pretrained_use_fast_toggle(self):
-        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
-        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)
+        self.assertIsInstance(
+            AutoTokenizer.from_pretrained("google-bert/bert-base-cased", use_fast=False), BertTokenizer
+        )
+        self.assertIsInstance(AutoTokenizer.from_pretrained("google-bert/bert-base-cased"), BertTokenizerFast)
 
     @require_tokenizers
     def test_do_lower_case(self):
-        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=False)
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", do_lower_case=False)
         sample = "Hello, world. How are you?"
         tokens = tokenizer.tokenize(sample)
         self.assertEqual("[UNK]", tokens[0])
@@ -211,15 +213,15 @@ def test_auto_tokenizer_from_local_folder(self):
         self.assertEqual(tokenizer2.vocab_size, 12)
 
     def test_auto_tokenizer_fast_no_slow(self):
-        tokenizer = AutoTokenizer.from_pretrained("ctrl")
+        tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
         # There is no fast CTRL so this always gives us a slow tokenizer.
         self.assertIsInstance(tokenizer, CTRLTokenizer)
 
     def test_get_tokenizer_config(self):
         # Check we can load the tokenizer config of an online model.
-        config = get_tokenizer_config("bert-base-cased")
+        config = get_tokenizer_config("google-bert/bert-base-cased")
         _ = config.pop("_commit_hash", None)
-        # If we ever update bert-base-cased tokenizer config, this dict here will need to be updated.
+        # If we ever update google-bert/bert-base-cased tokenizer config, this dict here will need to be updated.
         self.assertEqual(config, {"do_lower_case": False})
 
         # This model does not have a tokenizer_config so we get back an empty dict.
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 2601c92cfb76df..bc38356852935b 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -627,7 +627,7 @@ def test_torchscript_device_change(self):
 class BertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_no_head_absolute_embedding(self):
-        model = BertModel.from_pretrained("bert-base-uncased")
+        model = BertModel.from_pretrained("google-bert/bert-base-uncased")
         input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
         attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
         with torch.no_grad():
diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py
index 822689917513eb..fca54dbed84c3e 100644
--- a/tests/models/bert/test_modeling_flax_bert.py
+++ b/tests/models/bert/test_modeling_flax_bert.py
@@ -158,6 +158,6 @@ def setUp(self):
     def test_model_from_pretrained(self):
         # Only check this for base model, not necessary for all model classes.
         # This will also help speed-up tests.
-        model = FlaxBertModel.from_pretrained("bert-base-cased")
+        model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased")
         outputs = model(np.ones((1, 1)))
         self.assertIsNotNone(outputs)
diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py
index f9383756e3b2de..bee1ccf0d1500e 100644
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -242,7 +242,7 @@ def test_clean_text(self):
 
     @slow
     def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased")
 
         text = tokenizer.encode("sequence builders", add_special_tokens=False)
         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
diff --git a/tests/models/bert/test_tokenization_bert_tf.py b/tests/models/bert/test_tokenization_bert_tf.py
index 16ac1d4867e3d3..f950e7439c331d 100644
--- a/tests/models/bert/test_tokenization_bert_tf.py
+++ b/tests/models/bert/test_tokenization_bert_tf.py
@@ -16,7 +16,7 @@
     from transformers.models.bert import TFBertTokenizer
 
 
-TOKENIZER_CHECKPOINTS = ["bert-base-uncased", "bert-base-cased"]
+TOKENIZER_CHECKPOINTS = ["google-bert/bert-base-uncased", "google-bert/bert-base-cased"]
 TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only"
 
 if is_tf_available():
diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
index cedf7492cfb22c..d2a7accb3900ea 100644
--- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py
+++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
@@ -488,7 +488,7 @@ def test_tokenizer_mismatch_warning(self):
                     " is called from."
                 )
             )
-        EXAMPLE_BERT_ID = "bert-base-cased"
+        EXAMPLE_BERT_ID = "google-bert/bert-base-cased"
         with self.assertLogs("transformers", level="WARNING") as cm:
             BertJapaneseTokenizer.from_pretrained(EXAMPLE_BERT_ID)
             self.assertTrue(
diff --git a/tests/models/camembert/test_modeling_camembert.py b/tests/models/camembert/test_modeling_camembert.py
index a15ab8caa2318c..f2fba59496da4f 100644
--- a/tests/models/camembert/test_modeling_camembert.py
+++ b/tests/models/camembert/test_modeling_camembert.py
@@ -31,7 +31,7 @@
 class CamembertModelIntegrationTest(unittest.TestCase):
     @slow
     def test_output_embeds_base_model(self):
-        model = CamembertModel.from_pretrained("camembert-base")
+        model = CamembertModel.from_pretrained("almanach/camembert-base")
         model.to(torch_device)
 
         input_ids = torch.tensor(
diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py
index 7f72d304d5c09a..33254b96de8d56 100644
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -128,7 +128,7 @@ def test_tokenizer_integration(self):
 
         self.tokenizer_integration_test_util(
             expected_encoding=expected_encoding,
-            model_name="camembert-base",
+            model_name="almanach/camembert-base",
             revision="3a0641d9a1aeb7e848a74299e7e4c4bca216b4cf",
             sequences=sequences,
         )
diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py
index db41052d4cd0e2..2e0f41da4d5bd0 100644
--- a/tests/models/dpr/test_tokenization_dpr.py
+++ b/tests/models/dpr/test_tokenization_dpr.py
@@ -50,7 +50,7 @@ class DPRReaderTokenizationTest(BertTokenizationTest):
 
     @slow
     def test_decode_best_spans(self):
-        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased")
 
         text_1 = tokenizer.encode("question sequence", add_special_tokens=False)
         text_2 = tokenizer.encode("title sequence", add_special_tokens=False)
@@ -73,7 +73,7 @@ def test_decode_best_spans(self):
 
     @slow
     def test_call(self):
-        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased")
 
         text_1 = tokenizer.encode("question sequence", add_special_tokens=False)
         text_2 = tokenizer.encode("title sequence", add_special_tokens=False)
diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
index 25444d7d32ffa6..2ff3e3aa5094b1 100644
--- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
@@ -671,7 +671,9 @@ def test_real_model_save_load_from_pretrained(self):
 @require_torch
 class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
     def get_pretrained_model(self):
-        return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "bert-base-cased")
+        return EncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-base-cased", "google-bert/bert-base-cased"
+        )
 
     def get_encoder_decoder_model(self, config, decoder_config):
         encoder_model = BertModel(config)
@@ -937,7 +939,9 @@ def prepare_config_and_inputs(self):
         }
 
     def get_pretrained_model(self):
-        return EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base")
+        return EncoderDecoderModel.from_encoder_decoder_pretrained(
+            "FacebookAI/roberta-base", "FacebookAI/roberta-base"
+        )
 
 
 @require_torch
@@ -994,7 +998,9 @@ def prepare_config_and_inputs(self):
         }
 
     def get_pretrained_model(self):
-        return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
+        return EncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-base-cased", "openai-community/gpt2"
+        )
 
     def test_encoder_decoder_model_shared_weights(self):
         pass
@@ -1004,8 +1010,8 @@ def test_bert2gpt2_summarization(self):
         model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
 
         model.to(torch_device)
-        tokenizer_in = AutoTokenizer.from_pretrained("bert-base-cased")
-        tokenizer_out = AutoTokenizer.from_pretrained("gpt2")
+        tokenizer_in = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+        tokenizer_out = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
         ARTICLE_STUDENTS = """(CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the university's affiliation with the fraternity is permanently done. The news is shocking, but it's not the first time SAE has faced controversy. SAE was founded March 9, 1856, at the University of Alabama, five years before the American Civil War, according to the fraternity website. When the war began, the group had fewer than 400 members, of which "369 went to war for the Confederate States and seven for the Union Army," the website says. The fraternity now boasts more than 200,000 living alumni, along with about 15,000 undergraduates populating 219 chapters and 20 "colonies" seeking full membership at universities. SAE has had to work hard to change recently after a string of member deaths, many blamed on the hazing of new recruits, SAE national President Bradley Cohen wrote in a message on the fraternity's website. The fraternity's website lists more than 130 chapters cited or suspended for "health and safety incidents" since 2010. At least 30 of the incidents involved hazing, and dozens more involved alcohol. However, the list is missing numerous incidents from recent months. Among them, according to various media outlets: Yale University banned the SAEs from campus activities last month after members allegedly tried to interfere with a sexual misconduct investigation connected to an initiation rite. Stanford University in December suspended SAE housing privileges after finding sorority members attending a fraternity function were subjected to graphic sexual content. And Johns Hopkins University in November suspended the fraternity for underage drinking. "The media has labeled us as the 'nation's deadliest fraternity,' " Cohen said. In 2011, for example, a student died while being coerced into excessive alcohol consumption, according to a lawsuit. SAE's previous insurer dumped the fraternity. "As a result, we are paying Lloyd's of London the highest insurance rates in the Greek-letter world," Cohen said. Universities have turned down SAE's attempts to open new chapters, and the fraternity had to close 12 in 18 months over hazing incidents."""
 
@@ -1067,7 +1073,7 @@ def prepare_config_and_inputs(self):
 
     def get_pretrained_model(self):
         return EncoderDecoderModel.from_encoder_decoder_pretrained(
-            "bert-large-uncased", "microsoft/prophetnet-large-uncased"
+            "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
         )
 
     def test_encoder_decoder_model_shared_weights(self):
@@ -1122,7 +1128,9 @@ def prepare_config_and_inputs(self):
         }
 
     def get_pretrained_model(self):
-        return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-large-uncased", "facebook/bart-large")
+        return EncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-large-uncased", "facebook/bart-large"
+        )
 
     def test_encoder_decoder_model_shared_weights(self):
         pass
@@ -1131,10 +1139,12 @@ def test_encoder_decoder_model_shared_weights(self):
 @require_torch
 class EncoderDecoderModelTest(unittest.TestCase):
     def get_from_encoderdecoder_pretrained_model(self):
-        return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
+        return EncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-base-uncased", "google-bert/bert-base-uncased"
+        )
 
     def get_decoder_config(self):
-        config = AutoConfig.from_pretrained("bert-base-uncased")
+        config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
         config.is_decoder = True
         config.add_cross_attention = True
         return config
@@ -1143,8 +1153,10 @@ def get_encoderdecoder_model(self):
         return EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
 
     def get_encoder_decoder_models(self):
-        encoder_model = BertModel.from_pretrained("bert-base-uncased")
-        decoder_model = BertLMHeadModel.from_pretrained("bert-base-uncased", config=self.get_decoder_config())
+        encoder_model = BertModel.from_pretrained("google-bert/bert-base-uncased")
+        decoder_model = BertLMHeadModel.from_pretrained(
+            "google-bert/bert-base-uncased", config=self.get_decoder_config()
+        )
         return {"encoder": encoder_model, "decoder": decoder_model}
 
     def _check_configuration_tie(self, model):
diff --git a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
index 362a5f74a1b6ad..c8f76a144be703 100644
--- a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
@@ -483,12 +483,14 @@ def prepare_config_and_inputs(self):
         }
 
     def get_pretrained_model(self):
-        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
+        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-base-cased", "openai-community/gpt2"
+        )
 
     @slow
     def test_bert2gpt2_summarization(self):
-        tokenizer_in = AutoTokenizer.from_pretrained("bert-base-cased")
-        tokenizer_out = AutoTokenizer.from_pretrained("gpt2")
+        tokenizer_in = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+        tokenizer_out = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
         model = FlaxEncoderDecoderModel.from_pretrained(
             "patrickvonplaten/bert2gpt2-cnn_dailymail-fp16", pad_token_id=tokenizer_out.eos_token_id
@@ -539,7 +541,9 @@ def prepare_config_and_inputs(self):
         }
 
     def get_pretrained_model(self):
-        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "facebook/bart-base")
+        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-base-cased", "facebook/bart-base"
+        )
 
 
 @require_flax
@@ -576,13 +580,17 @@ def prepare_config_and_inputs(self):
         }
 
     def get_pretrained_model(self):
-        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "bert-base-cased")
+        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-base-cased", "google-bert/bert-base-cased"
+        )
 
 
 @require_flax
 class FlaxEncoderDecoderModelTest(unittest.TestCase):
     def get_from_encoderdecoder_pretrained_model(self):
-        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
+        return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-base-cased", "openai-community/gpt2"
+        )
 
     def _check_configuration_tie(self, model):
         module = model.module.bind(model.params)
diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
index a9d32474c3dd97..99a09ada169b69 100644
--- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
@@ -764,7 +764,7 @@ def prepare_config_and_inputs(self):
     def test_bert2bert_summarization(self):
         from transformers import EncoderDecoderModel
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
         """Not working, because pt checkpoint has `encoder.encoder.layer...` while tf model has `encoder.bert.encoder.layer...`.
         (For Bert decoder, there is no issue, because `BertModel` is wrapped into `decoder` as `bert`)
@@ -864,8 +864,8 @@ def prepare_config_and_inputs(self):
     def test_bert2gpt2_summarization(self):
         from transformers import EncoderDecoderModel
 
-        tokenizer_in = AutoTokenizer.from_pretrained("bert-base-cased")
-        tokenizer_out = AutoTokenizer.from_pretrained("gpt2")
+        tokenizer_in = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+        tokenizer_out = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
         """Not working, because pt checkpoint has `encoder.encoder.layer...` while tf model has `encoder.bert.encoder.layer...`.
         (For GPT2 decoder, there is no issue)
@@ -1016,10 +1016,12 @@ def prepare_config_and_inputs(self):
 @require_tf
 class TFEncoderDecoderModelTest(unittest.TestCase):
     def get_from_encoderdecoder_pretrained_model(self):
-        return TFEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "bert-base-cased")
+        return TFEncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google-bert/bert-base-cased", "google-bert/bert-base-cased"
+        )
 
     def get_decoder_config(self):
-        config = AutoConfig.from_pretrained("bert-base-cased")
+        config = AutoConfig.from_pretrained("google-bert/bert-base-cased")
         config.is_decoder = True
         config.add_cross_attention = True
         return config
@@ -1028,9 +1030,9 @@ def get_encoderdecoder_model(self):
         return TFEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
 
     def get_encoder_decoder_models(self):
-        encoder_model = TFBertModel.from_pretrained("bert-base-cased", name="encoder")
+        encoder_model = TFBertModel.from_pretrained("google-bert/bert-base-cased", name="encoder")
         decoder_model = TFBertLMHeadModel.from_pretrained(
-            "bert-base-cased", config=self.get_decoder_config(), name="decoder"
+            "google-bert/bert-base-cased", config=self.get_decoder_config(), name="decoder"
         )
         return {"encoder": encoder_model, "decoder": decoder_model}
 
@@ -1055,8 +1057,10 @@ def test_configuration_tie(self):
 @require_tf
 class TFEncoderDecoderModelSaveLoadTests(unittest.TestCase):
     def get_encoder_decoder_config(self):
-        encoder_config = AutoConfig.from_pretrained("bert-base-uncased")
-        decoder_config = AutoConfig.from_pretrained("bert-base-uncased", is_decoder=True, add_cross_attention=True)
+        encoder_config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
+        decoder_config = AutoConfig.from_pretrained(
+            "google-bert/bert-base-uncased", is_decoder=True, add_cross_attention=True
+        )
         return EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
 
     def get_encoder_decoder_config_small(self):
@@ -1160,8 +1164,8 @@ def test_encoder_decoder_from_pretrained(self):
         load_weight_prefix = TFEncoderDecoderModel.load_weight_prefix
 
         config = self.get_encoder_decoder_config()
-        encoder_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-        decoder_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        encoder_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        decoder_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
         input_ids = encoder_tokenizer("who sings does he love me with reba", return_tensors="tf").input_ids
         decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
@@ -1173,10 +1177,10 @@ def test_encoder_decoder_from_pretrained(self):
             # So we create pretrained models (without `load_weight_prefix`), save them, and later,
             # we load them using `from_pretrained`.
             # (we don't need to do this for encoder, but let's make the code more similar between encoder/decoder)
-            encoder = TFAutoModel.from_pretrained("bert-base-uncased", name="encoder")
+            encoder = TFAutoModel.from_pretrained("google-bert/bert-base-uncased", name="encoder")
             # It's necessary to specify `add_cross_attention=True` here.
             decoder = TFAutoModelForCausalLM.from_pretrained(
-                "bert-base-uncased", is_decoder=True, add_cross_attention=True, name="decoder"
+                "google-bert/bert-base-uncased", is_decoder=True, add_cross_attention=True, name="decoder"
             )
             pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder")
             pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder")
diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py
index 1e24ad0b00d034..fbf2d6c333fd8a 100644
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@@ -237,10 +237,10 @@ def test_bool_attention_mask_in_generation(self):
 
     @slow
     def test_batch_generation(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token="</s>", padding_side="left")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", pad_token="</s>", padding_side="left")
         inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
 
-        model = FlaxGPT2LMHeadModel.from_pretrained("gpt2")
+        model = FlaxGPT2LMHeadModel.from_pretrained("openai-community/gpt2")
         model.do_sample = False
         model.config.pad_token_id = model.config.eos_token_id
 
@@ -359,6 +359,6 @@ def test_equivalence_flax_to_pt(self):
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("gpt2", from_pt=True)
+            model = model_class_name.from_pretrained("openai-community/gpt2", from_pt=True)
             outputs = model(np.ones((1, 1)))
             self.assertIsNotNone(outputs)
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 245b29d56a6cf3..c9ecbdde6673a1 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -98,7 +98,7 @@ def __init__(
         self.pad_token_id = vocab_size - 1
 
     def get_large_model_config(self):
-        return GPT2Config.from_pretrained("gpt2")
+        return GPT2Config.from_pretrained("openai-community/gpt2")
 
     def prepare_config_and_inputs(
         self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
@@ -582,9 +582,9 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
 
     @slow
     def test_batch_generation(self):
-        model = GPT2LMHeadModel.from_pretrained("gpt2")
+        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
         model.to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         tokenizer.padding_side = "left"
 
@@ -641,9 +641,9 @@ def test_batch_generation(self):
 
     @slow
     def test_batch_generation_2heads(self):
-        model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
+        model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")
         model.to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         tokenizer.padding_side = "left"
 
@@ -722,7 +722,7 @@ def _test_lm_generate_gpt2_helper(
         verify_outputs=True,
     ):
         model = GPT2LMHeadModel.from_pretrained(
-            "gpt2",
+            "openai-community/gpt2",
             reorder_and_upcast_attn=reorder_and_upcast_attn,
             scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
         )
@@ -759,8 +759,8 @@ def test_lm_generate_gpt2_with_scale_attn_by_inverse_layer_idx(self):
 
     @slow
     def test_gpt2_sample(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        model = GPT2LMHeadModel.from_pretrained("gpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -787,8 +787,8 @@ def test_gpt2_sample(self):
 
     @slow
     def test_gpt2_sample_max_time(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        model = GPT2LMHeadModel.from_pretrained("gpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -833,8 +833,8 @@ def test_contrastive_search_gpt2(self):
             "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
         )
 
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
-        gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-large").to(torch_device)
+        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large")
+        gpt2_model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large").to(torch_device)
         input_ids = gpt2_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
 
         outputs = gpt2_model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index d636097dc28622..060d4b71985bc8 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -461,8 +461,8 @@ def test_onnx_compliancy(self):
 class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_greedy_distilgpt2_batch_special(self):
-        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
+        model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
 
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.padding_side = "left"
@@ -488,8 +488,8 @@ def test_lm_generate_greedy_distilgpt2_batch_special(self):
 
     @slow
     def test_lm_generate_sample_distilgpt2_batch_special(self):
-        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
+        model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
 
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.padding_side = "left"
@@ -522,8 +522,8 @@ def test_lm_generate_sample_distilgpt2_batch_special(self):
 
     @slow
     def test_lm_generate_greedy_distilgpt2_beam_search_special(self):
-        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
+        model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
 
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.padding_side = "left"
@@ -550,8 +550,8 @@ def test_lm_generate_greedy_distilgpt2_beam_search_special(self):
     @slow
     def test_lm_generate_distilgpt2_left_padding(self):
         """Tests that the generated text is the same, regarless of left padding"""
-        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
+        model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
 
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.padding_side = "left"
@@ -582,8 +582,8 @@ def test_lm_generate_distilgpt2_left_padding(self):
 
     @slow
     def test_lm_generate_gpt2_greedy_xla(self):
-        model = TFGPT2LMHeadModel.from_pretrained("gpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.padding_side = "left"
@@ -612,8 +612,8 @@ def test_lm_generate_gpt2_sample_xla(self):
 
         # forces the generation to happen on CPU, to avoid GPU-related quirks
         with tf.device(":/CPU:0"):
-            model = TFGPT2LMHeadModel.from_pretrained("gpt2")
-            tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+            model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2")
+            tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
             tokenizer.pad_token = tokenizer.eos_token
             tokenizer.padding_side = "left"
@@ -642,8 +642,8 @@ def test_lm_generate_gpt2_sample_xla(self):
 
     @slow
     def test_lm_generate_gpt2_beam_search_xla(self):
-        model = TFGPT2LMHeadModel.from_pretrained("gpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2")
+        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.padding_side = "left"
@@ -671,8 +671,8 @@ def test_contrastive_search_gpt2(self):
             "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
         )
 
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
-        gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2-large")
+        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large")
+        gpt2_model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
         input_ids = gpt2_tokenizer(article, return_tensors="tf")
 
         outputs = gpt2_model.generate(**input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
@@ -705,8 +705,8 @@ def test_contrastive_search_gpt2_xla(self):
             "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
         )
 
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
-        gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2-large")
+        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large")
+        gpt2_model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
         input_ids = gpt2_tokenizer(article, return_tensors="tf")
 
         xla_generate = tf.function(gpt2_model.generate, jit_compile=True)
diff --git a/tests/models/gpt2/test_tokenization_gpt2_tf.py b/tests/models/gpt2/test_tokenization_gpt2_tf.py
index a3eac86fa604ec..0cea50db3188b2 100644
--- a/tests/models/gpt2/test_tokenization_gpt2_tf.py
+++ b/tests/models/gpt2/test_tokenization_gpt2_tf.py
@@ -15,8 +15,8 @@
     from transformers.models.gpt2 import TFGPT2Tokenizer
 
 
-TOKENIZER_CHECKPOINTS = ["gpt2"]
-TINY_MODEL_CHECKPOINT = "gpt2"
+TOKENIZER_CHECKPOINTS = ["openai-community/gpt2"]
+TINY_MODEL_CHECKPOINT = "openai-community/gpt2"
 
 if is_tf_available():
 
diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
index 58574a8b1da3ea..ca41495a842c77 100644
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@@ -202,7 +202,9 @@ def test_use_cache_forward_with_attn_mask(self):
 
     @slow
     def test_batch_generation(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token="<|endoftext|>", padding_side="left")
+        tokenizer = GPT2Tokenizer.from_pretrained(
+            "openai-community/gpt2", pad_token="<|endoftext|>", padding_side="left"
+        )
         inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
 
         model = FlaxGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py
index 48061f84d86cbe..aa3b7a99aa0fdf 100644
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@@ -199,7 +199,9 @@ def test_use_cache_forward_with_attn_mask(self):
 
     @tooslow
     def test_batch_generation(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token="<|endoftext|>", padding_side="left")
+        tokenizer = GPT2Tokenizer.from_pretrained(
+            "openai-community/gpt2", pad_token="<|endoftext|>", padding_side="left"
+        )
         inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
 
         model = FlaxGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py
index 32dc0f952fee55..42524ca65a67aa 100644
--- a/tests/models/longformer/test_tokenization_longformer.py
+++ b/tests/models/longformer/test_tokenization_longformer.py
@@ -28,7 +28,7 @@
 
 
 @require_tokenizers
-# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest with roberta-base->allenai/longformer-base-4096,Roberta->Longformer,roberta->longformer,
+# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest with FacebookAI/roberta-base->allenai/longformer-base-4096,Roberta->Longformer,roberta->longformer,
 class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     # Ignore copy
     tokenizer_class = LongformerTokenizer
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 9d2af513e1a406..e793a9a507093d 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -1373,7 +1373,7 @@ def test_training_new_tokenizer(self):
         inputs = new_tokenizer(text, xpaths=xpaths)
         self.assertEqual(len(inputs["input_ids"]), 2)
         decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
-        expected_result = (  # original expected result "this is the" seems contradicts to roberta-based tokenizer
+        expected_result = (  # original expected result "this is the" seems contradicts to FacebookAI/roberta-based tokenizer
             "thisisthe"
         )
 
diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py
index babed7a8d9bfdc..92ddd88684b790 100644
--- a/tests/models/mobilebert/test_tokenization_mobilebert.py
+++ b/tests/models/mobilebert/test_tokenization_mobilebert.py
@@ -258,7 +258,7 @@ def test_clean_text(self):
         )
 
     @slow
-    # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_sequence_builders with bert-base-uncased->google/mobilebert-uncased
+    # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_sequence_builders with google-bert/bert-base-uncased->google/mobilebert-uncased
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("google/mobilebert-uncased")
 
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index ac34bcce7b9548..9e7dd443e2b8c2 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -104,7 +104,7 @@ def __init__(
         self.decoder_layers = decoder_layers
 
     def get_large_model_config(self):
-        return MT5Config.from_pretrained("t5-base")
+        return MT5Config.from_pretrained("google-t5/t5-base")
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
@@ -940,7 +940,7 @@ def __init__(
         self.is_training = is_training
 
     def get_large_model_config(self):
-        return MT5Config.from_pretrained("t5-base")
+        return MT5Config.from_pretrained("google-t5/t5-base")
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py
index 98d74ee5f8070d..718c224bf04895 100644
--- a/tests/models/openai/test_modeling_openai.py
+++ b/tests/models/openai/test_modeling_openai.py
@@ -279,7 +279,7 @@ def test_model_from_pretrained(self):
 class OPENAIGPTModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_openai_gpt(self):
-        model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
+        model = OpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")
         model.to(torch_device)
         input_ids = torch.tensor([[481, 4735, 544]], dtype=torch.long, device=torch_device)  # the president is
         expected_output_ids = [
diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py
index 231758064f2d18..6704ec97532b33 100644
--- a/tests/models/openai/test_modeling_tf_openai.py
+++ b/tests/models/openai/test_modeling_tf_openai.py
@@ -262,7 +262,7 @@ def test_model_from_pretrained(self):
 class TFOPENAIGPTModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_openai_gpt(self):
-        model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
+        model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")
         input_ids = tf.convert_to_tensor([[481, 4735, 544]], dtype=tf.int32)  # the president is
         expected_output_ids = [
             481,
diff --git a/tests/models/pix2struct/test_processor_pix2struct.py b/tests/models/pix2struct/test_processor_pix2struct.py
index 318e6f301f6eb8..88335296f03590 100644
--- a/tests/models/pix2struct/test_processor_pix2struct.py
+++ b/tests/models/pix2struct/test_processor_pix2struct.py
@@ -41,7 +41,7 @@ def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
         image_processor = Pix2StructImageProcessor()
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         processor = Pix2StructProcessor(image_processor, tokenizer)
 
diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py
index d10abb733e07a9..e8c6d17986d2d5 100644
--- a/tests/models/qdqbert/test_modeling_qdqbert.py
+++ b/tests/models/qdqbert/test_modeling_qdqbert.py
@@ -563,7 +563,7 @@ def test_inference_no_head_absolute_embedding(self):
         quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
         quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
 
-        model = QDQBertModel.from_pretrained("bert-base-uncased")
+        model = QDQBertModel.from_pretrained("google-bert/bert-base-uncased")
         input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
         attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
         output = model(input_ids, attention_mask=attention_mask)[0]
diff --git a/tests/models/realm/test_tokenization_realm.py b/tests/models/realm/test_tokenization_realm.py
index 6a5a3878fd4354..7dbd8df6ef29f6 100644
--- a/tests/models/realm/test_tokenization_realm.py
+++ b/tests/models/realm/test_tokenization_realm.py
@@ -236,7 +236,7 @@ def test_clean_text(self):
 
     @slow
     def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased")
 
         text = tokenizer.encode("sequence builders", add_special_tokens=False)
         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py
index f82479aa706fd0..d205a0e75f8035 100644
--- a/tests/models/roberta/test_modeling_flax_roberta.py
+++ b/tests/models/roberta/test_modeling_flax_roberta.py
@@ -154,6 +154,6 @@ def setUp(self):
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("roberta-base", from_pt=True)
+            model = model_class_name.from_pretrained("FacebookAI/roberta-base", from_pt=True)
             outputs = model(np.ones((1, 1)))
             self.assertIsNotNone(outputs)
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 6cacf605a26a03..402d60d37a42a4 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -527,7 +527,7 @@ def test_create_position_ids_from_inputs_embeds(self):
 class RobertaModelIntegrationTest(TestCasePlus):
     @slow
     def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained("roberta-base")
+        model = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
@@ -547,7 +547,7 @@ def test_inference_masked_lm(self):
 
     @slow
     def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained("roberta-base")
+        model = RobertaModel.from_pretrained("FacebookAI/roberta-base")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
@@ -565,7 +565,7 @@ def test_inference_no_head(self):
 
     @slow
     def test_inference_classification_head(self):
-        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
+        model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli")
 
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
diff --git a/tests/models/roberta/test_modeling_tf_roberta.py b/tests/models/roberta/test_modeling_tf_roberta.py
index 2f2859391ad3af..37377ab5ba52e6 100644
--- a/tests/models/roberta/test_modeling_tf_roberta.py
+++ b/tests/models/roberta/test_modeling_tf_roberta.py
@@ -666,7 +666,7 @@ def test_model_from_pretrained(self):
 class TFRobertaModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-        model = TFRobertaForMaskedLM.from_pretrained("roberta-base")
+        model = TFRobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
 
         input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
@@ -680,7 +680,7 @@ def test_inference_masked_lm(self):
 
     @slow
     def test_inference_no_head(self):
-        model = TFRobertaModel.from_pretrained("roberta-base")
+        model = TFRobertaModel.from_pretrained("FacebookAI/roberta-base")
 
         input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
@@ -692,7 +692,7 @@ def test_inference_no_head(self):
 
     @slow
     def test_inference_classification_head(self):
-        model = TFRobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
+        model = TFRobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli")
 
         input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         output = model(input_ids)[0]
diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py
index 3190ab13be4ea1..5d457c4cb4446d 100644
--- a/tests/models/roberta/test_tokenization_roberta.py
+++ b/tests/models/roberta/test_tokenization_roberta.py
@@ -105,7 +105,7 @@ def roberta_dict_integration_testing(self):
 
     @slow
     def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("roberta-base")
+        tokenizer = self.tokenizer_class.from_pretrained("FacebookAI/roberta-base")
 
         text = tokenizer.encode("sequence builders", add_special_tokens=False)
         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
index 65dbe65974d4c4..0074323460a9f3 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
@@ -134,7 +134,7 @@ def prepare_config_and_inputs_for_decoder(self):
 
 
 @require_flax
-# Copied from tests.models.roberta.test_modeling_flax_roberta.FlaxRobertaModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta-base->andreasmadsen/efficient_mlm_m0.40
+# Copied from tests.models.roberta.test_modeling_flax_roberta.FlaxRobertaModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40
 class FlaxRobertaPreLayerNormModelTest(FlaxModelTesterMixin, unittest.TestCase):
     test_head_masking = True
 
diff --git a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
index f2c75e702bf765..62ce0d660a0abc 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
@@ -578,7 +578,7 @@ def test_real_model_save_load_from_pretrained(self):
 class FlaxWav2Vec2GPT2ModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
     def get_pretrained_model_and_inputs(self):
         model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "facebook/wav2vec2-large-lv60", "gpt2-medium"
+            "facebook/wav2vec2-large-lv60", "openai-community/gpt2-medium"
         )
         batch_size = 13
         input_values = floats_tensor([batch_size, 512], scale=1.0)
@@ -812,7 +812,7 @@ def test_flaxwav2vec2bart_pt_flax_equivalence(self):
 class FlaxWav2Vec2BertModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
     def get_pretrained_model_and_inputs(self):
         model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "facebook/wav2vec2-large-lv60", "bert-large-uncased"
+            "facebook/wav2vec2-large-lv60", "google-bert/bert-large-uncased"
         )
         batch_size = 13
         input_values = floats_tensor([batch_size, 512], model.config.encoder.vocab_size)
diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
index 368232331a2ac0..c3503702c2ac82 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -445,7 +445,7 @@ def test_real_model_save_load_from_pretrained(self):
 class Wav2Vec2BertModelTest(EncoderDecoderMixin, unittest.TestCase):
     def get_pretrained_model_and_inputs(self):
         model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "facebook/wav2vec2-base-960h", "bert-base-cased"
+            "facebook/wav2vec2-base-960h", "google-bert/bert-base-cased"
         )
         batch_size = 13
         input_values = floats_tensor([batch_size, 512], scale=1.0)
@@ -509,7 +509,7 @@ def prepare_config_and_inputs(self):
 class Speech2TextBertModelTest(EncoderDecoderMixin, unittest.TestCase):
     def get_pretrained_model_and_inputs(self):
         model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "facebook/s2t-small-librispeech-asr", "bert-base-cased"
+            "facebook/s2t-small-librispeech-asr", "google-bert/bert-base-cased"
         )
         batch_size = 13
         input_features = floats_tensor([batch_size, 7, 80], scale=1.0)
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index aa226f82ae3606..b21fa405c39f9c 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -1065,7 +1065,7 @@ def test_small_generate(self):
         model = SwitchTransformersForConditionalGeneration.from_pretrained(
             "google/switch-base-8", torch_dtype=torch.bfloat16
         ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False, legacy=False)
+        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", use_fast=False, legacy=False)
         model = model.to(torch_device)
 
         input_ids = tokenizer(
@@ -1093,7 +1093,7 @@ def test_small_batch_generate(self):
         model = SwitchTransformersForConditionalGeneration.from_pretrained(
             "google/switch-base-8", torch_dtype=torch.bfloat16
         ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=False, legacy=False)
+        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", use_fast=False, legacy=False)
 
         inputs = [
             "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
index d5d729dac9aff8..204b84989be0f5 100644
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -773,8 +773,8 @@ def test_small_integration_test(self):
         >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
         """
 
-        model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         input_ids = tokenizer("Hello there", return_tensors="np").input_ids
         labels = tokenizer("Hi I am", return_tensors="np").input_ids
@@ -849,11 +849,11 @@ def test_small_byt5_integration_test(self):
 
     @slow
     def test_small_generation(self):
-        model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
+        model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
         model.config.max_length = 8
         model.config.num_beams = 1
         model.config.do_sample = False
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         input_ids = tokenizer("summarize: Hello there", return_tensors="np").input_ids
 
@@ -864,11 +864,11 @@ def test_small_generation(self):
 
     @slow
     def test_small_generation_bfloat16(self):
-        model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small", dtype=jnp.bfloat16)
+        model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-small", dtype=jnp.bfloat16)
         model.config.max_length = 8
         model.config.num_beams = 1
         model.config.do_sample = False
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         input_ids = tokenizer("summarize: Hello there", return_tensors="np").input_ids
 
@@ -879,8 +879,8 @@ def test_small_generation_bfloat16(self):
 
     @slow
     def test_summarization(self):
-        model = FlaxT5ForConditionalGeneration.from_pretrained("t5-base")
-        tok = T5Tokenizer.from_pretrained("t5-base")
+        model = FlaxT5ForConditionalGeneration.from_pretrained("google-t5/t5-base")
+        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
 
         FRANCE_ARTICLE = (  # @noqa
             "Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 9defe3b23ef68b..c0a43dfeab69cc 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -108,7 +108,7 @@ def __init__(
         self.decoder_layers = decoder_layers
 
     def get_large_model_config(self):
-        return T5Config.from_pretrained("t5-base")
+        return T5Config.from_pretrained("google-t5/t5-base")
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
@@ -942,7 +942,7 @@ def __init__(
         self.is_training = is_training
 
     def get_large_model_config(self):
-        return T5Config.from_pretrained("t5-base")
+        return T5Config.from_pretrained("google-t5/t5-base")
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
@@ -1096,36 +1096,40 @@ def import_accelerate_mock(name, *args, **kwargs):
         with unittest.mock.patch("builtins.__import__", side_effect=import_accelerate_mock):
             accelerate_available = False
 
-            model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.float16)
+            model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", torch_dtype=torch.float16)
             self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
             self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
 
             # Load without in bf16
-            model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.bfloat16)
+            model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", torch_dtype=torch.bfloat16)
             self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
             self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
 
         # Load using `accelerate` in bf16
-        model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.bfloat16, device_map="auto")
+        model = T5ForConditionalGeneration.from_pretrained(
+            "google-t5/t5-small", torch_dtype=torch.bfloat16, device_map="auto"
+        )
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
 
         # Load using `accelerate` in bf16
         model = T5ForConditionalGeneration.from_pretrained(
-            "t5-small", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+            "google-t5/t5-small", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
         )
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
 
         # Load without using `accelerate`
         model = T5ForConditionalGeneration.from_pretrained(
-            "t5-small", torch_dtype=torch.float16, low_cpu_mem_usage=True
+            "google-t5/t5-small", torch_dtype=torch.float16, low_cpu_mem_usage=True
         )
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
 
         # Load using `accelerate`
-        model = T5ForConditionalGeneration.from_pretrained("t5-small", torch_dtype=torch.float16, device_map="auto")
+        model = T5ForConditionalGeneration.from_pretrained(
+            "google-t5/t5-small", torch_dtype=torch.float16, device_map="auto"
+        )
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
 
@@ -1136,11 +1140,11 @@ def import_accelerate_mock(name, *args, **kwargs):
 class T5ModelIntegrationTests(unittest.TestCase):
     @cached_property
     def model(self):
-        return T5ForConditionalGeneration.from_pretrained("t5-base").to(torch_device)
+        return T5ForConditionalGeneration.from_pretrained("google-t5/t5-base").to(torch_device)
 
     @cached_property
     def tokenizer(self):
-        return T5Tokenizer.from_pretrained("t5-base")
+        return T5Tokenizer.from_pretrained("google-t5/t5-base")
 
     @slow
     def test_torch_quant(self):
@@ -1157,11 +1161,11 @@ def test_torch_quant(self):
 
     @slow
     def test_small_generation(self):
-        model = T5ForConditionalGeneration.from_pretrained("t5-small").to(torch_device)
+        model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small").to(torch_device)
         model.config.max_length = 8
         model.config.num_beams = 1
         model.config.do_sample = False
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         input_ids = tokenizer("summarize: Hello there", return_tensors="pt").input_ids.to(torch_device)
 
@@ -1184,8 +1188,8 @@ def test_small_integration_test(self):
         >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
         """
 
-        model = T5ForConditionalGeneration.from_pretrained("t5-small").to(torch_device)
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small").to(torch_device)
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         input_ids = tokenizer("Hello there", return_tensors="pt").input_ids
         labels = tokenizer("Hi I am", return_tensors="pt").input_ids
@@ -1501,7 +1505,7 @@ def test_translation_en_to_de(self):
 
     @slow
     def test_translation_en_to_fr(self):
-        model = self.model  # t5-base
+        model = self.model  # google-t5/t5-base
         tok = self.tokenizer
         use_task_specific_params(model, "translation_en_to_fr")
 
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index 9976e20baf330c..cab41c2b04121f 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -302,7 +302,7 @@ def test_t5_decoder_model_past_large_inputs(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model = TFT5Model.from_pretrained("t5-small")
+        model = TFT5Model.from_pretrained("google-t5/t5-small")
         self.assertIsNotNone(model)
 
     def test_generate_with_headmasking(self):
@@ -448,8 +448,8 @@ def test_train_pipeline_custom_model(self):
 class TFT5GenerationIntegrationTests(unittest.TestCase):
     @slow
     def test_greedy_xla_generate_simple(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         # two examples with different lengths to confirm that attention masks are operational in XLA
         sentences = [
@@ -476,8 +476,8 @@ def test_greedy_xla_generate_simple(self):
 
     @slow
     def test_greedy_generate(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         sentences = ["Yesterday, my name was", "Today is a beautiful day and"]
         input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
@@ -505,8 +505,8 @@ def test_sample_xla_generate_simple(self):
 
         # forces the generation to happen on CPU, to avoid GPU-related quirks
         with tf.device(":/CPU:0"):
-            model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
-            tokenizer = T5Tokenizer.from_pretrained("t5-small")
+            model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+            tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
             sentence = "Translate English to German: I have two bananas"
             input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids
@@ -526,8 +526,8 @@ def test_sample_xla_generate_simple(self):
 
     @slow
     def test_sample_generate(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         sentences = ["I really love my", "Translate English to German: the transformers are truly amazing"]
         input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
@@ -557,8 +557,8 @@ def test_sample_generate(self):
     @unittest.skip("Skip for now as TF 2.13 breaks it on GPU")
     @slow
     def test_beam_search_xla_generate_simple(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         # tests XLA with task specific arguments
         task_specific_config = getattr(model.config, "task_specific_params", {})
@@ -590,8 +590,8 @@ def test_beam_search_xla_generate_simple(self):
 
     @slow
     def test_beam_search_generate(self):
-        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         sentences = ["I really love my", "Translate English to German: the transformers are truly amazing"]
         input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
@@ -622,7 +622,7 @@ def test_pipeline_conversational(self):
 class TFT5ModelIntegrationTests(unittest.TestCase):
     @cached_property
     def model(self):
-        return TFT5ForConditionalGeneration.from_pretrained("t5-base")
+        return TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-base")
 
     @slow
     def test_small_integration_test(self):
@@ -638,8 +638,8 @@ def test_small_integration_test(self):
         >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
         """
 
-        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = TFT5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
 
         input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
         labels = tokenizer("Hi I am", return_tensors="tf").input_ids
@@ -703,7 +703,7 @@ def test_small_byt5_integration_test(self):
     @slow
     def test_summarization(self):
         model = self.model
-        tok = T5Tokenizer.from_pretrained("t5-base")
+        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
 
         FRANCE_ARTICLE = (  # @noqa
             "Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
@@ -948,7 +948,7 @@ def test_summarization(self):
 
     @slow
     def test_translation_en_to_de(self):
-        tok = T5Tokenizer.from_pretrained("t5-base")
+        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
         model = self.model
 
         task_specific_config = getattr(model.config, "task_specific_params", {})
@@ -978,7 +978,7 @@ def test_translation_en_to_de(self):
     @slow
     def test_translation_en_to_fr(self):
         model = self.model
-        tok = T5Tokenizer.from_pretrained("t5-base")
+        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
 
         task_specific_config = getattr(model.config, "task_specific_params", {})
         translation_config = task_specific_config.get("translation_en_to_fr", {})
@@ -1015,7 +1015,7 @@ def test_translation_en_to_fr(self):
     @slow
     def test_translation_en_to_ro(self):
         model = self.model
-        tok = T5Tokenizer.from_pretrained("t5-base")
+        tok = T5Tokenizer.from_pretrained("google-t5/t5-base")
 
         task_specific_config = getattr(model.config, "task_specific_params", {})
         translation_config = task_specific_config.get("translation_en_to_ro", {})
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index 5fa0e19c792b29..fdd4f253001470 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -138,11 +138,11 @@ def test_full_tokenizer(self):
 
     @cached_property
     def t5_base_tokenizer(self):
-        return T5Tokenizer.from_pretrained("t5-base")
+        return T5Tokenizer.from_pretrained("google-t5/t5-base")
 
     @cached_property
     def t5_base_tokenizer_fast(self):
-        return T5TokenizerFast.from_pretrained("t5-base")
+        return T5TokenizerFast.from_pretrained("google-t5/t5-base")
 
     def get_tokenizer(self, **kwargs) -> T5Tokenizer:
         return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@@ -373,7 +373,7 @@ def test_tokenizer_integration(self):
 
         self.tokenizer_integration_test_util(
             expected_encoding=expected_encoding,
-            model_name="t5-base",
+            model_name="google-t5/t5-base",
             revision="5a7ff2d8f5117c194c7e32ec1ccbf04642cca99b",
         )
 
@@ -400,7 +400,7 @@ def test_get_sentinel_token_ids_for_fasttokenizer(self):
         self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted(range(1000, 1010)))
 
     def test_some_edge_cases(self):
-        tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
 
         sp_tokens = tokenizer.sp_model.encode("</s>>", out_type=str)
         self.assertEqual(sp_tokens, ["<", "/", "s", ">", ">"])
@@ -426,8 +426,8 @@ def test_some_edge_cases(self):
 
     def test_fast_slow_edge_cases(self):
         # We are testing spaces before and spaces after special tokens + space transformations
-        slow_tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
-        fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True)
+        slow_tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
+        fast_tokenizer = T5TokenizerFast.from_pretrained("google-t5/t5-base", legacy=False, from_slow=True)
         slow_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=False))
         fast_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=False))
 
@@ -445,7 +445,7 @@ def test_fast_slow_edge_cases(self):
         with self.subTest(f"fast {edge_case} normalized = False"):
             self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_SLOW)
 
-        fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True)
+        fast_tokenizer = T5TokenizerFast.from_pretrained("google-t5/t5-base", legacy=False, from_slow=True)
         fast_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=True))
 
         # `normalized=True` is the default normalization scheme when adding a token. Normalize -> don't strip the space.
@@ -604,7 +604,7 @@ def test_integration_seqio(self):
                 )
 
         # Test with T5
-        hf_tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        hf_tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
         vocab_path = "gs://t5-data/vocabs/cc_all.32000/sentencepiece.model"
         t5x_tokenizer = SentencePieceVocabulary(vocab_path, extra_ids=300)
         for text in input_texts:
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index b25873eae54368..5bd961dbb3d1e3 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -603,7 +603,7 @@ def __init__(
         self.is_training = is_training
 
     def get_large_model_config(self):
-        return UMT5Config.from_pretrained("t5-base")
+        return UMT5Config.from_pretrained("google-t5/t5-base")
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
diff --git a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
index c6926e002a9b63..98c3a275825b0b 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
@@ -426,7 +426,7 @@ def prepare_config_and_inputs(self):
 
     def get_pretrained_model(self):
         return FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google/vit-base-patch16-224-in21k", "gpt2"
+            "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
         )
 
 
@@ -434,7 +434,7 @@ def get_pretrained_model(self):
 class FlaxVisionEncoderDecoderModelTest(unittest.TestCase):
     def get_from_encoderdecoder_pretrained_model(self):
         return FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "google/vit-base-patch16-224-in21k", "gpt2"
+            "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
         )
 
     def _check_configuration_tie(self, model):
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
index 057df26d303b69..b87673c0511251 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
@@ -627,7 +627,9 @@ def test_real_model_save_load_from_pretrained(self):
 @require_tf
 class TFViT2GPT2EncoderDecoderModelTest(TFVisionEncoderDecoderMixin, unittest.TestCase):
     def get_pretrained_model(self):
-        return TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained("google/vit-base-patch16-224-in21k", "gpt2")
+        return TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
+        )
 
     def get_encoder_decoder_model(self, config, decoder_config):
         encoder_model = TFViTModel(config, name="encoder")
@@ -672,10 +674,12 @@ def prepare_config_and_inputs(self):
 @require_tf
 class TFVisionEncoderDecoderModelTest(unittest.TestCase):
     def get_from_encoderdecoder_pretrained_model(self):
-        return TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained("google/vit-base-patch16-224-in21k", "gpt2")
+        return TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
+        )
 
     def get_decoder_config(self):
-        config = AutoConfig.from_pretrained("gpt2")
+        config = AutoConfig.from_pretrained("openai-community/gpt2")
         config.is_decoder = True
         config.add_cross_attention = True
         return config
@@ -685,7 +689,9 @@ def get_encoderdecoder_model(self):
 
     def get_encoder_decoder_models(self):
         encoder_model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder")
-        decoder_model = TFGPT2LMHeadModel.from_pretrained("gpt2", config=self.get_decoder_config(), name="decoder")
+        decoder_model = TFGPT2LMHeadModel.from_pretrained(
+            "openai-community/gpt2", config=self.get_decoder_config(), name="decoder"
+        )
         return {"encoder": encoder_model, "decoder": decoder_model}
 
     def _check_configuration_tie(self, model):
@@ -714,7 +720,7 @@ def prepare_img():
 class TFVisionEncoderDecoderModelSaveLoadTests(unittest.TestCase):
     def get_encoder_decoder_config(self):
         encoder_config = AutoConfig.from_pretrained("google/vit-base-patch16-224-in21k")
-        decoder_config = AutoConfig.from_pretrained("gpt2", is_decoder=True, add_cross_attention=True)
+        decoder_config = AutoConfig.from_pretrained("openai-community/gpt2", is_decoder=True, add_cross_attention=True)
         return VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
 
     def get_encoder_decoder_config_small(self):
@@ -829,7 +835,7 @@ def test_encoder_decoder_from_pretrained(self):
 
         config = self.get_encoder_decoder_config()
         image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
-        decoder_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
         img = prepare_img()
         pixel_values = image_processor(images=img, return_tensors="tf").pixel_values
@@ -845,7 +851,7 @@ def test_encoder_decoder_from_pretrained(self):
             encoder = TFAutoModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder")
             # It's necessary to specify `add_cross_attention=True` here.
             decoder = TFAutoModelForCausalLM.from_pretrained(
-                "gpt2", is_decoder=True, add_cross_attention=True, name="decoder"
+                "openai-community/gpt2", is_decoder=True, add_cross_attention=True, name="decoder"
             )
             pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder")
             pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder")
diff --git a/tests/models/xlm/test_modeling_tf_xlm.py b/tests/models/xlm/test_modeling_tf_xlm.py
index 7bfa33828f70f3..51ba6c2476b180 100644
--- a/tests/models/xlm/test_modeling_tf_xlm.py
+++ b/tests/models/xlm/test_modeling_tf_xlm.py
@@ -369,7 +369,7 @@ def test_model_from_pretrained(self):
 class TFXLMModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_xlm_mlm_en_2048(self):
-        model = TFXLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048")
+        model = TFXLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-mlm-en-2048")
         input_ids = tf.convert_to_tensor([[14, 447]], dtype=tf.int32)  # the president
         expected_output_ids = [
             14,
diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py
index b551e7e645d516..09ad95e81ac822 100644
--- a/tests/models/xlm/test_modeling_xlm.py
+++ b/tests/models/xlm/test_modeling_xlm.py
@@ -514,7 +514,7 @@ def test_model_from_pretrained(self):
 class XLMModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_xlm_mlm_en_2048(self):
-        model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048")
+        model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-mlm-en-2048")
         model.to(torch_device)
         input_ids = torch.tensor([[14, 447]], dtype=torch.long, device=torch_device)  # the president
         expected_output_ids = [
diff --git a/tests/models/xlm/test_tokenization_xlm.py b/tests/models/xlm/test_tokenization_xlm.py
index 6e3103521585c8..4b5982ca9855c8 100644
--- a/tests/models/xlm/test_tokenization_xlm.py
+++ b/tests/models/xlm/test_tokenization_xlm.py
@@ -85,7 +85,7 @@ def test_full_tokenizer(self):
 
     @slow
     def test_sequence_builders(self):
-        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
+        tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
 
         text = tokenizer.encode("sequence builders", add_special_tokens=False)
         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
diff --git a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
index 0ceaa739f3fa86..6af80600607569 100644
--- a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
@@ -32,8 +32,8 @@
 class FlaxXLMRobertaModelIntegrationTest(unittest.TestCase):
     @slow
     def test_flax_xlm_roberta_base(self):
-        model = FlaxXLMRobertaModel.from_pretrained("xlm-roberta-base")
-        tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
+        model = FlaxXLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
+        tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
         text = "The dog is cute and lives in the garden house"
         input_ids = jnp.array([tokenizer.encode(text)])
 
diff --git a/tests/models/xlm_roberta/test_modeling_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_xlm_roberta.py
index ca9db17270dcea..d9b69bb9ab5f2a 100644
--- a/tests/models/xlm_roberta/test_modeling_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_modeling_xlm_roberta.py
@@ -32,7 +32,7 @@
 class XLMRobertaModelIntegrationTest(unittest.TestCase):
     @slow
     def test_xlm_roberta_base(self):
-        model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
+        model = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
         input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
         # The dog is cute and lives in the garden house
 
@@ -51,7 +51,7 @@ def test_xlm_roberta_base(self):
 
     @slow
     def test_xlm_roberta_large(self):
-        model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
+        model = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-large")
         input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
         # The dog is cute and lives in the garden house
 
diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
index 1cba1c01d58081..6e2d4446a02df7 100644
--- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
@@ -212,7 +212,7 @@ def test_save_pretrained(self):
 
     @cached_property
     def big_tokenizer(self):
-        return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+        return XLMRobertaTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
 
     def test_picklable_without_disk(self):
         with tempfile.NamedTemporaryFile() as f:
@@ -338,6 +338,6 @@ def test_tokenizer_integration(self):
 
         self.tokenizer_integration_test_util(
             expected_encoding=expected_encoding,
-            model_name="xlm-roberta-base",
+            model_name="FacebookAI/xlm-roberta-base",
             revision="d9d8a8ea5eb94b1c6654ae9249df7793cd2933d3",
         )
diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py
index 03eba74f4065df..5d17299f9b3926 100644
--- a/tests/models/xlnet/test_modeling_tf_xlnet.py
+++ b/tests/models/xlnet/test_modeling_tf_xlnet.py
@@ -491,7 +491,7 @@ def test_loss_computation(self):
 class TFXLNetModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_xlnet_base_cased(self):
-        model = TFXLNetLMHeadModel.from_pretrained("xlnet-base-cased")
+        model = TFXLNetLMHeadModel.from_pretrained("xlnet/xlnet-base-cased")
         # fmt: off
         input_ids = tf.convert_to_tensor(
             [
diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py
index 2b0c95cd6d13d0..cd5a3d52b34801 100644
--- a/tests/models/xlnet/test_modeling_xlnet.py
+++ b/tests/models/xlnet/test_modeling_xlnet.py
@@ -694,7 +694,7 @@ def test_model_from_pretrained(self):
 class XLNetModelLanguageGenerationTest(unittest.TestCase):
     @slow
     def test_lm_generate_xlnet_base_cased(self):
-        model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")
+        model = XLNetLMHeadModel.from_pretrained("xlnet/xlnet-base-cased")
         model.to(torch_device)
         # fmt: off
         input_ids = torch.tensor(
diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py
index 9fb28658aab4da..8a7476fad92a96 100644
--- a/tests/models/xlnet/test_tokenization_xlnet.py
+++ b/tests/models/xlnet/test_tokenization_xlnet.py
@@ -186,7 +186,7 @@ def test_tokenizer_no_lower(self):
 
     @slow
     def test_sequence_builders(self):
-        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+        tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
 
         text = tokenizer.encode("sequence builders", add_special_tokens=False)
         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
@@ -203,6 +203,6 @@ def test_tokenizer_integration(self):
 
         self.tokenizer_integration_test_util(
             expected_encoding=expected_encoding,
-            model_name="xlnet-base-cased",
+            model_name="xlnet/xlnet-base-cased",
             revision="c841166438c31ec7ca9a106dee7bb312b73ae511",
         )
diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py
index fc1ce44e35d836..1a9eab5507e8da 100644
--- a/tests/models/xmod/test_modeling_xmod.py
+++ b/tests/models/xmod/test_modeling_xmod.py
@@ -630,7 +630,7 @@ def test_multilingual_batch(self):
 
     @slow
     def test_end_to_end_mask_fill(self):
-        tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+        tokenizer = XLMRobertaTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
         model = XmodForMaskedLM.from_pretrained("facebook/xmod-base", default_language="en_XX")
         model.to(torch_device)
 
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index e760d279014640..5e3e15f39c10ea 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -143,7 +143,7 @@ class MyPipeline(TextClassificationPipeline):
         self.assertIsInstance(text_classifier, MyPipeline)
 
     def test_check_task(self):
-        task = get_task("gpt2")
+        task = get_task("openai-community/gpt2")
         self.assertEqual(task, "text-generation")
 
         with self.assertRaises(RuntimeError):
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index 571b320d617fa1..bbf2b6cf3f439b 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -169,13 +169,13 @@ def test_fp16_casting(self):
     @slow
     @require_torch
     def test_large_model_pt(self):
-        unmasker = pipeline(task="fill-mask", model="distilroberta-base", top_k=2, framework="pt")
+        unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2, framework="pt")
         self.run_large_test(unmasker)
 
     @slow
     @require_tf
     def test_large_model_tf(self):
-        unmasker = pipeline(task="fill-mask", model="distilroberta-base", top_k=2, framework="tf")
+        unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2, framework="tf")
         self.run_large_test(unmasker)
 
     def run_large_test(self, unmasker):
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index b139fbfd2f7982..eda9ac014bf730 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -468,7 +468,7 @@ def test_dbmdz_english(self):
     @slow
     def test_aggregation_strategy_byte_level_tokenizer(self):
         sentence = "Groenlinks praat over Schiphol."
-        ner = pipeline("ner", model="xlm-roberta-large-finetuned-conll02-dutch", aggregation_strategy="max")
+        ner = pipeline("ner", model="FacebookAI/xlm-roberta-large-finetuned-conll02-dutch", aggregation_strategy="max")
         self.assertEqual(
             nested_simplify(ner(sentence)),
             [
diff --git a/tests/pipelines/test_pipelines_zero_shot.py b/tests/pipelines/test_pipelines_zero_shot.py
index 9c37014ab81d31..2e61d97c1dc8c9 100644
--- a/tests/pipelines/test_pipelines_zero_shot.py
+++ b/tests/pipelines/test_pipelines_zero_shot.py
@@ -199,7 +199,9 @@ def test_small_model_tf(self):
     @slow
     @require_torch
     def test_large_model_pt(self):
-        zero_shot_classifier = pipeline("zero-shot-classification", model="roberta-large-mnli", framework="pt")
+        zero_shot_classifier = pipeline(
+            "zero-shot-classification", model="FacebookAI/roberta-large-mnli", framework="pt"
+        )
         outputs = zero_shot_classifier(
             "Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
         )
@@ -254,7 +256,9 @@ def test_large_model_pt(self):
     @slow
     @require_tf
     def test_large_model_tf(self):
-        zero_shot_classifier = pipeline("zero-shot-classification", model="roberta-large-mnli", framework="tf")
+        zero_shot_classifier = pipeline(
+            "zero-shot-classification", model="FacebookAI/roberta-large-mnli", framework="tf"
+        )
         outputs = zero_shot_classifier(
             "Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
         )
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 4c33270af67421..782e9a082fd7df 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -43,7 +43,7 @@
 
 
 def get_some_linear_layer(model):
-    if model.config.model_type == "gpt2":
+    if model.config.model_type == "openai-community/gpt2":
         return model.transformer.h[0].mlp.c_fc
     elif model.config.model_type == "opt":
         try:
@@ -283,7 +283,7 @@ def test_fp32_4bit_conversion(self):
         r"""
         Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
         """
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", load_in_4bit=True, device_map="auto")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small", load_in_4bit=True, device_map="auto")
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
 
 
@@ -295,7 +295,7 @@ def test_fp32_4bit_conversion(self):
 class Bnb4BitT5Test(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        cls.model_name = "t5-small"
+        cls.model_name = "google-t5/t5-small"
         cls.dense_act_model_name = "google/flan-t5-small"  # flan-t5 uses dense-act instead of dense-relu-dense
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
         cls.input_text = "Translate in German: Hello, my dog is cute"
@@ -311,7 +311,7 @@ def tearDown(self):
     def test_inference_without_keep_in_fp32(self):
         r"""
         Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
-        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
         both cases.
         """
         from transformers import T5ForConditionalGeneration
@@ -319,7 +319,7 @@ def test_inference_without_keep_in_fp32(self):
         modules = T5ForConditionalGeneration._keep_in_fp32_modules
         T5ForConditionalGeneration._keep_in_fp32_modules = None
 
-        # test with `t5-small`
+        # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
         _ = model.generate(**encoded_input)
@@ -335,12 +335,12 @@ def test_inference_without_keep_in_fp32(self):
     def test_inference_with_keep_in_fp32(self):
         r"""
         Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
-        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
         both cases.
         """
         from transformers import T5ForConditionalGeneration
 
-        # test with `t5-small`
+        # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
 
         # there was a bug with decoders - this test checks that it is fixed
@@ -362,7 +362,7 @@ def setUp(self):
         super().setUp()
         # model_name
         self.model_name = "bigscience/bloom-560m"
-        self.seq_to_seq_name = "t5-small"
+        self.seq_to_seq_name = "google-t5/t5-small"
 
         # Different types of model
 
@@ -509,7 +509,7 @@ def test_training(self):
 
 
 class Bnb4BitGPT2Test(Bnb4BitTest):
-    model_name = "gpt2-xl"
+    model_name = "openai-community/gpt2-xl"
     EXPECTED_RELATIVE_DIFFERENCE = 3.3191854854152187
 
 
@@ -647,7 +647,7 @@ class GPTSerializationTest(BaseSerializationTest):
     default BaseSerializationTest config tested with GPT family model
     """
 
-    model_name = "gpt2-xl"
+    model_name = "openai-community/gpt2-xl"
 
 
 @require_bitsandbytes
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 0ce7274d2598ba..b926c80398c25a 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -42,7 +42,7 @@
 
 
 def get_some_linear_layer(model):
-    if model.config.model_type == "gpt2":
+    if model.config.model_type == "openai-community/gpt2":
         return model.transformer.h[0].mlp.c_fc
     return model.transformer.h[0].mlp.dense_4h_to_h
 
@@ -174,7 +174,7 @@ def test_get_keys_to_not_convert(self):
             model = OPTForCausalLM(config)
         self.assertEqual(get_keys_to_not_convert(model).sort(), ["lm_head", "model.decoder.embed_tokens"].sort())
 
-        model_id = "roberta-large"
+        model_id = "FacebookAI/roberta-large"
         config = AutoConfig.from_pretrained(model_id, revision="716877d372b884cad6d419d828bac6c85b3b18d9")
         with init_empty_weights():
             model = AutoModelForMaskedLM.from_config(config)
@@ -240,7 +240,7 @@ def test_llm_skip(self):
 
         quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_skip_modules=["classifier"])
         seq_classification_model = AutoModelForSequenceClassification.from_pretrained(
-            "roberta-large-mnli", quantization_config=quantization_config
+            "FacebookAI/roberta-large-mnli", quantization_config=quantization_config
         )
         self.assertTrue(seq_classification_model.roberta.encoder.layer[0].output.dense.weight.dtype == torch.int8)
         self.assertTrue(
@@ -340,7 +340,7 @@ def test_fp32_int8_conversion(self):
         r"""
         Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
         """
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", load_in_8bit=True, device_map="auto")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small", load_in_8bit=True, device_map="auto")
         self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
 
     def test_int8_serialization(self):
@@ -447,7 +447,7 @@ def test_int8_from_pretrained(self):
 class MixedInt8T5Test(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        cls.model_name = "t5-small"
+        cls.model_name = "google-t5/t5-small"
         cls.dense_act_model_name = "google/flan-t5-small"  # flan-t5 uses dense-act instead of dense-relu-dense
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
         cls.input_text = "Translate in German: Hello, my dog is cute"
@@ -463,7 +463,7 @@ def tearDown(self):
     def test_inference_without_keep_in_fp32(self):
         r"""
         Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
-        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
         both cases.
         """
         from transformers import T5ForConditionalGeneration
@@ -471,7 +471,7 @@ def test_inference_without_keep_in_fp32(self):
         modules = T5ForConditionalGeneration._keep_in_fp32_modules
         T5ForConditionalGeneration._keep_in_fp32_modules = None
 
-        # test with `t5-small`
+        # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
         _ = model.generate(**encoded_input)
@@ -487,14 +487,14 @@ def test_inference_without_keep_in_fp32(self):
     def test_inference_with_keep_in_fp32(self):
         r"""
         Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
-        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
         both cases.
         """
         import bitsandbytes as bnb
 
         from transformers import T5ForConditionalGeneration
 
-        # test with `t5-small`
+        # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
 
         # there was a bug with decoders - this test checks that it is fixed
@@ -514,14 +514,14 @@ def test_inference_with_keep_in_fp32_serialized(self):
         r"""
         Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly on
         a serialized model.
-        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
         both cases.
         """
         import bitsandbytes as bnb
 
         from transformers import T5ForConditionalGeneration
 
-        # test with `t5-small`
+        # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
 
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -548,7 +548,7 @@ def setUp(self):
         super().setUp()
         # model_name
         self.model_name = "bigscience/bloom-560m"
-        self.seq_to_seq_name = "t5-small"
+        self.seq_to_seq_name = "google-t5/t5-small"
 
         # Different types of model
 
@@ -842,7 +842,7 @@ def test_training(self):
 
 
 class MixedInt8GPT2Test(MixedInt8Test):
-    model_name = "gpt2-xl"
+    model_name = "openai-community/gpt2-xl"
     EXPECTED_RELATIVE_DIFFERENCE = 1.8720077507258357
     EXPECTED_OUTPUTS = set()
     EXPECTED_OUTPUTS.add("Hello my name is John Doe, and I'm a big fan of")
diff --git a/tests/sagemaker/test_multi_node_data_parallel.py b/tests/sagemaker/test_multi_node_data_parallel.py
index cc7f9e5e84f8bf..2ea029a285517d 100644
--- a/tests/sagemaker/test_multi_node_data_parallel.py
+++ b/tests/sagemaker/test_multi_node_data_parallel.py
@@ -25,21 +25,21 @@
         {
             "framework": "pytorch",
             "script": "run_glue.py",
-            "model_name_or_path": "distilbert-base-cased",
+            "model_name_or_path": "distilbert/distilbert-base-cased",
             "instance_type": "ml.p3.16xlarge",
             "results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6},
         },
         {
             "framework": "pytorch",
             "script": "run_ddp.py",
-            "model_name_or_path": "distilbert-base-cased",
+            "model_name_or_path": "distilbert/distilbert-base-cased",
             "instance_type": "ml.p3.16xlarge",
             "results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6},
         },
         {
             "framework": "tensorflow",
             "script": "run_tf_dist.py",
-            "model_name_or_path": "distilbert-base-cased",
+            "model_name_or_path": "distilbert/distilbert-base-cased",
             "instance_type": "ml.p3.16xlarge",
             "results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7},
         },
diff --git a/tests/sagemaker/test_multi_node_model_parallel.py b/tests/sagemaker/test_multi_node_model_parallel.py
index 95d5b9fa855904..216d31de47106a 100644
--- a/tests/sagemaker/test_multi_node_model_parallel.py
+++ b/tests/sagemaker/test_multi_node_model_parallel.py
@@ -25,14 +25,14 @@
         {
             "framework": "pytorch",
             "script": "run_glue_model_parallelism.py",
-            "model_name_or_path": "roberta-large",
+            "model_name_or_path": "FacebookAI/roberta-large",
             "instance_type": "ml.p3dn.24xlarge",
             "results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2},
         },
         {
             "framework": "pytorch",
             "script": "run_glue.py",
-            "model_name_or_path": "roberta-large",
+            "model_name_or_path": "FacebookAI/roberta-large",
             "instance_type": "ml.p3dn.24xlarge",
             "results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2},
         },
diff --git a/tests/sagemaker/test_single_node_gpu.py b/tests/sagemaker/test_single_node_gpu.py
index f2a62547e787c6..53d966bd1e8591 100644
--- a/tests/sagemaker/test_single_node_gpu.py
+++ b/tests/sagemaker/test_single_node_gpu.py
@@ -25,14 +25,14 @@
         {
             "framework": "pytorch",
             "script": "run_glue.py",
-            "model_name_or_path": "distilbert-base-cased",
+            "model_name_or_path": "distilbert/distilbert-base-cased",
             "instance_type": "ml.g4dn.xlarge",
             "results": {"train_runtime": 650, "eval_accuracy": 0.6, "eval_loss": 0.9},
         },
         {
             "framework": "tensorflow",
             "script": "run_tf.py",
-            "model_name_or_path": "distilbert-base-cased",
+            "model_name_or_path": "distilbert/distilbert-base-cased",
             "instance_type": "ml.g4dn.xlarge",
             "results": {"train_runtime": 600, "eval_accuracy": 0.3, "eval_loss": 0.9},
         },
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
index 413060ddfdebd2..5c9861e48bb122 100644
--- a/tests/test_configuration_utils.py
+++ b/tests/test_configuration_utils.py
@@ -255,7 +255,7 @@ def test_legacy_load_from_url(self):
         )
 
     def test_local_versioning(self):
-        configuration = AutoConfig.from_pretrained("bert-base-cased")
+        configuration = AutoConfig.from_pretrained("google-bert/bert-base-cased")
         configuration.configuration_files = ["config.4.0.0.json"]
 
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index cef56822dc3e95..0d52e5a87bed35 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -709,7 +709,7 @@ def test_from_pretrained_low_cpu_mem_usage_functional(self):
     def test_from_pretrained_low_cpu_mem_usage_measured(self):
         # test that `from_pretrained(..., low_cpu_mem_usage=True)` uses less cpu memory than default
 
-        mname = "bert-base-cased"
+        mname = "google-bert/bert-base-cased"
 
         preamble = "from transformers import AutoModel"
         one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
@@ -753,9 +753,9 @@ def test_model_parallelism_gpt2(self):
         for i in range(12):
             device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1
 
-        model = AutoModelForCausalLM.from_pretrained("gpt2", device_map=device_map)
+        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", device_map=device_map)
 
-        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
         inputs = tokenizer("Hello, my name is", return_tensors="pt")
         output = model.generate(inputs["input_ids"].to(0))
 
@@ -1165,7 +1165,7 @@ def f(input_ids):
     @slow
     def test_pretrained_low_mem_new_config(self):
         # Checking for 1 model(the same one which was described in the issue) .
-        model_ids = ["gpt2"]
+        model_ids = ["openai-community/gpt2"]
 
         for model_id in model_ids:
             model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_id)
@@ -1246,7 +1246,7 @@ def test_safetensors_torch_from_torch_sharded(self):
             self.assertTrue(torch.equal(p1, p2))
 
     def test_modifying_model_config_causes_warning_saving_generation_config(self):
-        model = AutoModelForCausalLM.from_pretrained("gpt2")
+        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
         model.config.top_k = 1
         with tempfile.TemporaryDirectory() as tmp_dir:
             with self.assertLogs("transformers.modeling_utils", level="WARNING") as logs:
@@ -1514,7 +1514,7 @@ def test_push_to_hub_with_description(self):
 The commit description supports markdown synthax see:
 ```python
 >>> form transformers import AutoConfig
->>> config = AutoConfig.from_pretrained("bert-base-uncased")
+>>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
 ```
 """
         commit_details = model.push_to_hub(
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index e5b9a34702e2f5..d0c5874911449c 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -3990,7 +3990,7 @@ def test_save_slow_from_fast_and_reload_fast(self):
 
     # TODO This is ran for all models but only tests bert...
     def test_clean_up_tokenization_spaces(self):
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
         assert tokenizer.clean_up_tokenization_spaces is True
 
         tokens = tokenizer.encode("This shouldn't be! He'll go.")
diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py
index 3f7f7249f97c40..3f23fdb156b585 100644
--- a/tests/test_tokenization_utils.py
+++ b/tests/test_tokenization_utils.py
@@ -73,11 +73,11 @@ def test_cached_files_are_used_when_internet_is_down_missing_files(self):
         response_mock.json.return_value = {}
 
         # Download this model to make sure it's in the cache.
-        _ = GPT2TokenizerFast.from_pretrained("gpt2")
+        _ = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
 
         # Under the mock environment we get a 500 error when trying to reach the tokenizer.
         with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = GPT2TokenizerFast.from_pretrained("gpt2")
+            _ = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
             # This check we did call the fake head request
             mock_head.assert_called()
 
@@ -86,7 +86,7 @@ def test_legacy_load_from_one_file(self):
         try:
             tmp_file = tempfile.mktemp()
             with open(tmp_file, "wb") as f:
-                http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)
+                http_get("https://huggingface.co/albert/albert-base-v1/resolve/main/spiece.model", f)
 
             _ = AlbertTokenizer.from_pretrained(tmp_file)
         finally:
@@ -101,7 +101,7 @@ def test_legacy_load_from_one_file(self):
             with open("tokenizer.json", "wb") as f:
                 http_get("https://huggingface.co/hf-internal-testing/tiny-random-bert/blob/main/tokenizer.json", f)
             tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-            # The tiny random BERT has a vocab size of 1024, tiny gpt2 as a vocab size of 1000
+            # The tiny random BERT has a vocab size of 1024, tiny openai-community/gpt2 as a vocab size of 1000
             self.assertEqual(tokenizer.vocab_size, 1000)
             # Tokenizer should depend on the remote checkpoint, not the local tokenizer.json file.
 
@@ -110,7 +110,7 @@ def test_legacy_load_from_one_file(self):
 
     def test_legacy_load_from_url(self):
         # This test is for deprecated behavior and can be removed in v5
-        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert-base-v1/resolve/main/spiece.model")
+        _ = AlbertTokenizer.from_pretrained("https://huggingface.co/albert/albert-base-v1/resolve/main/spiece.model")
 
 
 @is_staging_test
diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py
index 48ac31b97c41ca..6e24009ecd0830 100644
--- a/tests/tokenization/test_tokenization_fast.py
+++ b/tests/tokenization/test_tokenization_fast.py
@@ -132,7 +132,7 @@ def test_init_from_tokenizers_model(self):
 
         sentences = ["Hello, y'all!", "How are you 😁 ? There should not be any issue right?"]
 
-        tokenizer = Tokenizer.from_pretrained("t5-base")
+        tokenizer = Tokenizer.from_pretrained("google-t5/t5-base")
         # Enable padding
         tokenizer.enable_padding(pad_id=0, pad_token="<pad>", length=512, pad_to_multiple_of=8)
         self.assertEqual(
@@ -179,7 +179,7 @@ def test_init_from_tokenizers_model(self):
 @require_tokenizers
 class TokenizerVersioningTest(unittest.TestCase):
     def test_local_versioning(self):
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
         json_tokenizer = json.loads(tokenizer._tokenizer.to_str())
         json_tokenizer["model"]["vocab"]["huggingface"] = len(tokenizer)
 
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index 186fabb7aea086..e5838dd4a32156 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -91,8 +91,8 @@ def test_tensor_type_from_str(self):
     def test_batch_encoding_pickle(self):
         import numpy as np
 
-        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
-        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+        tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
 
         # Python no tensor
         with self.subTest("BatchEncoding (Python, return_tensors=None)"):
@@ -119,8 +119,8 @@ def test_batch_encoding_pickle_tf(self):
         def tf_array_equals(t1, t2):
             return tf.reduce_all(tf.equal(t1, t2))
 
-        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
-        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+        tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
 
         with self.subTest("BatchEncoding (Python, return_tensors=TENSORFLOW)"):
             self.assert_dump_and_restore(
@@ -137,8 +137,8 @@ def tf_array_equals(t1, t2):
     def test_batch_encoding_pickle_pt(self):
         import torch
 
-        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
-        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+        tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
 
         with self.subTest("BatchEncoding (Python, return_tensors=PYTORCH)"):
             self.assert_dump_and_restore(
@@ -152,8 +152,8 @@ def test_batch_encoding_pickle_pt(self):
 
     @require_tokenizers
     def test_batch_encoding_is_fast(self):
-        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
-        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+        tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
 
         with self.subTest("Python Tokenizer"):
             self.assertFalse(tokenizer_p("Small example to_encode").is_fast)
@@ -163,7 +163,7 @@ def test_batch_encoding_is_fast(self):
 
     @require_tokenizers
     def test_batch_encoding_word_to_tokens(self):
-        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
         encoded = tokenizer_r(["Test", "\xad", "test"], is_split_into_words=True)
 
         self.assertEqual(encoded.word_to_tokens(0), TokenSpan(start=1, end=2))
@@ -235,7 +235,7 @@ def test_batch_encoding_with_labels_jax(self):
 
     def test_padding_accepts_tensors(self):
         features = [{"input_ids": np.array([0, 1, 2])}, {"input_ids": np.array([0, 1, 2, 3])}]
-        tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
         batch = tokenizer.pad(features, padding=True)
         self.assertTrue(isinstance(batch["input_ids"], np.ndarray))
@@ -249,7 +249,7 @@ def test_padding_accepts_tensors_pt(self):
         import torch
 
         features = [{"input_ids": torch.tensor([0, 1, 2])}, {"input_ids": torch.tensor([0, 1, 2, 3])}]
-        tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
         batch = tokenizer.pad(features, padding=True)
         self.assertTrue(isinstance(batch["input_ids"], torch.Tensor))
@@ -263,7 +263,7 @@ def test_padding_accepts_tensors_tf(self):
         import tensorflow as tf
 
         features = [{"input_ids": tf.constant([0, 1, 2])}, {"input_ids": tf.constant([0, 1, 2, 3])}]
-        tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
 
         batch = tokenizer.pad(features, padding=True)
         self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 55cc35cf6aa3eb..fe958cdad9df2b 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1536,7 +1536,7 @@ def test_auto_batch_size_finder(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             testargs = f"""
                 run_glue.py
-                --model_name_or_path distilbert-base-uncased
+                --model_name_or_path distilbert/distilbert-base-uncased
                 --task_name mrpc
                 --do_train
                 --do_eval
@@ -1885,7 +1885,7 @@ def test_load_best_model_from_safetensors(self):
 
     @slow
     def test_trainer_eval_mrpc(self):
-        MODEL_ID = "bert-base-cased-finetuned-mrpc"
+        MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
         data_args = GlueDataTrainingArguments(
@@ -1900,7 +1900,7 @@ def test_trainer_eval_mrpc(self):
 
     @slow
     def test_trainer_eval_multiple(self):
-        MODEL_ID = "gpt2"
+        MODEL_ID = "openai-community/gpt2"
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
         dataset = LineByLineTextDataset(
@@ -1929,7 +1929,7 @@ def test_trainer_eval_multiple(self):
 
     @slow
     def test_trainer_eval_lm(self):
-        MODEL_ID = "distilroberta-base"
+        MODEL_ID = "distilbert/distilroberta-base"
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         dataset = LineByLineTextDataset(
             tokenizer=tokenizer,
@@ -2383,7 +2383,7 @@ def test_end_to_end_example(self):
                 "launch",
                 script_path,
                 "--model_name_or_path",
-                "t5-small",
+                "google-t5/t5-small",
                 "--per_device_train_batch_size",
                 "1",
                 "--output_dir",
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index 3f875e6d36573a..7a76ede3a55f5b 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -35,7 +35,7 @@ class Seq2seqTrainerTester(TestCasePlus):
     @require_torch
     def test_finetune_bert2bert(self):
         bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
         bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
         bert2bert.config.eos_token_id = tokenizer.sep_token_id
@@ -144,11 +144,11 @@ def test_return_sequences(self):
         MAX_TARGET_LENGTH = 256
 
         dataset = datasets.load_dataset("gsm8k", "main", split="train[:38]")
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
         data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
         gen_config = GenerationConfig.from_pretrained(
-            "t5-small", max_length=None, min_length=None, max_new_tokens=256, min_new_tokens=1, num_beams=5
+            "google-t5/t5-small", max_length=None, min_length=None, max_new_tokens=256, min_new_tokens=1, num_beams=5
         )
 
         training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True)
diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py
index 61ccc184f5519e..b7eceb6e76c34c 100644
--- a/tests/utils/test_add_new_model_like.py
+++ b/tests/utils/test_add_new_model_like.py
@@ -228,7 +228,7 @@ def test_simplify_replacements(self):
         )
 
     def test_replace_model_patterns(self):
-        bert_model_patterns = ModelPatterns("Bert", "bert-base-cased")
+        bert_model_patterns = ModelPatterns("Bert", "google-bert/bert-base-cased")
         new_bert_model_patterns = ModelPatterns("New Bert", "huggingface/bert-new-base")
         bert_test = '''class TFBertPreTrainedModel(PreTrainedModel):
     """
@@ -312,14 +312,14 @@ def test_replace_model_patterns(self):
         # in others.
         self.assertEqual(replacements, "")
 
-        roberta_model_patterns = ModelPatterns("RoBERTa", "roberta-base", model_camel_cased="Roberta")
+        roberta_model_patterns = ModelPatterns("RoBERTa", "FacebookAI/roberta-base", model_camel_cased="Roberta")
         new_roberta_model_patterns = ModelPatterns(
             "RoBERTa-New", "huggingface/roberta-new-base", model_camel_cased="RobertaNew"
         )
         roberta_test = '''# Copied from transformers.models.bert.BertModel with Bert->Roberta
 class RobertaModel(RobertaPreTrainedModel):
     """ The base RoBERTa model. """
-    checkpoint = roberta-base
+    checkpoint = FacebookAI/roberta-base
     base_model_prefix = "roberta"
         '''
         roberta_expected = '''# Copied from transformers.models.bert.BertModel with Bert->RobertaNew
@@ -346,7 +346,7 @@ def test_get_module_from_file(self):
             get_module_from_file("/models/gpt2/modeling_gpt2.py")
 
     def test_duplicate_module(self):
-        bert_model_patterns = ModelPatterns("Bert", "bert-base-cased")
+        bert_model_patterns = ModelPatterns("Bert", "google-bert/bert-base-cased")
         new_bert_model_patterns = ModelPatterns("New Bert", "huggingface/bert-new-base")
         bert_test = '''class TFBertPreTrainedModel(PreTrainedModel):
     """
@@ -395,7 +395,7 @@ def test_duplicate_module(self):
             self.check_result(dest_file_name, bert_expected)
 
     def test_duplicate_module_with_copied_from(self):
-        bert_model_patterns = ModelPatterns("Bert", "bert-base-cased")
+        bert_model_patterns = ModelPatterns("Bert", "google-bert/bert-base-cased")
         new_bert_model_patterns = ModelPatterns("New Bert", "huggingface/bert-new-base")
         bert_test = '''# Copied from transformers.models.xxx.XxxModel with Xxx->Bert
 class TFBertPreTrainedModel(PreTrainedModel):
@@ -656,7 +656,7 @@ def test_get_model_files_tf_and_flax(self):
         self.assertEqual(test_files, wav2vec2_test_files)
 
     def test_find_base_model_checkpoint(self):
-        self.assertEqual(find_base_model_checkpoint("bert"), "bert-base-uncased")
+        self.assertEqual(find_base_model_checkpoint("bert"), "google-bert/bert-base-uncased")
         self.assertEqual(find_base_model_checkpoint("gpt2"), "gpt2")
 
     def test_retrieve_model_classes(self):
@@ -719,7 +719,7 @@ def test_retrieve_info_for_model_with_bert(self):
 
         bert_model_patterns = bert_info["model_patterns"]
         self.assertEqual(bert_model_patterns.model_name, "BERT")
-        self.assertEqual(bert_model_patterns.checkpoint, "bert-base-uncased")
+        self.assertEqual(bert_model_patterns.checkpoint, "google-bert/bert-base-uncased")
         self.assertEqual(bert_model_patterns.model_type, "bert")
         self.assertEqual(bert_model_patterns.model_lower_cased, "bert")
         self.assertEqual(bert_model_patterns.model_camel_cased, "Bert")
@@ -768,7 +768,7 @@ def test_retrieve_info_for_model_pt_tf_with_bert(self):
 
         bert_model_patterns = bert_info["model_patterns"]
         self.assertEqual(bert_model_patterns.model_name, "BERT")
-        self.assertEqual(bert_model_patterns.checkpoint, "bert-base-uncased")
+        self.assertEqual(bert_model_patterns.checkpoint, "google-bert/bert-base-uncased")
         self.assertEqual(bert_model_patterns.model_type, "bert")
         self.assertEqual(bert_model_patterns.model_lower_cased, "bert")
         self.assertEqual(bert_model_patterns.model_camel_cased, "Bert")
diff --git a/tests/utils/test_hub_utils.py b/tests/utils/test_hub_utils.py
index dffc018e284cbc..c1320baaddaff3 100644
--- a/tests/utils/test_hub_utils.py
+++ b/tests/utils/test_hub_utils.py
@@ -105,7 +105,7 @@ def test_has_file(self):
 
     def test_get_file_from_repo_distant(self):
         # `get_file_from_repo` returns None if the file does not exist
-        self.assertIsNone(get_file_from_repo("bert-base-cased", "ahah.txt"))
+        self.assertIsNone(get_file_from_repo("google-bert/bert-base-cased", "ahah.txt"))
 
         # The function raises if the repository does not exist.
         with self.assertRaisesRegex(EnvironmentError, "is not a valid model identifier"):
@@ -113,9 +113,9 @@ def test_get_file_from_repo_distant(self):
 
         # The function raises if the revision does not exist.
         with self.assertRaisesRegex(EnvironmentError, "is not a valid git identifier"):
-            get_file_from_repo("bert-base-cased", CONFIG_NAME, revision="ahaha")
+            get_file_from_repo("google-bert/bert-base-cased", CONFIG_NAME, revision="ahaha")
 
-        resolved_file = get_file_from_repo("bert-base-cased", CONFIG_NAME)
+        resolved_file = get_file_from_repo("google-bert/bert-base-cased", CONFIG_NAME)
         # The name is the cached name which is not very easy to test, so instead we load the content.
         config = json.loads(open(resolved_file, "r").read())
         self.assertEqual(config["hidden_size"], 768)
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index 02ec510baba64f..8cb2c4e2fea58f 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -30,7 +30,7 @@
 CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING
 
 # Regex pattern used to find the checkpoint mentioned in the docstring of `config_class`.
-# For example, `[bert-base-uncased](https://huggingface.co/bert-base-uncased)`
+# For example, `[google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)`
 _re_checkpoint = re.compile(r"\[(.+?)\]\((https://huggingface\.co/.+?)\)")
 
 
@@ -55,7 +55,7 @@ def get_checkpoint_from_config_class(config_class):
     checkpoints = _re_checkpoint.findall(config_source)
 
     # Each `checkpoint` is a tuple of a checkpoint name and a checkpoint link.
-    # For example, `('bert-base-uncased', 'https://huggingface.co/bert-base-uncased')`
+    # For example, `('google-bert/bert-base-uncased', 'https://huggingface.co/google-bert/bert-base-uncased')`
     for ckpt_name, ckpt_link in checkpoints:
         # allow the link to end with `/`
         if ckpt_link.endswith("/"):