diff --git a/notebooks/README.md b/notebooks/README.md
new file mode 100644
index 00000000..c27484e6
--- /dev/null
+++ b/notebooks/README.md
@@ -0,0 +1,18 @@
+# pygaggle Notebooks
+
+[![PyPI](https://img.shields.io/pypi/v/pygaggle?color=brightgreen)](https://pypi.org/project/pygaggle/)
+
+This holds static copies of notebooks for the [PyGaggle](https://github.com/castorini/pygaggle), a neural IR and QA toolkit.
+
+## Colab
+
+The notebooks in this repo are sync'ed (by hand) with notebooks in Colab.
+These online demos provide a low-effort way to try out PyGaggle's features:
+
++ PyGaggle demo on CovidQA: [[GitHub]](pygaggle_covidqa_demo.ipynb)
+
+Click "Open in Playground" and you'll be able to replicate our results!
+
+## Pre-Built Indexes
+
+For convenience, we've pre-built a few common indexes, available to download [here](https://git.uwaterloo.ca/jimmylin/anserini-indexes).
diff --git a/notebooks/pygaggle_covidqa_demo.ipynb b/notebooks/pygaggle_covidqa_demo.ipynb
new file mode 100644
index 00000000..9b7e0d14
--- /dev/null
+++ b/notebooks/pygaggle_covidqa_demo.ipynb
@@ -0,0 +1,4237 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "pygaggle-covidqa-demo",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true,
+      "machine_shape": "hm"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bTLkqIX91TwU",
+        "colab_type": "text"
+      },
+      "source": [
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/castorini/pygaggle/blob/master/notebooks/pygaggle_covidqa_demo.ipynb)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tNJwwHLn1gLk",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **PyGaggle CovidQA demo**"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PpcDAefWlAQD",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Install pyserini"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YqZRcyqHLmQw",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "%%capture\n",
+        "!pip install pyserini\n",
+        "import os\n",
+        "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-11-openjdk-amd64\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U99w0fNfLdOI",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Checkout GPU, install transformers and pygaggle"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5QJ5LE9da1j6",
+        "colab_type": "code",
+        "outputId": "2e753be9-c710-4615-97f5-3b7f3ecc19cc",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 306
+        }
+      },
+      "source": [
+        "!nvidia-smi"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Mon May  4 21:26:13 2020       \n",
+            "+-----------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |\n",
+            "|-------------------------------+----------------------+----------------------+\n",
+            "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
+            "|===============================+======================+======================|\n",
+            "|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |\n",
+            "| N/A   34C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |\n",
+            "+-------------------------------+----------------------+----------------------+\n",
+            "                                                                               \n",
+            "+-----------------------------------------------------------------------------+\n",
+            "| Processes:                                                       GPU Memory |\n",
+            "|  GPU       PID   Type   Process name                             Usage      |\n",
+            "|=============================================================================|\n",
+            "|  No running processes found                                                 |\n",
+            "+-----------------------------------------------------------------------------+\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wLalD_vXZQWa",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "%%capture\n",
+        "# Install huggingface\n",
+        "!pip uninstall -y transformers\n",
+        "!pip install transformers"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "f57NyzEsAfH2",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "%%capture\n",
+        "# Clone the master branch from pygaggle\n",
+        "!rm -rf pygaggle && pip uninstall -y pygaggle\n",
+        "!git clone https://github.com/castorini/pygaggle.git # use master once that branch is merged\n",
+        "!cd pygaggle && pip install --editable ."
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "50YgOdw3mZkI",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Get the CORD-19 paragraph index from 2020-04-10"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "O3fns_omiNku",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "%%capture\n",
+        "%cd /content/pygaggle\n",
+        "!sh scripts/update-index.sh"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LCR84XvJnZhx",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Let's start of with BM-25"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LHrIADRnm8iP",
+        "colab_type": "text"
+      },
+      "source": [
+        "### First, we use the natural query string format"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fVGfhujZi0E6",
+        "colab_type": "code",
+        "outputId": "6282812d-8f6e-43fc-e47e-788cd2c83d3e",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 377
+        }
+      },
+      "source": [
+        "!python -um pygaggle.run.evaluate_kaggle_highlighter --method bm25"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\u001b[32m2020-05-04 21:42:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: PyTorch version 1.5.0+cu101 available.\n",
+            "2020-05-04 21:42:58.128420: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1\n",
+            "\u001b[32m2020-05-04 21:42:59\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: TensorFlow version 2.2.0-rc3 available.\n",
+            "\u001b[32m2020-05-04 21:43:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Average spans: 1.5725806451612903\n",
+            "\u001b[32m2020-05-04 21:43:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random P@1: 0.011513690878122968\n",
+            "\u001b[32m2020-05-04 21:43:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random R@3: 0.034106620910472035\n",
+            "\u001b[32m2020-05-04 21:43:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random MRR: 0.05247032691539293\n",
+            "100% 124/124 [00:08<00:00, 14.85it/s]\n",
+            "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: precision@1 0.15\n",
+            "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@3    0.2164\n",
+            "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@50   0.61976\n",
+            "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@1000 0.63185\n",
+            "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr         0.24284\n",
+            "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr@10      0.22116\n",
+            "precision@1\t0.15\n",
+            "recall@3\t0.2163978494623656\n",
+            "recall@50\t0.619758064516129\n",
+            "recall@1000\t0.6318548387096774\n",
+            "mrr\t0.24284268136968856\n",
+            "mrr@10\t0.22115655401945727\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "niO0PTMjnIHc",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Then, we evaluate with keyword query format"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "b-q7o10Kmwfu",
+        "colab_type": "code",
+        "outputId": "c4df4fbb-a06a-4ce2-9386-c7442807b5fd",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 258
+        }
+      },
+      "source": [
+        "!python -um pygaggle.run.evaluate_kaggle_highlighter --method bm25 \\ \n",
+        "                                                     --split kq"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\u001b[32m2020-05-04 21:50:07\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: PyTorch version 1.5.0+cu101 available.\n",
+            "2020-05-04 21:50:07.779993: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1\n",
+            "\u001b[32m2020-05-04 21:50:09\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: TensorFlow version 2.2.0-rc3 available.\n",
+            "usage: evaluate_kaggle_highlighter.py [-h] [--dataset DATASET] --method\n",
+            "                                      {transformer,bm25,t5,seq_class_transformer,qa_transformer,random}\n",
+            "                                      [--model-name MODEL_NAME]\n",
+            "                                      [--split {nq,kq}]\n",
+            "                                      [--batch-size BATCH_SIZE]\n",
+            "                                      [--device DEVICE]\n",
+            "                                      [--tokenizer-name TOKENIZER_NAME]\n",
+            "                                      [--do-lower-case]\n",
+            "                                      [--metrics {precision@1,recall@3,recall@50,recall@1000,mrr,mrr@10} [{precision@1,recall@3,recall@50,recall@1000,mrr,mrr@10} ...]]\n",
+            "evaluate_kaggle_highlighter.py: error: argument --method: invalid choice: 'bm25 --split' (choose from 'transformer', 'bm25', 't5', 'seq_class_transformer', 'qa_transformer', 'random')\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cfMPj8E3lVaI",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Let's evaluate using our best neural ranker, T5"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wnCu696xoiSC",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Again, we first use the natural query string format"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "v7l9gCPIkU-s",
+        "colab_type": "code",
+        "outputId": "d224fd01-0bcf-4396-b688-76665d85b95e",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        }
+      },
+      "source": [
+        "!python -um pygaggle.run.evaluate_kaggle_highlighter --method t5\n"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\u001b[32m2020-05-04 21:43:22\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: PyTorch version 1.5.0+cu101 available.\n",
+            "2020-05-04 21:43:22.711873: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1\n",
+            "\u001b[32m2020-05-04 21:43:24\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: TensorFlow version 2.2.0-rc3 available.\n",
+            "\u001b[32m2020-05-04 21:43:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Average spans: 1.5725806451612903\n",
+            "\u001b[32m2020-05-04 21:43:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random P@1: 0.011513690878122968\n",
+            "\u001b[32m2020-05-04 21:43:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random R@3: 0.034106620910472035\n",
+            "\u001b[32m2020-05-04 21:43:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random MRR: 0.05247032691539293\n",
+            "2020-05-04 21:43:29.564495: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:29.721468: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "\u001b[32m2020-05-04 21:43:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: T5 model weights not in cache.\n",
+            "2020-05-04 21:43:30.114116: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:30.246026: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:30.381720: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "\u001b[32m2020-05-04 21:43:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: Caching model.ckpt-1009900.data-00000-of-00002...\n",
+            "2020-05-04 21:43:30.528048: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:30.957448: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "\u001b[32m2020-05-04 21:43:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: Caching model.ckpt-1009900.data-00001-of-00002...\n",
+            "2020-05-04 21:43:31.089205: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:32.011302: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:32.813789: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:33.608747: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:34.332896: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:35.230923: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:36.061146: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:36.812021: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:37.640353: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:38.410710: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:39.224997: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:39.936081: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:40.722613: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:41.429997: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:42.172067: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:42.944362: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:43.645444: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:44.385052: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:45.158262: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:46.089485: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:46.819715: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:47.534050: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:48.278465: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:49.047111: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:49.827815: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:50.568785: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:51.305316: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:51.993255: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:52.138638: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "\u001b[32m2020-05-04 21:43:52\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: Caching model.ckpt-1009900.index...\n",
+            "2020-05-04 21:43:52.265979: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:52.654611: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "\u001b[32m2020-05-04 21:43:52\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: Caching model.ckpt-1009900.meta...\n",
+            "2020-05-04 21:43:52.787316: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:53.797288: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:43:54.088040: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "\u001b[32m2020-05-04 21:43:54\u001b[0m \u001b[1;30m[INFO]\u001b[0m filelock: Lock 139706941352256 acquired on /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b.lock\n",
+            "\u001b[32m2020-05-04 21:43:54\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpwsf7lnkt\n",
+            "Downloading: 100% 1.20k/1.20k [00:00<00:00, 744kB/s]\n",
+            "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: storing https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json in cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n",
+            "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: creating metadata file for /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n",
+            "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m filelock: Lock 139706941352256 released on /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b.lock\n",
+            "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n",
+            "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n",
+            "  \"_num_labels\": 2,\n",
+            "  \"architectures\": [\n",
+            "    \"T5WithLMHeadModel\"\n",
+            "  ],\n",
+            "  \"bad_words_ids\": null,\n",
+            "  \"bos_token_id\": null,\n",
+            "  \"d_ff\": 3072,\n",
+            "  \"d_kv\": 64,\n",
+            "  \"d_model\": 768,\n",
+            "  \"decoder_start_token_id\": 0,\n",
+            "  \"do_sample\": false,\n",
+            "  \"dropout_rate\": 0.1,\n",
+            "  \"early_stopping\": false,\n",
+            "  \"eos_token_id\": 1,\n",
+            "  \"finetuning_task\": null,\n",
+            "  \"id2label\": {\n",
+            "    \"0\": \"LABEL_0\",\n",
+            "    \"1\": \"LABEL_1\"\n",
+            "  },\n",
+            "  \"initializer_factor\": 1.0,\n",
+            "  \"is_decoder\": false,\n",
+            "  \"is_encoder_decoder\": true,\n",
+            "  \"label2id\": {\n",
+            "    \"LABEL_0\": 0,\n",
+            "    \"LABEL_1\": 1\n",
+            "  },\n",
+            "  \"layer_norm_epsilon\": 1e-06,\n",
+            "  \"length_penalty\": 1.0,\n",
+            "  \"max_length\": 20,\n",
+            "  \"min_length\": 0,\n",
+            "  \"model_type\": \"t5\",\n",
+            "  \"n_positions\": 512,\n",
+            "  \"no_repeat_ngram_size\": 0,\n",
+            "  \"num_beams\": 1,\n",
+            "  \"num_heads\": 12,\n",
+            "  \"num_layers\": 12,\n",
+            "  \"num_return_sequences\": 1,\n",
+            "  \"output_attentions\": false,\n",
+            "  \"output_hidden_states\": false,\n",
+            "  \"output_past\": true,\n",
+            "  \"pad_token_id\": 0,\n",
+            "  \"prefix\": null,\n",
+            "  \"pruned_heads\": {},\n",
+            "  \"relative_attention_num_buckets\": 32,\n",
+            "  \"repetition_penalty\": 1.0,\n",
+            "  \"task_specific_params\": {\n",
+            "    \"summarization\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"length_penalty\": 2.0,\n",
+            "      \"max_length\": 200,\n",
+            "      \"min_length\": 30,\n",
+            "      \"no_repeat_ngram_size\": 3,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"summarize: \"\n",
+            "    },\n",
+            "    \"translation_en_to_de\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to German: \"\n",
+            "    },\n",
+            "    \"translation_en_to_fr\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to French: \"\n",
+            "    },\n",
+            "    \"translation_en_to_ro\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to Romanian: \"\n",
+            "    }\n",
+            "  },\n",
+            "  \"temperature\": 1.0,\n",
+            "  \"top_k\": 50,\n",
+            "  \"top_p\": 1.0,\n",
+            "  \"torchscript\": false,\n",
+            "  \"use_bfloat16\": false,\n",
+            "  \"vocab_size\": 32128\n",
+            "}\n",
+            "\n",
+            "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file /root/.cache/covidex/ranker/config.json\n",
+            "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n",
+            "  \"_num_labels\": 2,\n",
+            "  \"architectures\": [\n",
+            "    \"T5WithLMHeadModel\"\n",
+            "  ],\n",
+            "  \"bad_words_ids\": null,\n",
+            "  \"bos_token_id\": null,\n",
+            "  \"d_ff\": 3072,\n",
+            "  \"d_kv\": 64,\n",
+            "  \"d_model\": 768,\n",
+            "  \"decoder_start_token_id\": 0,\n",
+            "  \"do_sample\": false,\n",
+            "  \"dropout_rate\": 0.1,\n",
+            "  \"early_stopping\": false,\n",
+            "  \"eos_token_id\": 1,\n",
+            "  \"finetuning_task\": null,\n",
+            "  \"id2label\": {\n",
+            "    \"0\": \"LABEL_0\",\n",
+            "    \"1\": \"LABEL_1\"\n",
+            "  },\n",
+            "  \"initializer_factor\": 1.0,\n",
+            "  \"is_decoder\": false,\n",
+            "  \"is_encoder_decoder\": true,\n",
+            "  \"label2id\": {\n",
+            "    \"LABEL_0\": 0,\n",
+            "    \"LABEL_1\": 1\n",
+            "  },\n",
+            "  \"layer_norm_epsilon\": 1e-06,\n",
+            "  \"length_penalty\": 1.0,\n",
+            "  \"max_length\": 20,\n",
+            "  \"min_length\": 0,\n",
+            "  \"model_type\": \"t5\",\n",
+            "  \"n_positions\": 512,\n",
+            "  \"no_repeat_ngram_size\": 0,\n",
+            "  \"num_beams\": 1,\n",
+            "  \"num_heads\": 12,\n",
+            "  \"num_layers\": 12,\n",
+            "  \"num_return_sequences\": 1,\n",
+            "  \"output_attentions\": false,\n",
+            "  \"output_hidden_states\": false,\n",
+            "  \"output_past\": true,\n",
+            "  \"pad_token_id\": 0,\n",
+            "  \"prefix\": null,\n",
+            "  \"pruned_heads\": {},\n",
+            "  \"relative_attention_num_buckets\": 32,\n",
+            "  \"repetition_penalty\": 1.0,\n",
+            "  \"task_specific_params\": {\n",
+            "    \"summarization\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"length_penalty\": 2.0,\n",
+            "      \"max_length\": 200,\n",
+            "      \"min_length\": 30,\n",
+            "      \"no_repeat_ngram_size\": 3,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"summarize: \"\n",
+            "    },\n",
+            "    \"translation_en_to_de\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to German: \"\n",
+            "    },\n",
+            "    \"translation_en_to_fr\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to French: \"\n",
+            "    },\n",
+            "    \"translation_en_to_ro\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to Romanian: \"\n",
+            "    }\n",
+            "  },\n",
+            "  \"temperature\": 1.0,\n",
+            "  \"top_k\": 50,\n",
+            "  \"top_p\": 1.0,\n",
+            "  \"torchscript\": false,\n",
+            "  \"use_bfloat16\": false,\n",
+            "  \"vocab_size\": 32128\n",
+            "}\n",
+            "\n",
+            "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_utils: loading weights file /root/.cache/covidex/ranker/model.ckpt.index\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Converting TensorFlow checkpoint from /root/.cache/covidex/ranker/model.ckpt\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/relative_attention_bias with shape [12, 32]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v with shape [12, 32]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/final_layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/final_layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/relative_attention_bias with shape [12, 32]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v with shape [12, 32]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/final_layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/final_layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight global_step with shape []\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding with shape [32128, 768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding_slot_vc with shape [32128]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (12, 32) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'final_layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'final_layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/final_layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (12, 32) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'final_layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'final_layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/final_layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping global_step\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['shared', 'embedding']\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping shared/embedding_slot_vc\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping shared/embedding_slot_vr\n",
+            "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Weights not copied to PyTorch model: \n",
+            "\u001b[32m2020-05-04 21:44:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n",
+            "\u001b[32m2020-05-04 21:44:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n",
+            "  \"_num_labels\": 2,\n",
+            "  \"architectures\": [\n",
+            "    \"T5WithLMHeadModel\"\n",
+            "  ],\n",
+            "  \"bad_words_ids\": null,\n",
+            "  \"bos_token_id\": null,\n",
+            "  \"d_ff\": 3072,\n",
+            "  \"d_kv\": 64,\n",
+            "  \"d_model\": 768,\n",
+            "  \"decoder_start_token_id\": 0,\n",
+            "  \"do_sample\": false,\n",
+            "  \"dropout_rate\": 0.1,\n",
+            "  \"early_stopping\": false,\n",
+            "  \"eos_token_id\": 1,\n",
+            "  \"finetuning_task\": null,\n",
+            "  \"id2label\": {\n",
+            "    \"0\": \"LABEL_0\",\n",
+            "    \"1\": \"LABEL_1\"\n",
+            "  },\n",
+            "  \"initializer_factor\": 1.0,\n",
+            "  \"is_decoder\": false,\n",
+            "  \"is_encoder_decoder\": true,\n",
+            "  \"label2id\": {\n",
+            "    \"LABEL_0\": 0,\n",
+            "    \"LABEL_1\": 1\n",
+            "  },\n",
+            "  \"layer_norm_epsilon\": 1e-06,\n",
+            "  \"length_penalty\": 1.0,\n",
+            "  \"max_length\": 20,\n",
+            "  \"min_length\": 0,\n",
+            "  \"model_type\": \"t5\",\n",
+            "  \"n_positions\": 512,\n",
+            "  \"no_repeat_ngram_size\": 0,\n",
+            "  \"num_beams\": 1,\n",
+            "  \"num_heads\": 12,\n",
+            "  \"num_layers\": 12,\n",
+            "  \"num_return_sequences\": 1,\n",
+            "  \"output_attentions\": false,\n",
+            "  \"output_hidden_states\": false,\n",
+            "  \"output_past\": true,\n",
+            "  \"pad_token_id\": 0,\n",
+            "  \"prefix\": null,\n",
+            "  \"pruned_heads\": {},\n",
+            "  \"relative_attention_num_buckets\": 32,\n",
+            "  \"repetition_penalty\": 1.0,\n",
+            "  \"task_specific_params\": {\n",
+            "    \"summarization\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"length_penalty\": 2.0,\n",
+            "      \"max_length\": 200,\n",
+            "      \"min_length\": 30,\n",
+            "      \"no_repeat_ngram_size\": 3,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"summarize: \"\n",
+            "    },\n",
+            "    \"translation_en_to_de\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to German: \"\n",
+            "    },\n",
+            "    \"translation_en_to_fr\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to French: \"\n",
+            "    },\n",
+            "    \"translation_en_to_ro\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to Romanian: \"\n",
+            "    }\n",
+            "  },\n",
+            "  \"temperature\": 1.0,\n",
+            "  \"top_k\": 50,\n",
+            "  \"top_p\": 1.0,\n",
+            "  \"torchscript\": false,\n",
+            "  \"use_bfloat16\": false,\n",
+            "  \"vocab_size\": 32128\n",
+            "}\n",
+            "\n",
+            "\u001b[32m2020-05-04 21:44:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m filelock: Lock 139706941352256 acquired on /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f.lock\n",
+            "\u001b[32m2020-05-04 21:44:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp8x66xr_c\n",
+            "Downloading: 100% 792k/792k [00:00<00:00, 1.89MB/s]\n",
+            "\u001b[32m2020-05-04 21:44:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: storing https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model in cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f\n",
+            "\u001b[32m2020-05-04 21:44:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: creating metadata file for /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f\n",
+            "\u001b[32m2020-05-04 21:44:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m filelock: Lock 139706941352256 released on /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f.lock\n",
+            "\u001b[32m2020-05-04 21:44:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m tokenization_utils: loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f\n",
+            "100% 124/124 [05:31<00:00,  2.67s/it]\n",
+            "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: precision@1 0.27419\n",
+            "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@3    0.43502\n",
+            "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@50   0.93057\n",
+            "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@1000 1.0\n",
+            "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr         0.4224\n",
+            "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr@10      0.40976\n",
+            "precision@1\t0.27419354838709675\n",
+            "recall@3\t0.43502304147465437\n",
+            "recall@50\t0.9305683563748081\n",
+            "recall@1000\t1.0\n",
+            "mrr\t0.4224002621206025\n",
+            "mrr@10\t0.4097638248847927\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "A6hvKTS3pht1",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Finally, we evaluate using the keyword query string format."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "WkJ42FRolJ0f",
+        "colab_type": "code",
+        "outputId": "1cedcbe2-feb3-4618-cf98-dfd6f3d4adfb",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        }
+      },
+      "source": [
+        "!python -um pygaggle.run.evaluate_kaggle_highlighter --method t5 --split kq"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\u001b[32m2020-05-04 21:51:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: PyTorch version 1.5.0+cu101 available.\n",
+            "2020-05-04 21:51:13.856830: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1\n",
+            "\u001b[32m2020-05-04 21:51:15\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: TensorFlow version 2.2.0-rc3 available.\n",
+            "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Average spans: 1.5725806451612903\n",
+            "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random P@1: 0.011513690878122968\n",
+            "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random R@3: 0.034106620910472035\n",
+            "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random MRR: 0.05247032691539293\n",
+            "2020-05-04 21:51:20.636924: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "2020-05-04 21:51:20.774761: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n",
+            "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file /root/.cache/covidex/ranker/config.json\n",
+            "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n",
+            "  \"_num_labels\": 2,\n",
+            "  \"architectures\": [\n",
+            "    \"T5WithLMHeadModel\"\n",
+            "  ],\n",
+            "  \"bad_words_ids\": null,\n",
+            "  \"bos_token_id\": null,\n",
+            "  \"d_ff\": 3072,\n",
+            "  \"d_kv\": 64,\n",
+            "  \"d_model\": 768,\n",
+            "  \"decoder_start_token_id\": 0,\n",
+            "  \"do_sample\": false,\n",
+            "  \"dropout_rate\": 0.1,\n",
+            "  \"early_stopping\": false,\n",
+            "  \"eos_token_id\": 1,\n",
+            "  \"finetuning_task\": null,\n",
+            "  \"id2label\": {\n",
+            "    \"0\": \"LABEL_0\",\n",
+            "    \"1\": \"LABEL_1\"\n",
+            "  },\n",
+            "  \"initializer_factor\": 1.0,\n",
+            "  \"is_decoder\": false,\n",
+            "  \"is_encoder_decoder\": true,\n",
+            "  \"label2id\": {\n",
+            "    \"LABEL_0\": 0,\n",
+            "    \"LABEL_1\": 1\n",
+            "  },\n",
+            "  \"layer_norm_epsilon\": 1e-06,\n",
+            "  \"length_penalty\": 1.0,\n",
+            "  \"max_length\": 20,\n",
+            "  \"min_length\": 0,\n",
+            "  \"model_type\": \"t5\",\n",
+            "  \"n_positions\": 512,\n",
+            "  \"no_repeat_ngram_size\": 0,\n",
+            "  \"num_beams\": 1,\n",
+            "  \"num_heads\": 12,\n",
+            "  \"num_layers\": 12,\n",
+            "  \"num_return_sequences\": 1,\n",
+            "  \"output_attentions\": false,\n",
+            "  \"output_hidden_states\": false,\n",
+            "  \"output_past\": true,\n",
+            "  \"pad_token_id\": 0,\n",
+            "  \"prefix\": null,\n",
+            "  \"pruned_heads\": {},\n",
+            "  \"relative_attention_num_buckets\": 32,\n",
+            "  \"repetition_penalty\": 1.0,\n",
+            "  \"task_specific_params\": {\n",
+            "    \"summarization\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"length_penalty\": 2.0,\n",
+            "      \"max_length\": 200,\n",
+            "      \"min_length\": 30,\n",
+            "      \"no_repeat_ngram_size\": 3,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"summarize: \"\n",
+            "    },\n",
+            "    \"translation_en_to_de\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to German: \"\n",
+            "    },\n",
+            "    \"translation_en_to_fr\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to French: \"\n",
+            "    },\n",
+            "    \"translation_en_to_ro\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to Romanian: \"\n",
+            "    }\n",
+            "  },\n",
+            "  \"temperature\": 1.0,\n",
+            "  \"top_k\": 50,\n",
+            "  \"top_p\": 1.0,\n",
+            "  \"torchscript\": false,\n",
+            "  \"use_bfloat16\": false,\n",
+            "  \"vocab_size\": 32128\n",
+            "}\n",
+            "\n",
+            "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_utils: loading weights file /root/.cache/covidex/ranker/model.ckpt.index\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Converting TensorFlow checkpoint from /root/.cache/covidex/ranker/model.ckpt\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/relative_attention_bias with shape [12, 32]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v with shape [12, 32]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/final_layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/final_layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/relative_attention_bias with shape [12, 32]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v with shape [12, 32]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v with shape [768, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v_slot_vc with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/final_layer_norm/scale with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/final_layer_norm/scale_slot_v with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight global_step with shape []\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding with shape [32128, 768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding_slot_vc with shape [32128]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding_slot_vr with shape [768]\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (12, 32) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'final_layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'final_layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/final_layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (12, 32) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/k_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/k_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/o_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/o_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/q_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/q_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/v_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/v_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'final_layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'final_layer_norm', 'scale']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/final_layer_norm/scale_slot_v\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping global_step\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['shared', 'embedding']\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping shared/embedding_slot_vc\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping shared/embedding_slot_vr\n",
+            "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Weights not copied to PyTorch model: \n",
+            "\u001b[32m2020-05-04 21:51:35\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n",
+            "\u001b[32m2020-05-04 21:51:35\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n",
+            "  \"_num_labels\": 2,\n",
+            "  \"architectures\": [\n",
+            "    \"T5WithLMHeadModel\"\n",
+            "  ],\n",
+            "  \"bad_words_ids\": null,\n",
+            "  \"bos_token_id\": null,\n",
+            "  \"d_ff\": 3072,\n",
+            "  \"d_kv\": 64,\n",
+            "  \"d_model\": 768,\n",
+            "  \"decoder_start_token_id\": 0,\n",
+            "  \"do_sample\": false,\n",
+            "  \"dropout_rate\": 0.1,\n",
+            "  \"early_stopping\": false,\n",
+            "  \"eos_token_id\": 1,\n",
+            "  \"finetuning_task\": null,\n",
+            "  \"id2label\": {\n",
+            "    \"0\": \"LABEL_0\",\n",
+            "    \"1\": \"LABEL_1\"\n",
+            "  },\n",
+            "  \"initializer_factor\": 1.0,\n",
+            "  \"is_decoder\": false,\n",
+            "  \"is_encoder_decoder\": true,\n",
+            "  \"label2id\": {\n",
+            "    \"LABEL_0\": 0,\n",
+            "    \"LABEL_1\": 1\n",
+            "  },\n",
+            "  \"layer_norm_epsilon\": 1e-06,\n",
+            "  \"length_penalty\": 1.0,\n",
+            "  \"max_length\": 20,\n",
+            "  \"min_length\": 0,\n",
+            "  \"model_type\": \"t5\",\n",
+            "  \"n_positions\": 512,\n",
+            "  \"no_repeat_ngram_size\": 0,\n",
+            "  \"num_beams\": 1,\n",
+            "  \"num_heads\": 12,\n",
+            "  \"num_layers\": 12,\n",
+            "  \"num_return_sequences\": 1,\n",
+            "  \"output_attentions\": false,\n",
+            "  \"output_hidden_states\": false,\n",
+            "  \"output_past\": true,\n",
+            "  \"pad_token_id\": 0,\n",
+            "  \"prefix\": null,\n",
+            "  \"pruned_heads\": {},\n",
+            "  \"relative_attention_num_buckets\": 32,\n",
+            "  \"repetition_penalty\": 1.0,\n",
+            "  \"task_specific_params\": {\n",
+            "    \"summarization\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"length_penalty\": 2.0,\n",
+            "      \"max_length\": 200,\n",
+            "      \"min_length\": 30,\n",
+            "      \"no_repeat_ngram_size\": 3,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"summarize: \"\n",
+            "    },\n",
+            "    \"translation_en_to_de\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to German: \"\n",
+            "    },\n",
+            "    \"translation_en_to_fr\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to French: \"\n",
+            "    },\n",
+            "    \"translation_en_to_ro\": {\n",
+            "      \"early_stopping\": true,\n",
+            "      \"max_length\": 300,\n",
+            "      \"num_beams\": 4,\n",
+            "      \"prefix\": \"translate English to Romanian: \"\n",
+            "    }\n",
+            "  },\n",
+            "  \"temperature\": 1.0,\n",
+            "  \"top_k\": 50,\n",
+            "  \"top_p\": 1.0,\n",
+            "  \"torchscript\": false,\n",
+            "  \"use_bfloat16\": false,\n",
+            "  \"vocab_size\": 32128\n",
+            "}\n",
+            "\n",
+            "\u001b[32m2020-05-04 21:51:35\u001b[0m \u001b[1;30m[INFO]\u001b[0m tokenization_utils: loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f\n",
+            "100% 124/124 [05:22<00:00,  2.60s/it]\n",
+            "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: precision@1 0.24194\n",
+            "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@3    0.36379\n",
+            "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@50   0.92304\n",
+            "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@1000 1.0\n",
+            "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr         0.3825\n",
+            "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr@10      0.37012\n",
+            "precision@1\t0.24193548387096775\n",
+            "recall@3\t0.36378648233486943\n",
+            "recall@50\t0.9230414746543779\n",
+            "recall@1000\t1.0\n",
+            "mrr\t0.38249784501639117\n",
+            "mrr@10\t0.3701228878648234\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file