diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 00000000..c27484e6 --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,18 @@ +# pygaggle Notebooks + +[![PyPI](https://img.shields.io/pypi/v/pygaggle?color=brightgreen)](https://pypi.org/project/pygaggle/) + +This holds static copies of notebooks for the [PyGaggle](https://github.com/castorini/pygaggle), a neural IR and QA toolkit. + +## Colab + +The notebooks in this repo are sync'ed (by hand) with notebooks in Colab. +These online demos provide a low-effort way to try out PyGaggle's features: + ++ PyGaggle demo on CovidQA: [[GitHub]](pygaggle_covidqa_demo.ipynb) + +Click "Open in Playground" and you'll be able to replicate our results! + +## Pre-Built Indexes + +For convenience, we've pre-built a few common indexes, available to download [here](https://git.uwaterloo.ca/jimmylin/anserini-indexes). diff --git a/notebooks/pygaggle_covidqa_demo.ipynb b/notebooks/pygaggle_covidqa_demo.ipynb new file mode 100644 index 00000000..9b7e0d14 --- /dev/null +++ b/notebooks/pygaggle_covidqa_demo.ipynb @@ -0,0 +1,4237 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "pygaggle-covidqa-demo", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true, + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "bTLkqIX91TwU", + "colab_type": "text" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/castorini/pygaggle/blob/master/notebooks/pygaggle_covidqa_demo.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNJwwHLn1gLk", + "colab_type": "text" + }, + "source": [ + "# **PyGaggle CovidQA demo**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PpcDAefWlAQD", + "colab_type": "text" + }, + "source": [ + "## Install pyserini" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YqZRcyqHLmQw", + "colab_type": "code", + "colab": {} + }, + "source": [ + "%%capture\n", + "!pip install pyserini\n", + "import os\n", + "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-11-openjdk-amd64\"" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U99w0fNfLdOI", + "colab_type": "text" + }, + "source": [ + "## Checkout GPU, install transformers and pygaggle" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5QJ5LE9da1j6", + "colab_type": "code", + "outputId": "2e753be9-c710-4615-97f5-3b7f3ecc19cc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 306 + } + }, + "source": [ + "!nvidia-smi" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mon May 4 21:26:13 2020 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 440.64.00 Driver Version: 418.67 CUDA Version: 10.1 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "|===============================+======================+======================|\n", + "| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 34C P8 29W / 149W | 0MiB / 11441MiB | 0% Default |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: GPU Memory |\n", + "| GPU PID Type Process name Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "wLalD_vXZQWa", + "colab_type": "code", + "colab": {} + }, + "source": [ + "%%capture\n", + "# Install huggingface\n", + "!pip uninstall -y transformers\n", + "!pip install transformers" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "f57NyzEsAfH2", + "colab_type": "code", + "colab": {} + }, + "source": [ + "%%capture\n", + "# Clone the master branch from pygaggle\n", + "!rm -rf pygaggle && pip uninstall -y pygaggle\n", + "!git clone https://github.com/castorini/pygaggle.git # use master once that branch is merged\n", + "!cd pygaggle && pip install --editable ." + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "50YgOdw3mZkI", + "colab_type": "text" + }, + "source": [ + "## Get the CORD-19 paragraph index from 2020-04-10" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "O3fns_omiNku", + "colab_type": "code", + "colab": {} + }, + "source": [ + "%%capture\n", + "%cd /content/pygaggle\n", + "!sh scripts/update-index.sh" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LCR84XvJnZhx", + "colab_type": "text" + }, + "source": [ + "## Let's start of with BM-25" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LHrIADRnm8iP", + "colab_type": "text" + }, + "source": [ + "### First, we use the natural query string format" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "fVGfhujZi0E6", + "colab_type": "code", + "outputId": "6282812d-8f6e-43fc-e47e-788cd2c83d3e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 377 + } + }, + "source": [ + "!python -um pygaggle.run.evaluate_kaggle_highlighter --method bm25" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\u001b[32m2020-05-04 21:42:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: PyTorch version 1.5.0+cu101 available.\n", + "2020-05-04 21:42:58.128420: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1\n", + "\u001b[32m2020-05-04 21:42:59\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: TensorFlow version 2.2.0-rc3 available.\n", + "\u001b[32m2020-05-04 21:43:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Average spans: 1.5725806451612903\n", + "\u001b[32m2020-05-04 21:43:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random P@1: 0.011513690878122968\n", + "\u001b[32m2020-05-04 21:43:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random R@3: 0.034106620910472035\n", + "\u001b[32m2020-05-04 21:43:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random MRR: 0.05247032691539293\n", + "100% 124/124 [00:08<00:00, 14.85it/s]\n", + "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: precision@1 0.15\n", + "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@3 0.2164\n", + "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@50 0.61976\n", + "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@1000 0.63185\n", + "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr 0.24284\n", + "\u001b[32m2020-05-04 21:43:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr@10 0.22116\n", + "precision@1\t0.15\n", + "recall@3\t0.2163978494623656\n", + "recall@50\t0.619758064516129\n", + "recall@1000\t0.6318548387096774\n", + "mrr\t0.24284268136968856\n", + "mrr@10\t0.22115655401945727\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "niO0PTMjnIHc", + "colab_type": "text" + }, + "source": [ + "### Then, we evaluate with keyword query format" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "b-q7o10Kmwfu", + "colab_type": "code", + "outputId": "c4df4fbb-a06a-4ce2-9386-c7442807b5fd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + } + }, + "source": [ + "!python -um pygaggle.run.evaluate_kaggle_highlighter --method bm25 \\ \n", + " --split kq" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\u001b[32m2020-05-04 21:50:07\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: PyTorch version 1.5.0+cu101 available.\n", + "2020-05-04 21:50:07.779993: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1\n", + "\u001b[32m2020-05-04 21:50:09\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: TensorFlow version 2.2.0-rc3 available.\n", + "usage: evaluate_kaggle_highlighter.py [-h] [--dataset DATASET] --method\n", + " {transformer,bm25,t5,seq_class_transformer,qa_transformer,random}\n", + " [--model-name MODEL_NAME]\n", + " [--split {nq,kq}]\n", + " [--batch-size BATCH_SIZE]\n", + " [--device DEVICE]\n", + " [--tokenizer-name TOKENIZER_NAME]\n", + " [--do-lower-case]\n", + " [--metrics {precision@1,recall@3,recall@50,recall@1000,mrr,mrr@10} [{precision@1,recall@3,recall@50,recall@1000,mrr,mrr@10} ...]]\n", + "evaluate_kaggle_highlighter.py: error: argument --method: invalid choice: 'bm25 --split' (choose from 'transformer', 'bm25', 't5', 'seq_class_transformer', 'qa_transformer', 'random')\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cfMPj8E3lVaI", + "colab_type": "text" + }, + "source": [ + "## Let's evaluate using our best neural ranker, T5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wnCu696xoiSC", + "colab_type": "text" + }, + "source": [ + "### Again, we first use the natural query string format" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "v7l9gCPIkU-s", + "colab_type": "code", + "outputId": "d224fd01-0bcf-4396-b688-76665d85b95e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + } + }, + "source": [ + "!python -um pygaggle.run.evaluate_kaggle_highlighter --method t5\n" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\u001b[32m2020-05-04 21:43:22\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: PyTorch version 1.5.0+cu101 available.\n", + "2020-05-04 21:43:22.711873: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1\n", + "\u001b[32m2020-05-04 21:43:24\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: TensorFlow version 2.2.0-rc3 available.\n", + "\u001b[32m2020-05-04 21:43:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Average spans: 1.5725806451612903\n", + "\u001b[32m2020-05-04 21:43:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random P@1: 0.011513690878122968\n", + "\u001b[32m2020-05-04 21:43:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random R@3: 0.034106620910472035\n", + "\u001b[32m2020-05-04 21:43:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random MRR: 0.05247032691539293\n", + "2020-05-04 21:43:29.564495: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:29.721468: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "\u001b[32m2020-05-04 21:43:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: T5 model weights not in cache.\n", + "2020-05-04 21:43:30.114116: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:30.246026: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:30.381720: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "\u001b[32m2020-05-04 21:43:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: Caching model.ckpt-1009900.data-00000-of-00002...\n", + "2020-05-04 21:43:30.528048: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:30.957448: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "\u001b[32m2020-05-04 21:43:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: Caching model.ckpt-1009900.data-00001-of-00002...\n", + "2020-05-04 21:43:31.089205: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:32.011302: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:32.813789: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:33.608747: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:34.332896: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:35.230923: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:36.061146: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:36.812021: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:37.640353: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:38.410710: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:39.224997: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:39.936081: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:40.722613: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:41.429997: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:42.172067: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:42.944362: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:43.645444: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:44.385052: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:45.158262: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:46.089485: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:46.819715: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:47.534050: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:48.278465: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:49.047111: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:49.827815: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:50.568785: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:51.305316: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:51.993255: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:52.138638: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "\u001b[32m2020-05-04 21:43:52\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: Caching model.ckpt-1009900.index...\n", + "2020-05-04 21:43:52.265979: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:52.654611: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "\u001b[32m2020-05-04 21:43:52\u001b[0m \u001b[1;30m[INFO]\u001b[0m serialize: Caching model.ckpt-1009900.meta...\n", + "2020-05-04 21:43:52.787316: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:53.797288: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:43:54.088040: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "\u001b[32m2020-05-04 21:43:54\u001b[0m \u001b[1;30m[INFO]\u001b[0m filelock: Lock 139706941352256 acquired on /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b.lock\n", + "\u001b[32m2020-05-04 21:43:54\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpwsf7lnkt\n", + "Downloading: 100% 1.20k/1.20k [00:00<00:00, 744kB/s]\n", + "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: storing https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json in cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n", + "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: creating metadata file for /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n", + "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m filelock: Lock 139706941352256 released on /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b.lock\n", + "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n", + "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n", + " \"_num_labels\": 2,\n", + " \"architectures\": [\n", + " \"T5WithLMHeadModel\"\n", + " ],\n", + " \"bad_words_ids\": null,\n", + " \"bos_token_id\": null,\n", + " \"d_ff\": 3072,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 768,\n", + " \"decoder_start_token_id\": 0,\n", + " \"do_sample\": false,\n", + " \"dropout_rate\": 0.1,\n", + " \"early_stopping\": false,\n", + " \"eos_token_id\": 1,\n", + " \"finetuning_task\": null,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\"\n", + " },\n", + " \"initializer_factor\": 1.0,\n", + " \"is_decoder\": false,\n", + " \"is_encoder_decoder\": true,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1\n", + " },\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"length_penalty\": 1.0,\n", + " \"max_length\": 20,\n", + " \"min_length\": 0,\n", + " \"model_type\": \"t5\",\n", + " \"n_positions\": 512,\n", + " \"no_repeat_ngram_size\": 0,\n", + " \"num_beams\": 1,\n", + " \"num_heads\": 12,\n", + " \"num_layers\": 12,\n", + " \"num_return_sequences\": 1,\n", + " \"output_attentions\": false,\n", + " \"output_hidden_states\": false,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"prefix\": null,\n", + " \"pruned_heads\": {},\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"repetition_penalty\": 1.0,\n", + " \"task_specific_params\": {\n", + " \"summarization\": {\n", + " \"early_stopping\": true,\n", + " \"length_penalty\": 2.0,\n", + " \"max_length\": 200,\n", + " \"min_length\": 30,\n", + " \"no_repeat_ngram_size\": 3,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"summarize: \"\n", + " },\n", + " \"translation_en_to_de\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to German: \"\n", + " },\n", + " \"translation_en_to_fr\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to French: \"\n", + " },\n", + " \"translation_en_to_ro\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to Romanian: \"\n", + " }\n", + " },\n", + " \"temperature\": 1.0,\n", + " \"top_k\": 50,\n", + " \"top_p\": 1.0,\n", + " \"torchscript\": false,\n", + " \"use_bfloat16\": false,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file /root/.cache/covidex/ranker/config.json\n", + "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n", + " \"_num_labels\": 2,\n", + " \"architectures\": [\n", + " \"T5WithLMHeadModel\"\n", + " ],\n", + " \"bad_words_ids\": null,\n", + " \"bos_token_id\": null,\n", + " \"d_ff\": 3072,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 768,\n", + " \"decoder_start_token_id\": 0,\n", + " \"do_sample\": false,\n", + " \"dropout_rate\": 0.1,\n", + " \"early_stopping\": false,\n", + " \"eos_token_id\": 1,\n", + " \"finetuning_task\": null,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\"\n", + " },\n", + " \"initializer_factor\": 1.0,\n", + " \"is_decoder\": false,\n", + " \"is_encoder_decoder\": true,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1\n", + " },\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"length_penalty\": 1.0,\n", + " \"max_length\": 20,\n", + " \"min_length\": 0,\n", + " \"model_type\": \"t5\",\n", + " \"n_positions\": 512,\n", + " \"no_repeat_ngram_size\": 0,\n", + " \"num_beams\": 1,\n", + " \"num_heads\": 12,\n", + " \"num_layers\": 12,\n", + " \"num_return_sequences\": 1,\n", + " \"output_attentions\": false,\n", + " \"output_hidden_states\": false,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"prefix\": null,\n", + " \"pruned_heads\": {},\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"repetition_penalty\": 1.0,\n", + " \"task_specific_params\": {\n", + " \"summarization\": {\n", + " \"early_stopping\": true,\n", + " \"length_penalty\": 2.0,\n", + " \"max_length\": 200,\n", + " \"min_length\": 30,\n", + " \"no_repeat_ngram_size\": 3,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"summarize: \"\n", + " },\n", + " \"translation_en_to_de\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to German: \"\n", + " },\n", + " \"translation_en_to_fr\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to French: \"\n", + " },\n", + " \"translation_en_to_ro\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to Romanian: \"\n", + " }\n", + " },\n", + " \"temperature\": 1.0,\n", + " \"top_k\": 50,\n", + " \"top_p\": 1.0,\n", + " \"torchscript\": false,\n", + " \"use_bfloat16\": false,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "\u001b[32m2020-05-04 21:43:55\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_utils: loading weights file /root/.cache/covidex/ranker/model.ckpt.index\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Converting TensorFlow checkpoint from /root/.cache/covidex/ranker/model.ckpt\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/relative_attention_bias with shape [12, 32]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v with shape [12, 32]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:02\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/final_layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/final_layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/relative_attention_bias with shape [12, 32]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v with shape [12, 32]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/final_layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/final_layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight global_step with shape []\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding with shape [32128, 768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding_slot_vc with shape [32128]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (12, 32) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:04\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'final_layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'final_layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/final_layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (12, 32) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:05\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'final_layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'final_layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/final_layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping global_step\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['shared', 'embedding']\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping shared/embedding_slot_vc\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping shared/embedding_slot_vr\n", + "\u001b[32m2020-05-04 21:44:06\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Weights not copied to PyTorch model: \n", + "\u001b[32m2020-05-04 21:44:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n", + "\u001b[32m2020-05-04 21:44:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n", + " \"_num_labels\": 2,\n", + " \"architectures\": [\n", + " \"T5WithLMHeadModel\"\n", + " ],\n", + " \"bad_words_ids\": null,\n", + " \"bos_token_id\": null,\n", + " \"d_ff\": 3072,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 768,\n", + " \"decoder_start_token_id\": 0,\n", + " \"do_sample\": false,\n", + " \"dropout_rate\": 0.1,\n", + " \"early_stopping\": false,\n", + " \"eos_token_id\": 1,\n", + " \"finetuning_task\": null,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\"\n", + " },\n", + " \"initializer_factor\": 1.0,\n", + " \"is_decoder\": false,\n", + " \"is_encoder_decoder\": true,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1\n", + " },\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"length_penalty\": 1.0,\n", + " \"max_length\": 20,\n", + " \"min_length\": 0,\n", + " \"model_type\": \"t5\",\n", + " \"n_positions\": 512,\n", + " \"no_repeat_ngram_size\": 0,\n", + " \"num_beams\": 1,\n", + " \"num_heads\": 12,\n", + " \"num_layers\": 12,\n", + " \"num_return_sequences\": 1,\n", + " \"output_attentions\": false,\n", + " \"output_hidden_states\": false,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"prefix\": null,\n", + " \"pruned_heads\": {},\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"repetition_penalty\": 1.0,\n", + " \"task_specific_params\": {\n", + " \"summarization\": {\n", + " \"early_stopping\": true,\n", + " \"length_penalty\": 2.0,\n", + " \"max_length\": 200,\n", + " \"min_length\": 30,\n", + " \"no_repeat_ngram_size\": 3,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"summarize: \"\n", + " },\n", + " \"translation_en_to_de\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to German: \"\n", + " },\n", + " \"translation_en_to_fr\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to French: \"\n", + " },\n", + " \"translation_en_to_ro\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to Romanian: \"\n", + " }\n", + " },\n", + " \"temperature\": 1.0,\n", + " \"top_k\": 50,\n", + " \"top_p\": 1.0,\n", + " \"torchscript\": false,\n", + " \"use_bfloat16\": false,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "\u001b[32m2020-05-04 21:44:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m filelock: Lock 139706941352256 acquired on /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f.lock\n", + "\u001b[32m2020-05-04 21:44:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp8x66xr_c\n", + "Downloading: 100% 792k/792k [00:00<00:00, 1.89MB/s]\n", + "\u001b[32m2020-05-04 21:44:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: storing https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model in cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f\n", + "\u001b[32m2020-05-04 21:44:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: creating metadata file for /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f\n", + "\u001b[32m2020-05-04 21:44:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m filelock: Lock 139706941352256 released on /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f.lock\n", + "\u001b[32m2020-05-04 21:44:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m tokenization_utils: loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f\n", + "100% 124/124 [05:31<00:00, 2.67s/it]\n", + "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: precision@1 0.27419\n", + "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@3 0.43502\n", + "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@50 0.93057\n", + "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@1000 1.0\n", + "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr 0.4224\n", + "\u001b[32m2020-05-04 21:50:03\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr@10 0.40976\n", + "precision@1\t0.27419354838709675\n", + "recall@3\t0.43502304147465437\n", + "recall@50\t0.9305683563748081\n", + "recall@1000\t1.0\n", + "mrr\t0.4224002621206025\n", + "mrr@10\t0.4097638248847927\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A6hvKTS3pht1", + "colab_type": "text" + }, + "source": [ + "### Finally, we evaluate using the keyword query string format." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WkJ42FRolJ0f", + "colab_type": "code", + "outputId": "1cedcbe2-feb3-4618-cf98-dfd6f3d4adfb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + } + }, + "source": [ + "!python -um pygaggle.run.evaluate_kaggle_highlighter --method t5 --split kq" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\u001b[32m2020-05-04 21:51:13\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: PyTorch version 1.5.0+cu101 available.\n", + "2020-05-04 21:51:13.856830: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1\n", + "\u001b[32m2020-05-04 21:51:15\u001b[0m \u001b[1;30m[INFO]\u001b[0m file_utils: TensorFlow version 2.2.0-rc3 available.\n", + "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Average spans: 1.5725806451612903\n", + "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random P@1: 0.011513690878122968\n", + "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random R@3: 0.034106620910472035\n", + "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m kaggle: Random MRR: 0.05247032691539293\n", + "2020-05-04 21:51:20.636924: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "2020-05-04 21:51:20.774761: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.\n", + "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file /root/.cache/covidex/ranker/config.json\n", + "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n", + " \"_num_labels\": 2,\n", + " \"architectures\": [\n", + " \"T5WithLMHeadModel\"\n", + " ],\n", + " \"bad_words_ids\": null,\n", + " \"bos_token_id\": null,\n", + " \"d_ff\": 3072,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 768,\n", + " \"decoder_start_token_id\": 0,\n", + " \"do_sample\": false,\n", + " \"dropout_rate\": 0.1,\n", + " \"early_stopping\": false,\n", + " \"eos_token_id\": 1,\n", + " \"finetuning_task\": null,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\"\n", + " },\n", + " \"initializer_factor\": 1.0,\n", + " \"is_decoder\": false,\n", + " \"is_encoder_decoder\": true,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1\n", + " },\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"length_penalty\": 1.0,\n", + " \"max_length\": 20,\n", + " \"min_length\": 0,\n", + " \"model_type\": \"t5\",\n", + " \"n_positions\": 512,\n", + " \"no_repeat_ngram_size\": 0,\n", + " \"num_beams\": 1,\n", + " \"num_heads\": 12,\n", + " \"num_layers\": 12,\n", + " \"num_return_sequences\": 1,\n", + " \"output_attentions\": false,\n", + " \"output_hidden_states\": false,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"prefix\": null,\n", + " \"pruned_heads\": {},\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"repetition_penalty\": 1.0,\n", + " \"task_specific_params\": {\n", + " \"summarization\": {\n", + " \"early_stopping\": true,\n", + " \"length_penalty\": 2.0,\n", + " \"max_length\": 200,\n", + " \"min_length\": 30,\n", + " \"no_repeat_ngram_size\": 3,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"summarize: \"\n", + " },\n", + " \"translation_en_to_de\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to German: \"\n", + " },\n", + " \"translation_en_to_fr\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to French: \"\n", + " },\n", + " \"translation_en_to_ro\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to Romanian: \"\n", + " }\n", + " },\n", + " \"temperature\": 1.0,\n", + " \"top_k\": 50,\n", + " \"top_p\": 1.0,\n", + " \"torchscript\": false,\n", + " \"use_bfloat16\": false,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "\u001b[32m2020-05-04 21:51:20\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_utils: loading weights file /root/.cache/covidex/ranker/model.ckpt.index\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Converting TensorFlow checkpoint from /root/.cache/covidex/ranker/model.ckpt\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/relative_attention_bias with shape [12, 32]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v with shape [12, 32]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_000/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_001/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_002/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_003/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_004/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_005/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_006/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_007/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_008/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_009/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:28\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_010/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/EncDecAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/block_011/layer_002/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/final_layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight decoder/final_layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/relative_attention_bias with shape [12, 32]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v with shape [12, 32]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_000/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_001/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_002/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_003/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_004/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_005/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_006/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_007/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_008/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_009/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_010/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/k_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/o_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/q_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v with shape [768, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v_slot_vc with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/SelfAttention/v_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_000/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel with shape [768, 3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel with shape [3072, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vc with shape [3072]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/block_011/layer_001/layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/final_layer_norm/scale with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight encoder/final_layer_norm/scale_slot_v with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight global_step with shape []\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding with shape [32128, 768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding_slot_vc with shape [32128]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Loading TF weight shared/embedding_slot_vr with shape [768]\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (12, 32) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_000', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_000', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_000/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:29\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_001', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_001', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_001/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_002', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_002', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_002/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_003', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_003', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_003/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_004', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_004', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_004/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_005', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_005', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_005/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_006', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_006', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_006/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_007', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_007', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_007/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_008', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_008', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_008/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_009', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_009', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_009/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_010', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_010', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_010/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'EncDecAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/EncDecAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'block_011', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'block_011', 'layer_002', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/block_011/layer_002/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['decoder', 'final_layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['decoder', 'final_layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping decoder/final_layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (12, 32) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'relative_attention_bias']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_000', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_000/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:30\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_001', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_001/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_002', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_002/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_003', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_003/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_004', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_004/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_005', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_005/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_006', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_006/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_007', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_007/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_008', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_008/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_009', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_009/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_010', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_010/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'k']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/k_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/k_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'o']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/o_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/o_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'q']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/q_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/q_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 768) for ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'SelfAttention', 'v']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/v_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/SelfAttention/v_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_000', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_000/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768, 3072) for ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wi', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wi/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (3072, 768) for ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'DenseReluDense', 'wo', 'kernel']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/DenseReluDense/wo/kernel_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'block_011', 'layer_001', 'layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/block_011/layer_001/layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Transposing numpy weight of shape (768,) for ['encoder', 'final_layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['encoder', 'final_layer_norm', 'scale']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping encoder/final_layer_norm/scale_slot_v\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping global_step\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Initialize PyTorch weight ['shared', 'embedding']\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping shared/embedding_slot_vc\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Skipping shared/embedding_slot_vr\n", + "\u001b[32m2020-05-04 21:51:31\u001b[0m \u001b[1;30m[INFO]\u001b[0m modeling_t5: Weights not copied to PyTorch model: \n", + "\u001b[32m2020-05-04 21:51:35\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b\n", + "\u001b[32m2020-05-04 21:51:35\u001b[0m \u001b[1;30m[INFO]\u001b[0m configuration_utils: Model config T5Config {\n", + " \"_num_labels\": 2,\n", + " \"architectures\": [\n", + " \"T5WithLMHeadModel\"\n", + " ],\n", + " \"bad_words_ids\": null,\n", + " \"bos_token_id\": null,\n", + " \"d_ff\": 3072,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 768,\n", + " \"decoder_start_token_id\": 0,\n", + " \"do_sample\": false,\n", + " \"dropout_rate\": 0.1,\n", + " \"early_stopping\": false,\n", + " \"eos_token_id\": 1,\n", + " \"finetuning_task\": null,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\"\n", + " },\n", + " \"initializer_factor\": 1.0,\n", + " \"is_decoder\": false,\n", + " \"is_encoder_decoder\": true,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1\n", + " },\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"length_penalty\": 1.0,\n", + " \"max_length\": 20,\n", + " \"min_length\": 0,\n", + " \"model_type\": \"t5\",\n", + " \"n_positions\": 512,\n", + " \"no_repeat_ngram_size\": 0,\n", + " \"num_beams\": 1,\n", + " \"num_heads\": 12,\n", + " \"num_layers\": 12,\n", + " \"num_return_sequences\": 1,\n", + " \"output_attentions\": false,\n", + " \"output_hidden_states\": false,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"prefix\": null,\n", + " \"pruned_heads\": {},\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"repetition_penalty\": 1.0,\n", + " \"task_specific_params\": {\n", + " \"summarization\": {\n", + " \"early_stopping\": true,\n", + " \"length_penalty\": 2.0,\n", + " \"max_length\": 200,\n", + " \"min_length\": 30,\n", + " \"no_repeat_ngram_size\": 3,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"summarize: \"\n", + " },\n", + " \"translation_en_to_de\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to German: \"\n", + " },\n", + " \"translation_en_to_fr\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to French: \"\n", + " },\n", + " \"translation_en_to_ro\": {\n", + " \"early_stopping\": true,\n", + " \"max_length\": 300,\n", + " \"num_beams\": 4,\n", + " \"prefix\": \"translate English to Romanian: \"\n", + " }\n", + " },\n", + " \"temperature\": 1.0,\n", + " \"top_k\": 50,\n", + " \"top_p\": 1.0,\n", + " \"torchscript\": false,\n", + " \"use_bfloat16\": false,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "\u001b[32m2020-05-04 21:51:35\u001b[0m \u001b[1;30m[INFO]\u001b[0m tokenization_utils: loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f\n", + "100% 124/124 [05:22<00:00, 2.60s/it]\n", + "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: precision@1 0.24194\n", + "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@3 0.36379\n", + "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@50 0.92304\n", + "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: recall@1000 1.0\n", + "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr 0.3825\n", + "\u001b[32m2020-05-04 21:56:57\u001b[0m \u001b[1;30m[INFO]\u001b[0m evaluate_kaggle_highlighter: mrr@10 0.37012\n", + "precision@1\t0.24193548387096775\n", + "recall@3\t0.36378648233486943\n", + "recall@50\t0.9230414746543779\n", + "recall@1000\t1.0\n", + "mrr\t0.38249784501639117\n", + "mrr@10\t0.3701228878648234\n" + ], + "name": "stdout" + } + ] + } + ] +} \ No newline at end of file