diff --git a/torchbenchmark/models/maml/LICENSE b/torchbenchmark/models/maml/LICENSE
deleted file mode 100644
index 6f1b377b33..0000000000
--- a/torchbenchmark/models/maml/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2019 Jackie Loong
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/torchbenchmark/models/maml/README.md b/torchbenchmark/models/maml/README.md
deleted file mode 100644
index efa8d42459..0000000000
--- a/torchbenchmark/models/maml/README.md
+++ /dev/null
@@ -1,91 +0,0 @@
-#  MAML-Pytorch
-PyTorch implementation of the supervised learning experiments from the paper:
-[Model-Agnostic Meta-Learning (MAML)](https://arxiv.org/abs/1703.03400).
-
-> Version 1.0: Both `MiniImagenet` and `Omniglot` Datasets are supported! Have Fun~
-
-> Version 2.0: Re-write meta learner and basic learner. Solved some serious bugs in version 1.0.
-
-For Tensorflow Implementation, please visit official [HERE](https://github.com/cbfinn/maml) and simplier version [HERE](https://github.com/dragen1860/MAML-TensorFlow).
-
-For First-Order Approximation Implementation, Reptile namely, please visit [HERE](https://github.com/dragen1860/Reptile-Pytorch).
-
-![heart](res/heart.gif)
-
-# Platform
-- python: 3.x
-- Pytorch: 0.4+
-
-# MiniImagenet
-
-
-## Howto
-
-For 5-way 1-shot exp., it allocates nearly 6GB GPU memory.
-
-1. download `MiniImagenet` dataset from [here](https://github.com/dragen1860/LearningToCompare-Pytorch/issues/4), splitting: `train/val/test.csv` from [here](https://github.com/twitter/meta-learning-lstm/tree/master/data/miniImagenet).
-2. extract it like:
-```shell
-miniimagenet/
-├── images
-	├── n0210891500001298.jpg  
-	├── n0287152500001298.jpg 
-	...
-├── test.csv
-├── val.csv
-└── train.csv
-
-
-```
-3. modify the `path` in `miniimagenet_train.py`:
-```python
-        mini = MiniImagenet('miniimagenet/', mode='train', n_way=args.n_way, k_shot=args.k_spt,
-                    k_query=args.k_qry,
-                    batchsz=10000, resize=args.imgsz)
-		...
-        mini_test = MiniImagenet('miniimagenet/', mode='test', n_way=args.n_way, k_shot=args.k_spt,
-                    k_query=args.k_qry,
-                    batchsz=100, resize=args.imgsz)
-```
-to your actual data path.
-
-4. just run `python miniimagenet_train.py` and the running screenshot is as follows:
-![screenshot-miniimagetnet](res/mini-screen.png)
-
-If your reproducation perf. is not so good, maybe you can enlarge your `training epoch` to get longer training. And MAML is notorious for its hard training. Therefore, this implementation only provide you a basic start point to begin your research.
-and the performance below is true and achieved on my machine.
-
-## Benchmark
-
-| Model                               | Fine Tune | 5-way Acc. |        | 20-way Acc.|        |
-|-------------------------------------|-----------|------------|--------|------------|--------|
-|                                     |           | 1-shot     | 5-shot | 1-shot     | 5-shot |
-| Matching Nets                       | N         | 43.56%     | 55.31% | 17.31%     | 22.69% |
-| Meta-LSTM                           |           | 43.44%     | 60.60% | 16.70%     | 26.06% |
-| MAML                                | Y         | 48.7%      | 63.11% | 16.49%     | 19.29% |
-| **Ours**                            | Y         | 46.2%      | 60.3%	| -    		 | - 	|
-
-
-
-# Ominiglot
-
-## Howto
-run `python omniglot_train.py`, the program will download `omniglot` dataset automatically.
-
-decrease the value of `args.task_num` to fit your GPU memory capacity.
-
-For 5-way 1-shot exp., it allocates nearly 3GB GPU memory.
-
-
-# Refer to this Rep.
-```
-@misc{MAML_Pytorch,
-  author = {Liangqu Long},
-  title = {MAML-Pytorch Implementation},
-  year = {2018},
-  publisher = {GitHub},
-  journal = {GitHub repository},
-  howpublished = {\url{https://github.com/dragen1860/MAML-Pytorch}},
-  commit = {master}
-}
-```
diff --git a/torchbenchmark/models/maml/__init__.py b/torchbenchmark/models/maml/__init__.py
deleted file mode 100644
index 9b0d2d7456..0000000000
--- a/torchbenchmark/models/maml/__init__.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import numpy as np
-import random
-import time
-import torch
-from argparse import Namespace
-from .meta import Meta
-from pathlib import Path
-from typing import Tuple
-from ...util.model import BenchmarkModel
-from torchbenchmark.tasks import OTHER
-
-torch.backends.cudnn.deterministic = True
-torch.backends.cudnn.benchmark = False
-
-
-class Model(BenchmarkModel):
-    task = OTHER.OTHER_TASKS
-    DEFAULT_TRAIN_BSIZE = 1
-    DEFAULT_EVAL_BSIZE = 1
-    ALLOW_CUSTOMIZE_BSIZE = False
-    # Skip correctness check, because maml runs backward and optimizer in eval()
-    # Which will return non-deterministic results
-    SKIP_CORRECTNESS_CHECK = True
-
-    def __init__(self, test, device, jit, batch_size=None, extra_args=[]):
-        super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
-
-        # load from disk or synthesize data
-        use_data_file = False
-        debug_print = False
-        root = str(Path(__file__).parent)
-        args = Namespace(**{
-            'n_way': 5,
-            'k_spt': 1,
-            'k_qry': 15,
-            'imgsz': 28,
-            'imgc': 1,
-            'task_num': 32,
-            'meta_lr': 1e-3,
-            'update_lr': 0.4,
-            'update_step': 5,
-            'update_step_test': 10
-        })
-        config = [
-            ('conv2d', [64, args.imgc, 3, 3, 2, 0]),
-            ('relu', [True]),
-            ('bn', [64]),
-            ('conv2d', [64, 64, 3, 3, 2, 0]),
-            ('relu', [True]),
-            ('bn', [64]),
-            ('conv2d', [64, 64, 3, 3, 2, 0]),
-            ('relu', [True]),
-            ('bn', [64]),
-            ('conv2d', [64, 64, 2, 2, 1, 0]),
-            ('relu', [True]),
-            ('bn', [64]),
-            ('flatten', []),
-            ('linear', [args.n_way, 64])
-        ]
-
-        self.module = Meta(args, config).to(device)
-
-        if use_data_file:
-            self.example_inputs = torch.load(f'{root}/batch.pt')
-            self.example_inputs = tuple([torch.from_numpy(i).to(self.device) for i in self.example_inputs])
-        else:
-            # synthesize data parameterized by arg values
-            self.example_inputs = (
-                torch.randn(args.task_num, args.n_way, args.imgc, args.imgsz, args.imgsz).to(device),
-                torch.randint(0, args.n_way, [args.task_num, args.n_way], dtype=torch.long).to(device),
-                torch.randn(args.task_num, args.n_way * args.k_qry, args.imgc, args.imgsz, args.imgsz).to(device),
-                torch.randint(0, args.n_way, [args.task_num, args.n_way * args.k_qry], dtype=torch.long).to(device))
-
-        # print input shapes
-        if debug_print:
-            for i in range(len(self.example_inputs)):
-                print(self.example_inputs[i].shape)
-
-    def get_module(self):
-        return self.module, self.example_inputs
-
-    def eval(self) -> Tuple[torch.Tensor]:
-        out = self.module(*self.example_inputs)
-        return (out, )
-
-    def train(self):
-        raise NotImplementedError("MAML model doesn't support train.")
-
-    def eval_in_nograd(self):
-        return False
diff --git a/torchbenchmark/models/maml/batch.pt b/torchbenchmark/models/maml/batch.pt
deleted file mode 100644
index 5ac4490bc4..0000000000
--- a/torchbenchmark/models/maml/batch.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:141141dcc7e260454405e7f15ec5550495bbeaed691c3ab19181b0dc00a9d91d
-size 10428868
diff --git a/torchbenchmark/models/maml/install.py b/torchbenchmark/models/maml/install.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/torchbenchmark/models/maml/learner.py b/torchbenchmark/models/maml/learner.py
deleted file mode 100644
index 974405026a..0000000000
--- a/torchbenchmark/models/maml/learner.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import  torch
-from    torch import nn
-from    torch.nn import functional as F
-import  numpy as np
-from typing import List
-
-
-
-class Learner(nn.Module):
-    """
-
-    """
-
-    def __init__(self, config, imgc, imgsz):
-        """
-
-        :param config: network config file, type:list of (string, list)
-        :param imgc: 1 or 3
-        :param imgsz:  28 or 84
-        """
-        super(Learner, self).__init__()
-
-
-        self.config = config
-
-        # this dict contains all tensors needed to be optimized
-        self.vars = nn.ParameterList()
-        # running_mean and running_var
-        self.vars_bn = nn.ParameterList()
-
-        for i, (name, param) in enumerate(self.config):
-            if name == 'conv2d':
-                # [ch_out, ch_in, kernelsz, kernelsz]
-                w = nn.Parameter(torch.ones(*param[:4]))
-                # gain=1 according to cbfin's implementation
-                torch.nn.init.kaiming_normal_(w)
-                self.vars.append(w)
-                # [ch_out]
-                self.vars.append(nn.Parameter(torch.zeros(param[0])))
-
-            elif name == 'convt2d':
-                # [ch_in, ch_out, kernelsz, kernelsz, stride, padding]
-                w = nn.Parameter(torch.ones(*param[:4]))
-                # gain=1 according to cbfin's implementation
-                torch.nn.init.kaiming_normal_(w)
-                self.vars.append(w)
-                # [ch_in, ch_out]
-                self.vars.append(nn.Parameter(torch.zeros(param[1])))
-
-            elif name == 'linear':
-                # [ch_out, ch_in]
-                w = nn.Parameter(torch.ones(*param))
-                # gain=1 according to cbfinn's implementation
-                torch.nn.init.kaiming_normal_(w)
-                self.vars.append(w)
-                # [ch_out]
-                self.vars.append(nn.Parameter(torch.zeros(param[0])))
-
-            elif name == 'bn':
-                # [ch_out]
-                w = nn.Parameter(torch.ones(param[0]))
-                self.vars.append(w)
-                # [ch_out]
-                self.vars.append(nn.Parameter(torch.zeros(param[0])))
-
-                # must set requires_grad=False
-                running_mean = nn.Parameter(torch.zeros(param[0]), requires_grad=False)
-                running_var = nn.Parameter(torch.ones(param[0]), requires_grad=False)
-                self.vars_bn.extend([running_mean, running_var])
-
-
-            elif name in ['tanh', 'relu', 'upsample', 'avg_pool2d', 'max_pool2d',
-                          'flatten', 'reshape', 'leakyrelu', 'sigmoid']:
-                continue
-            else:
-                raise NotImplementedError
-
-
-
-
-
-
-    def extra_repr(self):
-        info = ''
-
-        for name, param in self.config:
-            if name == 'conv2d':
-                tmp = 'conv2d:(ch_in:%d, ch_out:%d, k:%dx%d, stride:%d, padding:%d)'\
-                      %(param[1], param[0], param[2], param[3], param[4], param[5],)
-                info += tmp + '\n'
-
-            elif name == 'convt2d':
-                tmp = 'convTranspose2d:(ch_in:%d, ch_out:%d, k:%dx%d, stride:%d, padding:%d)'\
-                      %(param[0], param[1], param[2], param[3], param[4], param[5],)
-                info += tmp + '\n'
-
-            elif name == 'linear':
-                tmp = 'linear:(in:%d, out:%d)'%(param[1], param[0])
-                info += tmp + '\n'
-
-            elif name == 'leakyrelu':
-                tmp = 'leakyrelu:(slope:%f)'%(param[0])
-                info += tmp + '\n'
-
-
-            elif name == 'avg_pool2d':
-                tmp = 'avg_pool2d:(k:%d, stride:%d, padding:%d)'%(param[0], param[1], param[2])
-                info += tmp + '\n'
-            elif name == 'max_pool2d':
-                tmp = 'max_pool2d:(k:%d, stride:%d, padding:%d)'%(param[0], param[1], param[2])
-                info += tmp + '\n'
-            elif name in ['flatten', 'tanh', 'relu', 'upsample', 'reshape', 'sigmoid', 'use_logits', 'bn']:
-                tmp = name + ':' + str(tuple(param))
-                info += tmp + '\n'
-            else:
-                raise NotImplementedError
-
-        return info
-
-
-
-    def forward(self, x, vars=None, bn_training=True):
-        """
-        This function can be called by finetunning, however, in finetunning, we dont wish to update
-        running_mean/running_var. Thought weights/bias of bn == updated, it has been separated by fast_weights.
-        Indeed, to not update running_mean/running_var, we need set update_bn_statistics=False
-        but weight/bias will be updated and not dirty initial theta parameters via fast_weiths.
-        :param x: [b, 1, 28, 28]
-        :param vars:
-        :param bn_training: set False to not update
-        :return: x, loss, likelihood, kld
-        """
-
-        if vars == None:
-            vars = self.vars
-
-        idx = 0
-        bn_idx = 0
-
-        for name, param in self.config:
-            if name == 'conv2d':
-                w, b = vars[idx], vars[idx + 1]
-                # remember to keep synchrozied of forward_encoder and forward_decoder!
-                x = F.conv2d(x, w, b, stride=param[4], padding=param[5])
-                idx += 2
-                # print(name, param, '\tout:', x.shape)
-            elif name == 'convt2d':
-                w, b = vars[idx], vars[idx + 1]
-                # remember to keep synchrozied of forward_encoder and forward_decoder!
-                x = F.conv_transpose2d(x, w, b, stride=param[4], padding=param[5])
-                idx += 2
-                # print(name, param, '\tout:', x.shape)
-            elif name == 'linear':
-                w, b = vars[idx], vars[idx + 1]
-                x = F.linear(x, w, b)
-                idx += 2
-                # print('forward:', idx, x.norm().item())
-            elif name == 'bn':
-                w, b = vars[idx], vars[idx + 1]
-                running_mean, running_var = self.vars_bn[bn_idx], self.vars_bn[bn_idx+1]
-                x = F.batch_norm(x, running_mean, running_var, weight=w, bias=b, training=bn_training)
-                idx += 2
-                bn_idx += 2
-
-            elif name == 'flatten':
-                # print(x.shape)
-                x = x.view(x.size(0), -1)
-            elif name == 'reshape':
-                # [b, 8] => [b, 2, 2, 2]
-                x = x.view(x.size(0), *param)
-            elif name == 'relu':
-                x = F.relu(x, inplace=param[0])
-            elif name == 'leakyrelu':
-                x = F.leaky_relu(x, negative_slope=param[0], inplace=param[1])
-            elif name == 'tanh':
-                x = F.tanh(x)
-            elif name == 'sigmoid':
-                x = torch.sigmoid(x)
-            elif name == 'upsample':
-                x = F.upsample_nearest(x, scale_factor=param[0])
-            elif name == 'max_pool2d':
-                x = F.max_pool2d(x, param[0], param[1], param[2])
-            elif name == 'avg_pool2d':
-                x = F.avg_pool2d(x, param[0], param[1], param[2])
-
-            else:
-                raise NotImplementedError
-
-        # make sure variable == used properly
-        assert idx == len(vars)
-        assert bn_idx == len(self.vars_bn)
-
-
-        return x
-
-
-    def zero_grad(self, vars=None):
-        """
-
-        :param vars:
-        :return:
-        """
-        with torch.no_grad():
-            if vars == None:
-                for p in self.vars:
-                    if not p.grad == None:
-                        p.grad.zero_()
-            else:
-                for p in vars:
-                    if not p.grad == None:
-                        p.grad.zero_()
-
-    def parameters(self):
-        """
-        override this function since initial parameters will return with a generator.
-        :return:
-        """
-        return self.vars
\ No newline at end of file
diff --git a/torchbenchmark/models/maml/meta.py b/torchbenchmark/models/maml/meta.py
deleted file mode 100644
index 9c53d79979..0000000000
--- a/torchbenchmark/models/maml/meta.py
+++ /dev/null
@@ -1,231 +0,0 @@
-import  torch
-from    torch import nn
-from    torch import optim
-from    torch.nn import functional as F
-from    torch.utils.data import TensorDataset, DataLoader
-from    torch import optim
-import  numpy as np
-
-from    .learner import Learner
-from    copy import deepcopy
-
-
-
-class Meta(nn.Module):
-    """
-    Meta Learner
-    """
-    def __init__(self, args, config):
-        """
-
-        :param args:
-        """
-        super(Meta, self).__init__()
-
-        self.update_lr = args.update_lr
-        self.meta_lr = args.meta_lr
-        self.n_way = args.n_way
-        self.k_spt = args.k_spt
-        self.k_qry = args.k_qry
-        self.task_num = args.task_num
-        self.update_step = args.update_step
-        self.update_step_test = args.update_step_test
-
-
-        self.net = Learner(config, args.imgc, args.imgsz)
-        self.meta_optim = optim.Adam(self.net.parameters(), lr=self.meta_lr)
-
-
-
-
-    def clip_grad_by_norm_(self, grad, max_norm):
-        """
-        in-place gradient clipping.
-        :param grad: list of gradients
-        :param max_norm: maximum norm allowable
-        :return:
-        """
-
-        total_norm = 0
-        counter = 0
-        for g in grad:
-            param_norm = g.data.norm(2)
-            total_norm += param_norm.item() ** 2
-            counter += 1
-        total_norm = total_norm ** (1. / 2)
-
-        clip_coef = max_norm / (total_norm + 1e-6)
-        if clip_coef < 1:
-            for g in grad:
-                g.data.mul_(clip_coef)
-
-        return total_norm/counter
-
-    def forward(self, x_spt, y_spt, x_qry, y_qry):
-        if self.training:
-            return self.forward_train(x_spt, y_spt, x_qry, y_qry)
-        else:
-            return self.finetunning(x_spt[0], y_spt[0], x_qry[0], y_qry[0])
-
-    def forward_train(self, x_spt, y_spt, x_qry, y_qry):
-        """
-
-        :param x_spt:   [b, setsz, c_, h, w]
-        :param y_spt:   [b, setsz]
-        :param x_qry:   [b, querysz, c_, h, w]
-        :param y_qry:   [b, querysz]
-        :return:
-        """
-        task_num, setsz, c_, h, w = x_spt.size()
-        querysz = x_qry.size(1)
-
-        losses_q = [0 for _ in range(self.update_step + 1)]  # losses_q[i] is the loss on step i
-        corrects = [0 for _ in range(self.update_step + 1)]
-
-
-        for i in range(task_num):
-
-            # 1. run the i-th task and compute loss for k=0
-            logits = self.net(x_spt[i], vars=None, bn_training=True)
-            loss = F.cross_entropy(logits, y_spt[i])
-            grad = torch.autograd.grad(loss, self.net.parameters())
-            fast_weights = list([p[1] - self.update_lr * p[0]for p in zip(grad, self.net.parameters())])
-
-            # this is the loss and accuracy before first update
-            with torch.no_grad():
-                # [setsz, nway]
-                logits_q = self.net(x_qry[i], self.net.parameters(), bn_training=True)
-                loss_q = F.cross_entropy(logits_q, y_qry[i])
-                losses_q[0] += loss_q
-
-                pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
-                correct = torch.eq(pred_q, y_qry[i]).sum().item()
-                corrects[0] = corrects[0] + correct
-
-            # this is the loss and accuracy after the first update
-            with torch.no_grad():
-                # [setsz, nway]
-                logits_q = self.net(x_qry[i], fast_weights, bn_training=True)
-                loss_q = F.cross_entropy(logits_q, y_qry[i])
-                losses_q[1] += loss_q
-                # [setsz]
-                pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
-                correct = torch.eq(pred_q, y_qry[i]).sum().item()
-                corrects[1] = corrects[1] + correct
-
-            for k in range(1, self.update_step):
-                # 1. run the i-th task and compute loss for k=1~K-1
-                logits = self.net(x_spt[i], fast_weights, bn_training=True)
-                loss = F.cross_entropy(logits, y_spt[i])
-                # 2. compute grad on theta_pi
-                grad = torch.autograd.grad(loss, fast_weights)
-                # 3. theta_pi = theta_pi - train_lr * grad
-                fast_weights = [p[1] - self.update_lr * p[0] for p in zip(grad, fast_weights)]
-
-                logits_q = self.net(x_qry[i], fast_weights, bn_training=True)
-                # loss_q will be overwritten and just keep the loss_q on last update step.
-                loss_q = F.cross_entropy(logits_q, y_qry[i])
-                losses_q[k + 1] += loss_q
-
-                with torch.no_grad():
-                    pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
-                    correct = torch.eq(pred_q, y_qry[i]).sum().item()  # convert to numpy
-                    corrects[k + 1] = corrects[k + 1] + correct
-
-
-
-        # end of all tasks
-        # sum over all losses on query set across all tasks
-        loss_q = losses_q[-1] / task_num
-
-        # optimize theta parameters
-        self.meta_optim.zero_grad()
-        loss_q.backward()
-        # print('meta update')
-        # for p in self.net.parameters()[:5]:
-        # 	print(torch.norm(p).item())
-        self.meta_optim.step()
-
-
-        accs = torch.tensor(corrects) / (querysz * task_num)
-
-        return accs
-
-
-    def finetunning(self, x_spt, y_spt, x_qry, y_qry):
-        """
-
-        :param x_spt:   [setsz, c_, h, w]
-        :param y_spt:   [setsz]
-        :param x_qry:   [querysz, c_, h, w]
-        :param y_qry:   [querysz]
-        :return:
-        """
-        querysz = x_qry.size(0)
-
-        corrects = [0 for _ in range(self.update_step_test + 1)]
-
-        # in order to not ruin the state of running_mean/variance and bn_weight/bias
-        # we finetunning on the copied model instead of self.net
-        net = deepcopy(self.net)
-
-        # 1. run the i-th task and compute loss for k=0
-        logits = net(x_spt)
-        loss = F.cross_entropy(logits, y_spt)
-        grad = torch.autograd.grad(loss, net.parameters())
-        fast_weights = list(map(lambda p: p[1] - self.update_lr * p[0], zip(grad, net.parameters())))
-
-        # this is the loss and accuracy before first update
-        with torch.no_grad():
-            # [setsz, nway]
-            logits_q = net(x_qry, net.parameters(), bn_training=True)
-            # [setsz]
-            pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
-            # scalar
-            correct = torch.eq(pred_q, y_qry).sum().item()
-            corrects[0] = corrects[0] + correct
-
-        # this is the loss and accuracy after the first update
-        with torch.no_grad():
-            # [setsz, nway]
-            logits_q = net(x_qry, fast_weights, bn_training=True)
-            # [setsz]
-            pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
-            # scalar
-            correct = torch.eq(pred_q, y_qry).sum().item()
-            corrects[1] = corrects[1] + correct
-
-        for k in range(1, self.update_step_test):
-            # 1. run the i-th task and compute loss for k=1~K-1
-            logits = net(x_spt, fast_weights, bn_training=True)
-            loss = F.cross_entropy(logits, y_spt)
-            # 2. compute grad on theta_pi
-            grad = torch.autograd.grad(loss, fast_weights)
-            # 3. theta_pi = theta_pi - train_lr * grad
-            fast_weights = list(map(lambda p: p[1] - self.update_lr * p[0], zip(grad, fast_weights)))
-
-            logits_q = net(x_qry, fast_weights, bn_training=True)
-            # loss_q will be overwritten and just keep the loss_q on last update step.
-            loss_q = F.cross_entropy(logits_q, y_qry)
-
-            with torch.no_grad():
-                pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
-                correct = torch.eq(pred_q, y_qry).sum().item()  # convert to numpy
-                corrects[k + 1] = corrects[k + 1] + correct
-
-
-        del net
-
-        accs = torch.tensor(corrects) / querysz
-
-        return accs
-
-
-
-
-def main():
-    pass
-
-
-if __name__ == '__main__':
-    main()
diff --git a/torchbenchmark/models/maml/metadata.yaml b/torchbenchmark/models/maml/metadata.yaml
deleted file mode 100644
index 06b9110c46..0000000000
--- a/torchbenchmark/models/maml/metadata.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-eval_benchmark: false
-eval_deterministic: true
-eval_nograd: true
-train_benchmark: false
-train_deterministic: true
-not_implemented:
-  - jit: true
\ No newline at end of file
diff --git a/torchbenchmark/models/maml/origin b/torchbenchmark/models/maml/origin
deleted file mode 100644
index c3f766d92f..0000000000
--- a/torchbenchmark/models/maml/origin
+++ /dev/null
@@ -1 +0,0 @@
-https://github.com/dragen1860/MAML-Pytorch