diff --git a/.devcontainer/environment.yml b/.devcontainer/environment.yml
index 8525b32f..1c981788 100644
--- a/.devcontainer/environment.yml
+++ b/.devcontainer/environment.yml
@@ -13,6 +13,7 @@ dependencies:
   - scikit-learn=0.24.2
   - scipy=1.6.2
   - pip=21.0.1
+  - conda-forge::opencv=4.5
   - setuptools=58.0.4
   - pytorch::pytorch=1.11.0
   - pytorch::torchtext=0.12.0
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
index 3fbd847c..743f212a 100644
--- a/.devcontainer/requirements.txt
+++ b/.devcontainer/requirements.txt
@@ -4,7 +4,6 @@ huggingface==0.0.1
 imageio==2.16.2
 keras==2.8.0
 nltk==3.5
-opencv-python==4.5.1.48
 pandas==1.4.2
 pillow==9.1.0
 pygame==2.1.2
diff --git a/binder/environment.yml b/binder/environment.yml
index 8525b32f..1c981788 100644
--- a/binder/environment.yml
+++ b/binder/environment.yml
@@ -13,6 +13,7 @@ dependencies:
   - scikit-learn=0.24.2
   - scipy=1.6.2
   - pip=21.0.1
+  - conda-forge::opencv=4.5
   - setuptools=58.0.4
   - pytorch::pytorch=1.11.0
   - pytorch::torchtext=0.12.0
diff --git a/binder/requirements.txt b/binder/requirements.txt
index 3fbd847c..743f212a 100644
--- a/binder/requirements.txt
+++ b/binder/requirements.txt
@@ -4,7 +4,6 @@ huggingface==0.0.1
 imageio==2.16.2
 keras==2.8.0
 nltk==3.5
-opencv-python==4.5.1.48
 pandas==1.4.2
 pillow==9.1.0
 pygame==2.1.2
diff --git a/lessons/5-NLP/18-Transformers/torchnlp.py b/lessons/5-NLP/18-Transformers/torchnlp.py
new file mode 100644
index 00000000..e8563177
--- /dev/null
+++ b/lessons/5-NLP/18-Transformers/torchnlp.py
@@ -0,0 +1,111 @@
+import builtins
+import torch
+import torchtext
+import collections
+import os
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+vocab = None
+tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
+
+def load_dataset(ngrams=1,min_freq=1):
+    global vocab, tokenizer
+    print("Loading dataset...")
+    train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
+    train_dataset = list(train_dataset)
+    test_dataset = list(test_dataset)
+    classes = ['World', 'Sports', 'Business', 'Sci/Tech']
+    print('Building vocab...')
+    counter = collections.Counter()
+    for (label, line) in train_dataset:
+        counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))
+    vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
+    return train_dataset,test_dataset,classes,vocab
+
+stoi_hash = {}
+def encode(x,voc=None,unk=0,tokenizer=tokenizer):
+    global stoi_hash
+    v = vocab if voc is None else voc
+    if v in stoi_hash.keys():
+        stoi = stoi_hash[v]
+    else:
+        stoi = v.get_stoi()
+        stoi_hash[v]=stoi        
+    return [stoi.get(s,unk) for s in tokenizer(x)]
+
+def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
+    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
+    loss_fn = loss_fn.to(device)
+    net.train()
+    total_loss,acc,count,i = 0,0,0,0
+    for labels,features in dataloader:
+        optimizer.zero_grad()
+        features, labels = features.to(device), labels.to(device)
+        out = net(features)
+        loss = loss_fn(out,labels) #cross_entropy(out,labels)
+        loss.backward()
+        optimizer.step()
+        total_loss+=loss
+        _,predicted = torch.max(out,1)
+        acc+=(predicted==labels).sum()
+        count+=len(labels)
+        i+=1
+        if i%report_freq==0:
+            print(f"{count}: acc={acc.item()/count}")
+        if epoch_size and count>epoch_size:
+            break
+    return total_loss.item()/count, acc.item()/count
+
+def padify(b,voc=None,tokenizer=tokenizer):
+    # b is the list of tuples of length batch_size
+    #   - first element of a tuple = label, 
+    #   - second = feature (text sequence)
+    # build vectorized sequence
+    v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b]
+    # compute max length of a sequence in this minibatch
+    l = max(map(len,v))
+    return ( # tuple of two tensors - labels and features
+        torch.LongTensor([t[0]-1 for t in b]),
+        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
+    )
+
+def offsetify(b,voc=None):
+    # first, compute data tensor from all sequences
+    x = [torch.tensor(encode(t[1],voc=voc)) for t in b]
+    # now, compute the offsets by accumulating the tensor of sequence lengths
+    o = [0] + [len(t) for t in x]
+    o = torch.tensor(o[:-1]).cumsum(dim=0)
+    return ( 
+        torch.LongTensor([t[0]-1 for t in b]), # labels
+        torch.cat(x), # text 
+        o
+    )
+
+def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,use_pack_sequence=False):
+    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
+    loss_fn = loss_fn.to(device)
+    net.train()
+    total_loss,acc,count,i = 0,0,0,0
+    for labels,text,off in dataloader:
+        optimizer.zero_grad()
+        labels,text = labels.to(device), text.to(device)
+        if use_pack_sequence:
+            off = off.to('cpu')
+        else:
+            off = off.to(device)
+        out = net(text, off)
+        loss = loss_fn(out,labels) #cross_entropy(out,labels)
+        loss.backward()
+        optimizer.step()
+        total_loss+=loss
+        _,predicted = torch.max(out,1)
+        acc+=(predicted==labels).sum()
+        count+=len(labels)
+        i+=1
+        if i%report_freq==0:
+            print(f"{count}: acc={acc.item()/count}")
+        if epoch_size and count>epoch_size:
+            break
+    return total_loss.item()/count, acc.item()/count
+