Skip to content

Commit

Permalink
Minor env corrections - move opencv to conda install
Browse files Browse the repository at this point in the history
  • Loading branch information
shwars committed May 25, 2022
1 parent de581e2 commit ed8bfb0
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 2 deletions.
1 change: 1 addition & 0 deletions .devcontainer/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
- scikit-learn=0.24.2
- scipy=1.6.2
- pip=21.0.1
- conda-forge::opencv=4.5
- setuptools=58.0.4
- pytorch::pytorch=1.11.0
- pytorch::torchtext=0.12.0
Expand Down
1 change: 0 additions & 1 deletion .devcontainer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ huggingface==0.0.1
imageio==2.16.2
keras==2.8.0
nltk==3.5
opencv-python==4.5.1.48
pandas==1.4.2
pillow==9.1.0
pygame==2.1.2
Expand Down
1 change: 1 addition & 0 deletions binder/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
- scikit-learn=0.24.2
- scipy=1.6.2
- pip=21.0.1
- conda-forge::opencv=4.5
- setuptools=58.0.4
- pytorch::pytorch=1.11.0
- pytorch::torchtext=0.12.0
Expand Down
1 change: 0 additions & 1 deletion binder/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ huggingface==0.0.1
imageio==2.16.2
keras==2.8.0
nltk==3.5
opencv-python==4.5.1.48
pandas==1.4.2
pillow==9.1.0
pygame==2.1.2
Expand Down
111 changes: 111 additions & 0 deletions lessons/5-NLP/18-Transformers/torchnlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import builtins
import torch
import torchtext
import collections
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab = None
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

def load_dataset(ngrams=1,min_freq=1):
global vocab, tokenizer
print("Loading dataset...")
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
train_dataset = list(train_dataset)
test_dataset = list(test_dataset)
classes = ['World', 'Sports', 'Business', 'Sci/Tech']
print('Building vocab...')
counter = collections.Counter()
for (label, line) in train_dataset:
counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))
vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
return train_dataset,test_dataset,classes,vocab

stoi_hash = {}
def encode(x,voc=None,unk=0,tokenizer=tokenizer):
global stoi_hash
v = vocab if voc is None else voc
if v in stoi_hash.keys():
stoi = stoi_hash[v]
else:
stoi = v.get_stoi()
stoi_hash[v]=stoi
return [stoi.get(s,unk) for s in tokenizer(x)]

def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
loss_fn = loss_fn.to(device)
net.train()
total_loss,acc,count,i = 0,0,0,0
for labels,features in dataloader:
optimizer.zero_grad()
features, labels = features.to(device), labels.to(device)
out = net(features)
loss = loss_fn(out,labels) #cross_entropy(out,labels)
loss.backward()
optimizer.step()
total_loss+=loss
_,predicted = torch.max(out,1)
acc+=(predicted==labels).sum()
count+=len(labels)
i+=1
if i%report_freq==0:
print(f"{count}: acc={acc.item()/count}")
if epoch_size and count>epoch_size:
break
return total_loss.item()/count, acc.item()/count

def padify(b,voc=None,tokenizer=tokenizer):
# b is the list of tuples of length batch_size
# - first element of a tuple = label,
# - second = feature (text sequence)
# build vectorized sequence
v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b]
# compute max length of a sequence in this minibatch
l = max(map(len,v))
return ( # tuple of two tensors - labels and features
torch.LongTensor([t[0]-1 for t in b]),
torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
)

def offsetify(b,voc=None):
# first, compute data tensor from all sequences
x = [torch.tensor(encode(t[1],voc=voc)) for t in b]
# now, compute the offsets by accumulating the tensor of sequence lengths
o = [0] + [len(t) for t in x]
o = torch.tensor(o[:-1]).cumsum(dim=0)
return (
torch.LongTensor([t[0]-1 for t in b]), # labels
torch.cat(x), # text
o
)

def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,use_pack_sequence=False):
optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
loss_fn = loss_fn.to(device)
net.train()
total_loss,acc,count,i = 0,0,0,0
for labels,text,off in dataloader:
optimizer.zero_grad()
labels,text = labels.to(device), text.to(device)
if use_pack_sequence:
off = off.to('cpu')
else:
off = off.to(device)
out = net(text, off)
loss = loss_fn(out,labels) #cross_entropy(out,labels)
loss.backward()
optimizer.step()
total_loss+=loss
_,predicted = torch.max(out,1)
acc+=(predicted==labels).sum()
count+=len(labels)
i+=1
if i%report_freq==0:
print(f"{count}: acc={acc.item()/count}")
if epoch_size and count>epoch_size:
break
return total_loss.item()/count, acc.item()/count

0 comments on commit ed8bfb0

Please sign in to comment.