diff --git a/.devcontainer/environment.yml b/.devcontainer/environment.yml index 8525b32f..1c981788 100644 --- a/.devcontainer/environment.yml +++ b/.devcontainer/environment.yml @@ -13,6 +13,7 @@ dependencies: - scikit-learn=0.24.2 - scipy=1.6.2 - pip=21.0.1 + - conda-forge::opencv=4.5 - setuptools=58.0.4 - pytorch::pytorch=1.11.0 - pytorch::torchtext=0.12.0 diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index 3fbd847c..743f212a 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -4,7 +4,6 @@ huggingface==0.0.1 imageio==2.16.2 keras==2.8.0 nltk==3.5 -opencv-python==4.5.1.48 pandas==1.4.2 pillow==9.1.0 pygame==2.1.2 diff --git a/binder/environment.yml b/binder/environment.yml index 8525b32f..1c981788 100644 --- a/binder/environment.yml +++ b/binder/environment.yml @@ -13,6 +13,7 @@ dependencies: - scikit-learn=0.24.2 - scipy=1.6.2 - pip=21.0.1 + - conda-forge::opencv=4.5 - setuptools=58.0.4 - pytorch::pytorch=1.11.0 - pytorch::torchtext=0.12.0 diff --git a/binder/requirements.txt b/binder/requirements.txt index 3fbd847c..743f212a 100644 --- a/binder/requirements.txt +++ b/binder/requirements.txt @@ -4,7 +4,6 @@ huggingface==0.0.1 imageio==2.16.2 keras==2.8.0 nltk==3.5 -opencv-python==4.5.1.48 pandas==1.4.2 pillow==9.1.0 pygame==2.1.2 diff --git a/lessons/5-NLP/18-Transformers/torchnlp.py b/lessons/5-NLP/18-Transformers/torchnlp.py new file mode 100644 index 00000000..e8563177 --- /dev/null +++ b/lessons/5-NLP/18-Transformers/torchnlp.py @@ -0,0 +1,111 @@ +import builtins +import torch +import torchtext +import collections +import os + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +vocab = None +tokenizer = torchtext.data.utils.get_tokenizer('basic_english') + +def load_dataset(ngrams=1,min_freq=1): + global vocab, tokenizer + print("Loading dataset...") + train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data') + train_dataset = list(train_dataset) + test_dataset = list(test_dataset) + classes = ['World', 'Sports', 'Business', 'Sci/Tech'] + print('Building vocab...') + counter = collections.Counter() + for (label, line) in train_dataset: + counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams)) + vocab = torchtext.vocab.vocab(counter, min_freq=min_freq) + return train_dataset,test_dataset,classes,vocab + +stoi_hash = {} +def encode(x,voc=None,unk=0,tokenizer=tokenizer): + global stoi_hash + v = vocab if voc is None else voc + if v in stoi_hash.keys(): + stoi = stoi_hash[v] + else: + stoi = v.get_stoi() + stoi_hash[v]=stoi + return [stoi.get(s,unk) for s in tokenizer(x)] + +def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200): + optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr) + loss_fn = loss_fn.to(device) + net.train() + total_loss,acc,count,i = 0,0,0,0 + for labels,features in dataloader: + optimizer.zero_grad() + features, labels = features.to(device), labels.to(device) + out = net(features) + loss = loss_fn(out,labels) #cross_entropy(out,labels) + loss.backward() + optimizer.step() + total_loss+=loss + _,predicted = torch.max(out,1) + acc+=(predicted==labels).sum() + count+=len(labels) + i+=1 + if i%report_freq==0: + print(f"{count}: acc={acc.item()/count}") + if epoch_size and count>epoch_size: + break + return total_loss.item()/count, acc.item()/count + +def padify(b,voc=None,tokenizer=tokenizer): + # b is the list of tuples of length batch_size + # - first element of a tuple = label, + # - second = feature (text sequence) + # build vectorized sequence + v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b] + # compute max length of a sequence in this minibatch + l = max(map(len,v)) + return ( # tuple of two tensors - labels and features + torch.LongTensor([t[0]-1 for t in b]), + torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v]) + ) + +def offsetify(b,voc=None): + # first, compute data tensor from all sequences + x = [torch.tensor(encode(t[1],voc=voc)) for t in b] + # now, compute the offsets by accumulating the tensor of sequence lengths + o = [0] + [len(t) for t in x] + o = torch.tensor(o[:-1]).cumsum(dim=0) + return ( + torch.LongTensor([t[0]-1 for t in b]), # labels + torch.cat(x), # text + o + ) + +def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,use_pack_sequence=False): + optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr) + loss_fn = loss_fn.to(device) + net.train() + total_loss,acc,count,i = 0,0,0,0 + for labels,text,off in dataloader: + optimizer.zero_grad() + labels,text = labels.to(device), text.to(device) + if use_pack_sequence: + off = off.to('cpu') + else: + off = off.to(device) + out = net(text, off) + loss = loss_fn(out,labels) #cross_entropy(out,labels) + loss.backward() + optimizer.step() + total_loss+=loss + _,predicted = torch.max(out,1) + acc+=(predicted==labels).sum() + count+=len(labels) + i+=1 + if i%report_freq==0: + print(f"{count}: acc={acc.item()/count}") + if epoch_size and count>epoch_size: + break + return total_loss.item()/count, acc.item()/count +