From c5a4d693902a548bfe89c5aa15c98d9d914db42a Mon Sep 17 00:00:00 2001 From: BAAI-OpenPlatform Date: Wed, 28 Dec 2022 17:02:22 +0800 Subject: [PATCH 01/13] fixed issue Signed-off-by: BAAI-OpenPlatform --- .../data/tokenizer/uni_tokenizer/tokenizer.py | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index c8ae4ef9..ee340666 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -146,34 +146,17 @@ def __init__(self, self.text_tokenizer._token_cls = "" self.text_tokenizer._token_sep = "" if add_block_symbols: - self._command_tokens.extend([ - CommandToken('sop', '<|startofpiece|>', self.num_tokens), - CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 + self.add_command_token('sop', '<|startofpiece|>') + self.add_command_token('eop', '<|endofpiece|>',) if add_task_mask: - self._command_tokens.extend([ - CommandToken('gMASK', '[gMASK]', self.num_tokens), - CommandToken('sMASK', '[sMASK]', self.num_tokens + 1) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 + self.add_command_token('gMASK', '[gMASK]') + self.add_command_token('sMASK', '[sMASK]') if add_decoder_mask: - self._command_tokens.extend( - [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) - self.num_tokens += 1 - self.num_command_tokens += 1 + self.add_command_token('dBLOCK', '[dBLOCK]') if add_sentinel_token > 0: for i in range(1, add_sentinel_token): - self._command_tokens.extend([ - CommandToken(f'MASK{i}', f'[MASK{i}]', - self.num_tokens), - CommandToken(f'sop{i}', f'<|startofpiece{i}|>', - self.num_tokens + 1) - ]) - self.num_tokens += 2 - self.num_command_tokens += 2 + self.add_command_token(f'MASK{i}', f'[MASK{i}]') + self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') elif self.tokenizer_class == "bpe": if self.tokenizer_model_name.lower().startswith('roberta'): self.num_command_tokens = 6 @@ -315,7 +298,13 @@ def __init__(self, self.num_command_tokens += 6 self.token_end_id = self.text_tokenizer.convert_token_to_id( '') + + if add_block_symbols: + sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') + eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') + + self._command_tokens.extend([ CommandToken('sop', '<|startofpiece|>', self.num_tokens + 1), @@ -363,6 +352,7 @@ def __init__(self, } self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) + def get_vocab(self): return self.text_tokenizer.get_vocab() @@ -371,6 +361,14 @@ def get_command_id(self, name): """get command token corresponding to `name`""" return self.command_name_map[name].Id + def add_command_token(self, name, token): + try: + id = self.text_tokenizer.convert_token_to_id(token) + except KeyError: + id = self.num_tokens + self.num_tokens += 1 + self._command_tokens.append(CommandToken(name, token, id)) + return def rematch(self, text, tokens): """output the mapping relation between raw text and tokenizezd text """ From 9c130f7757ffbc4691935065784b2c2ff42a07bc Mon Sep 17 00:00:00 2001 From: BAAI-OpenPlatform Date: Fri, 30 Dec 2022 16:17:01 +0800 Subject: [PATCH 02/13] saved work Signed-off-by: BAAI-OpenPlatform --- .../glm_blank_filling/glm_generate_samples.py | 3 +- .../glm_generate_samples_en.py | 24 ++++++++++++++- .../data/tokenizer/uni_tokenizer/tokenizer.py | 29 +++++++++---------- flagai/model/predictor/utils.py | 8 +++++ 4 files changed, 47 insertions(+), 17 deletions(-) diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index cc8818d6..128445ef 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -3,7 +3,8 @@ # Licensed under the Apache License, Version 2.0 (the "License") import torch - +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer from flagai.model.predictor.predictor import Predictor diff --git a/examples/glm_blank_filling/glm_generate_samples_en.py b/examples/glm_blank_filling/glm_generate_samples_en.py index 856ef161..8e4db110 100644 --- a/examples/glm_blank_filling/glm_generate_samples_en.py +++ b/examples/glm_blank_filling/glm_generate_samples_en.py @@ -3,28 +3,50 @@ # Licensed under the Apache License, Version 2.0 (the "License") import torch -from flagai.model.predictor.predictor import Predictor +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") +from flagai.model.glm_model import GLMModel +from flagai.data.tokenizer import Tokenizer +from flagai.data.tokenizer.glm_large_en.glm_large_en_tokenizer import GLMLargeEnWordPieceTokenizer from flagai.auto_model.auto_loader import AutoLoader +from flagai.model.predictor.predictor import Predictor if __name__ == "__main__": """Main training program.""" print('Generate Samples') # Random seeds for reproducibility. # Model, + # model_name = 'GLM-large-en' + # model = GLMModel.from_pretrain(model_name=model_name, + # download_path="./checkpoints/") + # tokenizer = Tokenizer.from_pretrained(model_name) + + loader = AutoLoader(task_name='lm', model_name='GLM-large-en', only_download_config=False) model = loader.get_model() tokenizer = loader.get_tokenizer() + + # tokenizer = GLMLargeEnWordPieceTokenizer() + # import pdb;pdb.set_trace() model.cuda(torch.cuda.current_device()) + # predictor = Predictor(model, tokenizer) + predictor = Predictor(model, tokenizer) # generate samples text = [ 'Question: Is drinking beer bad for your health? Answer: [gMASK]', ] + # text = [ + # 'Question: Is fruit good for your health? Answer: [gMASK]', + # ] for t in text: output = predictor.predict_generate_randomsample( t, top_k=50, repetition_penalty=4.0, top_p=1.0) + # output = predictor.predict_generate_beamsearch(t, + # out_max_length=66, + # beam_size=10) print(t, '\n', output) diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index ee340666..791f7be8 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -145,18 +145,18 @@ def __init__(self, '') self.text_tokenizer._token_cls = "" self.text_tokenizer._token_sep = "" - if add_block_symbols: - self.add_command_token('sop', '<|startofpiece|>') - self.add_command_token('eop', '<|endofpiece|>',) - if add_task_mask: - self.add_command_token('gMASK', '[gMASK]') - self.add_command_token('sMASK', '[sMASK]') - if add_decoder_mask: - self.add_command_token('dBLOCK', '[dBLOCK]') - if add_sentinel_token > 0: - for i in range(1, add_sentinel_token): - self.add_command_token(f'MASK{i}', f'[MASK{i}]') - self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') + # if add_block_symbols: + # self.add_command_token('sop', '<|startofpiece|>') + # self.add_command_token('eop', '<|endofpiece|>',) + # if add_task_mask: + # self.add_command_token('gMASK', '[gMASK]') + # self.add_command_token('sMASK', '[sMASK]') + # if add_decoder_mask: + # self.add_command_token('dBLOCK', '[dBLOCK]') + # if add_sentinel_token > 0: + # for i in range(1, add_sentinel_token): + # self.add_command_token(f'MASK{i}', f'[MASK{i}]') + # self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') elif self.tokenizer_class == "bpe": if self.tokenizer_model_name.lower().startswith('roberta'): self.num_command_tokens = 6 @@ -303,8 +303,6 @@ def __init__(self, if add_block_symbols: sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') - - self._command_tokens.extend([ CommandToken('sop', '<|startofpiece|>', self.num_tokens + 1), @@ -352,7 +350,7 @@ def __init__(self, } self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) - + print([(k,v.Id) for k,v in self.command_name_map.items()]) def get_vocab(self): return self.text_tokenizer.get_vocab() @@ -369,6 +367,7 @@ def add_command_token(self, name, token): self.num_tokens += 1 self._command_tokens.append(CommandToken(name, token, id)) return + def rematch(self, text, tokens): """output the mapping relation between raw text and tokenizezd text """ diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 5b7726a2..cd53c54b 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1396,6 +1396,14 @@ def glm_generate_sample( context_length = len(context_tokens) context_length_tensor = torch.LongTensor([context_length]) context_length = context_length_tensor[0].item() + + + + # context_tokens[0] = context_tokens[0].Id + # import pdb;pdb.set_trace() + + + context_tokens_tensor = torch.LongTensor(context_tokens) text = tokenizer.DecodeIds(context_tokens_tensor.tolist()) From a33213e24c59856df58596f97285a6658f0f81c5 Mon Sep 17 00:00:00 2001 From: BAAI-OpenPlatform Date: Mon, 2 Jan 2023 18:17:13 +0800 Subject: [PATCH 03/13] work saved Signed-off-by: BAAI-OpenPlatform --- examples/AltCLIP/altclip_inference.py | 2 +- read_laion.py | 66 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 read_laion.py diff --git a/examples/AltCLIP/altclip_inference.py b/examples/AltCLIP/altclip_inference.py index 42320117..c737bf9b 100644 --- a/examples/AltCLIP/altclip_inference.py +++ b/examples/AltCLIP/altclip_inference.py @@ -19,7 +19,7 @@ tokenizer = loader.get_tokenizer() def inference(): - image = Image.open("./dog.jpeg") + image = Image.open("/home/yanzhaodong/anhforth/data/images/12.png") image = transform(image) image = torch.tensor(image["pixel_values"]).to(device) text = tokenizer(["a rat", "a dog", "a cat"])["input_ids"] diff --git a/read_laion.py b/read_laion.py new file mode 100644 index 00000000..ed3543b4 --- /dev/null +++ b/read_laion.py @@ -0,0 +1,66 @@ +import webdataset as wds +import torch +from itertools import islice +from PIL import Image +import io +# url = "/share/projset/laion400m/laion400m-full-release/img_data/laion400m-dat-release/12740.tar" + +# dataset = wds.WebDataset(url) + +# for sample in islice(dataset, 0, 3): +# for key, value in sample.items(): +# if key == "jpg": +# # print(repr(value)) +# image_data = repr(value) +# image = Image.open(io.BytesIO(image_data)) +# image.show() +# break +# # print(key, repr(value)[:50]) +# print() +# import pdb;pdb.set_trace() +# dataset = wds.WebDataset(url).shuffle(1000).decode("torchrgb").to_tuple("jpg;png", "json") + +# dataloader = torch.utils.data.DataLoader(dataset, num_workers=1, batch_size=1) + +# for inputs, outputs in dataloader: +# print(inputs, outputs) +# break + +import pandas as pd +from glob import glob +import requests +path = "/home/yanzhaodong/anhforth/data/train-00000-of-00001-6f24a7497df494ae.parquet" + +def download_from_path(path, output_dir='/home/yanzhaodong/anhforth/data/images/'): + df = pd.read_parquet(path) + df = df.loc[df['TEXT'].str.contains('flower') | df['TEXT'].str.contains('plant') | + df['TEXT'].str.contains('vegetation') | df['TEXT'].str.contains('garden') | df['TEXT'].str.contains('floral')] + df = df.sort_values(by=["AESTHETIC_SCORE"], ascending=False) + df = df[:2000] + urls = list(df["URL"]) + from tqdm import trange + # url = "https://us.123rf.com/450wm/grandfailure/grandfailure1601/grandfailure160100013/50661747-woman-in-flower-fields-next-to-red-castle-and-mountain-illustration-painting.jpg?ver=6" + for i in trange(len(urls)): + url = urls[i] + try: + response = requests.get(url, verify=False, timeout=10) + except OSError or urllib3.exceptions.NewConnectionError: + pass + if response.status_code: + fp = open(output_dir+str(i)+'.png', 'wb') + fp.write(response.content) + fp.close() +# download_from_path(path) +import pdb; pdb.set_trace() +glob('/home/yanzhaodong/anhforth/data/images/*.png') + +# files = glob("/share/projset/laion400m/laion400m-full-release/img_data/laion400m-dat-release/*.parquet") +# from tqdm import tqdm +# for f in tqdm(files): +# df = pd.read_parquet(f) +# df = df.loc[df['caption'].str.contains('flower')] + +# import pdb;pdb.set_trace() +# df = pd.read_parquet("/share/projset/laion400m/laion400m-full-release/img_data/laion400m-dat-release/12740.parquet") +# df = df.loc[df['caption'].str.contains('flower')] +# import pdb;pdb.set_trace() \ No newline at end of file From cdfebbe9594c79afd7dadb6e395d9db080a381d2 Mon Sep 17 00:00:00 2001 From: BAAI-OpenPlatform Date: Tue, 3 Jan 2023 00:12:12 +0800 Subject: [PATCH 04/13] fixed generation issue Signed-off-by: BAAI-OpenPlatform --- .../glm_generate_samples_en.py | 27 +++---------------- .../data/tokenizer/uni_tokenizer/tokenizer.py | 6 ++--- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/examples/glm_blank_filling/glm_generate_samples_en.py b/examples/glm_blank_filling/glm_generate_samples_en.py index 8e4db110..74247499 100644 --- a/examples/glm_blank_filling/glm_generate_samples_en.py +++ b/examples/glm_blank_filling/glm_generate_samples_en.py @@ -3,8 +3,6 @@ # Licensed under the Apache License, Version 2.0 (the "License") import torch -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer from flagai.data.tokenizer.glm_large_en.glm_large_en_tokenizer import GLMLargeEnWordPieceTokenizer @@ -13,40 +11,21 @@ if __name__ == "__main__": """Main training program.""" print('Generate Samples') - # Random seeds for reproducibility. - # Model, - # model_name = 'GLM-large-en' - # model = GLMModel.from_pretrain(model_name=model_name, - # download_path="./checkpoints/") - # tokenizer = Tokenizer.from_pretrained(model_name) - loader = AutoLoader(task_name='lm', - model_name='GLM-large-en', - only_download_config=False) + model_name='GLM-large-en', + only_download_config=False) model = loader.get_model() tokenizer = loader.get_tokenizer() - # tokenizer = GLMLargeEnWordPieceTokenizer() - # import pdb;pdb.set_trace() model.cuda(torch.cuda.current_device()) - - # predictor = Predictor(model, tokenizer) - predictor = Predictor(model, tokenizer) - # generate samples text = [ - 'Question: Is drinking beer bad for your health? Answer: [gMASK]', + 'Is drinking beer bad for your health?', ] - # text = [ - # 'Question: Is fruit good for your health? Answer: [gMASK]', - # ] for t in text: output = predictor.predict_generate_randomsample( t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # output = predictor.predict_generate_beamsearch(t, - # out_max_length=66, - # beam_size=10) print(t, '\n', output) diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 791f7be8..b12a494e 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -145,9 +145,9 @@ def __init__(self, '') self.text_tokenizer._token_cls = "" self.text_tokenizer._token_sep = "" - # if add_block_symbols: - # self.add_command_token('sop', '<|startofpiece|>') - # self.add_command_token('eop', '<|endofpiece|>',) + if add_block_symbols: + self.add_command_token('sop', '<|startofpiece|>') + self.add_command_token('eop', '<|endofpiece|>',) # if add_task_mask: # self.add_command_token('gMASK', '[gMASK]') # self.add_command_token('sMASK', '[sMASK]') From 5a587bf65723eba47fc955ba386e3899714dc746 Mon Sep 17 00:00:00 2001 From: BAAI-OpenPlatform Date: Tue, 3 Jan 2023 17:35:20 +0800 Subject: [PATCH 05/13] updated Signed-off-by: BAAI-OpenPlatform --- examples/glm_blank_filling/glm_generate_samples_en.py | 4 +++- flagai/auto_model/auto_loader.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/glm_blank_filling/glm_generate_samples_en.py b/examples/glm_blank_filling/glm_generate_samples_en.py index 74247499..41d27375 100644 --- a/examples/glm_blank_filling/glm_generate_samples_en.py +++ b/examples/glm_blank_filling/glm_generate_samples_en.py @@ -3,6 +3,8 @@ # Licensed under the Apache License, Version 2.0 (the "License") import torch +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer from flagai.data.tokenizer.glm_large_en.glm_large_en_tokenizer import GLMLargeEnWordPieceTokenizer @@ -13,7 +15,7 @@ print('Generate Samples') loader = AutoLoader(task_name='lm', - model_name='GLM-large-en', + model_name='GLM-large-en-generation', only_download_config=False) model = loader.get_model() tokenizer = loader.get_tokenizer() diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index 9b93e4b2..d7bf0d1f 100644 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -76,6 +76,7 @@ def __getattr__(self, name): "glm-large-ch": ["flagai.model.glm_model", "GLMModel", "glm", "nlp"], "alm-1.0": ["flagai.model.alm_model", "ALMModel", "alm", "nlp"], "glm-large-en": ["flagai.model.glm_model", "GLMModel", "glm", "nlp"], + "glm-large-en-generation": ["flagai.model.glm_model", "GLMModel", "glm", "nlp"], "gpt2-base-ch": ["flagai.model.gpt2_model", "GPT2Model", "gpt2", "nlp"], "cpm-large-ch": ["flagai.model.gpt2_model", "GPT2Model", "cpm", "nlp"], "opt-125m-en": ["flagai.model.opt_model", "OPTModel", "opt", "nlp"], From 9ff7c159cdca3f4b5968e70a7d08656145414565 Mon Sep 17 00:00:00 2001 From: BAAI-OpenPlatform Date: Tue, 3 Jan 2023 17:36:49 +0800 Subject: [PATCH 06/13] moved local path Signed-off-by: BAAI-OpenPlatform --- examples/glm_blank_filling/glm_generate_samples.py | 2 -- examples/glm_blank_filling/glm_generate_samples_en.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 128445ef..574a7644 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -3,8 +3,6 @@ # Licensed under the Apache License, Version 2.0 (the "License") import torch -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer from flagai.model.predictor.predictor import Predictor diff --git a/examples/glm_blank_filling/glm_generate_samples_en.py b/examples/glm_blank_filling/glm_generate_samples_en.py index 41d27375..009b4ed1 100644 --- a/examples/glm_blank_filling/glm_generate_samples_en.py +++ b/examples/glm_blank_filling/glm_generate_samples_en.py @@ -3,8 +3,6 @@ # Licensed under the Apache License, Version 2.0 (the "License") import torch -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer from flagai.data.tokenizer.glm_large_en.glm_large_en_tokenizer import GLMLargeEnWordPieceTokenizer From faf7246d2900690098d9ad4b2802fc243e57b3d0 Mon Sep 17 00:00:00 2001 From: BAAI-OpenPlatform Date: Tue, 3 Jan 2023 17:47:17 +0800 Subject: [PATCH 07/13] minor modifications Signed-off-by: BAAI-OpenPlatform --- .../data/tokenizer/uni_tokenizer/tokenizer.py | 2 - flagai/model/predictor/utils.py | 8 --- read_laion.py | 66 ------------------- 3 files changed, 76 deletions(-) delete mode 100644 read_laion.py diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index a5b38477..028405cd 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -298,8 +298,6 @@ def __init__(self, self.num_command_tokens += 6 self.token_end_id = self.text_tokenizer.convert_token_to_id( '') - - if add_block_symbols: sop_id = self.text_tokenizer.convert_token_to_id('<|startofpiece|>') eop_id = self.text_tokenizer.convert_token_to_id('<|endofpiece|>') diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index cd53c54b..5b7726a2 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1396,14 +1396,6 @@ def glm_generate_sample( context_length = len(context_tokens) context_length_tensor = torch.LongTensor([context_length]) context_length = context_length_tensor[0].item() - - - - # context_tokens[0] = context_tokens[0].Id - # import pdb;pdb.set_trace() - - - context_tokens_tensor = torch.LongTensor(context_tokens) text = tokenizer.DecodeIds(context_tokens_tensor.tolist()) diff --git a/read_laion.py b/read_laion.py deleted file mode 100644 index ed3543b4..00000000 --- a/read_laion.py +++ /dev/null @@ -1,66 +0,0 @@ -import webdataset as wds -import torch -from itertools import islice -from PIL import Image -import io -# url = "/share/projset/laion400m/laion400m-full-release/img_data/laion400m-dat-release/12740.tar" - -# dataset = wds.WebDataset(url) - -# for sample in islice(dataset, 0, 3): -# for key, value in sample.items(): -# if key == "jpg": -# # print(repr(value)) -# image_data = repr(value) -# image = Image.open(io.BytesIO(image_data)) -# image.show() -# break -# # print(key, repr(value)[:50]) -# print() -# import pdb;pdb.set_trace() -# dataset = wds.WebDataset(url).shuffle(1000).decode("torchrgb").to_tuple("jpg;png", "json") - -# dataloader = torch.utils.data.DataLoader(dataset, num_workers=1, batch_size=1) - -# for inputs, outputs in dataloader: -# print(inputs, outputs) -# break - -import pandas as pd -from glob import glob -import requests -path = "/home/yanzhaodong/anhforth/data/train-00000-of-00001-6f24a7497df494ae.parquet" - -def download_from_path(path, output_dir='/home/yanzhaodong/anhforth/data/images/'): - df = pd.read_parquet(path) - df = df.loc[df['TEXT'].str.contains('flower') | df['TEXT'].str.contains('plant') | - df['TEXT'].str.contains('vegetation') | df['TEXT'].str.contains('garden') | df['TEXT'].str.contains('floral')] - df = df.sort_values(by=["AESTHETIC_SCORE"], ascending=False) - df = df[:2000] - urls = list(df["URL"]) - from tqdm import trange - # url = "https://us.123rf.com/450wm/grandfailure/grandfailure1601/grandfailure160100013/50661747-woman-in-flower-fields-next-to-red-castle-and-mountain-illustration-painting.jpg?ver=6" - for i in trange(len(urls)): - url = urls[i] - try: - response = requests.get(url, verify=False, timeout=10) - except OSError or urllib3.exceptions.NewConnectionError: - pass - if response.status_code: - fp = open(output_dir+str(i)+'.png', 'wb') - fp.write(response.content) - fp.close() -# download_from_path(path) -import pdb; pdb.set_trace() -glob('/home/yanzhaodong/anhforth/data/images/*.png') - -# files = glob("/share/projset/laion400m/laion400m-full-release/img_data/laion400m-dat-release/*.parquet") -# from tqdm import tqdm -# for f in tqdm(files): -# df = pd.read_parquet(f) -# df = df.loc[df['caption'].str.contains('flower')] - -# import pdb;pdb.set_trace() -# df = pd.read_parquet("/share/projset/laion400m/laion400m-full-release/img_data/laion400m-dat-release/12740.parquet") -# df = df.loc[df['caption'].str.contains('flower')] -# import pdb;pdb.set_trace() \ No newline at end of file From 4e6f259bc10e517b40e28d50b5a0c5ff7cb81954 Mon Sep 17 00:00:00 2001 From: zhaohu xing <920232796@qq.com> Date: Wed, 4 Jan 2023 10:49:49 +0800 Subject: [PATCH 08/13] improve lanuch Signed-off-by: zhaohu xing <920232796@qq.com> --- flagai/launch.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/flagai/launch.py b/flagai/launch.py index ecba3254..b9bd88c6 100644 --- a/flagai/launch.py +++ b/flagai/launch.py @@ -17,6 +17,7 @@ import socket from flagai.logger import log_dist +import signal def fetch_hostfile(hostfile_path): if not os.path.isfile(hostfile_path): @@ -128,7 +129,12 @@ def launch_dist(launcher='distributed_deepspeed', cmd_launch.append('"') run_cmd = ' '.join(cmd_launch) log_dist(run_cmd) - subprocess.Popen(run_cmd, shell=True) + p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) + def signal_handler(signal, frame): + os.killpg(os.getpgid(p.pid), 9) + signal.signal(signal.SIGINT, signal_handler) + p.wait() + # subprocess.Popen(run_cmd, shell=True) node_rank += 1 elif num_nodes == 1 and launcher == 'distributed_torch': @@ -158,7 +164,13 @@ def launch_dist(launcher='distributed_deepspeed', cmd_launch.append('--not_call_launch') run_cmd = ' '.join(cmd_launch) log_dist(run_cmd) - subprocess.Popen(run_cmd, shell=True) + # subprocess.Popen(run_cmd, shell=True) + + p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) + def signal_handler(signal, frame): + os.killpg(os.getpgid(p.pid), 9) + signal.signal(signal.SIGINT, signal_handler) + p.wait() elif launcher == 'distributed_deepspeed': if hostfile is None: @@ -206,7 +218,12 @@ def launch_dist(launcher='distributed_deepspeed', cmd_launch.append('--not_call_launch') run_cmd = ' '.join(cmd_launch) log_dist(run_cmd) - subprocess.Popen(run_cmd, shell=True) + # subprocess.Popen(run_cmd, shell=True) + p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) + def signal_handler(signal, frame): + os.killpg(os.getpgid(p.pid), 9) + signal.signal(signal.SIGINT, signal_handler) + p.wait() elif num_nodes == 1 and launcher == 'simple_torch': # This launcher @@ -238,6 +255,11 @@ def launch_dist(launcher='distributed_deepspeed', run_cmd = ' '.join(cmd_launch) log_dist(run_cmd) - subprocess.Popen(run_cmd, shell=True) + # subprocess.Popen(run_cmd, shell=True) + p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) + def signal_handler(signal, frame): + os.killpg(os.getpgid(p.pid), 9) + signal.signal(signal.SIGINT, signal_handler) + p.wait() else: raise Exception('No aviable launcher') From 640d03e842f9d8f2b6dc92a6b61acc5d61251979 Mon Sep 17 00:00:00 2001 From: ldwang Date: Thu, 5 Jan 2023 20:58:48 +0800 Subject: [PATCH 09/13] Revert "improve lanuch" --- flagai/launch.py | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/flagai/launch.py b/flagai/launch.py index b9bd88c6..ecba3254 100644 --- a/flagai/launch.py +++ b/flagai/launch.py @@ -17,7 +17,6 @@ import socket from flagai.logger import log_dist -import signal def fetch_hostfile(hostfile_path): if not os.path.isfile(hostfile_path): @@ -129,12 +128,7 @@ def launch_dist(launcher='distributed_deepspeed', cmd_launch.append('"') run_cmd = ' '.join(cmd_launch) log_dist(run_cmd) - p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) - def signal_handler(signal, frame): - os.killpg(os.getpgid(p.pid), 9) - signal.signal(signal.SIGINT, signal_handler) - p.wait() - # subprocess.Popen(run_cmd, shell=True) + subprocess.Popen(run_cmd, shell=True) node_rank += 1 elif num_nodes == 1 and launcher == 'distributed_torch': @@ -164,13 +158,7 @@ def signal_handler(signal, frame): cmd_launch.append('--not_call_launch') run_cmd = ' '.join(cmd_launch) log_dist(run_cmd) - # subprocess.Popen(run_cmd, shell=True) - - p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) - def signal_handler(signal, frame): - os.killpg(os.getpgid(p.pid), 9) - signal.signal(signal.SIGINT, signal_handler) - p.wait() + subprocess.Popen(run_cmd, shell=True) elif launcher == 'distributed_deepspeed': if hostfile is None: @@ -218,12 +206,7 @@ def signal_handler(signal, frame): cmd_launch.append('--not_call_launch') run_cmd = ' '.join(cmd_launch) log_dist(run_cmd) - # subprocess.Popen(run_cmd, shell=True) - p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) - def signal_handler(signal, frame): - os.killpg(os.getpgid(p.pid), 9) - signal.signal(signal.SIGINT, signal_handler) - p.wait() + subprocess.Popen(run_cmd, shell=True) elif num_nodes == 1 and launcher == 'simple_torch': # This launcher @@ -255,11 +238,6 @@ def signal_handler(signal, frame): run_cmd = ' '.join(cmd_launch) log_dist(run_cmd) - # subprocess.Popen(run_cmd, shell=True) - p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) - def signal_handler(signal, frame): - os.killpg(os.getpgid(p.pid), 9) - signal.signal(signal.SIGINT, signal_handler) - p.wait() + subprocess.Popen(run_cmd, shell=True) else: raise Exception('No aviable launcher') From e6378e8e5c29c7a35eb3c34cbc501f440271b020 Mon Sep 17 00:00:00 2001 From: Anhforth Date: Fri, 6 Jan 2023 10:08:08 +0800 Subject: [PATCH 10/13] localized safetychecker Signed-off-by: Anhforth --- examples/AltDiffusion/generate.py | 4 +++- examples/gpt2_text_writting/generate.py | 2 +- flagai/model/predictor/utils.py | 29 +++++++++++++++++++++---- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/examples/AltDiffusion/generate.py b/examples/AltDiffusion/generate.py index 9f4c54d3..b7dcd4e0 100644 --- a/examples/AltDiffusion/generate.py +++ b/examples/AltDiffusion/generate.py @@ -2,6 +2,8 @@ # # Licensed under the Apache License, Version 2.0 (the "License") import torch +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor @@ -9,7 +11,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loader = AutoLoader(task_name="text2img", #contrastive learning - model_name="AltDiffusion-m9", + model_name="AltDiffusion", model_dir="./checkpoints") model = loader.get_model() diff --git a/examples/gpt2_text_writting/generate.py b/examples/gpt2_text_writting/generate.py index 325dca15..42edd9b6 100644 --- a/examples/gpt2_text_writting/generate.py +++ b/examples/gpt2_text_writting/generate.py @@ -7,7 +7,7 @@ if __name__ == '__main__': loader = AutoLoader("seq2seq", "GPT2-base-ch", - model_dir="./state_dict/") + model_dir="./checkpoints/") model = loader.get_model() tokenizer = loader.get_tokenizer() predictor = Predictor(model, tokenizer) diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 5b7726a2..71443867 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -12,18 +12,39 @@ from itertools import islice from transformers import AutoFeatureExtractor import math - +from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files join = os.path.join +def download(model_name, download_path): + try: + model_id = _get_model_id(model_name) + except: + print("Model hub is not reachable!") + # prepare the download path + # downloading the files + if model_id and model_id != "null": + model_files = eval(_get_model_files(model_name)) + print("model files:" + str(model_files)) + for file_name in model_files: + if not file_name.endswith("bin"): + _get_vocab_path(os.path.join(download_path, model_name), file_name, model_id) + else : + _get_checkpoint_path(os.path.join(download_path, model_name), file_name, model_id) + return + def get_safety_checker(): # load safety model from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker - safety_model_id = "CompVis/stable-diffusion-safety-checker" + path = os.getcwd() + "/checkpoints/" + import pdb;pdb.set_trace() + if not os.path.exists(path+"SafetyChecker"): + download("SafetyChecker", path) + # safety_model_id = "CompVis/stable-diffusion-safety-checker" safety_feature_extractor = AutoFeatureExtractor.from_pretrained( - safety_model_id) + path+"SafetyChecker") safety_checker = StableDiffusionSafetyChecker.from_pretrained( - safety_model_id) + path+"SafetyChecker") return safety_checker, safety_feature_extractor From 771551f61a57e054c04bfee6d6f74207a1e3adbc Mon Sep 17 00:00:00 2001 From: Anhforth Date: Fri, 6 Jan 2023 10:18:04 +0800 Subject: [PATCH 11/13] removed local path Signed-off-by: Anhforth --- examples/AltDiffusion/generate.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/AltDiffusion/generate.py b/examples/AltDiffusion/generate.py index b7dcd4e0..d613107a 100644 --- a/examples/AltDiffusion/generate.py +++ b/examples/AltDiffusion/generate.py @@ -2,8 +2,6 @@ # # Licensed under the Apache License, Version 2.0 (the "License") import torch -import sys -sys.path.append("/home/yanzhaodong/anhforth/FlagAI") from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor From 4494960d4b292e448e1b19e852929e5cd2f579d7 Mon Sep 17 00:00:00 2001 From: Anhforth Date: Fri, 6 Jan 2023 10:19:35 +0800 Subject: [PATCH 12/13] change to m9 Signed-off-by: Anhforth --- examples/AltDiffusion/generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/AltDiffusion/generate.py b/examples/AltDiffusion/generate.py index d613107a..9f4c54d3 100644 --- a/examples/AltDiffusion/generate.py +++ b/examples/AltDiffusion/generate.py @@ -9,7 +9,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loader = AutoLoader(task_name="text2img", #contrastive learning - model_name="AltDiffusion", + model_name="AltDiffusion-m9", model_dir="./checkpoints") model = loader.get_model() From 11199af4a44cfb95c4ff91f1c751ba3531bda36d Mon Sep 17 00:00:00 2001 From: BAAI-OpenPlatform <107522723+BAAI-OpenPlatform@users.noreply.github.com> Date: Fri, 6 Jan 2023 11:42:55 +0800 Subject: [PATCH 13/13] Update utils.py --- flagai/model/predictor/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 71443867..82735233 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -37,7 +37,6 @@ def get_safety_checker(): # load safety model from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker path = os.getcwd() + "/checkpoints/" - import pdb;pdb.set_trace() if not os.path.exists(path+"SafetyChecker"): download("SafetyChecker", path) # safety_model_id = "CompVis/stable-diffusion-safety-checker"