From 0df2b4efb641be2bf07c55344bd378a37178fe33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=93=E8=BE=95?= Date: Mon, 8 Aug 2022 20:11:06 +0800 Subject: [PATCH 1/4] support using cached data for huggingface datasets; and re-splitting the GLUE versions into custom FL versions --- .../core/auxiliaries/data_builder.py | 95 +++++++++++++++++-- .../baseline/fedavg_transformer_on_cola.yaml | 43 +++++++++ 2 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 federatedscope/nlp/baseline/fedavg_transformer_on_cola.yaml diff --git a/federatedscope/core/auxiliaries/data_builder.py b/federatedscope/core/auxiliaries/data_builder.py index 87faefddc..aa6e02d76 100644 --- a/federatedscope/core/auxiliaries/data_builder.py +++ b/federatedscope/core/auxiliaries/data_builder.py @@ -1,8 +1,13 @@ +import os import pickle import logging +from random import shuffle + import numpy as np from collections import defaultdict +from federatedscope.core.auxiliaries.utils import setup_seed + import federatedscope.register as register logger = logging.getLogger(__name__) @@ -285,8 +290,14 @@ def load_torchtext_data(name, splits=None, config=None): if config.model.type.endswith('transformers'): from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained( - config.model.type.split('@')[0]) + + try: + tokenizer = AutoTokenizer.from_pretrained( + config.model.type.split('@')[0], + local_files_only=True, + cache_dir=os.path.join(os.getcwd(), "huggingface")) + except: + logging.error("") x_all = tokenizer(x_all, return_tensors='pt', @@ -402,6 +413,7 @@ def load_torch_geometric_data(name, splits=None, config=None): def load_huggingface_datasets_data(name, splits=None, config=None): from datasets import load_dataset + from datasets import load_from_disk if config.data.args: raw_args = config.data.args[0] @@ -410,18 +422,46 @@ def load_huggingface_datasets_data(name, splits=None, config=None): assert 'max_len' in raw_args, "Miss key 'max_len' in " \ "`config.data.args`." filtered_args = filter_dict(load_dataset, raw_args) - dataset = load_dataset(path=config.data.root, - name=name, - **filtered_args) + logger.info("Begin to load huggingface dataset") + if "hg_cache_dir" in raw_args: + hugging_face_path = raw_args["hg_cache_dir"] + else: + hugging_face_path = os.getcwd() + + if "load_disk_dir" in raw_args: + dataset = load_from_disk(raw_args["load_disk_dir"]) + else: + dataset = load_dataset(path=config.data.root, + name=name, + **filtered_args) if config.model.type.endswith('transformers'): + os.environ["TOKENIZERS_PARALLELISM"] = "false" from transformers import AutoTokenizer + logger.info("To load huggingface tokenizer") tokenizer = AutoTokenizer.from_pretrained( - config.model.type.split('@')[0]) + config.model.type.split('@')[0], + local_files_only=True, + cache_dir=os.path.join(hugging_face_path, "transformers")) for split in dataset: x_all = [i['sentence'] for i in dataset[split]] targets = [i['label'] for i in dataset[split]] + if split == "train" and "used_train_ratio" in raw_args and \ + 1 > raw_args['used_train_ratio'] > 0: + selected_idx = [i for i in range(len(dataset[split]))] + shuffle(selected_idx) + selected_idx = selected_idx[:int( + len(selected_idx) * raw_args['used_train_ratio'])] + x_all = [ + element for i, element in enumerate(x_all) + if i in selected_idx + ] + targets = [ + element for i, element in enumerate(targets) + if i in selected_idx + ] + x_all = tokenizer(x_all, return_tensors='pt', padding=True, @@ -441,6 +481,42 @@ def load_huggingface_datasets_data(name, splits=None, config=None): (x, y) for x, y in zip(dataset['test'][0], dataset['test'][1]) ] if (set(dataset['test'][1]) - set([-1])) else None, } + original_train_size = len(data_dict["train"]) + + if "half_val_dummy_test" in raw_args and raw_args[ + "half_val_dummy_test"]: + # since the "test" set from GLUE dataset may be masked, we need to + # submit to get the ground-truth, for fast FL experiments, + # we split the validation set into two parts with the same size as + # new test/val data + original_val = [(x, y) for x, y in zip(dataset['validation'][0], + dataset['validation'][1])] + data_dict["val"], data_dict[ + "test"] = original_val[:len(original_val) // + 2], original_val[len(original_val) // + 2:] + if "val_as_dummy_test" in raw_args and raw_args["val_as_dummy_test"]: + # use the validation set as tmp test set, + # and partial training set as validation set + data_dict["test"] = data_dict["val"] + data_dict["val"] = [] + if "part_train_dummy_val" in raw_args and 1 > raw_args[ + "part_train_dummy_val"] > 0: + new_val_part = int(original_train_size * + raw_args["part_train_dummy_val"]) + data_dict["val"].extend(data_dict["train"][:new_val_part]) + data_dict["train"] = data_dict["train"][new_val_part:] + if "part_train_dummy_test" in raw_args and 1 > raw_args[ + "part_train_dummy_test"] > 0: + new_test_part = int(original_train_size * + raw_args["part_train_dummy_test"]) + data_dict["test"] = data_dict["val"] + if data_dict["test"] is not None: + data_dict["test"].extend(data_dict["train"][:new_test_part]) + else: + data_dict["test"] = (data_dict["train"][:new_test_part]) + data_dict["train"] = data_dict["train"][new_test_part:] + return data_dict def load_openml_data(tid, splits=None, config=None): @@ -529,6 +605,9 @@ def get_data(config): obj: The dataset object. cfg.node: The updated configuration. """ + # fix the seed for data generation, + # will restore the user-specified on after the generation + setup_seed(12345) for func in register.data_dict.values(): data_and_config = func(config) if data_and_config is not None: @@ -615,6 +694,8 @@ def get_data(config): from federatedscope.attack.auxiliary import poisoning poisoning(data, modified_config) + setup_seed(config.seed) + if config.federate.mode.lower() == 'standalone': return data, modified_config else: @@ -631,6 +712,8 @@ def get_data(config): data_idx = config.distribute.data_idx return data[data_idx], config + setup_seed(config.seed) + def merge_data(all_data, merged_max_data_id, specified_dataset_name=None): if specified_dataset_name is None: diff --git a/federatedscope/nlp/baseline/fedavg_transformer_on_cola.yaml b/federatedscope/nlp/baseline/fedavg_transformer_on_cola.yaml new file mode 100644 index 000000000..b3e821615 --- /dev/null +++ b/federatedscope/nlp/baseline/fedavg_transformer_on_cola.yaml @@ -0,0 +1,43 @@ +# different from federatedscope/nlp/baseline/fedavg_bert_on_sst2.yaml, +# this yaml demonstrate +# (1) using cached tokenizer via `load_disk_dir` and `hg_cache_dir` +# (2) using some GLUE validation data as partial test data of the FL version + +use_gpu: True +device: -1 +early_stop: + patience: 5 +seed: 1 +federate: + mode: standalone + local_update_steps: 1 + batch_or_epoch: epoch + total_round_num: 500 + client_num: 50 + sample_client_rate: 0.2 + unseen_clients_rate: 0.2 +data: + root: 'glue' + type: 'cola@huggingface_datasets' + args: [{'load_disk_dir': 'huggingface/datasets/glue/cola', + 'hg_cache_dir': 'huggingface', 'max_len': 128, + 'val_as_dummy_test': True, 'part_train_dummy_val': 0.2} ] + batch_size: 64 + splitter: 'lda' + splitter_args: [ { 'alpha': 0.4, 'min_size': 1} ] + num_workers: 0 +model: + type: 'google/bert_uncased_L-2_H-128_A-2@transformers' + task: 'SequenceClassification' + out_channels: 2 +optimizer: + lr: 0.1 + weight_decay: 0.0 + grad_clip: 5.0 +criterion: + type: CrossEntropyLoss +trainer: + type: nlptrainer +eval: + freq: 5 + metrics: ['acc', 'correct', 'f1'] \ No newline at end of file From 2acbeb44b2b1d19612e638eb6cd2449a2b180532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=93=E8=BE=95?= Date: Mon, 8 Aug 2022 20:15:52 +0800 Subject: [PATCH 2/4] support using cached data for huggingface datasets; and re-splitting the GLUE versions into custom FL versions --- .../nlp/baseline/fedavg_transformer_on_cola.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/federatedscope/nlp/baseline/fedavg_transformer_on_cola.yaml b/federatedscope/nlp/baseline/fedavg_transformer_on_cola.yaml index b3e821615..7eb686495 100644 --- a/federatedscope/nlp/baseline/fedavg_transformer_on_cola.yaml +++ b/federatedscope/nlp/baseline/fedavg_transformer_on_cola.yaml @@ -10,8 +10,6 @@ early_stop: seed: 1 federate: mode: standalone - local_update_steps: 1 - batch_or_epoch: epoch total_round_num: 500 client_num: 50 sample_client_rate: 0.2 @@ -30,10 +28,12 @@ model: type: 'google/bert_uncased_L-2_H-128_A-2@transformers' task: 'SequenceClassification' out_channels: 2 -optimizer: - lr: 0.1 - weight_decay: 0.0 - grad_clip: 5.0 +train: + local_update_steps: 1 + batch_or_epoch: epoch + optimizer: + lr: 0.1 + weight_decay: 0.0 criterion: type: CrossEntropyLoss trainer: From 795da26d0bf658f8d57c6b97701a57cb6be78d87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=93=E8=BE=95?= Date: Wed, 10 Aug 2022 17:14:48 +0800 Subject: [PATCH 3/4] minor fix according to weirui's comments --- .../core/auxiliaries/data_builder.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/federatedscope/core/auxiliaries/data_builder.py b/federatedscope/core/auxiliaries/data_builder.py index 0000e8684..6f41f1452 100644 --- a/federatedscope/core/auxiliaries/data_builder.py +++ b/federatedscope/core/auxiliaries/data_builder.py @@ -290,14 +290,16 @@ def load_torchtext_data(name, splits=None, config=None): if config.model.type.endswith('transformers'): from transformers import AutoTokenizer - + cache_path = os.path.join(os.getcwd(), "huggingface") try: tokenizer = AutoTokenizer.from_pretrained( config.model.type.split('@')[0], local_files_only=True, - cache_dir=os.path.join(os.getcwd(), "huggingface")) - except: - logging.error("") + cache_dir=cache_path) + except Exception as e: + logging.error(f"When loading cached file form " + f"{cache_path}, we faced the exception: \n " + f"{str(e)}") x_all = tokenizer(x_all, return_tensors='pt', @@ -412,8 +414,7 @@ def load_torch_geometric_data(name, splits=None, config=None): raise NotImplementedError def load_huggingface_datasets_data(name, splits=None, config=None): - from datasets import load_dataset - from datasets import load_from_disk + from datasets import load_dataset, load_from_disk if config.data.args: raw_args = config.data.args[0] @@ -429,7 +430,13 @@ def load_huggingface_datasets_data(name, splits=None, config=None): hugging_face_path = os.getcwd() if "load_disk_dir" in raw_args: - dataset = load_from_disk(raw_args["load_disk_dir"]) + load_path = raw_args["load_disk_dir"] + try: + dataset = load_from_disk(load_path) + except Exception as e: + logging.error(f"When loading cached dataset form " + f"{load_path}, we faced the exception: \n " + f"{str(e)}") else: dataset = load_dataset(path=config.data.root, name=name, From d2056dde6727a8694bc5fd6f7b84a465f48a20ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=81=93=E8=BE=95?= Date: Wed, 10 Aug 2022 19:29:48 +0800 Subject: [PATCH 4/4] minor fix for unittest --- tests/test_vertical_fl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_vertical_fl.py b/tests/test_vertical_fl.py index f2fc22e78..13d3f65ac 100644 --- a/tests/test_vertical_fl.py +++ b/tests/test_vertical_fl.py @@ -57,7 +57,7 @@ def test_vFL(self): test_results = Fed_runner.run() init_cfg.merge_from_other_cfg(backup_cfg) print(test_results) - self.assertGreater(test_results['server_global_eval']['test_acc'], 0.9) + self.assertGreater(test_results['server_global_eval']['test_acc'], 0.8) if __name__ == '__main__':