Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataserver synthetic data #508

Merged
merged 25 commits into from
Aug 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
29bcace
Update README.md
chaoyanghe Aug 22, 2022
7fcc52e
Update datasets.py
ray-ruisun Aug 23, 2022
e529ced
Update requirements.txt
ray-ruisun Aug 24, 2022
9149455
Update attack_defense_data_loader.py
ray-ruisun Aug 24, 2022
41fad73
Merge pull request #473 from ray-ruisun/patch-5
chaoyanghe Aug 24, 2022
a2f702a
Merge pull request #477 from ray-ruisun/patch-6
chaoyanghe Aug 24, 2022
f1e4541
Merge pull request #478 from ray-ruisun/patch-7
chaoyanghe Aug 24, 2022
23ac067
Merge pull request #490 from FedML-AI/dev/v0.7.0
chaoyanghe Aug 26, 2022
800a7df
Merge branch 'master' into test/v0.7.0
chaoyanghe Aug 26, 2022
683cd68
Merge pull request #491 from FedML-AI/test/v0.7.0
chaoyanghe Aug 26, 2022
cbea7cf
Merge pull request #492 from FedML-AI/dev/v0.7.0
chaoyanghe Aug 26, 2022
30a5236
Merge pull request #493 from FedML-AI/test/v0.7.0
chaoyanghe Aug 26, 2022
597528e
Merge pull request #495 from FedML-AI/dev/v0.7.0
chaoyanghe Aug 28, 2022
cfa6d21
Merge pull request #496 from FedML-AI/test/v0.7.0
chaoyanghe Aug 28, 2022
f9d9015
Merge pull request #497 from FedML-AI/dev/v0.7.0
chaoyanghe Aug 28, 2022
48986e1
bugfix synthetic data url split data
RyanTrojans Aug 28, 2022
63fe2e4
Update data_loader.py
fedml-alex Aug 28, 2022
a841630
bugfix no s3 info and mqtt info
RyanTrojans Aug 28, 2022
d636a67
Merge remote-tracking branch 'origin/dataserver_synthetic_data' into …
RyanTrojans Aug 28, 2022
ef18213
bugfix no run_id == 0
RyanTrojans Aug 28, 2022
7eaa8a9
bugfix no default mock run_id
RyanTrojans Aug 28, 2022
953a4d0
no run_id limit
RyanTrojans Aug 29, 2022
cdec673
bugfix no client_id_list[1]
RyanTrojans Aug 29, 2022
1811e9c
bugfix no args attribute synthetic data
RyanTrojans Aug 29, 2022
f31812d
bugfix client path and server path
RyanTrojans Aug 29, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion devops/scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
protobuf
grpcio
grpcio-tools
loguru
dill
multiprocess
nvidia-ml-py3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,59 +3,60 @@
import fedml
from fedml import FedMLRunner
from fedml.data.MNIST.data_loader import download_mnist, load_partition_data_mnist
from .trainer.classification_aggregator import ClassificationAggregator
from .trainer.classification_trainer import ClassificationTrainer
from trainer.classification_aggregator import ClassificationAggregator
from trainer.classification_trainer import ClassificationTrainer
from fedml.data.data_loader import load


def load_data(args):
download_mnist(args.data_cache_dir)
fedml.logging.info("load_data. dataset_name = %s" % args.dataset)

"""
Please read through the data loader at to see how to customize the dataset for FedML framework.
"""
(
client_num,
train_data_num,
test_data_num,
train_data_global,
test_data_global,
train_data_local_num_dict,
train_data_local_dict,
test_data_local_dict,
class_num,
) = load_partition_data_mnist(
args,
args.batch_size,
train_path=args.data_cache_dir + "/MNIST/train",
test_path=args.data_cache_dir + "/MNIST/test",
)
"""
For shallow NN or linear models,
we uniformly sample a fraction of clients each round (as the original FedAvg paper)
"""
args.client_num_in_total = client_num
dataset = [
train_data_num,
test_data_num,
train_data_global,
test_data_global,
train_data_local_num_dict,
train_data_local_dict,
test_data_local_dict,
class_num,
]
return dataset, class_num


class LogisticRegression(torch.nn.Module):
def __init__(self, input_dim, output_dim):
super(LogisticRegression, self).__init__()
self.linear = torch.nn.Linear(input_dim, output_dim)

def forward(self, x):
outputs = torch.sigmoid(self.linear(x))
return outputs
# def load_data(args):
# download_mnist(args.data_cache_dir)
# fedml.logging.info("load_data. dataset_name = %s" % args.dataset)
#
# """
# Please read through the data loader at to see how to customize the dataset for FedML framework.
# """
# (
# client_num,
# train_data_num,
# test_data_num,
# train_data_global,
# test_data_global,
# train_data_local_num_dict,
# train_data_local_dict,
# test_data_local_dict,
# class_num,
# ) = load_partition_data_mnist(
# args,
# args.batch_size,
# train_path=args.data_cache_dir + "/MNIST/train",
# test_path=args.data_cache_dir + "/MNIST/test",
# )
# """
# For shallow NN or linear models,
# we uniformly sample a fraction of clients each round (as the original FedAvg paper)
# """
# args.client_num_in_total = client_num
# dataset = [
# train_data_num,
# test_data_num,
# train_data_global,
# test_data_global,
# train_data_local_num_dict,
# train_data_local_dict,
# test_data_local_dict,
# class_num,
# ]
# return dataset, class_num
#
#
# class LogisticRegression(torch.nn.Module):
# def __init__(self, input_dim, output_dim):
# super(LogisticRegression, self).__init__()
# self.linear = torch.nn.Linear(input_dim, output_dim)
#
# def forward(self, x):
# outputs = torch.sigmoid(self.linear(x))
# return outputs


if __name__ == "__main__":
Expand All @@ -66,7 +67,7 @@ def forward(self, x):
device = fedml.device.get_device(args)

# load data
dataset, class_num = load_data(args)
dataset, class_num = load(args)

# create model and trainer
model = fedml.model.create(args, output_dim=class_num)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,51 @@

import fedml
from fedml import FedMLRunner
from fedml.data.MNIST.data_loader import download_mnist, load_partition_data_mnist
from .trainer.classification_aggregator import ClassificationAggregator
from .trainer.classification_trainer import ClassificationTrainer
# from fedml.data.MNIST.data_loader import download_mnist, load_partition_data_mnist
from trainer.classification_aggregator import ClassificationAggregator
from trainer.classification_trainer import ClassificationTrainer
from fedml.data.data_loader import load


def load_data(args):
download_mnist(args.data_cache_dir)
fedml.logging.info("load_data. dataset_name = %s" % args.dataset)

"""
Please read through the data loader at to see how to customize the dataset for FedML framework.
"""
(
client_num,
train_data_num,
test_data_num,
train_data_global,
test_data_global,
train_data_local_num_dict,
train_data_local_dict,
test_data_local_dict,
class_num,
) = load_partition_data_mnist(
args,
args.batch_size,
train_path=args.data_cache_dir + "/MNIST/train",
test_path=args.data_cache_dir + "/MNIST/test",
)
"""
For shallow NN or linear models,
we uniformly sample a fraction of clients each round (as the original FedAvg paper)
"""
args.client_num_in_total = client_num
dataset = [
train_data_num,
test_data_num,
train_data_global,
test_data_global,
train_data_local_num_dict,
train_data_local_dict,
test_data_local_dict,
class_num,
]
return dataset, class_num
# def load_data(args):
# download_mnist(args.data_cache_dir)
# fedml.logging.info("load_data. dataset_name = %s" % args.dataset)
#
# """
# Please read through the data loader at to see how to customize the dataset for FedML framework.
# """
# (
# client_num,
# train_data_num,
# test_data_num,
# train_data_global,
# test_data_global,
# train_data_local_num_dict,
# train_data_local_dict,
# test_data_local_dict,
# class_num,
# ) = load_partition_data_mnist(
# args,
# args.batch_size,
# train_path=args.data_cache_dir + "/MNIST/train",
# test_path=args.data_cache_dir + "/MNIST/test",
# )
# """
# For shallow NN or linear models,
# we uniformly sample a fraction of clients each round (as the original FedAvg paper)
# """
# args.client_num_in_total = client_num
# dataset = [
# train_data_num,
# test_data_num,
# train_data_global,
# test_data_global,
# train_data_local_num_dict,
# train_data_local_dict,
# test_data_local_dict,
# class_num,
# ]
# return dataset, class_num


class LogisticRegression(torch.nn.Module):
Expand All @@ -66,7 +67,7 @@ def forward(self, x):
device = fedml.device.get_device(args)

# load data
dataset, class_num = load_data(args)
dataset, class_num = load(args)

# create model and trainer
model = fedml.model.create(args, output_dim=class_num)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def load_data_loader_from_file(cls, filename):
"""
Loads DataLoader object from a file if available.

:param logger: loguru.Logger
:param filename: string
"""
print("Loading data loader from file: {}".format(filename))
Expand Down
18 changes: 12 additions & 6 deletions python/fedml/data/cifar10/efficient_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,26 +94,30 @@ def _data_transforms_cifar10():
return train_transform, valid_transform


def load_cifar10_data(datadir, resize=32, augmentation=True, data_efficient_load=False):
def load_cifar10_data(datadir, process_id, synthetic_data_url, private_local_data, resize=32, augmentation=True, data_efficient_load=False):
train_transform, test_transform = _data_transforms_cifar10()

is_download = True;
if process_id != 0:
is_download = False if (len(synthetic_data_url) != 0 or len(private_local_data) != 0) else True;

if data_efficient_load:
cifar10_train_ds = CIFAR10(datadir, train=True, download=True, transform=train_transform)
cifar10_test_ds = CIFAR10(datadir, train=False, download=True, transform=test_transform)
else:
cifar10_train_ds = CIFAR10_truncated(datadir, train=True, download=True, transform=train_transform)
cifar10_test_ds = CIFAR10_truncated(datadir, train=False, download=True, transform=test_transform)
cifar10_train_ds = CIFAR10_truncated(datadir, train=True, download=is_download, transform=train_transform)
cifar10_test_ds = CIFAR10_truncated(datadir, train=False, download=is_download, transform=test_transform)

X_train, y_train = cifar10_train_ds.data, cifar10_train_ds.targets
X_test, y_test = cifar10_test_ds.data, cifar10_test_ds.targets

return (X_train, y_train, X_test, y_test, cifar10_train_ds, cifar10_test_ds)


def partition_data(dataset, datadir, partition, n_nets, alpha):
def partition_data(dataset, datadir, partition, n_nets, alpha, process_id, synthetic_data_url, private_local_data):
np.random.seed(10)
logging.info("*********partition data***************")
X_train, y_train, X_test, y_test, cifar10_train_ds, cifar10_test_ds = load_cifar10_data(datadir)
X_train, y_train, X_test, y_test, cifar10_train_ds, cifar10_test_ds = load_cifar10_data(datadir, process_id, synthetic_data_url, private_local_data)
n_train = X_train.shape[0]
# n_test = X_test.shape[0]

Expand Down Expand Up @@ -311,6 +315,8 @@ def efficient_load_partition_data_cifar10(
client_number,
batch_size,
process_id,
synthetic_data_url="",
private_local_data="",
n_proc_in_silo=0,
data_efficient_load=True,
):
Expand All @@ -323,7 +329,7 @@ def efficient_load_partition_data_cifar10(
traindata_cls_counts,
cifar10_train_ds,
cifar10_test_ds,
) = partition_data(dataset, data_dir, partition_method, client_number, partition_alpha, process_id)
) = partition_data(dataset, data_dir, partition_method, client_number, partition_alpha, process_id, synthetic_data_url, private_local_data)
class_num = len(np.unique(y_train))
logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
Expand Down
10 changes: 2 additions & 8 deletions python/fedml/data/cifar100/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,8 @@ def __build_truncated_dataset__(self):

cifar_dataobj = CIFAR100(self.root, self.train, self.transform, self.target_transform, self.download)

if self.train:
# print("train member of the class: {}".format(self.train))
# data = cifar_dataobj.train_data
data = cifar_dataobj.data
target = np.array(cifar_dataobj.targets)
else:
data = cifar_dataobj.data
target = np.array(cifar_dataobj.targets)
data = cifar_dataobj.data
target = np.array(cifar_dataobj.targets)

if self.dataidxs is not None:
data = data[self.dataidxs]
Expand Down
Loading