Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update xgb algo files #439

Merged
merged 27 commits into from
Nov 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
b677cc0
20221017
qbc2016 Oct 17, 2022
eb8f7db
Merge branch 'master' of https://github.com/alibaba/FederatedScope
qbc2016 Oct 18, 2022
f1e3b99
refine master
qbc2016 Nov 4, 2022
89f103e
Merge branch 'master' of https://github.com/alibaba/FederatedScope
qbc2016 Nov 7, 2022
60cda5b
fix yaml, need fix givemesomecredit
qbc2016 Nov 7, 2022
0ef0135
temperory files, need further repairation, may work for 'adult', no f…
qbc2016 Nov 8, 2022
228fa72
dataset 'adult' for vertical fl
qbc2016 Nov 8, 2022
78c0d4b
delete redundant
qbc2016 Nov 8, 2022
5db4bbf
fix typo
qbc2016 Nov 8, 2022
1667e6d
minor changes
qbc2016 Nov 8, 2022
60949b9
modified according to the comments
qbc2016 Nov 10, 2022
d44121d
add a parameter 'model' to dataset to decide whether to change the la…
qbc2016 Nov 10, 2022
96614a3
minor changes
qbc2016 Nov 10, 2022
f1af88d
Merge branch 'dev_vertical_data'
qbc2016 Nov 10, 2022
21ffc9e
Merge branch 'master' of https://github.com/alibaba/FederatedScope
qbc2016 Nov 10, 2022
8e2ae16
Merge branch 'master' of https://github.com/alibaba/FederatedScope
qbc2016 Nov 11, 2022
34bde98
add 3 more datasets for xgb_base
qbc2016 Nov 15, 2022
1f7b87a
rm 'test_acc' for Regression
qbc2016 Nov 16, 2022
0c89190
add round 0 logger info
qbc2016 Nov 16, 2022
131acef
refine test proceduce (with much annotation)
qbc2016 Nov 17, 2022
e058f38
Merge branch 'master' of https://github.com/alibaba/FederatedScope in…
qbc2016 Nov 17, 2022
9f2d751
refine 3 other datasets .py files for partitioning test data
qbc2016 Nov 17, 2022
29f1404
add feedback during training with much annatation
qbc2016 Nov 18, 2022
2bb23ef
refine test proceduce by midifying dataloader
qbc2016 Nov 21, 2022
5de12dd
minor changes
qbc2016 Nov 21, 2022
7458d55
minor changes
qbc2016 Nov 21, 2022
b6316d1
modified according to the comments
qbc2016 Nov 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion federatedscope/core/auxiliaries/data_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
'subreddit', 'synthetic', 'ciao', 'epinions', '.*?vertical_fl_data.*?',
'.*?movielens.*?', '.*?cikmcup.*?', 'graph_multi_domain.*?', 'cora',
'citeseer', 'pubmed', 'dblp_conf', 'dblp_org', 'csbm.*?', 'fb15k-237',
'wn18', 'adult'
'wn18', 'adult', 'abalone', 'credit', 'blog'
], # Dummy for FL dataset
}
DATA_TRANS_MAP = RegexInverseMap(TRANS_DATA_MAP, None)
Expand Down
2 changes: 1 addition & 1 deletion federatedscope/core/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def load_dataset(config):
elif config.data.type.lower() == 'vertical_fl_data':
from federatedscope.vertical_fl.dataloader import load_vertical_data
dataset, modified_config = load_vertical_data(config, generate=True)
elif config.data.type.lower() in ['adult']:
elif config.data.type.lower() in ['adult', 'abalone', 'credit', 'blog']:
from federatedscope.vertical_fl.dataloader import load_vertical_data
dataset, modified_config = load_vertical_data(config, generate=False)
elif 'movielens' in config.data.type.lower(
Expand Down
54 changes: 54 additions & 0 deletions federatedscope/vertical_fl/dataloader/dataloader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import numpy as np
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add these datasets to README and docstring.


from federatedscope.vertical_fl.dataset.adult import Adult
from federatedscope.vertical_fl.dataset.abalone import Abalone
from federatedscope.vertical_fl.dataset.credit \
import Credit
from federatedscope.vertical_fl.dataset.blog import Blog


def load_vertical_data(config=None, generate=False):
Expand All @@ -24,6 +28,8 @@ def load_vertical_data(config=None, generate=False):
elif config.xgb_base.use:
feature_partition = config.xgb_base.dims
algo = 'xgb'
else:
raise ValueError('You must provide the data partition')

if config.data.args:
args = config.data.args[0]
Expand All @@ -42,6 +48,54 @@ def load_vertical_data(config=None, generate=False):
algo=algo)
data = dataset.data
return data, config
elif name == 'credit':
dataset = Credit(root=path,
name=name,
num_of_clients=config.federate.client_num,
feature_partition=feature_partition,
tr_frac=splits[0],
download=True,
seed=1234,
args=args,
algo=algo)
data = dataset.data
return data, config
elif name == 'adult':
dataset = Adult(root=path,
name=name,
num_of_clients=config.federate.client_num,
feature_partition=feature_partition,
tr_frac=splits[0],
download=True,
seed=1234,
Copy link
Collaborator

@rayrayraykk rayrayraykk Nov 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might maintain a global seed for data generation as it has been set in here. @xieyxclack @yxdyc @DavdGao

args=args,
algo=algo)
data = dataset.data
return data, config
elif name == 'abalone':
dataset = Abalone(root=path,
name=name,
num_of_clients=config.federate.client_num,
feature_partition=feature_partition,
tr_frac=splits[0],
download=True,
seed=1234,
args=args,
algo=algo)
data = dataset.data
return data, config
elif name == 'blog':
dataset = Blog(root=path,
name=name,
num_of_clients=config.federate.client_num,
feature_partition=feature_partition,
tr_frac=splits[0],
download=True,
seed=1234,
args=args,
algo=algo)
data = dataset.data
return data, config
elif generate:
# generate toy data for running a vertical FL example
INSTANCE_NUM = 1000
Expand Down
153 changes: 153 additions & 0 deletions federatedscope/vertical_fl/dataset/abalone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import logging
import os
import os.path as osp

import pandas as pd
from torchvision.datasets.utils import download_and_extract_archive

logger = logging.getLogger(__name__)


class Abalone:
"""
Abalone Data Set
(https://archive.ics.uci.edu/ml/datasets/abalone)
Data Set Information:
Number of Instances: 4177
Number of Attributes: 8

Predicting the age of abalone from physical measurements.
Given is the attribute name, attribute type, the measurement unit
and a brief description.
The number of rings is the value to predict:
either as a continuous value or as a classification problem.

Name / Data Type / Measurement Unit / Description/

Sex / nominal / -- / M, F, and I (infant)
Length / continuous / mm / Longest shell measurement
Diameter / continuous / mm / perpendicular to length
Height / continuous / mm / with meat in shell
Whole weight / continuous / grams / whole abalone
Shucked weight / continuous / grams / weight of meat
Viscera weight / continuous / grams / gut weight (after bleeding)
Shell weight / continuous / grams / after being dried
Rings / integer / -- / +1.5 gives the age in years

Arguments:
root (str): root path
name (str): name of dataset, ‘abalone’ or ‘xxx’
num_of_clients(int): number of clients
feature_partition(list): the number of features
partitioned to each client
tr_frac (float): train set proportion for each task; default=0.8
args (dict): set Ture or False to decide whether
to normalize or standardize the data or not,
e.g., {'normalization': False, 'standardization': False}
algo(str): the running model, 'lr' or 'xgb'
download (bool): indicator to download dataset
seed: a random seed
"""
base_folder = 'abalone'
url = 'https://federatedscope.oss-cn-beijing.aliyuncs.com/abalone.zip'
raw_file = 'abalone.data'

def __init__(self,
root,
name,
num_of_clients,
feature_partition,
args,
algo=None,
tr_frac=0.8,
download=True,
seed=123):
self.root = root
self.name = name
self.num_of_clients = num_of_clients
self.feature_partition = feature_partition
self.tr_frac = tr_frac
self.seed = seed
self.args = args
self.algo = algo
self.data_dict = {}
self.data = {}

if download:
self.download()
if not self._check_existence():
raise RuntimeError("Dataset not found or corrupted." +
"You can use download=True to download it")

self._get_data()
self._partition_data()

def _get_data(self):
fpath = os.path.join(self.root, self.base_folder)
file = osp.join(fpath, self.raw_file)
data = self._read_raw(file)
data = self._process(data)
train_num = int(self.tr_frac * len(data))
self.data_dict['train'] = data[:train_num]
self.data_dict['test'] = data[train_num:]

def _read_raw(self, file_path):
data = pd.read_csv(file_path, header=None)
return data

def _process(self, data):
data[0] = data[0].replace({'F': 2, 'M': 1, 'I': 0})
data = data.values
return data

def _check_existence(self):
fpath = os.path.join(self.root, self.base_folder, self.raw_file)
return osp.exists(fpath)

def download(self):
if self._check_existence():
logger.info("Files already exist")
return
download_and_extract_archive(self.url,
os.path.join(self.root, self.base_folder),
filename=self.url.split('/')[-1])

def _partition_data(self):

x = self.data_dict['train'][:, :-1]
y = self.data_dict['train'][:, -1]

test_data = {
'x': self.data_dict['test'][:, :-1],
'y': self.data_dict['test'][:, -1]
}

test_x = test_data['x']
test_y = test_data['y']

self.data = dict()
for i in range(self.num_of_clients + 1):
self.data[i] = dict()
if i == 0:
self.data[0]['train'] = None
self.data[0]['test'] = test_data
elif i == 1:
self.data[1]['train'] = {'x': x[:, :self.feature_partition[0]]}
self.data[1]['test'] = {
'x': test_x[:, :self.feature_partition[0]]
}
else:
self.data[i]['train'] = {
'x': x[:,
self.feature_partition[i -
2]:self.feature_partition[i -
1]]
}
self.data[i]['test'] = {
'x': test_x[:, self.feature_partition[i - 2]:self.
feature_partition[i - 1]]
}
self.data[i]['val'] = None

self.data[self.num_of_clients]['train']['y'] = y[:]
self.data[self.num_of_clients]['test']['y'] = test_y[:]
14 changes: 12 additions & 2 deletions federatedscope/vertical_fl/dataset/adult.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ class Adult:
(https://archive.ics.uci.edu/ml/datasets/adult)
Fields
The dataset contains 15 columns
Training set: 'adult.data', 32561 instances
Testing set: 'adult.test', 16281 instances
Target filed: Income
-- The income is divide into two classes: <=50K and >50K
Number of attributes: 14
Expand All @@ -30,7 +32,7 @@ class Adult:
args (dict): set Ture or False to decide whether
to normalize or standardize the data or not,
e.g., {'normalization': False, 'standardization': False}
model(str): the running model, 'lr' or 'xgb'
algo(str): the running model, 'lr' or 'xgb'
download (bool): indicator to download dataset
seed: a random seed
"""
Expand Down Expand Up @@ -146,19 +148,27 @@ def _partition_data(self, train_set, test_set):
self.data[i] = dict()
if i == 0:
self.data[0]['train'] = None
self.data[0]['test'] = test_data
elif i == 1:
self.data[1]['train'] = {'x': x[:, :self.feature_partition[0]]}
self.data[1]['test'] = {
'x': test_x[:, :self.feature_partition[0]]
}
else:
self.data[i]['train'] = {
'x': x[:,
self.feature_partition[i -
2]:self.feature_partition[i -
1]]
}
self.data[i]['test'] = {
'x': test_x[:, self.feature_partition[i - 2]:self.
feature_partition[i - 1]]
}
self.data[i]['val'] = None
self.data[i]['test'] = test_data

self.data[self.num_of_clients]['train']['y'] = y[:]
self.data[self.num_of_clients]['test']['y'] = test_y[:]

def _check_existence(self, file):
fpath = os.path.join(self.root, self.base_folder, file)
Expand Down
Loading