diff --git a/benchmark/pFL_Bench/FEMNIST-s02/fedavg_convnet2_on_femnist.yaml b/benchmark/pFL_Bench/FEMNIST-s02/fedavg_convnet2_on_femnist.yaml
new file mode 100644
index 000000000..f04ccf8bf
--- /dev/null
+++ b/benchmark/pFL_Bench/FEMNIST-s02/fedavg_convnet2_on_femnist.yaml
@@ -0,0 +1,36 @@
+use_gpu: True
+device: -1
+early_stop:
+  patience: 5
+seed: 1
+federate:
+  mode: standalone
+  local_update_steps: 1
+  batch_or_epoch: epoch
+  total_round_num: 1000
+  sample_client_rate: 0.2
+  unseen_clients_rate: 0.2
+data:
+  root: data/
+  type: femnist
+  splits: [0.6,0.2,0.2]
+  batch_size: 32
+  subsample: 0.05
+  num_workers: 0
+  transform: [['ToTensor'], ['Normalize', {'mean': [0.1307], 'std': [0.3081]}]]
+model:
+  type: convnet2
+  hidden: 2048
+  out_channels: 62
+  dropout: 0.0
+optimizer:
+  lr: 0.01
+  weight_decay: 0.0
+  grad_clip: 5.0
+criterion:
+  type: CrossEntropyLoss
+trainer:
+  type: cvtrainer
+eval:
+  freq: 10
+  metrics: ['acc', 'correct']
diff --git a/benchmark/pFL_Bench/FEMNIST-s02/run_fedopt_bn_plus_sweep.sh b/benchmark/pFL_Bench/FEMNIST-s02/run_fedopt_bn_plus_sweep.sh
new file mode 100644
index 000000000..19be4e8f8
--- /dev/null
+++ b/benchmark/pFL_Bench/FEMNIST-s02/run_fedopt_bn_plus_sweep.sh
@@ -0,0 +1,17 @@
+set -e
+
+wandb sweep sweep_fedOpt.yaml
+wandb sweep sweep_fedOpt_FT.yaml
+
+wandb sweep sweep_ditto_fedBN.yaml
+wandb sweep sweep_ditto_fedBN_fedOpt.yaml
+wandb sweep sweep_ditto_FT_fedBN.yaml
+wandb sweep sweep_ditto_FT_fedBN_fedOpt.yaml
+
+wandb sweep sweep_fedBN_fedOpt.yaml
+wandb sweep sweep_fedBN_FT_fedOpt.yaml
+
+wandb sweep sweep_fedEM_fedBN.yaml
+wandb sweep sweep_fedEM_fedBN_fedOpt.yaml
+wandb sweep sweep_fedEM_FT_fedBN.yaml
+wandb sweep sweep_fedEM_FT_fedBN_fedOpt.yaml
diff --git a/benchmark/pFL_Bench/FEMNIST-s02/sweep_ditto.yaml b/benchmark/pFL_Bench/FEMNIST-s02/sweep_ditto.yaml
new file mode 100644
index 000000000..89e5f1637
--- /dev/null
+++ b/benchmark/pFL_Bench/FEMNIST-s02/sweep_ditto.yaml
@@ -0,0 +1,38 @@
+
+program: federatedscope/main.py
+project: pFL-bench
+name: ditto,FEMNIST-s02
+method: grid
+#method: bayes
+metric:
+  goal: maximize
+  name: best_client_summarized_weighted_avg/val_acc
+command:
+  - ${env}
+  - ${interpreter}
+  - ${program}
+  - "--cfg"
+  - "scripts/personalization_exp_scripts/pfl_bench/FEMNIST-s02/fedavg_convnet2_on_femnist.yaml"
+  - "outdir"
+  - "exp_pfl_bench"
+  - "wandb.use"
+  - "True"
+  - "wandb.name_project"
+  - "pFL-bench"
+  - "wandb.name_user"
+  - "daoyuan"
+  - "federate.method"
+  - "Ditto"
+  - ${args_no_hyphens}
+parameters:
+  optimizer.lr:
+    values: [0.05, 0.005, 0.5, 0.01, 0.1]
+  federate.local_update_steps:
+    values: [1, 3]
+  personalization.regular_weight:
+    values: [0.05, 0.1, 0.5, 0.8]
+
+early_terminate:
+  type: hyperband
+  min_iter: 5  # the first bucket indicates we called wandb at least 5 times
+
diff --git a/benchmark/pFL_Bench/FEMNIST-s02/sweep_fedAvg_FT.yaml b/benchmark/pFL_Bench/FEMNIST-s02/sweep_fedAvg_FT.yaml
new file mode 100644
index 000000000..e1cfb97cf
--- /dev/null
+++ b/benchmark/pFL_Bench/FEMNIST-s02/sweep_fedAvg_FT.yaml
@@ -0,0 +1,40 @@
+
+program: federatedscope/main.py
+project: pFL-bench
+name: fedAvg,FEMNIST-s02,FT
+method: grid
+#method: bayes
+metric:
+  goal: maximize
+  name: best_client_summarized_weighted_avg/val_acc
+command:
+  - ${env}
+  - ${interpreter}
+  - ${program}
+  - "--cfg"
+  - "scripts/personalization_exp_scripts/pfl_bench/FEMNIST-s02/fedavg_convnet2_on_femnist.yaml"
+  - "outdir"
+  - "exp_pfl_bench"
+  - "wandb.use"
+  - "True"
+  - "wandb.name_project"
+  - "pFL-bench"
+  - "wandb.name_user"
+  - "daoyuan"
+  - "expname_tag"
+  - "finetune"
+  - "trainer.finetune.before_eval"
+  - "True"
+  - "trainer.finetune.steps"
+  - "5"
+  - ${args_no_hyphens}
+parameters:
+  optimizer.lr:
+    values: [0.05, 0.005, 0.5, 0.01, 0.1]
+  federate.local_update_steps:
+    values: [1, 3]
+
+early_terminate:
+  type: hyperband
+  min_iter: 5  # the first bucket indicates we called wandb at least 5 times
+
diff --git a/benchmark/pFL_Bench/FEMNIST-s02/sweep_fedEM_FT_fedBN_fedOpt.yaml b/benchmark/pFL_Bench/FEMNIST-s02/sweep_fedEM_FT_fedBN_fedOpt.yaml
new file mode 100644
index 000000000..7c1ef060b
--- /dev/null
+++ b/benchmark/pFL_Bench/FEMNIST-s02/sweep_fedEM_FT_fedBN_fedOpt.yaml
@@ -0,0 +1,50 @@
+
+program: federatedscope/main.py
+project: pFL-bench
+name: FedEM-FT-FedBN+fedOpt,FEMNIST-s02
+method: grid
+#method: bayes
+metric:
+  goal: maximize
+  name: best_client_summarized_weighted_avg/val_acc
+command:
+  - ${env}
+  - ${interpreter}
+  - ${program}
+  - "--cfg"
+  - "scripts/personalization_exp_scripts/pfl_bench/FEMNIST-s02/fedavg_convnet2_on_femnist.yaml"
+  - "outdir"
+  - "exp_pfl_bench"
+  - "wandb.use"
+  - "True"
+  - "wandb.name_project"
+  - "pFL-bench"
+  - "wandb.name_user"
+  - "daoyuan"
+  - "federate.method"
+  - "FedEM"
+  - "model.model_num_per_trainer"
+  - "3"
+  - "expname_tag"
+  - "+fedBN+finetune+fedOpt"
+  - "trainer.finetune.before_eval"
+  - "True"
+  - "trainer.finetune.steps"
+  - "5"
+  - "personalization.local_param"
+  - "['bn', 'norms']"  # FedBN
+  - "fedopt.use"
+  - "True"
+  - ${args_no_hyphens}
+parameters:
+  optimizer.lr:
+    values: [0.05, 0.005, 0.5, 0.01, 0.1]
+  federate.local_update_steps:
+    values: [1, 3]
+  fedopt.lr_server:
+    values: [ 0.5, 0.05, 1.5, 0.1, 1.0]
+
+early_terminate:
+  type: hyperband
+  min_iter: 5  # the first bucket indicates we called wandb at least 5 times
+
diff --git a/benchmark/pFL_Bench/README.md b/benchmark/pFL_Bench/README.md
new file mode 100644
index 000000000..ccbbfc047
--- /dev/null
+++ b/benchmark/pFL_Bench/README.md
@@ -0,0 +1,106 @@
+# pFL-Bench
+The **pFL-Bench** is a comprehensive benchmark for personalized Federated Learning (pFL), which contains more than 10 diverse datasets, 20 competitive pFL baselines, and systematic evaluation with highlighted benefits and potential of pFL. See more details in our [paper](https://arxiv.org/abs/2206.03655).
+
+
+This repository includes the experimental data, environments, scripts and codes of **pFL-Bench**. We welcome contributions of new pFL methods and datasets to keep pFL-Bench up-to-date and to evolve it! See more details about contribution [here](https://github.com/alibaba/FederatedScope#contributing).
+
+**NOTICE:** We are working on seamlessly and consistently fusing the new features in pFL-Bench into the *FederatedScope*. However, since the underling package *FederatedScope* is still being continuously and actively updated, the results can be a little different to the ones in our paper.
+To fully reproduce the experimental results reported in the paper, please use the code versioned by this [branch](https://github.com/alibaba/FederatedScope/tree/Feature/pfl_bench) on which the experiments were conducted at the time.
+
+
+# 1. Data
+All the experimental data can be automatically downloaded and processed via *FederatedScope* from the original public data links.
+
+In case the slow or blocked internet connection prevents your downloading, we also provide a public [mirror](https://federatedscope.oss-cn-beijing.aliyuncs.com/pFL-Bench-data.zip) with *aliyun*.
+You can download the pre-processed datasets from our public mirror and unzip them in `data` directory under your project root.
+
+If you use other customized data directory, please replace the value of `data.root` in the scripts accordingly.
+
+# 2. Docker Environment
+The experiments are conducted on the *federatedscope-torch1.8-application* docker image, you can build it using the [Dockfile](https://github.com/alibaba/FederatedScope/blob/master/enviroment/docker_files/federatedscope-torch1.8-application.Dockerfile). 
+
+We also provide a built docker [image](https://federatedscope.oss-cn-beijing.aliyuncs.com/federatedscope_cuda10_torch18_app.tar), you can download it and creat your image as 
+```
+docker load < federatedscope_cuda10_torch18_app.tar & docker tag 188b4 alibaba/federatedscope:app-env-torch1.8
+```
+
+# 3. Run the experiments
+We first use wandb sweep to find the best hyper-parameters, then repeat the results three times.
+Here we provide some examples for FEMNIST-s02 at `benchmark/pfl_bench/FEMNIST-s02` for the hyper-parameter search space and hyper-parameter optimization (HPO) scripts, and the searched best configurations as yaml files for FEMNIST-s02 at `benchmark/pfl_bench/yaml_best_runs_example`.
+
+Since the searching scripts and best config yaml files for all experiments involve about 600 files and 6w+ code lines, we omit them here.
+
+You can find the full scripts from the another [branch](https://github.com/alibaba/FederatedScope/tree/Feature/pfl_bench/scripts/personalization_exp_scripts/pfl_bench) or the packed small zip [file](https://federatedscope.oss-cn-beijing.aliyuncs.com/pfl_bench_scripts.zip), in which we organize the scripts for all the methods and all the datasets as multiple directories named by dataset name.
+
+## 3.1 The searched best configs
+We put all the config yaml file in the directory `benchmark/pfl_bench/yaml_best_runs`.
+To reproduce the experiments with searched best configurations, you can run the experiment as the following example:
+```
+python federatedscope/main.py --cfg benchmark/pfl_bench/yaml_best_runs/FedBN_FEMNIST-s02.yaml
+```
+Then all the metrics will be tracked in your logfile and send to wandb monitor.
+
+You can customize the yaml file such as your wandb project name, or directly add new config in the command such as 
+```
+python federatedscope/main.py --cfg benchmark/pfl_bench/yaml_best_runs/FedBN_FEMNIST-s02.yaml federate.local_update_steps 1
+```
+More examples for other methods including the combined pFL method variants (e.g., `FedEM-FedBN-FedOPT-FT_cifar10-alpha01.yaml`) are in the directory `benchmark/pfl_bench/yaml_best_runs`.
+
+## 3.2 Scripts for HPO
+We use wandb sweep to find the best hyper-parameters, here are some scripts to do the sweep,
+
+### 3.2.1 For sweep machine
+0. login to the wandb host, if you need private hosting, try wandb login [here](https://docs.wandb.ai/guides/self-hosted/local).
+1. write your sweep HPO scripts, we provide the full HPO yamls in the `benchmark/pfl_bench` directory and organized by dataset name. See more details about sweep [here](https://docs.wandb.ai/guides/sweeps).
+
+2. start your sweep by `wandb sweep my_hpo.yaml`, it will print the sweep id such as 
+```
+wandb: Creating sweep from: sweep_fedAvg_FT.yaml
+wandb: Created sweep with ID: mo45xa3d
+wandb: View sweep at: http://xx.xx.xxx.xxx:8080/your_sweep_name/pFL-bench/sweeps/mo45xa3d
+```
+
+
+### 3.2.2 For agent machine 
+0. - sync your FederatedScope codes to the agent machine
+    - sync your data to the agent machine, and make sure you put them in the right path, e.g., `/mnt1/user_name/FederatedScope`
+    
+   
+1. enter the container
+```
+docker run -u root --gpus all -it --rm -v "/mnt1:/mnt" --name your_name-pfl-bench -w /mnt/user_name/FederatedScope alibaba/federatedscope:app-env-torch1.8 /bin/bash
+```
+   
+2. setup wandb and FederatedScope
+```bash
+wandb login --host=http://xx.xx.xx.xx:8080/
+python setup.py install
+```
+
+If necessary, install several missing packages in case of the docker image misses these package
+`conda install fvcore pympler iopath`
+
+
+3. run sweep agent, e.g.,
+```bash
+nohup wandb agent your_name/pFL-bench/sweep_id &
+```
+
+### 3.2.3 For develop/debug machine
+For the machine used for remote development and debug
+1. enter the container:
+```
+docker run -u root -p 8023:22 --gpus all -it --rm -v "/mnt1:/mnt"  --name your_name-pfl-bench-debug -w /mnt/user_name/FederatedScope alibaba/federatedscope:app-env-torch1.8 /bin/bash
+```
+
+2. prepare the ssh and wandb
+```bash
+apt-get update && apt-get install -y openssh-server
+mkdir /var/run/sshd
+echo 'root:fsdebug' | chpasswd
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+service ssh start
+wandb login --host=http://xx.xx.xx.xx:8080/
+```
+
+3. connect the machine and develop your pFL algorithm
diff --git a/benchmark/pFL_Bench/res_analysis_plot/get_total_run_time.py b/benchmark/pFL_Bench/res_analysis_plot/get_total_run_time.py
new file mode 100644
index 000000000..4846a9852
--- /dev/null
+++ b/benchmark/pFL_Bench/res_analysis_plot/get_total_run_time.py
@@ -0,0 +1,52 @@
+
+import wandb
+
+api = wandb.Api()
+
+name_project = ["daoyuan/pFL-bench",
+                "daoyuan/pfl-bench-best-repeat"]
+
+total_run_time = 0
+run_cnt = 0
+run_finish_cnt = 0
+total_run_time_finished = 0
+
+
+def convert(seconds):
+    seconds_in_day = 60 * 60 * 24
+    seconds_in_hour = 60 * 60
+    seconds_in_minute = 60
+
+    days = seconds // seconds_in_day
+    hours = (seconds - (days * seconds_in_day)) // seconds_in_hour
+    minutes = (seconds - (days * seconds_in_day) - (hours * seconds_in_hour)) // seconds_in_minute
+
+    return "%d:%02d:%02d" % (days, hours, minutes)
+
+
+def print_run_time():
+    print(f"Total_run_t: {convert(total_run_time)}, run_cnt={run_cnt}")
+    print(f"Total_run_t_finished: {convert(total_run_time_finished)}, run_cnt_finish={run_finish_cnt}")
+
+
+for p in name_project:
+    runs = api.runs(p)
+    for run in runs:
+        try:
+            if '_runtime' in run.summary:
+                time_run = run.summary["_runtime"]
+            else:
+                time_run = run.summary['_wandb']["runtime"]
+
+            if run.state == "finished":
+                total_run_time_finished += time_run
+                run_finish_cnt += 1
+            total_run_time += time_run
+            run_cnt += 1
+            if run_cnt % 200 == 0:
+                print_run_time()
+        except:
+            #print("something wrong")
+            continue
+
+print_run_time()
diff --git a/benchmark/pFL_Bench/res_analysis_plot/plot_paper_figs.py b/benchmark/pFL_Bench/res_analysis_plot/plot_paper_figs.py
new file mode 100644
index 000000000..f4a29dcd1
--- /dev/null
+++ b/benchmark/pFL_Bench/res_analysis_plot/plot_paper_figs.py
@@ -0,0 +1,598 @@
+
+
+import wandb
+from collections import OrderedDict
+
+api = wandb.Api()
+
+name_project = "daoyuan/pFL-bench"
+
+filters_each_line_main_table = OrderedDict(
+    # {dataset_name: filter}
+    [
+        # ("all",
+        # None,
+        # ),
+        # ("FEMNIST-all",
+        #  {"$and":
+        #      [
+        #          {"config.data.type": "femnist"},
+        #      ]
+        #  }
+        #  ),
+        ("FEMNIST-s02",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.2},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        # ("cifar10-alpha05",
+        #  {"$and":
+        #      [
+        #          {"config.data.type": "CIFAR10@torchvision"},
+        #          {"config.data.splitter_args": [{"alpha": 0.5}]},
+        #      ]
+        #  }
+        #  ),
+        ("sst2",
+         {"$and":
+             [
+                 {"config.data.type": "sst2@huggingface_datasets"},
+             ]
+         }
+         ),
+        ("pubmed",
+         {"$and":
+             [
+                 {"config.data.type": "pubmed"},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_all_cifar10 = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("cifar10-alpha5",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 5}]},
+             ]
+         }
+         ),
+        ("cifar10-alpha05",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 0.5}]},
+             ]
+         }
+         ),
+        ("cifar10-alpha01",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 0.1}]},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_femnist_all_s = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("FEMNIST-s02",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.2},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        ("FEMNIST-s01",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.1},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        ("FEMNIST-s005",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.05},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+
+    ]
+)
+
+filters_each_line_all_graph = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("pubmed",
+         {"$and":
+             [
+                 {"config.data.type": "pubmed"},
+             ]
+         }
+         ),
+        ("cora",
+         {"$and":
+             [
+                 {"config.data.type": "cora"},
+             ]
+         }
+         ),
+        ("citeseer",
+         {"$and":
+             [
+                 {"config.data.type": "citeseer"},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_all_nlp = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("cola",
+         {"$and":
+             [
+                 {"config.data.type": "cola@huggingface_datasets"},
+             ]
+         }
+         ),
+        ("sst2",
+         {"$and":
+             [
+                 {"config.data.type": "sst2@huggingface_datasets"},
+             ]
+         }
+         ),
+    ]
+)
+
+sweep_name_2_id = dict()
+column_names_generalization = [
+    "best_client_summarized_weighted_avg/test_acc",
+    "best_unseen_client_summarized_weighted_avg_unseen/test_acc",
+    "participation_gap"
+]
+column_names_fair = [
+    "best_client_summarized_avg/test_acc",
+    "best_client_summarized_fairness/test_acc_std",
+    "best_client_summarized_fairness/test_acc_bottom_decile"
+]
+column_names_efficiency = [
+    "sys_avg/total_flops",
+    "sys_avg/total_upload_bytes",
+    "sys_avg/total_download_bytes",
+    "sys_avg/global_convergence_round",
+    # "sys_avg/local_convergence_round"
+]
+column_names_generalization_for_plot = ["Acc (Parti.)", "Acc (Un-parti.)", "Generalization Gap"]
+column_name_for_plot = {
+    "best_client_summarized_weighted_avg/test_acc": "Acc (Parti.)",
+    "total_flops": "Total Flops",
+    "communication_bytes": "Communication Bytes",
+    "sys_avg/global_convergence_round": "Convergence Round",
+}
+sorted_method_name_pair = [
+    ("global-train", "Global-Train"),
+    ("isolated-train", "Isolated"),
+    ("fedavg", "FedAvg"),
+    ("fedavg-ft", "FedAvg-FT"),
+    ("fedopt", "FedOpt"),
+    ("fedopt-ft", "FedOpt-FT"),
+    ("pfedme", "pFedMe"),
+    ("ft-pfedme", "pFedMe-FT"),
+    ("fedbn", "FedBN"),
+    ("fedbn-ft", "FedBN-FT"),
+    ("fedbn-fedopt", "FedBN-FedOPT"),
+    ("fedbn-fedopt-ft", "FedBN-FedOPT-FT"),
+    ("ditto", "Ditto"),
+    ("ditto-ft", "Ditto-FT"),
+    ("ditto-fedbn", "Ditto-FedBN"),
+    ("ditto-fedbn-ft", "Ditto-FedBN-FT"),
+    ("ditto-fedbn-fedopt", "Ditto-FedBN-FedOpt"),
+    ("ditto-fedbn-fedopt-ft", "Ditto-FedBN-FedOpt-FT"),
+    ("fedem", "FedEM"),
+    ("fedem-ft", "FedEM-FT"),
+    ("fedbn-fedem", "FedEM-FedBN"),
+    ("fedbn-fedem-ft", "FedEM-FedBN-FT"),
+    ("fedbn-fedem-fedopt", "FedEM-FedBN-FedOPT"),
+    ("fedbn-fedem-fedopt-ft", "FedEM-FedBN-FedOPT-FT"),
+]
+sorted_keys = OrderedDict(sorted_method_name_pair)
+expected_keys = set(list(sorted_keys.keys()))
+expected_method_names = list(sorted_keys.values())
+expected_datasets_name = ["cola", "sst2", "pubmed", "cora", "citeseer", "cifar10-alpha5", "cifar10-alpha05",
+                          "cifar10-alpha01", "FEMNIST-s02", "FEMNIST-s01", "FEMNIST-s005"]
+expected_seed_set = ["1", "2", "3"]
+expected_expname_tag = set()
+
+for method_name in expected_method_names:
+    for dataset_name in expected_datasets_name:
+        for seed in expected_seed_set:
+            expected_expname_tag.add(f"{method_name}_{dataset_name}_seed{seed}")
+        expected_expname_tag.add(f"{method_name}_{dataset_name}_repeat")
+
+from collections import defaultdict
+
+all_missing_scripts = defaultdict(list)
+
+all_res_structed = defaultdict(dict)
+for expname_tag in expected_expname_tag:
+    for metric in column_names_generalization + column_names_efficiency + column_names_fair:
+        if "repeat" in expname_tag:
+            all_res_structed[expname_tag][metric] = []
+        else:
+            all_res_structed[expname_tag][metric] = "-"
+
+
+
+def check_run_stats(filter_seed_set=None):
+    for expname_tag in expected_expname_tag:
+        filter = {"$and":
+            [
+                {"config.expname_tag": expname_tag},
+            ]
+        }
+        filtered_runs = api.runs("pfl-bench-best-repeat", filters=filter)
+        method, dataname, seed = expname_tag.split("_")
+        finished_run_cnt = 0
+        for run in filtered_runs:
+            if run.state != "finished":
+                print(f"run {run} is not fished")
+            else:
+                finished_run_cnt += 1
+                for metric in column_names_generalization + column_names_efficiency + column_names_fair:
+                    try:
+                        if method in ["Isolated", "Global-Train"]:
+                            skip_generalize = "unseen" in metric or metric == "participation_gap"
+                            skip_global_fairness = method == "Global-Train" and "fairness" in metric
+                            if skip_generalize or skip_global_fairness:
+                                all_res_structed[expname_tag][metric] = "-"
+                                continue
+
+                        if metric == "participation_gap":
+                            all_res_structed[expname_tag][metric] = all_res_structed[expname_tag]["best_unseen_client_summarized_weighted_avg_unseen/test_acc"] - all_res_structed[expname_tag]["best_client_summarized_weighted_avg/test_acc"]
+                        else:
+                            all_res_structed[expname_tag][metric] = run.summary[metric]
+                    except KeyError:
+                        print("Something wrong")
+
+        print_missing = True
+        for seed in filter_seed_set:
+            if seed in expname_tag:
+                print_missing = False
+        if finished_run_cnt == 0 and print_missing:
+            print(f"Missing run {expname_tag})")
+            yaml_name = f"{method}_{dataname}.yaml"
+            if "Global" in method:
+                yaml_name = f"\'{yaml_name}\'"
+                expname_tag_new = expname_tag.replace("Global Train", "Global-Train")
+            else:
+                expname_tag_new = expname_tag
+            seed_num = seed.replace("seed", "")
+            all_missing_scripts[seed].append(
+                f"python federatedscope/main.py --cfg scripts/personalization_exp_scripts/pfl_bench/yaml_best_runs/{yaml_name} seed {seed_num} expname_tag {expname_tag_new} wandb.name_project pfl-bench-best-repeat")
+        elif finished_run_cnt != 1 and print_missing:
+            print(f"run_cnt = {finished_run_cnt} for the exp {expname_tag}")
+
+    for seed in all_missing_scripts.keys():
+        print(
+            f"+================= All MISSING SCRIPTS, seed={seed} =====================+, cnt={len(all_missing_scripts[seed])}")
+        for scipt in all_missing_scripts[seed]:
+            print(scipt)
+        print()
+
+
+
+def bytes_to_unit_size(size_bytes):
+    import math
+    if size_bytes == 0:
+        return "0"
+    size_name = ("", "K", "M", "G", "T", "P", "E", "Z", "Y")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return f"{s}{size_name[i]}"
+
+def unit_size_to_bytes(size_str):
+    if not isinstance(size_str, str):
+        return size_str
+    else:
+        last_unit = size_str[-1]
+        size_name = ("", "K", "M", "G", "T", "P", "E", "Z", "Y")
+        if last_unit not in size_name:
+            return float(size_str)
+        else:
+            # need transform
+            import math
+            idx = size_name.index(last_unit)
+            p = math.pow(1024, idx)
+            return float(size_str[:-1]) * p
+
+
+def avg_res_of_seeds():
+    # add all res to repeat
+    for expname_tag in expected_expname_tag:
+        if "repeat" in expname_tag:
+            continue
+        else:
+            for metric in column_names_generalization + column_names_efficiency + column_names_fair:
+                if all_res_structed[expname_tag][metric] == "-" and "Global" not in expname_tag and "Isolated" not in expname_tag:
+                    print(f"missing {expname_tag} for metric {metric}")
+                method, dataname, seed = expname_tag.split("_")
+                cur_res = all_res_structed[expname_tag][metric]
+                all_res_structed[f"{method}_{dataname}_repeat"][metric].append(cur_res)
+
+    for expname_tag in expected_expname_tag:
+        if "repeat" in expname_tag:
+            for metric in column_names_generalization + column_names_efficiency + column_names_fair:
+                valid_res = [unit_size_to_bytes(v) for v in all_res_structed[expname_tag][metric] if v != "-"]
+                if len(valid_res) == 0:
+                    all_res_structed[expname_tag][metric] = "-"
+                else:
+                    res = sum(valid_res) / len(valid_res)
+                    if "flops" in metric or "bytes" in metric:
+                        res = bytes_to_unit_size(res)
+                    all_res_structed[expname_tag][metric] = res
+
+def print_paper_table(filters_each_line_table):
+    res_of_each_line_generalization = OrderedDict()
+    res_of_each_line_fair = OrderedDict()
+    res_of_each_line_efficiency = OrderedDict()
+    res_of_each_line_commu_acc_trade = OrderedDict()
+    res_of_each_line_conver_acc_trade = OrderedDict()
+
+    for key in expected_method_names:
+        res_of_each_line_generalization[key] = []
+        res_of_each_line_fair[key] = []
+        res_of_each_line_efficiency[key] = []
+        for dataset_name in filters_each_line_table:
+            expname_tag = f"{key}_{dataset_name}_repeat"
+            for metric in column_names_generalization:
+                res_of_each_line_generalization[key].append(all_res_structed[expname_tag][metric])
+            for metric in column_names_fair:
+                res_of_each_line_fair[key].append(all_res_structed[expname_tag][metric])
+            for metric in column_names_efficiency:
+                res = all_res_structed[expname_tag][metric]
+                if "round" in metric:
+                    res = "{:.2f}".format(res)
+                res_of_each_line_efficiency[key].append(res)
+
+    print("\n=============res_of_each_line [Generalization]===============" + ",".join(
+        list(filters_each_line_table.keys())))
+    # Acc, Unseen-ACC, Delta
+    for key in expected_method_names:
+        res_to_print = ["{:.2f}".format(v * 100) if v != "-" else v for v in     res_of_each_line_generalization[key]]
+        res_to_print = [key] + res_to_print
+        print(",".join(res_to_print))
+    
+    print("\n=============res_of_each_line [Fairness]===============" + ",".join(list(    filters_each_line_table.keys())))
+    for key in expected_method_names:
+        res_to_print = ["{:.2f}".format(v * 100) if v != "-" else v for v in     res_of_each_line_fair[key]]
+        res_to_print = [key] + res_to_print
+        print(",".join(res_to_print))
+    print("\n=============res_of_each_line [All Efficiency]===============" + ",".join(
+        list(filters_each_line_table.keys())))
+    # FLOPS, UPLOAD, DOWNLOAD
+    for key in expected_method_names:
+        res_to_print = [str(v) for v in res_of_each_line_efficiency[key]]
+        res_to_print = [key] + res_to_print
+        print(",".join(res_to_print))
+    print("\n=============res_of_each_line [flops, communication, acc]===============" +     ",".join(
+        list(filters_each_line_table.keys())))
+    for key in expected_method_names:
+        res_of_each_line_commu_acc_trade[key] = []
+        dataset_num = 2 if "cola" in list(filters_each_line_table.keys()) else 3
+        for i in range(dataset_num):
+            res_of_each_line_commu_acc_trade[key].extend(
+                [str(res_of_each_line_efficiency[key][i * 4])] + \
+                [str(res_of_each_line_efficiency[key][i * 4 + 1])] + \
+                ["{:.2f}".format(v * 100) if v != "-" else v for v in     res_of_each_line_generalization[key][i * 3:i * 3 + 1]]
+            )
+    
+        res_to_print = [str(v) for v in res_of_each_line_commu_acc_trade[key]]
+        res_to_print = [key] + res_to_print
+        print(",".join(res_to_print))
+
+    print("\n=============res_of_each_line [converge_round, acc]===============" + ",".join(
+        list(filters_each_line_table.keys())))
+    for key in expected_method_names:
+        res_of_each_line_conver_acc_trade[key] = []
+        dataset_num = 2 if "cola" in list(filters_each_line_table.keys()) else 3
+        for i in range(dataset_num):
+            res_of_each_line_conver_acc_trade[key].extend(
+                [str(res_of_each_line_efficiency[key][i * 4 + 3])] + \
+                # [str(res_of_each_line_efficiency[key][i * 4 + 4])] + \
+                ["{:.2f}".format(v * 100) if v != "-" else v for v in res_of_each_line_fair[key][i * 3:i * 3 + 1]]
+        )
+
+        res_to_print = [str(v) for v in res_of_each_line_conver_acc_trade[key]]
+        res_to_print = [key] + res_to_print
+        print(",".join(res_to_print))
+
+import json
+with open('best_res_all_metric.json', 'r') as fp:
+    all_res_structed_load = json.load(fp)
+    for expname_tag in expected_expname_tag:
+        if "repeat" in expname_tag:
+            continue
+        for metric in column_names_generalization + column_names_efficiency + column_names_fair:
+            all_res_structed[expname_tag][metric] = all_res_structed_load[expname_tag][metric]
+
+
+# add all res to a df
+import pandas as pd
+
+def load_data_to_pd(use_repeat_res=False):
+    all_res_for_pd = []
+    for expname_tag in expected_expname_tag:
+        if not use_repeat_res:
+            if "repeat" in expname_tag:
+                continue
+        else:
+            if not "repeat" in expname_tag:
+                continue
+        res = expname_tag.split("_")  # method, data, seed
+        for metric in column_names_generalization + column_names_fair + column_names_efficiency:
+            res.append(all_res_structed[expname_tag][metric])
+        s = "-"
+        alpha = "-"
+        if "FEMNIST-s0" in res[1]:
+            s = float(res[1].replace("FEMNIST-s0", "0."))
+        if "cifar10-alpha0" in res[1]:
+            alpha = float(res[1].replace("cifar10-alpha0", "0."))
+        elif "cifar10-alpha" in res[1]:
+            alpha = float(res[1].replace("cifar10-alpha", ""))
+        res.append(s)
+        res.append(alpha)
+        total_com_bytes = unit_size_to_bytes(res[-5]) +  unit_size_to_bytes(res[-4])
+        total_flops = unit_size_to_bytes(res[-6])
+        res.append(total_com_bytes)
+        res.append(total_flops)
+        all_res_for_pd.append(res)
+
+    all_res_pd = pd.DataFrame().from_records(all_res_for_pd, columns=["method", "data", "seed"] + column_names_generalization + column_names_fair + column_names_efficiency + ["s", "alpha", "communication_bytes", "total_flops"])
+    return all_res_pd
+
+
+
+
+def plot_generalization_lines(all_res_pd, data_cate, data_cate_name):
+    import seaborn as sns
+    from matplotlib import pyplot as plt
+    import matplotlib.pylab as pylab
+
+    plt.clf()
+    sns.set()
+    #fig, axes = plt.subplots(1, 3, figsize=(6, 4))
+    fig, axes = plt.subplots(1, 3, figsize=(6, 4))
+    print(all_res_pd.columns.tolist())
+
+
+    plot_data = all_res_pd.loc[all_res_pd["data"].isin(data_cate)]
+
+    plot_data = plot_data.loc[plot_data["method"] != "Global-Train"]
+    plot_data = plot_data.loc[plot_data["method"] != "Isolated"]
+    plot_data = plot_data.loc[plot_data["method"] != "FedOpt"]
+    plot_data = plot_data.loc[plot_data["method"] != "FedOpt-FT"]
+    filter_out_methods = ["Global-Train", "Isolated", "FedOpt", "FedOpt-FT"]
+    for i, metric in enumerate(column_names_generalization):
+        plt.clf()
+        sns.set()
+        fig, axes = plt.subplots(1, 1, figsize=(2, 3))
+        x = "data"
+        if data_cate_name == "femnist_all":
+            x = "s"
+        if data_cate_name == "cifar10_all":
+            x = "alpha"
+
+        ax = sns.lineplot(
+            ax=axes,
+            data=plot_data,
+            x=x, y=metric, hue="method", style="method",
+            markers=True, dashes=True,
+            hue_order=[m for m in expected_method_names if m not in filter_out_methods],
+            sort=True,
+        )
+        ax.set(ylabel=column_names_generalization_for_plot[i])
+        plt.gca().invert_xaxis()
+
+        if data_cate_name == "cifar10_all":
+            ax.set_xscale('log')
+
+        plt.legend(bbox_to_anchor=(1, 1), loc=2, ncol=2, borderaxespad=0.)
+        plt.tight_layout()
+        plt.savefig(f"generalization_all_{data_cate_name}_{i}.pdf", bbox_inches='tight', pad_inches=0)
+
+        plt.show()
+
+
+
+def plot_tradeoff(all_res_pd, data_cate, data_cate_name, metric_a, metric_b, fig_time):
+    import seaborn as sns
+    from matplotlib import pyplot as plt
+    import matplotlib.pylab as pylab
+
+    plt.clf()
+    sns.set()
+    print(all_res_pd.columns.tolist())
+
+    plot_data = all_res_pd.loc[all_res_pd["data"].isin(data_cate)]
+
+    plot_data = plot_data.loc[plot_data["method"] != "Global-Train"]
+    plot_data = plot_data.loc[plot_data["method"] != "Isolated"]
+    plot_data = plot_data.loc[plot_data["method"] != "FedOpt"]
+    plot_data = plot_data.loc[plot_data["method"] != "FedOpt-FT"]
+    filter_out_methods = ["Global-Train", "Isolated", "FedOpt", "FedOpt-FT"]
+    plt.clf()
+    sns.set()
+    fig, axes = plt.subplots(1, 1, figsize=(2, 3))
+
+    ax = sns.scatterplot(
+        ax=axes,
+        data=plot_data,
+        x=metric_a, y=metric_b, hue="method", style="method",
+        markers=True,
+        hue_order=[m for m in expected_method_names if m not in filter_out_methods],
+        s=100
+    )
+    ax.set(xlabel=column_name_for_plot[metric_a], ylabel=column_name_for_plot[metric_b])
+    #plt.gca().invert_xaxis()
+    if metric_a == "total_flops":
+        ax.set_xscale('log')
+
+    if data_cate_name == "cifar10_all":
+        ax.set_xscale('log')
+
+    plt.legend(bbox_to_anchor=(1, 1), loc=2, ncol=2, borderaxespad=0.)
+    plt.tight_layout()
+    plt.savefig(f"{fig_time}_{data_cate_name}.pdf", bbox_inches='tight', pad_inches=0)
+
+    plt.show()
+
+
+
+check_run_stats(["1", "repeat"])
+avg_res_of_seeds()
+
+print_paper_table(filters_each_line_main_table)
+print_paper_table(filters_each_line_femnist_all_s)
+print_paper_table(filters_each_line_all_cifar10)
+print_paper_table(filters_each_line_all_nlp)
+print_paper_table(filters_each_line_all_graph)
+
+# plot line figures
+all_res_pd = load_data_to_pd(use_repeat_res=False)
+all_res_pd_repeat = load_data_to_pd(use_repeat_res=True)
+plot_generalization_lines(all_res_pd, list( filters_each_line_femnist_all_s.keys()), data_cate_name="femnist_all")
+plot_generalization_lines(all_res_pd, list(filters_each_line_all_cifar10.keys()), data_cate_name="cifar10_all")
+
+# plot trade-off figs
+for data_name in list(filters_each_line_main_table.keys()):
+    plot_tradeoff(all_res_pd_repeat, [data_name], data_cate_name=data_name, metric_a="communication_bytes", metric_b="best_client_summarized_weighted_avg/test_acc", fig_time="com-acc")
+
+for data_name in list(filters_each_line_main_table.keys()):
+    plot_tradeoff(all_res_pd_repeat, [data_name], data_cate_name=data_name, metric_a="total_flops", metric_b="best_client_summarized_weighted_avg/test_acc", fig_time="flops-acc")
+
+for data_name in list(filters_each_line_main_table.keys()):
+    plot_tradeoff(all_res_pd_repeat, [data_name], data_cate_name=data_name, metric_a="sys_avg/global_convergence_round", metric_b="best_client_summarized_weighted_avg/test_acc", fig_time="round-acc")
+
+
diff --git a/benchmark/pFL_Bench/res_analysis_plot/repeat_best_exp.py b/benchmark/pFL_Bench/res_analysis_plot/repeat_best_exp.py
new file mode 100644
index 000000000..b84e625c1
--- /dev/null
+++ b/benchmark/pFL_Bench/res_analysis_plot/repeat_best_exp.py
@@ -0,0 +1,276 @@
+
+import wandb
+from collections import OrderedDict
+
+api = wandb.Api()
+
+name_project = "daoyuan/pFL-bench"
+
+filters_each_line_main_table = OrderedDict(
+    # {dataset_name: filter}
+    [
+        # ("all",
+        # None,
+        # ),
+        # ("FEMNIST-all",
+        #  {"$and":
+        #      [
+        #          {"config.data.type": "femnist"},
+        #      ]
+        #  }
+        #  ),
+        ("FEMNIST-s02",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.2},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        # ("cifar10-alpha05",
+        #  {"$and":
+        #      [
+        #          {"config.data.type": "CIFAR10@torchvision"},
+        #          {"config.data.splitter_args": [{"alpha": 0.5}]},
+        #      ]
+        #  }
+        #  ),
+        ("sst2",
+         {"$and":
+             [
+                 {"config.data.type": "sst2@huggingface_datasets"},
+             ]
+         }
+         ),
+        ("pubmed",
+         {"$and":
+             [
+                 {"config.data.type": "pubmed"},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_all_cifar10 = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("cifar10-alpha5",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 5}]},
+             ]
+         }
+         ),
+        ("cifar10-alpha05",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 0.5}]},
+             ]
+         }
+         ),
+        ("cifar10-alpha01",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 0.1}]},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_femnist_all_s = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("FEMNIST-s02",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.2},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        ("FEMNIST-s01",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.1},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        ("FEMNIST-s005",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.05},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+
+    ]
+)
+
+filters_each_line_all_graph = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("pubmed",
+         {"$and":
+             [
+                 {"config.data.type": "pubmed"},
+             ]
+         }
+         ),
+        ("cora",
+         {"$and":
+             [
+                 {"config.data.type": "cora"},
+             ]
+         }
+         ),
+        ("citeseer",
+         {"$and":
+             [
+                 {"config.data.type": "citeseer"},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_all_nlp = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("cola",
+         {"$and":
+             [
+                 {"config.data.type": "cola@huggingface_datasets"},
+             ]
+         }
+         ),
+        ("sst2",
+         {"$and":
+             [
+                 {"config.data.type": "sst2@huggingface_datasets"},
+             ]
+         }
+         ),
+    ]
+)
+
+
+sweep_name_2_id = dict()
+column_names_generalization = [
+    "best_client_summarized_weighted_avg/test_acc",
+    "best_unseen_client_summarized_weighted_avg_unseen/test_acc",
+    "participation_gap"
+]
+column_names_fair = [
+    "best_client_summarized_avg/test_acc",
+    "best_client_summarized_fairness/test_acc_std",
+    "best_client_summarized_fairness/test_acc_bottom_decile"
+]
+column_names_efficiency = [
+    "sys_avg/total_flops",
+    "sys_avg/total_upload_bytes",
+    "sys_avg/total_download_bytes",
+    "sys_avg/global_convergence_round",
+    # "sys_avg/local_convergence_round"
+]
+sorted_keys = OrderedDict(
+    [("global-train", "Global Train"),
+     ("isolated-train", "Isolated"),
+     ("fedavg", "FedAvg"),
+     ("fedavg-ft", "FedAvg-FT"),
+     ("fedopt", "FedOpt"),
+     ("fedopt-ft", "FedOpt-FT"),
+     ("pfedme", "pFedMe"),
+     ("ft-pfedme", "pFedMe-FT"),
+     ("fedbn", "FedBN"),
+     ("fedbn-ft", "FedBN-FT"),
+     ("fedbn-fedopt", "FedBN-FedOPT"),
+     ("fedbn-fedopt-ft", "FedBN-FedOPT-FT"),
+     ("ditto", "Ditto"),
+     ("ditto-ft", "Ditto-FT"),
+     ("ditto-fedbn", "Ditto-FedBN"),
+     ("ditto-fedbn-ft", "Ditto-FedBN-FT"),
+     ("ditto-fedbn-fedopt", "Ditto-FedBN-FedOpt"),
+     ("ditto-fedbn-fedopt-ft", "Ditto-FedBN-FedOpt-FT"),
+     ("fedem", "FedEM"),
+     ("fedem-ft", "FedEM-FT"),
+     ("fedbn-fedem", "FedEM-FedBN"),
+     ("fedbn-fedem-ft", "FedEM-FedBN-FT"),
+     ("fedbn-fedem-fedopt", "FedEM-FedBN-FedOPT"),
+     ("fedbn-fedem-fedopt-ft", "FedEM-FedBN-FedOPT-FT"),
+     ]
+)
+expected_keys = set(list(sorted_keys.keys()))
+
+def bytes_to_unit_size(size_bytes):
+    import math
+    if size_bytes == 0:
+        return "0"
+    size_name = ("", "K", "M", "G", "T", "P", "E", "Z", "Y")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return f"{s}{size_name[i]}"
+
+def get_best_runs_within_sweep(sweep_id_lists):
+    best_run_list = []
+    for sweep_id in sweep_id_lists:
+        sweep = api.sweep(f"{name_project}/{sweep_id}")
+        best_run = sweep.best_run()
+        best_run_list.append(best_run)
+
+
+def get_sweep_filter_by(filter_name, filters_each_line_table):
+    filter = filters_each_line_table[filter_name]
+    filtered_runs = api.runs(name_project, filters=filter)
+    filtered_sweep_ids = set()
+    check_run_cnt = 0
+    # may hang on
+    for run in filtered_runs:
+        if run.sweep is not None:
+            filtered_sweep_ids.add(run.sweep.id)
+        check_run_cnt += 1
+    print(f"check_run_cnt is {check_run_cnt}")
+    return list(filtered_sweep_ids)
+
+
+def get_runs_filter_by(filter_name, filters_each_line_table):
+    filter = filters_each_line_table[filter_name]
+    filtered_runs = api.runs(name_project, filters=filter)
+    return filtered_runs
+
+order = '-' + 'summary_metrics.best_client_summarized_weighted_avg/val_acc'
+
+def generate_repeat_scripts(best_cfg_path, seed_sets=None):
+    file_cnt = 0
+    if seed_sets is None:
+        seed_sets = [2, 3]
+    from os import listdir
+    from os.path import isfile, join
+    onlyfiles = [f for f in listdir(best_cfg_path) if isfile(join(best_cfg_path, f))]
+    for file_name in onlyfiles:
+        exp_name = file_name
+        exp_name = exp_name.replace(".yaml", "")
+        method, data = exp_name.split("_")
+        for seed in seed_sets:
+            print(f"python federatedscope/main.py --cfg scripts/personalization_exp_scripts/pfl_bench/yaml_best_runs/{file_name} seed {seed} expname_tag {exp_name}_seed{seed} wandb.name_project pfl-bench-best-repeat")
+            file_cnt += 1
+            if file_cnt % 10 == 0:
+                print(f"Seed={seed}, totally generated {file_cnt} run scripts\n\n")
+
+    print(f"Seed={seed_sets}, totally generated {file_cnt} run scripts")
+    print(f"=============================== END ===============================")
+
+seed_sets = [2, 3]
+for seed in seed_sets:
+    generate_repeat_scripts("/mnt/daoyuanchen.cdy/FederatedScope/scripts/personalization_exp_scripts/pfl_bench/yaml_best_runs", seed_sets=[seed])
\ No newline at end of file
diff --git a/benchmark/pFL_Bench/res_analysis_plot/wandb_to_latex_res.py b/benchmark/pFL_Bench/res_analysis_plot/wandb_to_latex_res.py
new file mode 100644
index 000000000..061abbaac
--- /dev/null
+++ b/benchmark/pFL_Bench/res_analysis_plot/wandb_to_latex_res.py
@@ -0,0 +1,528 @@
+import copy
+import os
+
+import wandb
+from collections import OrderedDict
+
+import yaml
+
+api = wandb.Api()
+
+name_project = "daoyuan/pFL-bench"
+
+filters_each_line_main_table = OrderedDict(
+    # {dataset_name: filter}
+    [
+        # ("all",
+        # None,
+        # ),
+        # ("FEMNIST-all",
+        #  {"$and":
+        #      [
+        #          {"config.data.type": "femnist"},
+        #      ]
+        #  }
+        #  ),
+        ("FEMNIST-s02",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.2},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        # ("cifar10-alpha05",
+        #  {"$and":
+        #      [
+        #          {"config.data.type": "CIFAR10@torchvision"},
+        #          {"config.data.splitter_args": [{"alpha": 0.5}]},
+        #      ]
+        #  }
+        #  ),
+        ("sst2",
+         {"$and":
+             [
+                 {"config.data.type": "sst2@huggingface_datasets"},
+             ]
+         }
+         ),
+        ("pubmed",
+         {"$and":
+             [
+                 {"config.data.type": "pubmed"},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_all_cifar10 = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("cifar10-alpha5",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 5}]},
+             ]
+         }
+         ),
+        ("cifar10-alpha05",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 0.5}]},
+             ]
+         }
+         ),
+        ("cifar10-alpha01",
+         {"$and":
+             [
+                 {"config.data.type": "CIFAR10@torchvision"},
+                 {"config.data.splitter_args": [{"alpha": 0.1}]},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_femnist_all_s = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("FEMNIST-s02",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.2},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        ("FEMNIST-s01",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.1},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+        ("FEMNIST-s005",
+         {"$and":
+             [
+                 {"config.data.type": "femnist"},
+                 {"config.federate.sample_client_rate": 0.05},
+                 {"state": "finished"},
+             ]
+         }
+         ),
+
+    ]
+)
+
+filters_each_line_all_graph = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("pubmed",
+         {"$and":
+             [
+                 {"config.data.type": "pubmed"},
+             ]
+         }
+         ),
+        ("cora",
+         {"$and":
+             [
+                 {"config.data.type": "cora"},
+             ]
+         }
+         ),
+        ("citeseer",
+         {"$and":
+             [
+                 {"config.data.type": "citeseer"},
+             ]
+         }
+         ),
+    ]
+)
+
+filters_each_line_all_nlp = OrderedDict(
+    # {dataset_name: filter}
+    [
+        ("cola",
+         {"$and":
+             [
+                 {"config.data.type": "cola@huggingface_datasets"},
+             ]
+         }
+         ),
+        ("sst2",
+         {"$and":
+             [
+                 {"config.data.type": "sst2@huggingface_datasets"},
+             ]
+         }
+         ),
+    ]
+)
+
+
+sweep_name_2_id = dict()
+column_names_generalization = [
+    "best_client_summarized_weighted_avg/test_acc",
+    "best_unseen_client_summarized_weighted_avg_unseen/test_acc",
+    "participation_gap"
+]
+column_names_fair = [
+    "best_client_summarized_avg/test_acc",
+    "best_client_summarized_fairness/test_acc_std",
+    "best_client_summarized_fairness/test_acc_bottom_decile"
+]
+column_names_efficiency = [
+    "sys_avg/total_flops",
+    "sys_avg/total_upload_bytes",
+    "sys_avg/total_download_bytes",
+    "sys_avg/global_convergence_round",
+    # "sys_avg/local_convergence_round"
+]
+sorted_method_name_pair = [
+    ("global-train", "Global-Train"),
+    ("isolated-train", "Isolated"),
+    ("fedavg", "FedAvg"),
+    ("fedavg-ft", "FedAvg-FT"),
+    ("fedopt", "FedOpt"),
+    ("fedopt-ft", "FedOpt-FT"),
+    ("pfedme", "pFedMe"),
+    ("ft-pfedme", "pFedMe-FT"),
+    ("fedbn", "FedBN"),
+    ("fedbn-ft", "FedBN-FT"),
+    ("fedbn-fedopt", "FedBN-FedOPT"),
+    ("fedbn-fedopt-ft", "FedBN-FedOPT-FT"),
+    ("ditto", "Ditto"),
+    ("ditto-ft", "Ditto-FT"),
+    ("ditto-fedbn", "Ditto-FedBN"),
+    ("ditto-fedbn-ft", "Ditto-FedBN-FT"),
+    ("ditto-fedbn-fedopt", "Ditto-FedBN-FedOpt"),
+    ("ditto-fedbn-fedopt-ft", "Ditto-FedBN-FedOpt-FT"),
+    ("fedem", "FedEM"),
+    ("fedem-ft", "FedEM-FT"),
+    ("fedbn-fedem", "FedEM-FedBN"),
+    ("fedbn-fedem-ft", "FedEM-FedBN-FT"),
+    ("fedbn-fedem-fedopt", "FedEM-FedBN-FedOPT"),
+    ("fedbn-fedem-fedopt-ft", "FedEM-FedBN-FedOPT-FT"),
+]
+sorted_method_name_to_print = OrderedDict(sorted_method_name_pair)
+expected_keys = set(list(sorted_method_name_to_print.keys()))
+expected_method_names = list(sorted_method_name_to_print.values())
+expected_datasets_name = ["cola", "sst2", "pubmed", "cora", "citeseer", "cifar10-alpha5", "cifar10-alpha05",
+                          "cifar10-alpha01", "FEMNIST-s02", "FEMNIST-s01", "FEMNIST-s005"]
+expected_seed_set = ["1"]
+expected_expname_tag = set()
+
+for method_name in expected_method_names:
+    for dataset_name in expected_datasets_name:
+        for seed in expected_seed_set:
+            expected_expname_tag.add(f"{method_name}_{dataset_name}_seed{seed}")
+from collections import defaultdict
+all_res_structed = defaultdict(dict)
+for expname_tag in expected_expname_tag:
+    for metric in column_names_generalization + column_names_efficiency + column_names_fair:
+        all_res_structed[expname_tag][metric] = "-"
+
+def bytes_to_unit_size(size_bytes):
+    import math
+    if size_bytes == 0:
+        return "0"
+    size_name = ("", "K", "M", "G", "T", "P", "E", "Z", "Y")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return f"{s}{size_name[i]}"
+
+def unit_size_to_bytes(size_str):
+    if not isinstance(size_str, str):
+        return size_str
+    else:
+        last_unit = size_str[-1]
+        size_name = ("", "K", "M", "G", "T", "P", "E", "Z", "Y")
+        if last_unit not in size_name:
+            return float(size_str)
+        else:
+            # need transform
+            import math
+            idx = size_name.index(last_unit)
+            p = math.pow(1024, idx)
+            return float(size_str[:-1]) * p
+
+def get_best_runs_within_sweep(sweep_id_lists):
+    best_run_list = []
+    for sweep_id in sweep_id_lists:
+        sweep = api.sweep(f"{name_project}/{sweep_id}")
+        best_run = sweep.best_run()
+        best_run_list.append(best_run)
+
+
+def get_sweep_filter_by(filter_name, filters_each_line_table):
+    filter = filters_each_line_table[filter_name]
+    filtered_runs = api.runs(name_project, filters=filter)
+    filtered_sweep_ids = set()
+    check_run_cnt = 0
+    # may hang on
+    for run in filtered_runs:
+        if run.sweep is not None:
+            filtered_sweep_ids.add(run.sweep.id)
+        check_run_cnt += 1
+    print(f"check_run_cnt is {check_run_cnt}")
+    return list(filtered_sweep_ids)
+
+
+order = '-' + 'summary_metrics.best_client_summarized_weighted_avg/val_acc'
+
+
+def print_table_datasets_list(filters_each_line_table):
+    res_of_each_line_generalization = OrderedDict()
+    res_of_each_line_fair = OrderedDict()
+    res_of_each_line_efficiency = OrderedDict()
+    res_of_each_line_commu_acc_trade = OrderedDict()
+    res_of_each_line_conver_acc_trade = OrderedDict()
+    res_of_all_sweeps = OrderedDict()
+    for data_name in filters_each_line_table:
+        unseen_keys = copy.copy(expected_keys)
+        print(f"======= processing dataset {data_name}")
+        sweep_ids = get_sweep_filter_by(data_name, filters_each_line_table)
+        for sweep_id in sweep_ids:
+            sweep = api.sweep(f"{name_project}/{sweep_id}")
+            run_header = sweep.name
+            if sweep.order != order:
+                print(f"un-expected order for {run_header}")
+            best_run = sweep.best_run()
+            res_all_generalization = []
+            res_all_fair = []
+            res_all_efficiency = []
+            if best_run.state != "finished":
+                print(f"==================Waring: the best_run with id={best_run} has state {best_run.state}. "
+                      f"In weep_id={sweep_id}, sweep_name={run_header}")
+            else:
+                print(f"Finding the best_run with id={best_run}. "
+                      f"In sweep_id={sweep_id}, sweep_name={run_header}")
+
+            # for generalization results
+            wrong_sweep = False
+            if "isolated" in run_header or "global" in run_header:
+                try:
+                    res = best_run.summary[column_names_generalization[0]]
+                    res_all_generalization.append(res)
+                except KeyError:
+                    print(
+                        f"KeyError with key={column_names_generalization[0]}, sweep_id={sweep_id}, sweep_name={run_header}, best_run_id={best_run.id}")
+                    wrong_sweep = True
+                if wrong_sweep:
+                    continue
+                res_all_generalization.append("-")  # un-seen
+                res_all_generalization.append("-")  # Gap
+            else:
+                for column_name in column_names_generalization[0:2]:
+                    try:
+                        res = best_run.summary[column_name]
+                        res_all_generalization.append(res)
+                    except KeyError:
+                        print(f"KeyError with key={column_name}, sweep_id={sweep_id}, sweep_name={run_header}, best_run_id={best_run.id}")
+                        wrong_sweep = True
+                if wrong_sweep:
+                    continue
+                res_all_generalization.append(res_all_generalization[-1] - res_all_generalization[-2])
+            # -============== for fairness results ======
+            for column_name in column_names_fair:
+                if "global" in run_header:
+                    res_all_fair.append("-")
+                    res_all_fair.append("-")
+                    res_all_fair.append("-")
+                else:
+                    try:
+                        res = best_run.summary[column_name]
+                        res_all_fair.append(res)
+                    except KeyError:
+                        print(f"KeyError with key={column_name}, sweep_id={sweep_id}, sweep_name={run_header}, best_run_id={best_run.id}")
+                        res_all_fair.append("-")
+                        wrong_sweep = True
+
+            # -============== for efficiency results ======
+            for column_name in column_names_efficiency:
+                try:
+                    res = best_run.summary[column_name]
+                    contain_unit = False
+                    for size_unit in ["K", "M", "G", "T", "P", "E", "Z", "Y"]:
+                        if size_unit in str(res):
+                            contain_unit = True
+                    if not contain_unit:
+                        res = bytes_to_unit_size(float(res))
+
+                    res_all_efficiency.append(res)
+                except KeyError:
+                    print(f"KeyError with key={column_name}, sweep_id={sweep_id}, sweep_name={run_header}, best_run_id={best_run.id}")
+                    wrong_sweep = True
+                    res_all_efficiency.append("-")
+
+            run_header = str.lower(run_header)
+
+            # fix some run_header error
+            #best_run_cfg = json.loads(best_run.json_config)
+            best_run_cfg = best_run.config
+
+            def remove_a_key(d, remove_key):
+                if isinstance(d, dict):
+                    for key in list(d.keys()):
+                        if key == remove_key:
+                            del d[key]
+                        else:
+                            remove_a_key(d[key], remove_key)
+            remove_a_key(best_run_cfg, "cfg_check_funcs")
+            old_run_header = run_header
+            if best_run_cfg["trainer"]["finetune"]["before_eval"] is True and "ft" not in run_header:
+                run_header = run_header + ",ft"
+            elif best_run_cfg["fedopt"]["use"] is True and "fedopt" not in run_header:
+                run_header = run_header + ",fedopt"
+            if old_run_header != run_header:
+                print(f"processed {old_run_header} to new run header {run_header}")
+
+            if run_header not in res_of_all_sweeps:
+                res_of_all_sweeps[run_header] = res_all_generalization
+                sweep_name_2_id[run_header] = sweep_id
+            else:
+                print(f"processed duplicated sweep with name {run_header}, plz check it with id {sweep_id}. "
+                      f"The first appeared sweep has id {sweep_name_2_id[run_header]}")
+
+                while run_header + "_dup" in res_of_all_sweeps:
+                    run_header = run_header + "_dup"
+                run_header = run_header + "dup"
+                print(f"processed to new run header {run_header}")
+                res_of_all_sweeps[run_header] = res_all_generalization
+
+            run_header = run_header.replace("-", ",")
+            run_header = run_header.replace("+", ",")
+            split_res = run_header.split(",")
+            filter_split_res = []
+            for sub in split_res:
+                if "femnist" in sub or "cifar" in sub or "cora" in sub or "cola" in sub or "pubmed" in sub or "citeseer" in sub or "sst2" in sub \
+                        or "s02" in sub or "s005" in sub or "s01" in sub \
+                        or "alpha5" in sub or "alpha0.5" in sub or "alpha0.1" in sub:
+                    pass
+                else:
+                    filter_split_res.append(sub)
+            method_header = "-".join(sorted(filter_split_res))
+            if method_header in unseen_keys:
+                unseen_keys.remove(method_header)
+
+            # save all res into the structured dict
+            cur_seed = best_run_cfg["seed"]
+            exp_name_current = f"{sorted_method_name_to_print[method_header]}_{data_name}_seed{cur_seed}"
+            for i, metric in enumerate(column_names_generalization):
+                all_res_structed[exp_name_current][metric] = res_all_generalization[i]
+            for i, metric in enumerate(column_names_efficiency):
+                all_res_structed[exp_name_current][metric] = res_all_efficiency[i]
+            for i, metric in enumerate(column_names_fair):
+                all_res_structed[exp_name_current][metric] = res_all_fair[i]
+
+            # save config
+            parent_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
+            best_cfg_dir = os.path.join(parent_dir, "yaml_best_runs")
+            os.makedirs(best_cfg_dir, exist_ok=True)
+            yaml_f_name = f"{sorted_method_name_to_print[method_header]}_{data_name}.yaml"
+            with open(os.path.join(best_cfg_dir, yaml_f_name), 'w') as yml_f:
+                yaml.dump(best_run_cfg, yml_f, allow_unicode=True)
+
+            if method_header not in res_of_each_line_generalization:
+                res_of_each_line_generalization[method_header] = res_all_generalization
+                res_of_each_line_fair[method_header] = res_all_fair
+                res_of_each_line_efficiency[method_header] = res_all_efficiency
+            else:
+                res_of_each_line_generalization[method_header].extend(res_all_generalization)
+                res_of_each_line_fair[method_header].extend(res_all_fair)
+                res_of_each_line_efficiency[method_header].extend(res_all_efficiency)
+
+        for missing_header in unseen_keys:
+            print(f"the header is missing {missing_header} in dataset {data_name}")
+            if missing_header not in res_of_each_line_generalization:
+                res_of_each_line_generalization[missing_header] = ["-"] * 3
+                res_of_each_line_fair[missing_header] = ["-"] * 3
+                res_of_each_line_efficiency[missing_header] = ["-"] * 4
+            else:
+                res_of_each_line_generalization[missing_header].extend(["-"] * 3)
+                res_of_each_line_fair[missing_header].extend(["-"] * 3)
+                res_of_each_line_efficiency[missing_header].extend(["-"] * 4)
+
+    print("\n=============res_of_each_line [Generalization]===============" + ",".join(
+        list(filters_each_line_table.keys())))
+    # Acc, Unseen-ACC, Delta
+    for key in sorted_method_name_to_print:
+        res_to_print = ["{:.2f}".format(v * 100) if v != "-" else v for v in res_of_each_line_generalization[key]]
+        res_to_print = [sorted_method_name_to_print[key]] + res_to_print
+        print(",".join(res_to_print))
+
+    print("\n=============res_of_each_line [Fairness]===============" + ",".join(list(filters_each_line_table.keys())))
+    for key in sorted_method_name_to_print:
+        res_to_print = ["{:.2f}".format(v * 100) if v != "-" else v for v in res_of_each_line_fair[key]]
+        res_to_print = [sorted_method_name_to_print[key]] + res_to_print
+        print(",".join(res_to_print))
+    print("\n=============res_of_each_line [All Efficiency]===============" + ",".join(
+        list(filters_each_line_table.keys())))
+    # FLOPS, UPLOAD, DOWNLOAD
+    for key in sorted_method_name_to_print:
+        res_to_print = [str(v) for v in res_of_each_line_efficiency[key]]
+        res_to_print = [sorted_method_name_to_print[key]] + res_to_print
+        print(",".join(res_to_print))
+    print("\n=============res_of_each_line [flops, communication, acc]===============" + ",".join(
+        list(filters_each_line_table.keys())))
+    for key in sorted_method_name_to_print:
+        res_of_each_line_commu_acc_trade[key] = []
+        dataset_num = 2 if "cola" in list(filters_each_line_table.keys()) else 3
+        for i in range(dataset_num):
+            res_of_each_line_commu_acc_trade[key].extend(
+                [str(res_of_each_line_efficiency[key][i * 4])] + \
+                [str(res_of_each_line_efficiency[key][i * 4 + 1])] + \
+                ["{:.2f}".format(v * 100) if v != "-" else v for v in res_of_each_line_generalization[key][i * 3:i * 3 + 1]]
+            )
+
+        res_to_print = [str(v) for v in res_of_each_line_commu_acc_trade[key]]
+        res_to_print = [sorted_method_name_to_print[key]] + res_to_print
+        print(",".join(res_to_print))
+    print("\n=============res_of_each_line [converge_round, acc]===============" + ",".join(
+        list(filters_each_line_table.keys())))
+    for key in sorted_method_name_to_print:
+        res_of_each_line_conver_acc_trade[key] = []
+        dataset_num = 2 if "cola" in list(filters_each_line_table.keys()) else 3
+        for i in range(dataset_num):
+            res_of_each_line_conver_acc_trade[key].extend(
+                [str(res_of_each_line_efficiency[key][i * 4 + 3])] + \
+                # [str(res_of_each_line_efficiency[key][i * 4 + 4])] + \
+                ["{:.2f}".format(v * 100) if v != "-" else v for v in res_of_each_line_fair[key][i * 3:i * 3 + 1]]
+            )
+
+        res_to_print = [str(v) for v in res_of_each_line_conver_acc_trade[key]]
+        res_to_print = [sorted_method_name_to_print[key]] + res_to_print
+        print(",".join(res_to_print))
+    # print("\n=============res_of_all_sweeps [Generalization]===============")
+    # for key in sorted(res_of_all_sweeps.keys()):
+    #     res_to_print = ["{:.2f}".format(v * 100) if v != "-" else v for v in res_of_all_sweeps[key]]
+    #     res_to_print = [key] + res_to_print
+    #     print(",".join(res_to_print))
+    #
+
+
+print_table_datasets_list(filters_each_line_main_table)
+print_table_datasets_list(filters_each_line_femnist_all_s)
+print_table_datasets_list(filters_each_line_all_cifar10)
+print_table_datasets_list(filters_each_line_all_nlp)
+print_table_datasets_list(filters_each_line_all_graph)
+
+import json
+with open('best_res_all_metric.json', 'w') as fp:
+    json.dump(all_res_structed, fp)
+
+for expname_tag in expected_expname_tag:
+    for metric in column_names_generalization + column_names_efficiency + column_names_fair:
+        if all_res_structed[expname_tag][metric] == "-":
+            print(f"Missing {expname_tag} for metric {metric}")
+
diff --git a/benchmark/pFL_Bench/yaml_best_runs_example/Ditto_FEMNIST-s02.yaml b/benchmark/pFL_Bench/yaml_best_runs_example/Ditto_FEMNIST-s02.yaml
new file mode 100644
index 000000000..3d7864b5a
--- /dev/null
+++ b/benchmark/pFL_Bench/yaml_best_runs_example/Ditto_FEMNIST-s02.yaml
@@ -0,0 +1,201 @@
+asyn:
+  min_received_num: 1
+  min_received_rate: -1
+  timeout: 3
+  use: true
+attack:
+  alpha_TV: 0.001
+  alpha_prop_loss: 0
+  attack_method: ''
+  attacker_id: -1
+  classifier_PIA: randomforest
+  info_diff_type: l2
+  inject_round: 0
+  max_ite: 400
+  reconstruct_lr: 0.01
+  reconstruct_optim: Adam
+  target_label_ind: -1
+backend: torch
+cfg_file: ''
+criterion:
+  type: CrossEntropyLoss
+data:
+  args: []
+  batch_size: 32
+  cSBM_phi:
+  - 0.5
+  - 0.5
+  - 0.5
+  drop_last: false
+  graphsaint:
+    num_steps: 30
+    walk_length: 2
+  loader: ''
+  num_workers: 0
+  pre_transform: []
+  root: data/
+  shuffle: true
+  sizes:
+  - 10
+  - 5
+  splits:
+  - 0.6
+  - 0.2
+  - 0.2
+  splitter: ''
+  splitter_args: []
+  subsample: 0.05
+  target_transform: []
+  transform:
+  - - ToTensor
+    - {}
+  - - Normalize
+    - mean:
+      - 0.1307
+      std:
+      - 0.3081
+  type: femnist
+device: -1
+distribute:
+  use: false
+early_stop:
+  delta: 0
+  improve_indicator_mode: best
+  patience: 3
+  the_smaller_the_better: true
+eval:
+  best_res_update_round_wise_key: val_loss
+  freq: 10
+  metrics:
+  - acc
+  - correct
+  monitoring: []
+  report:
+  - weighted_avg
+  - avg
+  - fairness
+  - raw
+  save_data: false
+  split:
+  - test
+  - val
+expname: Ditto_convnet2_on_femnist_lr0.05_lstep3_
+expname_tag: ''
+federate:
+  batch_or_epoch: epoch
+  client_num: 200
+  data_weighted_aggr: false
+  ignore_weight: false
+  join_in_info: []
+  local_update_steps: 3
+  make_global_eval: false
+  method: Ditto
+  mode: standalone
+  online_aggr: false
+  restore_from: ''
+  sample_client_num: 32
+  sample_client_rate: 0.2
+  save_to: ''
+  share_local_model: false
+  total_round_num: 1000
+  unseen_clients_rate: 0.2
+  use_ss: false
+federate.local_update_steps: 3
+fedopt:
+  use: false
+fedprox:
+  use: false
+fedsageplus:
+  a: 1
+  b: 1
+  c: 1
+  fedgen_epoch: 200
+  gen_hidden: 128
+  hide_portion: 0.5
+  loc_epoch: 1
+  num_pred: 5
+gcflplus:
+  EPS_1: 0.05
+  EPS_2: 0.1
+  seq_length: 5
+  standardize: false
+hpo:
+  fedex:
+    cutoff: 0
+    diff: false
+    flatten_ss: true
+    gamma: 0
+    num_arms: 16
+    sched: auto
+    ss: ''
+    use: false
+  init_cand_num: 16
+  init_strategy: random
+  larger_better: false
+  log_scale: false
+  metric: client_summarized_weighted_avg.test_loss
+  pbt:
+    max_stage: 5
+    perf_threshold: 0.1
+  plot_interval: 1
+  scheduler: bruteforce
+  sha:
+    budgets: []
+    elim_rate: 3
+    elim_round_num: 3
+  working_folder: hpo
+model:
+  dropout: 0
+  embed_size: 8
+  graph_pooling: mean
+  hidden: 2048
+  in_channels: 0
+  layer: 2
+  model_num_per_trainer: 1
+  num_item: 0
+  num_user: 0
+  out_channels: 62
+  task: node
+  type: convnet2
+  use_bias: true
+nbafl:
+  use: false
+optimizer:
+  grad_clip: 5
+  lr: 0.05
+  momentum: 0
+  type: SGD
+  weight_decay: 0
+optimizer.lr: 0.05
+outdir: exp_pfl_bench/Ditto_convnet2_on_femnist_lr0.05_lstep3_/sub_exp_20220523003100
+personalization:
+  K: 5
+  beta: 1
+  local_param: []
+  local_update_steps: 1
+  lr: 0.01
+  regular_weight: 0.2
+  share_non_trainable_para: false
+personalization.regular_weight: 0.2
+regularizer:
+  mu: 0
+  type: ''
+seed: 1
+sgdmf:
+  use: false
+trainer:
+  finetune:
+    before_eval: false
+    freeze_param: ''
+    lr: 0.01
+    steps: 5
+  type: cvtrainer
+use_gpu: true
+verbose: 1
+vertical:
+  use: false
+wandb:
+  name_project: pFL-bench
+  name_user: daoyuan
+  online_track: true
+  use: true
diff --git a/benchmark/pFL_Bench/yaml_best_runs_example/FedAvg-FT_FEMNIST-s02.yaml b/benchmark/pFL_Bench/yaml_best_runs_example/FedAvg-FT_FEMNIST-s02.yaml
new file mode 100644
index 000000000..041646c69
--- /dev/null
+++ b/benchmark/pFL_Bench/yaml_best_runs_example/FedAvg-FT_FEMNIST-s02.yaml
@@ -0,0 +1,200 @@
+asyn:
+  min_received_num: 1
+  min_received_rate: -1
+  timeout: 3
+  use: true
+attack:
+  alpha_TV: 0.001
+  alpha_prop_loss: 0
+  attack_method: ''
+  attacker_id: -1
+  classifier_PIA: randomforest
+  info_diff_type: l2
+  inject_round: 0
+  max_ite: 400
+  reconstruct_lr: 0.01
+  reconstruct_optim: Adam
+  target_label_ind: -1
+backend: torch
+cfg_file: ''
+criterion:
+  type: CrossEntropyLoss
+data:
+  args: []
+  batch_size: 32
+  cSBM_phi:
+  - 0.5
+  - 0.5
+  - 0.5
+  drop_last: false
+  graphsaint:
+    num_steps: 30
+    walk_length: 2
+  loader: ''
+  num_workers: 0
+  pre_transform: []
+  root: data/
+  shuffle: true
+  sizes:
+  - 10
+  - 5
+  splits:
+  - 0.6
+  - 0.2
+  - 0.2
+  splitter: ''
+  splitter_args: []
+  subsample: 0.05
+  target_transform: []
+  transform:
+  - - ToTensor
+    - {}
+  - - Normalize
+    - mean:
+      - 0.1307
+      std:
+      - 0.3081
+  type: femnist
+device: -1
+distribute:
+  use: false
+early_stop:
+  delta: 0
+  improve_indicator_mode: best
+  patience: 3
+  the_smaller_the_better: true
+eval:
+  best_res_update_round_wise_key: val_loss
+  freq: 10
+  metrics:
+  - acc
+  - correct
+  monitoring: []
+  report:
+  - weighted_avg
+  - avg
+  - fairness
+  - raw
+  save_data: false
+  split:
+  - test
+  - val
+expname: FedAvg_convnet2_on_femnist_lr0.1_lstep1_finetune
+expname_tag: finetune
+federate:
+  batch_or_epoch: epoch
+  client_num: 200
+  data_weighted_aggr: false
+  ignore_weight: false
+  join_in_info: []
+  local_update_steps: 1
+  make_global_eval: false
+  method: FedAvg
+  mode: standalone
+  online_aggr: false
+  restore_from: ''
+  sample_client_num: 32
+  sample_client_rate: 0.2
+  save_to: ''
+  share_local_model: false
+  total_round_num: 1000
+  unseen_clients_rate: 0.2
+  use_ss: false
+federate.local_update_steps: 1
+fedopt:
+  use: false
+fedprox:
+  use: false
+fedsageplus:
+  a: 1
+  b: 1
+  c: 1
+  fedgen_epoch: 200
+  gen_hidden: 128
+  hide_portion: 0.5
+  loc_epoch: 1
+  num_pred: 5
+gcflplus:
+  EPS_1: 0.05
+  EPS_2: 0.1
+  seq_length: 5
+  standardize: false
+hpo:
+  fedex:
+    cutoff: 0
+    diff: false
+    flatten_ss: true
+    gamma: 0
+    num_arms: 16
+    sched: auto
+    ss: ''
+    use: false
+  init_cand_num: 16
+  init_strategy: random
+  larger_better: false
+  log_scale: false
+  metric: client_summarized_weighted_avg.test_loss
+  pbt:
+    max_stage: 5
+    perf_threshold: 0.1
+  plot_interval: 1
+  scheduler: bruteforce
+  sha:
+    budgets: []
+    elim_rate: 3
+    elim_round_num: 3
+  working_folder: hpo
+model:
+  dropout: 0
+  embed_size: 8
+  graph_pooling: mean
+  hidden: 2048
+  in_channels: 0
+  layer: 2
+  model_num_per_trainer: 1
+  num_item: 0
+  num_user: 0
+  out_channels: 62
+  task: node
+  type: convnet2
+  use_bias: true
+nbafl:
+  use: false
+optimizer:
+  grad_clip: 5
+  lr: 0.1
+  momentum: 0
+  type: SGD
+  weight_decay: 0
+optimizer.lr: 0.1
+outdir: exp_pfl_bench/FedAvg_convnet2_on_femnist_lr0.1_lstep1_finetune
+personalization:
+  K: 5
+  beta: 1
+  local_param: []
+  local_update_steps: 1
+  lr: 0.01
+  regular_weight: 0.1
+  share_non_trainable_para: false
+regularizer:
+  mu: 0
+  type: ''
+seed: 1
+sgdmf:
+  use: false
+trainer:
+  finetune:
+    before_eval: true
+    freeze_param: ''
+    lr: 0.01
+    steps: 5
+  type: cvtrainer
+use_gpu: true
+verbose: 1
+vertical:
+  use: false
+wandb:
+  name_project: pFL-bench
+  name_user: daoyuan
+  online_track: true
+  use: true
diff --git a/benchmark/pFL_Bench/yaml_best_runs_example/FedEM-FedBN-FedOPT-FT_FEMNIST-s02.yaml b/benchmark/pFL_Bench/yaml_best_runs_example/FedEM-FedBN-FedOPT-FT_FEMNIST-s02.yaml
new file mode 100644
index 000000000..7a09cceb9
--- /dev/null
+++ b/benchmark/pFL_Bench/yaml_best_runs_example/FedEM-FedBN-FedOPT-FT_FEMNIST-s02.yaml
@@ -0,0 +1,207 @@
+asyn:
+  min_received_num: 1
+  min_received_rate: -1
+  timeout: 3
+  use: true
+attack:
+  alpha_TV: 0.001
+  alpha_prop_loss: 0
+  attack_method: ''
+  attacker_id: -1
+  classifier_PIA: randomforest
+  info_diff_type: l2
+  inject_round: 0
+  max_ite: 400
+  reconstruct_lr: 0.01
+  reconstruct_optim: Adam
+  target_label_ind: -1
+backend: torch
+cfg_file: ''
+criterion:
+  type: CrossEntropyLoss
+data:
+  args: []
+  batch_size: 32
+  cSBM_phi:
+  - 0.5
+  - 0.5
+  - 0.5
+  drop_last: false
+  graphsaint:
+    num_steps: 30
+    walk_length: 2
+  loader: ''
+  num_workers: 0
+  pre_transform: []
+  root: data/
+  server_holds_all: false
+  shuffle: true
+  sizes:
+  - 10
+  - 5
+  splits:
+  - 0.6
+  - 0.2
+  - 0.2
+  splitter: ''
+  splitter_args: []
+  subsample: 0.05
+  target_transform: []
+  transform:
+  - - ToTensor
+    - {}
+  - - Normalize
+    - mean:
+      - 0.1307
+      std:
+      - 0.3081
+  type: femnist
+device: -1
+distribute:
+  use: false
+early_stop:
+  delta: 0
+  improve_indicator_mode: best
+  patience: 5
+  the_smaller_the_better: true
+eval:
+  best_res_update_round_wise_key: val_loss
+  freq: 10
+  metrics:
+  - acc
+  - correct
+  monitoring: []
+  report:
+  - weighted_avg
+  - avg
+  - fairness
+  - raw
+  save_data: false
+  split:
+  - test
+  - val
+expname: FedEM_convnet2_on_femnist_lr0.05_lstep1_+fedBN+finetune+fedOpt
+expname_tag: +fedBN+finetune+fedOpt
+federate:
+  batch_or_epoch: epoch
+  client_num: 200
+  data_weighted_aggr: false
+  ignore_weight: false
+  join_in_info: []
+  local_update_steps: 1
+  make_global_eval: false
+  method: FedEM
+  mode: standalone
+  online_aggr: false
+  restore_from: ''
+  sample_client_num: 32
+  sample_client_rate: 0.2
+  save_to: ''
+  share_local_model: false
+  total_round_num: 1000
+  unseen_clients_rate: 0.2
+  use_ss: false
+federate.local_update_steps: 1
+fedopt:
+  lr_server: 0.5
+  type_optimizer: SGD
+  use: true
+fedopt.lr_server: 0.5
+fedprox:
+  use: false
+fedsageplus:
+  a: 1
+  b: 1
+  c: 1
+  fedgen_epoch: 200
+  gen_hidden: 128
+  hide_portion: 0.5
+  loc_epoch: 1
+  num_pred: 5
+gcflplus:
+  EPS_1: 0.05
+  EPS_2: 0.1
+  seq_length: 5
+  standardize: false
+hpo:
+  fedex:
+    cutoff: 0
+    diff: false
+    flatten_ss: true
+    gamma: 0
+    num_arms: 16
+    sched: auto
+    ss: ''
+    use: false
+  init_cand_num: 16
+  init_strategy: random
+  larger_better: false
+  log_scale: false
+  metric: client_summarized_weighted_avg.test_loss
+  pbt:
+    max_stage: 5
+    perf_threshold: 0.1
+  plot_interval: 1
+  scheduler: bruteforce
+  sha:
+    budgets: []
+    elim_rate: 3
+    elim_round_num: 3
+  working_folder: hpo
+model:
+  dropout: 0
+  embed_size: 8
+  graph_pooling: mean
+  hidden: 2048
+  in_channels: 0
+  layer: 2
+  model_num_per_trainer: 3
+  num_item: 0
+  num_user: 0
+  out_channels: 62
+  task: node
+  type: convnet2
+  use_bias: true
+nbafl:
+  use: false
+optimizer:
+  grad_clip: 5
+  lr: 0.05
+  momentum: 0
+  type: SGD
+  weight_decay: 0
+optimizer.lr: 0.05
+outdir: exp_pfl_bench/FedEM_convnet2_on_femnist_lr0.05_lstep1_+fedBN+finetune+fedOpt/sub_exp_20220531144907
+personalization:
+  K: 5
+  apfl_alpha: 0
+  beta: 1
+  local_param:
+  - bn
+  - norms
+  local_update_steps: 1
+  lr: 0.01
+  regular_weight: 0.1
+  share_non_trainable_para: false
+regularizer:
+  mu: 0
+  type: ''
+seed: 1
+sgdmf:
+  use: false
+trainer:
+  finetune:
+    before_eval: true
+    freeze_param: ''
+    lr: 0.01
+    steps: 5
+  type: cvtrainer
+use_gpu: true
+verbose: 1
+vertical:
+  use: false
+wandb:
+  name_project: pFL-bench
+  name_user: daoyuan
+  online_track: true
+  use: true