Merge pull request #2088 from FedML-AI/alexleung/dev_v070_for_refactor

Alexleung/dev v070 for refactor
FedML-AI · May 12, 2024 · dec0f51 · dec0f51
2 parents a35d433 + c81073e
commit dec0f51
Show file tree

Hide file tree

Showing 70 changed files with 3,378 additions and 1,210 deletions.
diff --git a/devops/dockerfile/device-image/Dockerfile-Dev b/devops/dockerfile/device-image/Dockerfile-Dev
@@ -19,6 +19,8 @@ RUN pip3 install -e ./
 #RUN pip3 install -e '.[jax]'
 #RUN pip3 install -e '.[mxnet]'
 RUN pip3 install MNN==1.1.6
+RUN pip3 install -U pydantic
+RUN pip3 install -U fastapi
 
 WORKDIR /fedml
 

diff --git a/devops/dockerfile/device-image/Dockerfile-Release b/devops/dockerfile/device-image/Dockerfile-Release
@@ -19,6 +19,8 @@ RUN pip3 install -e ./
 #RUN pip3 install -e '.[jax]'
 #RUN pip3 install -e '.[mxnet]'
 RUN pip3 install MNN==1.1.6
+RUN pip3 install -U pydantic
+RUN pip3 install -U fastapi
 
 WORKDIR /fedml
 

diff --git a/devops/dockerfile/device-image/Dockerfile-Test b/devops/dockerfile/device-image/Dockerfile-Test
@@ -19,6 +19,8 @@ RUN pip3 install -e ./
 #RUN pip3 install -e '.[jax]'
 #RUN pip3 install -e '.[mxnet]'
 RUN pip3 install MNN==1.1.6
+RUN pip3 install -U pydantic
+RUN pip3 install -U fastapi
 
 WORKDIR /fedml
 

diff --git a/devops/dockerfile/server-agent/Dockerfile-Dev b/devops/dockerfile/server-agent/Dockerfile-Dev
@@ -15,6 +15,8 @@ RUN pip3 install -r ./fedml/requirements.txt
 COPY ./python ./fedml/fedml-pip
 WORKDIR ./fedml/fedml-pip
 RUN pip3 install -e ./
+RUN pip3 install -U pydantic
+RUN pip3 install -U fastapi
 #RUN pip3 install -e '.[tensorflow]'
 #RUN pip3 install -e '.[jax]'
 #RUN pip3 install -e '.[mxnet]'

diff --git a/devops/dockerfile/server-agent/Dockerfile-Release b/devops/dockerfile/server-agent/Dockerfile-Release
@@ -15,6 +15,8 @@ RUN pip3 install -r ./fedml/requirements.txt
 COPY ./python ./fedml/fedml-pip
 WORKDIR ./fedml/fedml-pip
 RUN pip3 install -e ./
+RUN pip3 install -U pydantic
+RUN pip3 install -U fastapi
 #RUN pip3 install -e '.[tensorflow]'
 #RUN pip3 install -e '.[jax]'
 #RUN pip3 install -e '.[mxnet]'

diff --git a/devops/dockerfile/server-agent/Dockerfile-Test b/devops/dockerfile/server-agent/Dockerfile-Test
@@ -15,6 +15,8 @@ RUN pip3 install -r ./fedml/requirements.txt
 COPY ./python ./fedml/fedml-pip
 WORKDIR ./fedml/fedml-pip
 RUN pip3 install -e ./
+RUN pip3 install -U pydantic
+RUN pip3 install -U fastapi
 #RUN pip3 install -e '.[tensorflow]'
 #RUN pip3 install -e '.[jax]'
 #RUN pip3 install -e '.[mxnet]'

diff --git a/python/examples/deploy/dummy_failed/dummy_failed_scale_out/config.yaml b/python/examples/deploy/dummy_failed/dummy_failed_scale_out/config.yaml
@@ -0,0 +1,16 @@
+workspace: "./src"
+
+inference_image_name: "raphaeljin/fedml"
+enable_custom_image: true
+
+bootstrap: |
+  echo "Bootstrap start..."
+  cat serve_main.py
+  echo "Bootstrap finished"
+
+# Simulate a successful deployment
+job: |
+  python3 serve_main.py
+
+auto_detect_public_ip: true
+use_gpu: true
diff --git a/python/examples/deploy/dummy_failed/dummy_failed_scale_out/src/serve_main.py b/python/examples/deploy/dummy_failed/dummy_failed_scale_out/src/serve_main.py
@@ -0,0 +1,39 @@
+import os
+
+from fedml.serving import FedMLPredictor
+from fedml.serving import FedMLInferenceRunner
+import uuid
+import torch
+
+# Calculate the number of elements
+num_elements = 1_073_741_824 // 4  # using integer division for whole elements
+
+
+class DummyPredictor(FedMLPredictor):
+    def __init__(self):
+        super().__init__()
+        # Create a tensor with these many elements
+        tensor = torch.empty(num_elements, dtype=torch.float32)
+
+        # Move the tensor to GPU
+        tensor_gpu = tensor.cuda()
+
+        # for debug
+        with open("/tmp/dummy_gpu_occupier.txt", "w") as f:
+            f.write("GPU is occupied")
+
+        if os.getenv("FEDML_REPLICA_RANK") == "2":
+            # Simulate a failure
+            raise Exception("Simulated failure")
+            exit(1)
+
+        self.worker_id = uuid.uuid4()
+
+    def predict(self, request):
+        return {f"AlohaV0From{self.worker_id}": request}
+
+
+if __name__ == "__main__":
+    predictor = DummyPredictor()
+    fedml_inference_runner = FedMLInferenceRunner(predictor)
+    fedml_inference_runner.run()
diff --git a/python/examples/federate/cross_silo/mpi_fedavg_mnist_lr_example/config/fedml_config.yaml b/python/examples/federate/cross_silo/mpi_fedavg_mnist_lr_example/config/fedml_config.yaml
@@ -13,7 +13,6 @@ model_args:
 
 train_args:
   federated_optimizer: "FedOpt"
-  client_optimizer:
   server_optimizer: "FedOpt"
   client_id_list:
   client_num_in_total: 1000

diff --git a/python/examples/federate/cross_silo/mpi_fedavg_mnist_lr_example/mpi_host_file b/python/examples/federate/cross_silo/mpi_fedavg_mnist_lr_example/mpi_host_file
@@ -1 +1 @@
-fedml-a6000-node-1
+Dimitris-FedML.local
diff --git a/...ples/federate/cross_silo/mqtt_s3_fedavg_mnist_lr_example/custom_data_and_model/run_all.sh b/...ples/federate/cross_silo/mqtt_s3_fedavg_mnist_lr_example/custom_data_and_model/run_all.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+set -e
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"/
+
+# The name of the current run.
+RUN_ID=$1
+if [ -z "${RUN_ID}" ]; then
+  echo "Need to provide the id of the run as a string."
+  exit 1
+fi
+
+# The number of workers.
+WORKER_NUM=$2
+if [ -z "${WORKER_NUM}" ]; then
+  echo "Need to provide the number of workers you want to run the experiment for."
+  exit 1
+fi
+
+# Spawn server process.
+echo "Starting server"
+python3 torch_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id $RUN_ID &
+sleep 3  # Sleep for 3s to give the server enough time to start
+
+# Spawn client(s) process.
+# Change the number next to seq for spawning more than 1 clients.
+for i in `seq $WORKER_NUM`; do
+    echo "Starting client $i"
+    python3 torch_client.py --cf config/fedml_config.yaml --rank $i --role client --run_id $RUN_ID &
+done
+
+# Enable CTRL+C to stop all background processes
+trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM
+
+# Wait for all background processes to complete
+wait
diff --git a/python/examples/launch/dump.rdb b/python/examples/launch/dump.rdb
diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
@@ -34,7 +34,7 @@
 _global_training_type = None
 _global_comm_backend = None
 
-__version__ = "0.8.29.dev10"
+__version__ = "0.8.30"
 
 
 # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release

diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py
@@ -179,13 +179,12 @@ def cluster_killall(api_key=None) -> bool:
     return cluster.kill(cluster_names=(), api_key=api_key)
 
 
-def upload(data_path, api_key=None, service="R2", name=None, description=None, metadata=None, show_progress=False,
+def upload(data_path, api_key=None, tag_list=[], service="R2", name=None, description=None, metadata=None, show_progress=False,
            out_progress_to_err=True, progress_desc=None) -> FedMLResponse:
-    return storage.upload(data_path=data_path, api_key=api_key, name=name, description=description,
+    return storage.upload(data_path=data_path, api_key=api_key, name=name, description=description, tag_list =tag_list,
                           service=service, progress_desc=progress_desc, show_progress=show_progress,
                           out_progress_to_err=out_progress_to_err, metadata=metadata)
 
-
 def get_storage_user_defined_metadata(data_name, api_key=None) -> FedMLResponse:
     return storage.get_user_metadata(data_name=data_name, api_key=api_key)
 
@@ -194,7 +193,7 @@ def get_storage_metadata(data_name, api_key=None) -> FedMLResponse:
     return storage.get_metadata(api_key=api_key, data_name=data_name)
 
 
-def list_storage_obects(api_key=None) -> FedMLResponse:
+def list_storage_objects(api_key=None) -> FedMLResponse:
     return storage.list_objects(api_key=api_key)
 
 

diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py
@@ -10,28 +10,31 @@
 from fedml.api.fedml_response import FedMLResponse, ResponseCode
 
 
-# Todo (alaydshah): Add file size
 class StorageMetadata(object):
     def __init__(self, data: dict):
         self.dataName = data.get("datasetName", None)
         self.description = data.get("description", None)
         self.tags = data.get("description", None)
         self.createdAt = data.get("createTime", None)
         self.updatedAt = data.get("updateTime", None)
+        self.size = _get_size(data.get("fileSize",None))
+        self.tag_list = data.get("tags", None)
 
 
-# Todo (alaydshah): Add file size while creating objects. Store service name in metadata
+# Todo (alaydshah): Store service name in metadata
 # Todo (alaydshah): If data already exists, don't upload again. Instead suggest to use update command
 
-
-def upload(data_path, api_key, name, description, service, show_progress, out_progress_to_err, progress_desc,
+def upload(data_path, api_key, name, description, tag_list, service, show_progress, out_progress_to_err, progress_desc,
            metadata) -> FedMLResponse:
     api_key = authenticate(api_key)
 
     user_id, message = _get_user_id_from_api_key(api_key)
 
     if user_id is None:
         return FedMLResponse(code=ResponseCode.FAILURE, message=message)
+
+    if(not _check_data_path(data_path)):
+        return FedMLResponse(code=ResponseCode.FAILURE,message="Invalid data path")
 
     archive_path, message = _archive_data(data_path)
     if not archive_path:
@@ -41,6 +44,7 @@ def upload(data_path, api_key, name, description, service, show_progress, out_pr
     name = os.path.splitext(os.path.basename(archive_path))[0] if name is None else name
     file_name = name + ".zip"
     dest_path = os.path.join(user_id, file_name)
+    file_size = os.path.getsize(archive_path)
 
     file_uploaded_url = store.upload_file_with_progress(src_local_path=archive_path, dest_s3_path=dest_path,
                                                         show_progress=show_progress,
@@ -53,9 +57,9 @@ def upload(data_path, api_key, name, description, service, show_progress, out_pr
     json_data = {
         "datasetName": name,
         "description": description,
-        "fileSize": "",
+        "fileSize": file_size,
         "fileUrl": file_uploaded_url,
-        "tagNameList": [],
+        "tagNameList": tag_list,
     }
 
     try:
@@ -228,6 +232,11 @@ def _get_storage_service(service):
     else:
         raise NotImplementedError(f"Service {service} not implemented")
 
+def _check_data_path(data_path):
+    if os.path.isdir(data_path) or os.path.isfile(data_path):
+        return True
+    return False
+
 
 def _archive_data(data_path: str) -> (str, str):
     src_local_path = os.path.abspath(data_path)
@@ -350,3 +359,20 @@ def _get_data_from_response(message: str, response: requests.Response) -> (Respo
             return ResponseCode.FAILURE, message, None
 
     return ResponseCode.SUCCESS, "Successfully parsed data from response", data
+
+def _get_size(size_in_bytes:str)->str:
+    size_str = ""
+    if(size_in_bytes):
+        size = int(size_in_bytes)
+        size_in_gb = size / (1024 * 1024 * 1024)
+        size_in_mb = size / (1024 * 1024)
+        size_in_kb = size / 1024
+        if(size_in_gb >= 1):
+            size_str = f"{size_in_gb:.2f} GB"
+        elif(size_in_mb >= 1):
+            size_str = f"{size_in_mb:.2f} MB"
+        elif(size_in_kb >= 1):
+            size_str = f"{size_in_kb:.2f} KB"
+        else:
+            size_str = f"{size} B"
+    return size_str 
diff --git a/python/fedml/cli/modules/storage.py b/python/fedml/cli/modules/storage.py
@@ -50,8 +50,10 @@ def validate_argument(ctx, param, value):
                                              "the data file or directory name.")
 @click.option("--description", "-d", type=str, help="Add description to your data to store. If not provided, "
                                                     "the description will be empty.")
-@click.option("--user_metadata", "-um", type=str, help="User-defined metadata in the form of a name-value (key-value) "
-                                                       "pair. Defaults to None.")
+@click.option("--user_metadata", "-um", type=str, help="User-defined metadata in the form of a dictionary, for instance, "
+                                                       " {'name':'value'} within double quotes. "" "
+                                                       "Defaults to None.")
+@click.option("--tags", "-t", type=str, help="Add tags to your data to store. Give tags in comma separated form like 'cv,unet,segmentation' If not provided, the tags will be empty.")
 @click.option('--service', "-s", type=click.Choice(['R2']), default="R2", help="Storage service for object storage. "
                                                                                "Only R2 is supported as of now")
 @click.option(
@@ -64,10 +66,11 @@ def validate_argument(ctx, param, value):
     default="release",
     help=version_help,
 )
-def upload(data_path: str, name: str, user_metadata: str, description: str, version: str, api_key: str, service):
+def upload(data_path: str, name: str, user_metadata: str, description: str, version: str, api_key: str, tags:str, service):
     metadata = _parse_metadata(user_metadata)
+    tag_list = _parse_tags(tags)
     fedml.set_env_version(version)
-    response = fedml.api.upload(data_path=data_path, api_key=api_key, name=name, service=service, show_progress=True,
+    response = fedml.api.upload(data_path=data_path, api_key=api_key, name=name, tag_list = tag_list, service=service, show_progress=True,
                                 description=description, metadata=metadata)
     if response.code == ResponseCode.SUCCESS:
         click.echo(f"Data uploaded successfully. | url: {response.data}")
@@ -89,16 +92,16 @@ def upload(data_path: str, name: str, user_metadata: str, description: str, vers
 )
 def list_data(version, api_key):
     fedml.set_env_version(version)
-    response = fedml.api.list_storage_obects(api_key=api_key)
+    response = fedml.api.list_storage_objects(api_key=api_key)
     if response.code == ResponseCode.SUCCESS:
         click.echo(f"Successfully fetched list of stored objects:")
         if not response.data:
             click.echo(f"No stored objects found for account linked with apikey: {api_key}")
             return
-        object_list_table = PrettyTable(["Data Name", "Description", "Created At", "Updated At"])
+        object_list_table = PrettyTable(["Data Name", "Data Size", "Description", "Data Tags","Created At", "Updated At"])
         for stored_object in response.data:
             object_list_table.add_row(
-                [stored_object.dataName, stored_object.description, stored_object.createdAt, stored_object.updatedAt])
+                [stored_object.dataName, stored_object.size, stored_object.description, stored_object.tag_list,stored_object.createdAt, stored_object.updatedAt])
         click.echo(object_list_table)
     else:
         click.echo(f"Failed to list stored objects for account linked with apikey {api_key}. "
@@ -156,8 +159,8 @@ def get_metadata(data_name, version, api_key):
             return
         click.echo(f"Successfully fetched metadata for object {data_name}:")
         # Todo (alaydshah): Add file size and tags
-        metadata_table = PrettyTable(["Data Name", "Description", "Created At", "Updated At"])
-        metadata_table.add_row([metadata.dataName, metadata.description, metadata.createdAt, metadata.updatedAt])
+        metadata_table = PrettyTable(["Data Name","Data Size","Description","Data Tags","Created At", "Updated At"])
+        metadata_table.add_row([metadata.dataName,metadata.size,metadata.description,metadata.tag_list,metadata.createdAt, metadata.updatedAt])
         click.echo(metadata_table)
         click.echo("")
     else:
@@ -237,3 +240,9 @@ def _parse_metadata(metadata: str):
         click.echo(
             f"Input metadata cannot be evaluated. Please make sure metadata is in the correct format. Error: {e}.")
         exit()
+
+def _parse_tags(tags:str):
+    if not tags:
+        return []
+    tag_list = tags.split(",")
+    return tag_list