Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test/v0.7.0 #1876

Merged
merged 7 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions devops/dockerfile/device-image/Dockerfile-Local
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
ARG VERSION=local
ARG IS_BUILDING_GPU_IMAGE=0
ARG BASE_IMAGE=docker.io/fedml/fedml-device-image:base
FROM ${BASE_IMAGE}

ADD ./devops/scripts/runner.sh ./fedml/runner.sh

ADD ./devops/scripts/requirements.txt ./fedml/requirements.txt

RUN chmod a+x ./fedml/runner.sh
RUN echo "Updating..."

RUN pip3 install -r ./fedml/requirements.txt

COPY ./python ./fedml/fedml-pip
WORKDIR ./fedml/fedml-pip
RUN pip3 install -e ./
#RUN pip3 install -e '.[tensorflow]'
#RUN pip3 install -e '.[jax]'
#RUN pip3 install -e '.[mxnet]'
RUN pip3 install MNN==1.1.6

WORKDIR /fedml

ENV MODE=normal FEDML_VERSION=${VERSION} ACCOUNT_ID=0 SERVER_DEVICE_ID=0 \
FEDML_PACKAGE_NAME=package FEDML_PACKAGE_URL=s3_url \
FEDML_RUNNER_CMD=3dsad \
FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST=host.docker.internal

CMD bash ./start-redis.sh; python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -v ${FEDML_VERSION} -r cloud_server -rc ${FEDML_RUNNER_CMD} -id ${SERVER_DEVICE_ID}; bash ./runner.sh
30 changes: 30 additions & 0 deletions devops/dockerfile/server-agent/Dockerfile-Local
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
ARG VERSION=local
ARG IS_BUILDING_GPU_IMAGE=0
ARG BASE_IMAGE=docker.io/fedml/fedml-device-image:base
FROM ${BASE_IMAGE}

ADD ./devops/scripts/runner.sh ./fedml/runner.sh

ADD ./devops/scripts/requirements.txt ./fedml/requirements.txt

RUN chmod a+x ./fedml/runner.sh
RUN echo "Updating..."

RUN pip3 install -r ./fedml/requirements.txt

COPY ./python ./fedml/fedml-pip
WORKDIR ./fedml/fedml-pip
RUN pip3 install -e ./
#RUN pip3 install -e '.[tensorflow]'
#RUN pip3 install -e '.[jax]'
#RUN pip3 install -e '.[mxnet]'

WORKDIR /fedml

ENV MODE=normal FEDML_VERSION=${VERSION} ACCOUNT_ID=0 SERVER_AGENT_ID=0 \
AWS_IAM_ACCESS_ID=0 \
AWS_IAM_ACCESS_KEY=0 \
AWS_REGION=0 \
FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST=host.docker.internal

CMD bash ./start-redis.sh; ./set-aws-credentials.sh ${AWS_IAM_ACCESS_ID} ${AWS_IAM_ACCESS_KEY} ${AWS_REGION};python3 ./fedml-pip/fedml/computing/scheduler/master/server_daemon.py -t login -u ${ACCOUNT_ID} -k ${ACCOUNT_ID} -v ${FEDML_VERSION} -r cloud_agent -id ${SERVER_AGENT_ID};bash ./runner.sh
7 changes: 7 additions & 0 deletions devops/scripts/build-push-fedml-cloud-image.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

version=$1
image_version=$2

if [ ${version} == "dev" ]; then
docker build --network=host -f ./devops/dockerfile/device-image/Dockerfile-Dev -t fedml/fedml-device-image:${version} .
Expand All @@ -24,5 +25,11 @@ elif [ ${version} == "release" ]; then

docker build --network=host -f ./devops/dockerfile/server-agent/Dockerfile-Release -t fedml/fedml-server-agent:${version} .
docker push fedml/fedml-server-agent:${version}
elif [ ${version} == "local" ]; then
docker build --network=host -f ./devops/dockerfile/device-image/Dockerfile-Local -t fedml/fedml-device-image:${version} .
docker push fedml/fedml-device-image:${version}

docker build --network=host -f ./devops/dockerfile/server-agent/Dockerfile-Local -t fedml/fedml-server-agent:${image_version} .
docker push fedml/fedml-server-agent:${image_version}
fi

3 changes: 3 additions & 0 deletions python/examples/launch/hello_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ job_type: train # options: train, deploy, federate
# deploy subtype: none
job_subtype: generate_training

# containerize
containerize: false

# Bootstrap shell commands which will be executed before running entry commands.
# Support multiple lines, which can be empty.
bootstrap: |
Expand Down
78 changes: 78 additions & 0 deletions python/examples/launch/hello_job_with_container.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Local directory where your source code resides.
# It should be the relative path to this job yaml file or the absolute path.
# If your job doesn't contain any source code, it can be empty.
workspace: hello_world

# Running entry commands which will be executed as the job entry point.
# If an error occurs, you should exit with a non-zero code, e.g. exit 1.
# Otherwise, you should exit with a zero code, e.g. exit 0.
# Support multiple lines, which can not be empty.
job: |
echo "current job id: $FEDML_CURRENT_RUN_ID"
echo "current edge id: $FEDML_CURRENT_EDGE_ID"
echo "Hello, Here is the launch platform."
echo "Current directory is as follows."
pwd
python3 hello_world.py
#sleep 20
#exit 1
#echo "Current GPU information is as follows."
#nvidia-smi # Print GPU information
#gpustat
#echo "Download the file from http://212.183.159.230/200MB.zip ..."
#wget http://212.183.159.230/200MB.zip
#rm ./200MB.zip*
#echo "The downloading task has finished."
# echo "Training the vision transformer model using PyTorch..."
# python vision_transformer.py --epochs 1

# If you want to use the job created by the MLOps platform,
# just uncomment the following three, then set job_id and config_id to your desired job id and related config.
#job_args:
# job_id: 2070
# config_id: 111

# If you want to create the job with specific name, just uncomment the following line and set job_name to your desired job name
#job_name: cv_job

job_type: train # options: train, deploy, federate

# train subtype: general_training, single_machine_training, cluster_distributed_training, cross_cloud_training
# federate subtype: cross_silo, simulation, web, smart_phone
# deploy subtype: none
job_subtype: generate_training

docker:
image: fedml/fedml-default-launch:cu12.1-u22.04
#registry: docker.io
#username: my_hub_user
#password: my_hub_password
#ports: [30001,3002,3003]


# Bootstrap shell commands which will be executed before running entry commands.
# Support multiple lines, which can be empty.
bootstrap: |
# pip install -r requirements.txt
echo "Bootstrap finished."

computing:
minimum_num_gpus: 1 # minimum # of GPUs to provision
maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card
#allow_cross_cloud_resources: true # true, false
#device_type: CPU # options: GPU, CPU, hybrid
resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type

data_args:
dataset_name: mnist
dataset_path: ./dataset
dataset_type: csv

model_args:
input_dim: '784'
model_cache_path: /Users/alexliang/fedml_models
model_name: lr
output_dim: '10'

training_params:
learning_rate: 0.004
2 changes: 1 addition & 1 deletion python/examples/launch/hello_world/hello_world.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
for iter_count in range(10):
acc += 0.01
loss -= 0.02
fedml.log_endpoint({"acc": acc, "loss": loss})
#fedml.log_endpoint({"acc": acc, "loss": loss})
time.sleep(0.1)

artifact = fedml.mlops.Artifact(name=f"general-file@{run_id}-{edge_id}", type=fedml.mlops.ARTIFACT_TYPE_NAME_GENERAL)
Expand Down
12 changes: 10 additions & 2 deletions python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.8.18b7"
__version__ = "0.8.18a8"


# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
Expand Down Expand Up @@ -458,7 +458,7 @@ def _get_backend_service():
# caller = getframeinfo(stack()[1][0])
# print(f"{caller.filename}:{caller.lineno} - _get_backend_service. version = {version}")
if version == "local":
return FEDML_BACKEND_SERVICE_URL_LOCAL
return f"http://{get_local_on_premise_platform_host()}:18080"
elif version == "dev":
return FEDML_BACKEND_SERVICE_URL_DEV
elif version == "test":
Expand All @@ -482,6 +482,14 @@ def _get_mqtt_service():
return FEDML_MQTT_DOMAIN_RELEASE


def set_local_on_premise_platform_host(local_on_premise_platform_host):
os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST'] = local_on_premise_platform_host


def get_local_on_premise_platform_host():
return os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST']


def _get_local_s3_like_service_url():
return FEDML_S3_DOMAIN_LOCAL

Expand Down
10 changes: 9 additions & 1 deletion python/fedml/cli/modules/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,16 @@
"--deploy_worker_num", "-dpn", default=1, type=int,
help="Deploy worker number will be started when logged in successfully.",
)
def fedml_login(api_key, version, compute_node, server, provider, deploy_worker_num):
@click.option(
"--local_on_premise_platform",
"-lp",
type=str,
default="127.0.0.1",
help="The IP address for local on-premise Nexus AI Platform.",
)
def fedml_login(api_key, version, compute_node, server, provider, deploy_worker_num, local_on_premise_platform):
fedml.set_env_version(version)
fedml.set_local_on_premise_platform_host(local_on_premise_platform)

api_key = api_key[0] if len(api_key) > 0 else None
try:
Expand Down
6 changes: 5 additions & 1 deletion python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@
from fedml.computing.scheduler.scheduler_core.compute_cache_manager import ComputeCacheManager
from dataclasses import dataclass, field, fields
from fedml.core.common.singleton import Singleton
from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils
from typing import List
import threading
import json

run_docker_without_gpu = False


@dataclass
class DockerArgs:
Expand Down Expand Up @@ -407,7 +410,8 @@ def generate_launch_docker_command(docker_args: DockerArgs, run_id: int, edge_id
JobRunnerUtils.remove_cuda_visible_devices_lines(entry_file_full_path)
# docker command expects device ids in such format: '"device=0,2,3"'
device_str = f'"device={cuda_visible_gpu_ids_str}"'
docker_command.extend(["--gpus", f"'{device_str}'"])
if not run_docker_without_gpu:
docker_command.extend(["--gpus", f"'{device_str}'"])

# Add Port Mapping
for port in docker_args.ports:
Expand Down
6 changes: 4 additions & 2 deletions python/fedml/computing/scheduler/master/server_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2123,8 +2123,10 @@ def callback_runner_id_status(self, topic, payload):
# Stop log processor for current run
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id)
if self.use_local_process_as_cloud_server:
RunProcessUtils.kill_process(os.getpid())
raise Exception("Killed")
#RunProcessUtils.kill_process(os.getpid())
cloud_server_process = self.run_process_map.get(run_id_str, None)
if cloud_server_process is not None:
RunProcessUtils.kill_process(cloud_server_process.pid)
else:
self.stop_cloud_server()

Expand Down
13 changes: 8 additions & 5 deletions python/fedml/computing/scheduler/slave/client_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1109,10 +1109,13 @@ def callback_runner_id_status(self, topic, payload):
RunProcessUtils.kill_process(run_process.pid)

# Terminate the run docker container if exists
container_name = JobRunnerUtils.get_run_container_name(run_id)
docker_client = JobRunnerUtils.get_docker_client(DockerArgs())
logging.info(f"Terminating the run docker container {container_name} if exists...")
JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client)
try:
container_name = JobRunnerUtils.get_run_container_name(run_id)
docker_client = JobRunnerUtils.get_docker_client(DockerArgs())
logging.info(f"Terminating the run docker container {container_name} if exists...")
JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client)
except Exception as e:
logging.info(f"Exception when terminating docker container {traceback.format_exc()}.")

self.run_process_map.pop(run_id_str)

Expand Down Expand Up @@ -1318,7 +1321,7 @@ def bind_account_and_device_id(self, url, account_id, device_id, os_name, api_ke
"accountid": account_id,
"deviceid": device_id,
"type": os_name,
"status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE,
"state": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE,
"processor": cpu_info,
"core_type": cpu_info,
"network": "",
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def finalize_options(self):

setup(
name="fedml",
version="0.8.18b7",
version="0.8.18a8",
author="FedML Team",
author_email="[email protected]",
description="A research and production integrated edge-cloud library for "
Expand Down
Loading