diff --git a/.github/workflows/CI_build.yml b/.github/workflows/CI_build.yml deleted file mode 100644 index b4c3642b09..0000000000 --- a/.github/workflows/CI_build.yml +++ /dev/null @@ -1,47 +0,0 @@ -# This is a basic workflow to help you get started with Actions - -name: CI-build - -# Controls when the workflow will run -on: - # Triggers the workflow on push or pull request events but only for the master branch - schedule: - # Nightly build at 12:12 A.M. - - cron: "0 10 */1 * *" - pull_request: - branches: [ master, dev/v0.7.0 ] - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - build: - runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] - strategy: - fail-fast: false - matrix: - os: [ Linux, Windows ] - arch: [X64] - python-version: ['python3.8', 'python3.9', 'python3.10', 'python3.11'] - - timeout-minutes: 5 - steps: - - name: Checkout fedml - uses: actions/checkout@v3 - - - name: pip_install - run: | - cd python - pip install -e ./ - - - name: login - run: | - fedml logout - fedml login $API_KEY - - - name: pylint - run: | - cd python - echo "Pylint has been run successfully!" - diff --git a/.github/workflows/CI_deploy.yml b/.github/workflows/CI_deploy.yml deleted file mode 100644 index 982f65b3c5..0000000000 --- a/.github/workflows/CI_deploy.yml +++ /dev/null @@ -1,43 +0,0 @@ -# This is a basic workflow to help you get started with Actions - -name: CI-deploy - -# Controls when the workflow will run -on: - # Triggers the workflow on push or pull request events but only for the master branch - schedule: - # Nightly build at 12:12 A.M. - - cron: "0 10 */1 * *" - pull_request: - branches: [ master, dev/v0.7.0 ] - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - deploy: - runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] - strategy: - fail-fast: false - matrix: - os: [ Linux, Windows ] - arch: [X64] - python-version: ['python3.8', 'python3.9', 'python3.10', 'python3.11'] - - timeout-minutes: 5 - steps: - - name: Checkout fedml - uses: actions/checkout@v3 - - - name: pip_install - run: | - cd python - pip install -e ./ - - - name: serving_job_in_test_env - run: | - cd python - echo "Serving example has been tested successfully!" - python tests/test_deploy/test_deploy.py - diff --git a/.github/workflows/CI_federate.yml b/.github/workflows/CI_federate.yml deleted file mode 100644 index 1302771b1d..0000000000 --- a/.github/workflows/CI_federate.yml +++ /dev/null @@ -1,42 +0,0 @@ -# This is a basic workflow to help you get started with Actions - -name: CI-federate - -# Controls when the workflow will run -on: - # Triggers the workflow on push or pull request events but only for the master branch - schedule: - # Nightly build at 12:12 A.M. - - cron: "0 10 */1 * *" - pull_request: - branches: [ master, dev/v0.7.0 ] - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - federate: - strategy: - fail-fast: false - matrix: - os: [ Linux, Windows ] - arch: [X64] - python-version: ['python3.8', 'python3.9', 'python3.10', 'python3.11'] - - runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] - timeout-minutes: 5 - steps: - - name: Checkout fedml - uses: actions/checkout@v3 - - - name: pip_install - run: | - cd python - pip install -e ./ - - - name: federate_job_in_test_env - run: | - cd python - bash tests/test_federate/test_federate.sh - echo "Federate example has been tested successfully!" diff --git a/.github/workflows/CI_launch.yml b/.github/workflows/CI_launch.yml deleted file mode 100644 index 13519c41f2..0000000000 --- a/.github/workflows/CI_launch.yml +++ /dev/null @@ -1,43 +0,0 @@ -# This is a basic workflow to help you get started with Actions - -name: CI-launch - -# Controls when the workflow will run -on: - # Triggers the workflow on push or pull request events but only for the master branch - schedule: - # Nightly build at 12:12 A.M. - - cron: "0 10 */1 * *" - pull_request: - branches: [ master, dev/v0.7.0 ] - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - launch: - - strategy: - fail-fast: false - matrix: - os: [ Linux, Windows ] - arch: [X64] - python-version: ['python3.8','python3.9','python3.10','python3.11'] - - runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] - timeout-minutes: 5 - steps: - - name: Checkout fedml - uses: actions/checkout@v3 - - - name: pip_install - run: | - cd python - pip install -e ./ - - - name: launch_job_in_test_env - run: | - cd python - python tests/test_launch/test_launch.py - echo "Launch example has been tested successfully!" diff --git a/.github/workflows/CI_train.yml b/.github/workflows/CI_train.yml deleted file mode 100644 index 2acbcc12a0..0000000000 --- a/.github/workflows/CI_train.yml +++ /dev/null @@ -1,42 +0,0 @@ -# This is a basic workflow to help you get started with Actions - -name: CI-train - -# Controls when the workflow will run -on: - # Triggers the workflow on push or pull request events but only for the master branch - schedule: - # Nightly build at 12:12 A.M. - - cron: "0 10 */1 * *" - pull_request: - branches: [ master, dev/v0.7.0 ] - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - train: - runs-on: ["${{ matrix.python-version }}","${{ matrix.os }}"] - strategy: - fail-fast: false - matrix: - os: [ Linux, Windows ] - arch: [X64] - python-version: ['python3.8', 'python3.9', 'python3.10', 'python3.11'] - timeout-minutes: 5 - steps: - - name: Checkout fedml - uses: actions/checkout@v3 - - - name: pip_install - run: | - cd python - pip install -e ./ - - - name: training_job_in_test_env - run: | - cd python - python tests/test_train/test_train.py - echo "Train example has been tested successfully!" - diff --git a/.github/workflows/README.md b/.github/workflows/README.md deleted file mode 100644 index 668cb9b302..0000000000 --- a/.github/workflows/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# 1. Design - -![Design](image.png) - -## Design principles - -The CI tests need to be comprehensive, covering typical scenarios only, achievable within 5 minutes. - -# 2. Registry Self-Host Runners - -## 2.1 Linux Runners - -### Step1: Build linux images - -Build all the linux images for Self-Host Runners. -``` -cd registry-runners -bash build_linux_runners.sh -``` - -### Step2: Specify the token and key. -Find your GitHub runner token and your test-account apikey. - -For the argument YourGitHubRunnerToken, Navigate the path `Settings -> Actions -> Runners -> New self-hosted runner` to get. - -In the Configure section, you will find the similar line: -./config.sh --url https://github.com/FedML-AI/FedML --token AXRYPL6G2VHVGDFDQQS5XA3ELYI6M to get YourGitHubRunnerToken to value of --token - -### Step3: Registry all the runners. -Registry by run `run_linux_runners.sh` script -``` -bash run_linux_runners.sh [YourGitRepo] [YourGitHubRunnerToken] [YourTestAccountApiKey] -``` -for example -``` -bash run_linux_runners.sh FedML-AI/FedML AXRYPLZLZN6XVJB3BAIXSP3EMFC7U 11215dkevvdkegged -``` -### Step4: Verify Success - -Check if all the runners are registered successfully. Navigate the following path. `Settings -> Actions -> Runners` to check that all your runners are active. - -## 2.2 Windows Runners - -### Step1: Install Anaconda packages -Install Anaconda or Miniconda on a Windows machine. Anaconda and Miniconda can manage your Python environments. - -### Step2: Create python enviroments -Create 4 python environments named `python38`、`python39`、`python310` and `python311` for different runners. -Specify the python version to install. -For example -``` -conda create -n python38 python==3.8 -``` -### Step3: Create directories -Create 4 directories named `actions-runner-python38`、`actions-runner-python39`、`actions-runner-python310` and `actions-runner-python311` for different runners. - -### Step4: Install the latest runner package. -Follow the insturction from navigating this path `Settings -> Actions -> Runners -> New self-hosted runner` to add a new Windows runner. Note that you only need to download、extract the files into the directories created in Step 3. Configuration and running will be done through a script later. - -### Step5: Registry all the runners. -Run the script from `./registry-runners/windows.ps1` to registry all the runners to your github. Replace the variables `$REPO`、`$ACCESS_TOKEN` and `$WORKPLACE` with actual values. Note that you can get your $ACCESS_TOKEN from the following path `Settings -> Actions -> Runners -> New self-hosted runner.`. -In the Configure section, you will find the similar line: `./config.sh --url https://github.com/FedML-AI/FedML --token AXRYPL6G2VHVGDFDQQS5XA3ELYI6M` to get your `$ACCESS_TOKEN`. - -### Step6: Verify Success -Check if the runners are registered successfully by navigate to `Settings -> Actions -> Runners`. Make sure that all your runners are active. - -## 2.3 Mac Runners - -# 3. Bind Test Machines - -Bind the actual machine to run the test training job. Follow this document to bind your test machines. -https://docs.tensoropera.ai/share-and-earn - -Note that we need to bind our machines to the test environment. - -Specify the computing resource type to which you have bound your machines. Your job will be scheduled to that machine. - -# 4. Trigger - -Applying for a PR can trigger all tests automatically. - -Run a single test on a specific branch from the GitHub Actions tab. - -Schedule daily runs at a specific time by configuring your workflow YAML. You can check the results in the GitHub Actions tab. - -# 5. Add a new CI test - -Creating a new workflow YAML file, such as CI_launch.yaml or CI_train.yaml, allows you to add a CI test that is different from the current business. - -Adding a new CI test to the current business can be done by placing your test in the path python/tests/test_{business}/test_file.py and ensuring that your workflow YAML can run that Python test script. - -Ensuring your workflow YAML is configured correctly will enable it to run the new test automatically. - -# 6. TODO - -Implement the Mac runners. - diff --git a/.github/workflows/deprecated/build_wheels_and_releases.yml-backup b/.github/workflows/build_wheels_and_releases.yml-backup similarity index 100% rename from .github/workflows/deprecated/build_wheels_and_releases.yml-backup rename to .github/workflows/build_wheels_and_releases.yml-backup diff --git a/.github/workflows/deprecated/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml similarity index 100% rename from .github/workflows/deprecated/codeql-analysis.yml rename to .github/workflows/codeql-analysis.yml diff --git a/.github/workflows/deprecated/python-package-conda.yml b/.github/workflows/deprecated/python-package-conda.yml deleted file mode 100644 index f3586044ab..0000000000 --- a/.github/workflows/deprecated/python-package-conda.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: Python Package using Conda - -on: [push] - -jobs: - build-linux: - runs-on: ubuntu-latest - strategy: - max-parallel: 5 - - steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v3 - with: - python-version: '3.10' - - name: Add conda to system path - run: | - # $CONDA is an environment variable pointing to the root of the miniconda directory - echo $CONDA/bin >> $GITHUB_PATH - - name: Install dependencies - run: | - conda env update --file environment.yml --name base - - name: Lint with flake8 - run: | - conda install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - conda install pytest - pytest diff --git a/.github/workflows/deprecated/full_e2e_test.yml-bakcup b/.github/workflows/full_e2e_test.yml-bakcup similarity index 100% rename from .github/workflows/deprecated/full_e2e_test.yml-bakcup rename to .github/workflows/full_e2e_test.yml-bakcup diff --git a/.github/workflows/image.png b/.github/workflows/image.png deleted file mode 100644 index 330e630c0a..0000000000 Binary files a/.github/workflows/image.png and /dev/null differ diff --git a/.github/workflows/deprecated/pylint.yml b/.github/workflows/pylint.yml similarity index 89% rename from .github/workflows/deprecated/pylint.yml rename to .github/workflows/pylint.yml index 402bf72895..cdc3800869 100644 --- a/.github/workflows/deprecated/pylint.yml +++ b/.github/workflows/pylint.yml @@ -28,16 +28,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: Analysing the code with pylint diff --git a/.github/workflows/registry-runners/build_linux_runners.sh b/.github/workflows/registry-runners/build_linux_runners.sh deleted file mode 100644 index fb4b6e1abc..0000000000 --- a/.github/workflows/registry-runners/build_linux_runners.sh +++ /dev/null @@ -1,12 +0,0 @@ -tag="0.1.0" - -platform="linux/amd64" - -echo "build python:3.11" -docker build --no-cache --platform $platform --build-arg BASE_IMAGE=python:3.11 -t fedml/action_runner_3.11_linux64:$tag -f ./Dockerfile . -echo "build python:3.10" -docker build --no-cache --platform $platform --build-arg BASE_IMAGE=python:3.10 -t fedml/action_runner_3.10_linux64:$tag -f ./Dockerfile . -echo "build python:3.9" -docker build --no-cache --platform $platform --build-arg BASE_IMAGE=python:3.9 -t fedml/action_runner_3.9_linux64:$tag -f ./Dockerfile . -echo "build python:3.8" -docker build --no-cache --platform $platform --build-arg BASE_IMAGE=python:3.8 -t fedml/action_runner_3.8_linux64:$tag -f ./Dockerfile . diff --git a/.github/workflows/registry-runners/build_test.sh b/.github/workflows/registry-runners/build_test.sh deleted file mode 100755 index 1e17dc6847..0000000000 --- a/.github/workflows/registry-runners/build_test.sh +++ /dev/null @@ -1 +0,0 @@ -docker build -t fedml/action_runner_3.11_linux64:0.1 -f ./Dockerfile . diff --git a/.github/workflows/registry-runners/run_linux_runners.sh b/.github/workflows/registry-runners/run_linux_runners.sh deleted file mode 100644 index fa70388de8..0000000000 --- a/.github/workflows/registry-runners/run_linux_runners.sh +++ /dev/null @@ -1,48 +0,0 @@ -REPO=$1 -ACCESS_TOKEN=$2 -API_KEY=$3 -DOCKER_PULL=false -ARCH=linux64 -TAG="0.1.0" - -if [ $# != 3 ]; then - echo "Please provide two arguments." - echo "./runner-start.sh [YourGitRepo][YourGitHubRunnerToken][API_KEY]" - exit -1 -fi - -# List of Docker container names -# containers=("fedml/action_runner_3.8_$ARCH:0.1.0" "fedml/action_runner_3.9_$ARCH:0.1.0" "fedml/action_runner_3.10_$ARCH:0.1.0" "fedml/action_runner_3.11_$ARCH:0.1.0") -containers=("action_runner_3.8_$ARCH" "action_runner_3.9_$ARCH" "action_runner_3.10_$ARCH" "action_runner_3.11_$ARCH") -python_versions=("python3.8" "python3.9" "python3.10" "python3.11") - - -# Iterate through each container -for container_index in "${!containers[@]}"; do - - container=${containers[$container_index]} - # Find the running container - if [ "$DOCKER_PULL" = "true" ]; then - echo "docker pull fedml/$container:$TAG" - docker pull fedml/$container:$TAG - fi - # docker stop `sudo docker ps |grep ${TAG}- |awk -F' ' '{print $1}'` - - running_container=$(docker ps -a | grep $container | awk -F ' ' '{print $1}') - - if [ -n "$running_container" ]; then - # Stop the running container - echo "Stopping running container: $container, $running_container" - docker stop "$running_container" - else - echo "No running container found for: $container" - fi - sleep 5 - # docker pull $container - ACT_NAME=${containers[$container_index]} - echo "docker run --rm --name $ACT_NAME --env API_KEY=$API_KEY --env REPO=$REPO --env ACCESS_TOKEN=$ACCESS_TOKEN -d fedml/${containers[$container_index]}:$TAG bash ./start.sh ${REPO} ${ACCESS_TOKEN} ${python_versions[$container_index]}" - docker run --rm --name $ACT_NAME --env API_KEY=$API_KEY --env REPO=$REPO --env ACCESS_TOKEN=$ACCESS_TOKEN -d fedml/${containers[$container_index]}:$TAG bash ./start.sh ${REPO} ${ACCESS_TOKEN} ${python_versions[$container_index]} - -done -echo "Script completed." - diff --git a/.github/workflows/registry-runners/windows.ps1 b/.github/workflows/registry-runners/windows.ps1 deleted file mode 100644 index 40f0f00b8f..0000000000 --- a/.github/workflows/registry-runners/windows.ps1 +++ /dev/null @@ -1,32 +0,0 @@ - -$REPO = "Qigemingziba/FedML" -$ACCESS_TOKEN = "AGMK3PY3QDYUXXXEB5LWI4DGOQIFW" -$WORKPLACE=$PWD - -Set-Location actions-runner-python38 -& conda activate python38 -./config.cmd --url https://github.com/$REPO --name windows-python38 --token $ACCESS_TOKEN --labels self-hosted,Windows,X64,python3.8 -Start-Process run.cmd start -WindowStyle Hidden - -Set-Location $WORKPLACE - -Set-Location actions-runner-python39 -& conda activate python39 -./config.cmd --url https://github.com/$REPO --name windows-python39 --token $ACCESS_TOKEN --labels self-hosted,Windows,X64,python3.9 -Start-Process run.cmd start -WindowStyle Hidden - -Set-Location $WORKPLACE - -Set-Location actions-runner-python310 -& conda activate python310 -./config.cmd --url https://github.com/$REPO --name windows-python310 --token $ACCESS_TOKEN --labels self-hosted,Windows,X64,python3.10 -Start-Process run.cmd start -WindowStyle Hidden - -Set-Location $WORKPLACE - -Set-Location actions-runner-python311 -& conda activate python311 -./config.cmd --url https://github.com/$REPO --name windows-python311 --token $ACCESS_TOKEN --labels self-hosted,Windows,X64,python3.11 -Start-Process run.cmd start -WindowStyle Hidden - -Set-Location $WORKPLACE \ No newline at end of file diff --git a/.github/workflows/deprecated/runner.md b/.github/workflows/runner.md similarity index 100% rename from .github/workflows/deprecated/runner.md rename to .github/workflows/runner.md diff --git a/.github/workflows/deprecated/smoke_test_cross_device_mnn_server_linux.yml b/.github/workflows/smoke_test_cross_device_mnn_server_linux.yml similarity index 88% rename from .github/workflows/deprecated/smoke_test_cross_device_mnn_server_linux.yml rename to .github/workflows/smoke_test_cross_device_mnn_server_linux.yml index 10c9860d0f..c8fff7e4f1 100644 --- a/.github/workflows/deprecated/smoke_test_cross_device_mnn_server_linux.yml +++ b/.github/workflows/smoke_test_cross_device_mnn_server_linux.yml @@ -52,16 +52,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -70,9 +67,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - cd python - pip install -e ./ - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: Install MNN working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} @@ -84,6 +79,6 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/quick_start/beehive + cd quick_start/beehive timeout 60 bash run_server.sh || code=$?; if [[ $code -ne 124 && $code -ne 0 ]]; then exit $code; fi diff --git a/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_attack_linux.yml b/.github/workflows/smoke_test_cross_silo_fedavg_attack_linux.yml similarity index 83% rename from .github/workflows/deprecated/smoke_test_cross_silo_fedavg_attack_linux.yml rename to .github/workflows/smoke_test_cross_silo_fedavg_attack_linux.yml index ea0c4ed601..b1c29fcfd7 100644 --- a/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_attack_linux.yml +++ b/.github/workflows/smoke_test_cross_silo_fedavg_attack_linux.yml @@ -29,8 +29,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-latest ] - arch: [ X64 ] + os: [ ubuntu-latest] + arch: [X64] python-version: ['3.8'] client-index: ['0', '1', '2', '3', '4'] # exclude: @@ -38,7 +38,7 @@ jobs: # python-version: '3.8' # - os: windows-latest # python-version: '3.6' - runs-on: [ self-hosted ] + runs-on: [ self-hosted, Linux ] timeout-minutes: 15 steps: - name: Extract branch name @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,16 +68,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - cd python - pip install -e ./ - # bash ./devops/srcipts/install-fedml.sh - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - attack working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -90,7 +84,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -100,7 +94,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id @@ -110,7 +104,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 3 $run_id @@ -120,7 +114,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_attack_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_attack_mnist_lr_example run_id=cross-silo-attack-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 4 $run_id diff --git a/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_cdp_linux.yml b/.github/workflows/smoke_test_cross_silo_fedavg_cdp_linux.yml similarity index 87% rename from .github/workflows/deprecated/smoke_test_cross_silo_fedavg_cdp_linux.yml rename to .github/workflows/smoke_test_cross_silo_fedavg_cdp_linux.yml index 051c0418d2..67ee9e4a0f 100644 --- a/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_cdp_linux.yml +++ b/.github/workflows/smoke_test_cross_silo_fedavg_cdp_linux.yml @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,13 +68,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - cdp working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example + cd examples/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -87,7 +84,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example + cd examples/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -97,7 +94,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example + cd examples/privacy/mqtt_s3_fedavg_cdp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_defense_linux.yml b/.github/workflows/smoke_test_cross_silo_fedavg_defense_linux.yml similarity index 86% rename from .github/workflows/deprecated/smoke_test_cross_silo_fedavg_defense_linux.yml rename to .github/workflows/smoke_test_cross_silo_fedavg_defense_linux.yml index b9348d7bf2..fac19d9552 100644 --- a/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_defense_linux.yml +++ b/.github/workflows/smoke_test_cross_silo_fedavg_defense_linux.yml @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,13 +68,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - defense working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -87,7 +84,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -97,7 +94,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id @@ -107,7 +104,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 3 $run_id @@ -117,7 +114,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/security/mqtt_s3_fedavg_defense_mnist_lr_example + cd examples/security/mqtt_s3_fedavg_defense_mnist_lr_example run_id=cross-silo-defense-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 4 $run_id diff --git a/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_ldp_linux.yml b/.github/workflows/smoke_test_cross_silo_fedavg_ldp_linux.yml similarity index 87% rename from .github/workflows/deprecated/smoke_test_cross_silo_fedavg_ldp_linux.yml rename to .github/workflows/smoke_test_cross_silo_fedavg_ldp_linux.yml index f849c4db71..def8aca733 100644 --- a/.github/workflows/deprecated/smoke_test_cross_silo_fedavg_ldp_linux.yml +++ b/.github/workflows/smoke_test_cross_silo_fedavg_ldp_linux.yml @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,13 +68,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - ldp working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example + cd examples/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -87,7 +84,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example + cd examples/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -97,7 +94,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example + cd examples/privacy/mqtt_s3_fedavg_ldp_mnist_lr_example run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/deprecated/smoke_test_cross_silo_ho_linux.yml b/.github/workflows/smoke_test_cross_silo_ho_linux.yml similarity index 89% rename from .github/workflows/deprecated/smoke_test_cross_silo_ho_linux.yml rename to .github/workflows/smoke_test_cross_silo_ho_linux.yml index 7d28a37292..e34a22cdbe 100644 --- a/.github/workflows/deprecated/smoke_test_cross_silo_ho_linux.yml +++ b/.github/workflows/smoke_test_cross_silo_ho_linux.yml @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,13 +68,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/quick_start/octopus + cd quick_start/octopus run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -87,7 +84,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/quick_start/octopus + cd quick_start/octopus run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -97,7 +94,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/quick_start/octopus + cd quick_start/octopus run_id=cross-silo-ho-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/deprecated/smoke_test_cross_silo_ho_win.yml b/.github/workflows/smoke_test_cross_silo_ho_win.yml similarity index 88% rename from .github/workflows/deprecated/smoke_test_cross_silo_ho_win.yml rename to .github/workflows/smoke_test_cross_silo_ho_win.yml index d9239bcb99..b8376438d7 100644 --- a/.github/workflows/deprecated/smoke_test_cross_silo_ho_win.yml +++ b/.github/workflows/smoke_test_cross_silo_ho_win.yml @@ -52,16 +52,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -70,25 +67,25 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/quick_start/octopus + cd quick_start/octopus .\run_server.bat ${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/quick_start/octopus + cd quick_start/octopus .\run_client.bat 1 ${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/quick_start/octopus + cd quick_start/octopus .\run_client.bat 2 ${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} \ No newline at end of file diff --git a/.github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_linux.yml b/.github/workflows/smoke_test_cross_silo_lightsecagg_linux.yml similarity index 88% rename from .github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_linux.yml rename to .github/workflows/smoke_test_cross_silo_lightsecagg_linux.yml index ae06088dc7..d672e2a772 100644 --- a/.github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_linux.yml +++ b/.github/workflows/smoke_test_cross_silo_lightsecagg_linux.yml @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,13 +68,13 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - lightsecagg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/light_sec_agg_example + cd examples/cross_silo/light_sec_agg_example run_id=cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -87,7 +84,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/light_sec_agg_example + cd examples/cross_silo/light_sec_agg_example run_id=cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -97,7 +94,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/light_sec_agg_example + cd examples/cross_silo/light_sec_agg_example run_id=cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_win.yml b/.github/workflows/smoke_test_cross_silo_lightsecagg_win.yml similarity index 88% rename from .github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_win.yml rename to .github/workflows/smoke_test_cross_silo_lightsecagg_win.yml index 40d15a1f0f..8deab9acb2 100644 --- a/.github/workflows/deprecated/smoke_test_cross_silo_lightsecagg_win.yml +++ b/.github/workflows/smoke_test_cross_silo_lightsecagg_win.yml @@ -52,16 +52,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -70,25 +67,25 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/light_sec_agg_example + cd examples/cross_silo/light_sec_agg_example .\run_server.bat cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - cross-silo - ho working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/light_sec_agg_example + cd examples/cross_silo/light_sec_agg_example .\run_client.bat 1 cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - cross-silo - lightsecagg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/light_sec_agg_example + cd examples/cross_silo/light_sec_agg_example .\run_client.bat 2 cross-silo-lightsecagg-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} \ No newline at end of file diff --git a/.github/workflows/deprecated/smoke_test_flow_linux.yml b/.github/workflows/smoke_test_flow_linux.yml similarity index 92% rename from .github/workflows/deprecated/smoke_test_flow_linux.yml rename to .github/workflows/smoke_test_flow_linux.yml index 5293787a11..df876a632b 100644 --- a/.github/workflows/deprecated/smoke_test_flow_linux.yml +++ b/.github/workflows/smoke_test_flow_linux.yml @@ -43,16 +43,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -61,7 +58,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: server - Flow working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} diff --git a/.github/workflows/deprecated/smoke_test_ml_engines_linux_jax.yml b/.github/workflows/smoke_test_ml_engines_linux_jax.yml similarity index 87% rename from .github/workflows/deprecated/smoke_test_ml_engines_linux_jax.yml rename to .github/workflows/smoke_test_ml_engines_linux_jax.yml index cd4bd8d720..42a6d25ead 100644 --- a/.github/workflows/deprecated/smoke_test_ml_engines_linux_jax.yml +++ b/.github/workflows/smoke_test_ml_engines_linux_jax.yml @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,14 +68,14 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh cd $homepath/python - name: server - jax - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example run_id=jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -88,7 +85,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example run_id=jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -98,7 +95,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example run_id=jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/deprecated/smoke_test_ml_engines_linux_mxnet.yml b/.github/workflows/smoke_test_ml_engines_linux_mxnet.yml similarity index 87% rename from .github/workflows/deprecated/smoke_test_ml_engines_linux_mxnet.yml rename to .github/workflows/smoke_test_ml_engines_linux_mxnet.yml index 5ce217ea4b..bf30fd1b1a 100644 --- a/.github/workflows/deprecated/smoke_test_ml_engines_linux_mxnet.yml +++ b/.github/workflows/smoke_test_ml_engines_linux_mxnet.yml @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,7 +68,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh cd $homepath/python pip install mxnet==2.0.0b1 @@ -79,7 +76,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -89,7 +86,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -99,7 +96,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example run_id=mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/deprecated/smoke_test_ml_engines_linux_tf.yml b/.github/workflows/smoke_test_ml_engines_linux_tf.yml similarity index 87% rename from .github/workflows/deprecated/smoke_test_ml_engines_linux_tf.yml rename to .github/workflows/smoke_test_ml_engines_linux_tf.yml index 3b7519dd97..9d69ba3774 100644 --- a/.github/workflows/deprecated/smoke_test_ml_engines_linux_tf.yml +++ b/.github/workflows/smoke_test_ml_engines_linux_tf.yml @@ -53,16 +53,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -71,14 +68,14 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh cd $homepath/python - name: server - tensorflow - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example run_id=tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_server.sh $run_id @@ -88,7 +85,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example run_id=tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 1 $run_id @@ -98,7 +95,7 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example run_id=tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} echo ${run_id} bash run_client.sh 2 $run_id diff --git a/.github/workflows/deprecated/smoke_test_ml_engines_win.yml b/.github/workflows/smoke_test_ml_engines_win.yml similarity index 90% rename from .github/workflows/deprecated/smoke_test_ml_engines_win.yml rename to .github/workflows/smoke_test_ml_engines_win.yml index 8913cc6bec..f1f3bfabd4 100644 --- a/.github/workflows/deprecated/smoke_test_ml_engines_win.yml +++ b/.github/workflows/smoke_test_ml_engines_win.yml @@ -46,16 +46,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -64,28 +61,28 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh cd $homepath/python pip install -e '.[tensorflow]' - name: server - tensorflow - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - tensorflow - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 1 --role client --run_id tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - tensorflow - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 2 --role client --run_id tf-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} @@ -141,21 +138,21 @@ jobs: - name: server - jax - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - jax - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 1 --role client --run_id jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - jax - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 2 --role client --run_id jax-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} @@ -211,20 +208,20 @@ jobs: - name: server - mxnet - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '0' }} - name: client 1 - mxnet - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 1 --role client --run_id mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '1' }} - name: client 2 - mxnet - fedavg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example + cd examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example python3 tf_client.py --cf config/fedml_config.yaml --rank 2 --role client --run_id mxnet-ml-engine-${{ format('{0}{1}{2}{3}', github.run_id, matrix.os, matrix.arch, matrix.python-version) }} if: ${{ matrix.client-index == '2' }} diff --git a/.github/workflows/deprecated/smoke_test_pip_cli_sp_linux.yml b/.github/workflows/smoke_test_pip_cli_sp_linux.yml similarity index 80% rename from .github/workflows/deprecated/smoke_test_pip_cli_sp_linux.yml rename to .github/workflows/smoke_test_pip_cli_sp_linux.yml index 006ecfb574..131d88de9b 100644 --- a/.github/workflows/deprecated/smoke_test_pip_cli_sp_linux.yml +++ b/.github/workflows/smoke_test_pip_cli_sp_linux.yml @@ -54,16 +54,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -72,20 +69,20 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - # - name: test "fedml login" and "fedml build" - # working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} - # run: | - # cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - # cd tests/smoke_test/cli - # bash login.sh - # bash build.sh + - name: test "fedml login" and "fedml build" + working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} + run: | + cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python + cd tests/smoke_test/cli + bash login.sh + bash build.sh - name: test simulation-sp working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/federate/quick_start/parrot + cd quick_start/parrot python torch_fedavg_mnist_lr_one_line_example.py --cf fedml_config.yaml python torch_fedavg_mnist_lr_custum_data_and_model_example.py --cf fedml_config.yaml @@ -93,40 +90,40 @@ jobs: working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/federate/simulation/sp_decentralized_mnist_lr_example + cd examples/simulation/sp_decentralized_mnist_lr_example python torch_fedavg_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_fednova_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/federate/simulation/sp_fednova_mnist_lr_example + cd examples/simulation/sp_fednova_mnist_lr_example python torch_fednova_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_fedopt_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/federate/simulation/sp_fedopt_mnist_lr_example + cd examples/simulation/sp_fedopt_mnist_lr_example python torch_fedopt_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_hierarchicalfl_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/federate/simulation/sp_hierarchicalfl_mnist_lr_example + cd examples/simulation/sp_hierarchicalfl_mnist_lr_example python torch_hierarchicalfl_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_turboaggregate_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/federate/simulation/sp_turboaggregate_mnist_lr_example + cd examples/simulation/sp_turboaggregate_mnist_lr_example python torch_turboaggregate_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - name: test sp - sp_vertical_mnist_lr_example working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd ${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }}/python - cd examples/federate/simulation/sp_vertical_mnist_lr_example + cd examples/simulation/sp_vertical_mnist_lr_example python torch_vertical_mnist_lr_step_by_step_example.py --cf fedml_config.yaml diff --git a/.github/workflows/deprecated/smoke_test_pip_cli_sp_win.yml b/.github/workflows/smoke_test_pip_cli_sp_win.yml similarity index 90% rename from .github/workflows/deprecated/smoke_test_pip_cli_sp_win.yml rename to .github/workflows/smoke_test_pip_cli_sp_win.yml index 3987f90f74..69dac083bb 100644 --- a/.github/workflows/deprecated/smoke_test_pip_cli_sp_win.yml +++ b/.github/workflows/smoke_test_pip_cli_sp_win.yml @@ -51,16 +51,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -69,7 +66,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: test "fedml login" and "fedml build" working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} @@ -80,6 +77,6 @@ jobs: - name: test simulation-sp working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | - cd examples/federate/quick_start/parrot + cd quick_start/parrot python torch_fedavg_mnist_lr_one_line_example.py --cf fedml_config.yaml python torch_fedavg_mnist_lr_custum_data_and_model_example.py --cf fedml_config.yaml diff --git a/.github/workflows/deprecated/smoke_test_security.yml b/.github/workflows/smoke_test_security.yml similarity index 91% rename from .github/workflows/deprecated/smoke_test_security.yml rename to .github/workflows/smoke_test_security.yml index 5d5c03ee38..6644a4b513 100644 --- a/.github/workflows/deprecated/smoke_test_security.yml +++ b/.github/workflows/smoke_test_security.yml @@ -54,16 +54,13 @@ jobs: echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then echo "running on master" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-master cd $path - git pull echo "dir=$path" >> $GITHUB_OUTPUT else echo "running on dev" - path=/home/fedml/FedML + path=/home/actions-runner/fedml-dev cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip @@ -72,7 +69,7 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: attack tests working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} diff --git a/.github/workflows/deprecated/smoke_test_simulation_mpi_linux.yml b/.github/workflows/smoke_test_simulation_mpi_linux.yml similarity index 73% rename from .github/workflows/deprecated/smoke_test_simulation_mpi_linux.yml rename to .github/workflows/smoke_test_simulation_mpi_linux.yml index b2e9676ae9..c48cc43149 100644 --- a/.github/workflows/deprecated/smoke_test_simulation_mpi_linux.yml +++ b/.github/workflows/smoke_test_simulation_mpi_linux.yml @@ -40,8 +40,8 @@ jobs: - os: ubuntu-latest mpi: mpich install-mpi: | - apt-get update - apt install -y mpich libmpich-dev + sudo apt-get update + sudo apt install -y mpich libmpich-dev # - os: ubuntu-latest # mpi: openmpi # install-mpi: sudo apt install -y openmpi-bin libopenmpi-dev @@ -50,12 +50,6 @@ jobs: shell: bash run: echo "branch=$(echo ${GITHUB_REF#refs/heads/})" >>$GITHUB_OUTPUT id: extract_branch - - name: Install MPI - if: matrix.mpi == 'mpich' - run: | - apt-get update - apt-get install -y mpich libmpich-dev - - id: fedml_source_code_home name: cd to master or dev branch and git pull shell: bash @@ -63,18 +57,15 @@ jobs: ls echo ${{ steps.extract_branch.outputs.branch }} if [[ ${{ steps.extract_branch.outputs.branch }} == "master" ]]; then - echo "running on master" - path=/home/fedml/FedML - cd $path - git pull - echo "dir=$path" >> $GITHUB_OUTPUT + echo "running on master" + path=/home/actions-runner/fedml-master + cd $path + echo "dir=$path" >> $GITHUB_OUTPUT else - echo "running on dev" - path=/home/fedml/FedML - cd $path - git pull - git checkout ${{ steps.extract_branch.outputs.branch }} - echo "dir=$path" >> $GITHUB_OUTPUT + echo "running on dev" + path=/home/actions-runner/fedml-dev + cd $path + echo "dir=$path" >> $GITHUB_OUTPUT fi - name: sync git repo to local pip working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} @@ -82,47 +73,47 @@ jobs: homepath=${{ format('{0}', steps.fedml_source_code_home.outputs.dir) }} echo $Homepath cd $homepath - # bash ./devops/scripts/sync-fedml-pip.sh + bash ./devops/scripts/sync-fedml-pip.sh - name: Test package - FedAvg working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | pwd cd python - cd examples/federate/simulation/mpi_torch_fedavg_mnist_lr_example + cd examples/simulation/mpi_torch_fedavg_mnist_lr_example sh run_custom_data_and_model_example.sh 4 - name: Test package - Base working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/simulation/mpi_base_framework_example + cd examples/simulation/mpi_base_framework_example sh run.sh 4 - name: Test package - Decentralized working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/simulation/mpi_decentralized_fl_example + cd examples/simulation/mpi_decentralized_fl_example sh run.sh 4 - name: Test package - FedOPT working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/simulation/mpi_fedopt_datasets_and_models_example + cd examples/simulation/mpi_fedopt_datasets_and_models_example sh run_step_by_step_example.sh 4 config/mnist_lr/fedml_config.yaml - name: Test package - FedProx working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/simulation/mpi_fedprox_datasets_and_models_example + cd examples/simulation/mpi_fedprox_datasets_and_models_example sh run_step_by_step_example.sh 4 config/mnist_lr/fedml_config.yaml - name: Test package - FedGAN working-directory: ${{ steps.fedml_source_code_home.outputs.dir }} run: | cd python - cd examples/federate/simulation/mpi_torch_fedgan_mnist_gan_example + cd examples/simulation/mpi_torch_fedgan_mnist_gan_example sh run_step_by_step_example.sh 4 \ No newline at end of file diff --git a/.github/workflows/registry-runners/Dockerfile b/devops/dockerfile/github-action-runner/Dockerfile similarity index 70% rename from .github/workflows/registry-runners/Dockerfile rename to devops/dockerfile/github-action-runner/Dockerfile index 5d3168853a..4e6648260f 100644 --- a/.github/workflows/registry-runners/Dockerfile +++ b/devops/dockerfile/github-action-runner/Dockerfile @@ -1,10 +1,9 @@ # base -ARG BASE_IMAGE=python:3.11 - -FROM ${BASE_IMAGE} +FROM fedml/fedml:latest-torch1.13.1-cuda11.6-cudnn8-devel # set the github runner version -ARG RUNNER_VERSION="2.317.0" +ARG RUNNER_VERSION="2.304.0" + # update the base packages and add a non-sudo user #RUN apt-get update -y && apt-get upgrade -y && useradd -m docker @@ -25,15 +24,18 @@ COPY start.sh start.sh # make the script executable RUN chmod +x start.sh + +RUN cp -f /usr/bin/python /usr/bin/python-backup && ln -s /usr/bin/python3 python + +RUN pip install scikit-learn + +RUN pip install tensorflow && pip install tensorflow_datasets && pip install jax[cpu] && pip install dm-haiku && pip install optax && pip install jaxlib + # since the config and run script for actions are not allowed to be run by root, # set the user to "docker" so all subsequent commands are run as the docker user #USER docker -RUN git clone https://github.com/Qigemingziba/FedML.git -RUN cd FedML && git pull && git checkout dev/v0.7.0 && cd python && pip3 install -e ./ -ENV REPO=Qigemingziba/FedML ACCESS_TOKEN=AGMK3P4W5EM5PXNYTZXXIMTGNF4MW +ENV REPO=FedML-AI/FedML ACCESS_TOKEN=1 # set the entrypoint to the start.sh script -CMD ./start.sh ${REPO} ${ACCESS_TOKEN} - - +CMD ./start.sh ${REPO} ${ACCESS_TOKEN} \ No newline at end of file diff --git a/devops/dockerfile/github-action-runner/README.md b/devops/dockerfile/github-action-runner/README.md new file mode 100644 index 0000000000..d02e29665b --- /dev/null +++ b/devops/dockerfile/github-action-runner/README.md @@ -0,0 +1,25 @@ +# Run self-host runner in your machine + +## Usage + +./runner-start.sh [YourGitRepo] [YourRunnerPrefix] [YourRunnerNum] [YourGitHubRunnerToken] [LocalDevSourceDir] [LocalReleaseSourceDir] [LocalDataDir] + +For the argument YourGitHubRunnerToken, you may navigate based the following path. + +Settings -> Actions -> Runners -> New self-hosted runner. + +In the Configure section, you should find the similar line: +./config.sh --url https://github.com/FedML-AI/FedML --token AXRYPL6G2VHVGDFDQQS5XA3ELYI6M + +set YourGitHubRunnerToken to value of --token + + +## Example + +Use the following commands to run 30 runners in the FedML-AI/FedML repo and run 6 runners in the FedML-AI/Front-End-Auto-Test repo: + +./runner-start.sh FedML-AI/FedML fedml-runner 30 AXRYPLZLZN6XVJB3BAIXSP3EMFC7U /home/fedml/FedML4GitHubAction-Dev /home/fedml/FedML4GitHubAction /home/fedml/fedml_data +./runner-start.sh FedML-AI/Front-End-Auto-Test webtest-runner 6 AXRYPL57ZD35ZGDWZKRKFHLEMGLTK /home/fedml/FedML4GitHubAction-Dev /home/fedml/FedML4GitHubAction /home/fedml/fedml_data + +./runner-start.sh FedML-AI/FedML fedml-runner 30 AXRYPL6CCBH24ZVRSUEAYTTEMKD56 /home/chaoyanghe/sourcecode/FedML4GitHubAction-Dev /home/chaoyanghe/sourcecode/FedML4GitHubAction /home/chaoyanghe/fedml_data +./runner-start.sh FedML-AI/Front-End-Auto-Test webtest-runner 6 AXRYPL57ZD35ZGDWZKRKFHLEMGLTK /home/chaoyanghe/sourcecode/FedML4GitHubAction-Dev /home/chaoyanghe/sourcecode/FedML4GitHubAction /home/chaoyanghe/fedml_data diff --git a/devops/dockerfile/github-action-runner/build.sh b/devops/dockerfile/github-action-runner/build.sh new file mode 100755 index 0000000000..5f6dae9615 --- /dev/null +++ b/devops/dockerfile/github-action-runner/build.sh @@ -0,0 +1,3 @@ +docker build -t fedml/github-action-runner:latest -f ./Dockerfile . +docker login +docker push fedml/github-action-runner:latest \ No newline at end of file diff --git a/devops/dockerfile/github-action-runner/runner-start.sh b/devops/dockerfile/github-action-runner/runner-start.sh new file mode 100644 index 0000000000..18a0c4f958 --- /dev/null +++ b/devops/dockerfile/github-action-runner/runner-start.sh @@ -0,0 +1,23 @@ +REPO=$1 +TAG=$2 +NUM=$3 +ACCESS_TOKEN=$4 +LOCAL_DEV_SOURCE_DIR=$5 +LOCAL_RELEASE_SOURCE_DIR=$6 +LOCAL_DATA_DIR=$7 + +if [ $# != 7 ]; then + echo "Please provide five arguments." + echo "./runner-start.sh [YourGitRepo] [YourRunnerPrefix] [YourRunnerNum] [YourGitHubRunnerToken] [LocalDevSourceDir] [LocalReleaseSourceDir] [LocalDataDir]" + exit -1 +fi + +sudo docker stop `sudo docker ps |grep ${TAG}- |awk -F' ' '{print $1}'` +sudo docker pull fedml/github-action-runner:latest + +for((i=1;i<=$NUM;i++)); +do +ACT_NAME=$TAG-$i +sudo docker rm $ACT_NAME +sudo docker run --name $ACT_NAME --env REPO=$REPO --env ACCESS_TOKEN=$ACCESS_TOKEN -v $LOCAL_DEV_SOURCE_DIR:/home/actions-runner/fedml-dev -v $LOCAL_RELEASE_SOURCE_DIR:/home/actions-runner/fedml-master -v $LOCAL_DATA_DIR:/home/fedml/fedml_data -v $LOCAL_DATA_DIR:/home/actions-runner/fedml_data -d fedml/github-action-runner:latest +done \ No newline at end of file diff --git a/.github/workflows/registry-runners/start.sh b/devops/dockerfile/github-action-runner/start.sh similarity index 76% rename from .github/workflows/registry-runners/start.sh rename to devops/dockerfile/github-action-runner/start.sh index b65b0f1272..917d1cfe16 100644 --- a/.github/workflows/registry-runners/start.sh +++ b/devops/dockerfile/github-action-runner/start.sh @@ -2,15 +2,13 @@ ORGANIZATION=$1 ACCESS_TOKEN=$2 -PYTHON_VERSION=$3 echo $ORGANIZATION echo $ACCESS_TOKEN -echo $PYTHON_VERSION cd /home/fedml/actions-runner -RUNNER_ALLOW_RUNASROOT="1" ./config.sh --url https://github.com/${ORGANIZATION} --token ${ACCESS_TOKEN} --labels self-hosted,Linux,X64,$PYTHON_VERSION +RUNNER_ALLOW_RUNASROOT="1" ./config.sh --url https://github.com/${ORGANIZATION} --token ${ACCESS_TOKEN} cleanup() { echo "Removing runner..." diff --git a/devops/scripts/install-fedml.sh b/devops/scripts/install-fedml.sh deleted file mode 100644 index cafcfa3ac7..0000000000 --- a/devops/scripts/install-fedml.sh +++ /dev/null @@ -1,2 +0,0 @@ -cd python -pip install -e ./ \ No newline at end of file diff --git a/devops/scripts/sync-fedml-pip.sh b/devops/scripts/sync-fedml-pip.sh index 6b24ac52e7..0d909fff76 100755 --- a/devops/scripts/sync-fedml-pip.sh +++ b/devops/scripts/sync-fedml-pip.sh @@ -24,7 +24,7 @@ else fi fi -mkdir -p ./fedml/fedml_data -cp -Rf ./fedml/fedml_data_host/* ./fedml/fedml_data +mkdir -p /home/fedml/fedml_data +cp -Rf /home/fedml/fedml_data_host/* /home/fedml/fedml_data exit 0 diff --git a/python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/README.md b/python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/README.md index a1fa30b6f2..c693d8d863 100644 --- a/python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/README.md +++ b/python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/README.md @@ -26,7 +26,7 @@ For info on `trpc_master_config_path` refer to `python/examples/cross_silo/cuda_ Example is provided at: -`python/examples/federate/cross_silo/cuda_rpc_fedavg_mnist_lr_example/one_line` +`python/examples/cross_silo/cuda_rpc_fedavg_mnist_lr_example/one_line` ### Training Script At the client side, the client ID (a.k.a rank) starts from 1. diff --git a/python/examples/launch/examples/launch/hello_world/launch_config/fedml_config.yaml b/python/examples/launch/examples/launch/hello_world/launch_config/fedml_config.yaml deleted file mode 100644 index 21e1f2e33e..0000000000 --- a/python/examples/launch/examples/launch/hello_world/launch_config/fedml_config.yaml +++ /dev/null @@ -1,14 +0,0 @@ -containerize: false -data_args: - dataset_name: mnist - dataset_path: ./dataset - dataset_type: csv -environment_args: - bootstrap: fedml_bootstrap_generated.sh -model_args: - input_dim: '784' - model_cache_path: /Users/alexliang/fedml_models - model_name: lr - output_dim: '10' -training_params: - learning_rate: 0.004 diff --git a/python/examples/launch/hello_world/hello_world.py b/python/examples/launch/hello_world/hello_world.py index 2f68f99055..71ffaf7c16 100644 --- a/python/examples/launch/hello_world/hello_world.py +++ b/python/examples/launch/hello_world/hello_world.py @@ -1,5 +1,6 @@ import os import time + import fedml if __name__ == "__main__": diff --git a/python/examples/launch/serve_job_mnist.yaml b/python/examples/launch/serve_job_mnist.yaml index bd8b52ca6c..98c1570a4f 100755 --- a/python/examples/launch/serve_job_mnist.yaml +++ b/python/examples/launch/serve_job_mnist.yaml @@ -35,4 +35,4 @@ computing: maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card #allow_cross_cloud_resources: true # true, false #device_type: CPU # options: GPU, CPU, hybrid - resource_type: RTX-4090 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type \ No newline at end of file + resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type \ No newline at end of file diff --git a/python/examples/train/mnist_train/examples/train/mnist_train/launch_config/fedml_config.yaml b/python/examples/train/mnist_train/examples/train/mnist_train/launch_config/fedml_config.yaml deleted file mode 100644 index 188c19dde6..0000000000 --- a/python/examples/train/mnist_train/examples/train/mnist_train/launch_config/fedml_config.yaml +++ /dev/null @@ -1,3 +0,0 @@ -containerize: false -environment_args: - bootstrap: fedml_bootstrap_generated.sh diff --git a/python/examples/train/mnist_train/train.py b/python/examples/train/mnist_train/train.py deleted file mode 100644 index 611a15c2b6..0000000000 --- a/python/examples/train/mnist_train/train.py +++ /dev/null @@ -1,98 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -import torchvision -import torchvision.transforms as transforms -from torch.utils.data import DataLoader -import fedml -# Set random seed for reproducibility -torch.manual_seed(42) - -# Define hyperparameters -batch_size = 64 -learning_rate = 0.001 -num_epochs = 3 - -# Prepare dataset and data loaders -transform = transforms.Compose([ - transforms.ToTensor(), # Convert image to tensor, normalize to [0, 1] - transforms.Normalize((0.5,), (0.5,)) # Normalize with mean and std deviation of 0.5 -]) - -train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True) -train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - -test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True) -test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) - -# Define a simple convolutional neural network model -class SimpleCNN(nn.Module): - def __init__(self): - super(SimpleCNN, self).__init__() - self.conv1 = nn.Conv2d(1, 16, kernel_size=5, padding=2) - self.conv2 = nn.Conv2d(16, 32, kernel_size=5, padding=2) - self.fc1 = nn.Linear(32 * 7 * 7, 128) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - x = torch.relu(self.conv1(x)) - x = torch.max_pool2d(x, kernel_size=2, stride=2) - x = torch.relu(self.conv2(x)) - x = torch.max_pool2d(x, kernel_size=2, stride=2) - x = x.view(-1, 32 * 7 * 7) - x = torch.relu(self.fc1(x)) - x = self.fc2(x) - return x - -model = SimpleCNN() - -# Define loss function and optimizer -criterion = nn.CrossEntropyLoss() -optimizer = optim.Adam(model.parameters(), lr=learning_rate) - -# Train the model -for epoch in range(num_epochs): - - # Evaluate the model on the test set during training - model.eval() - with torch.no_grad(): - correct = 0 - total = 0 - for images, labels in test_loader: - outputs = model(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() - acc = 100 * correct / total - fedml.mlops.log_metric({"epoch":epoch, "acc": acc}) - - model.train() - for images, labels in train_loader: - # Forward pass - outputs = model(images) - loss = criterion(outputs, labels) - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - optimizer.step() - -# Final evaluation on the test set -model.eval() -with torch.no_grad(): - correct = 0 - total = 0 - for images, labels in test_loader: - outputs = model(images) - _, predicted = torch.max(outputs.data, 1) - total += labels.size(0) - correct += (predicted == labels).sum().item() - - acc = 100 * correct / total - print('Final Test Accuracy: {:.2f} %'.format(acc)) - fedml.mlops.log_metric({"epoch":num_epochs, "acc": acc}) - -fedml.mlops.log_model(f"model-file@test", "./simple_cnn.pth") -# # Save the model parameters -# torch.save(model.state_dict(), 'simple_cnn.pth') -# print('Model saved to simple_cnn.pth') diff --git a/python/examples/train/mnist_train/train.yaml b/python/examples/train/mnist_train/train.yaml deleted file mode 100644 index f9a5cc5ab5..0000000000 --- a/python/examples/train/mnist_train/train.yaml +++ /dev/null @@ -1,50 +0,0 @@ -# Local directory where your source code resides. -# It should be the relative path to this job yaml file or the absolute path. -# If your job doesn't contain any source code, it can be empty. -workspace: . - -# Running entry commands which will be executed as the job entry point. -# If an error occurs, you should exit with a non-zero code, e.g. exit 1. -# Otherwise, you should exit with a zero code, e.g. exit 0. -# Support multiple lines, which can not be empty. -job: | - echo "current job id: $FEDML_CURRENT_RUN_ID" - echo "current edge id: $FEDML_CURRENT_EDGE_ID" - echo "Hello, Here is the launch platform." - echo "Current directory is as follows." - pwd - python3 train.py - echo "training job finished." - -# If you want to use the job created by the MLOps platform, -# just uncomment the following three, then set job_id and config_id to your desired job id and related config. -#job_args: -# job_id: 2070 -# config_id: 111 - -# If you want to create the job with specific name, just uncomment the following line and set job_name to your desired job name -#job_name: cv_job - -job_type: train # options: train, deploy, federate - -# train subtype: general_training, single_machine_training, cluster_distributed_training, cross_cloud_training -# federate subtype: cross_silo, simulation, web, smart_phone -# deploy subtype: none -job_subtype: generate_training - -# containerize -containerize: false - -# Bootstrap shell commands which will be executed before running entry commands. -# Support multiple lines, which can be empty. -bootstrap: | - # pip install -r requirements.txt - echo "Bootstrap finished." - -computing: - minimum_num_gpus: 1 # minimum # of GPUs to provision - maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card - #allow_cross_cloud_resources: true # true, false - #device_type: CPU # options: GPU, CPU, hybrid - resource_type: RTX-4090 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type - diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py index c2fc2e3a0f..c96d65adc5 100644 --- a/python/fedml/__init__.py +++ b/python/fedml/__init__.py @@ -1,7 +1,6 @@ import logging import platform -import multiprocess import multiprocess as multiprocessing import os import random @@ -38,7 +37,7 @@ _global_training_type = None _global_comm_backend = None -__version__ = "0.8.51b1" +__version__ = "0.9.0" # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release @@ -461,26 +460,6 @@ def _init_multiprocessing(): multiprocessing.set_start_method("fork", force=True) -def get_multiprocessing_context(): - if platform.system() == "Windows": - return multiprocessing.get_context("spawn") - else: - return multiprocessing.get_context("fork") - - -def get_process(target=None, args=None): - if platform.system() == "Windows": - return multiprocessing.Process(target=target, args=args) - else: - #return multiprocessing.Process(target=target, args=args) - #multiprocessing.set_start_method("spawn", force=True) - #return multiprocess.context.SpawnContext.Process(target=target, args=args) - #multiprocessing.Manager().current_process().authkey = str.encode("abc") - new_process = multiprocessing.get_context("fork").Process(target=target, args=args) - #new_process.authkey = str.encode("abc") - return new_process - - def set_env_version(version): set_env_kv("FEDML_ENV_VERSION", version) load_env() diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py index ff2b0c7307..b03c72b675 100755 --- a/python/fedml/api/__init__.py +++ b/python/fedml/api/__init__.py @@ -278,9 +278,6 @@ def model_deploy(name, endpoint_name, endpoint_id, local, master_ids, worker_ids def model_run(endpoint_id, json_string): model_module.run(endpoint_id, json_string) -def get_endpoint(endpoint_id): - return model_module.get_endpoint(endpoint_id) - def endpoint_delete(endpoint_id): model_module.delete_endpoint(endpoint_id) diff --git a/python/fedml/api/modules/model.py b/python/fedml/api/modules/model.py index 93892fc5d1..a02e674f47 100644 --- a/python/fedml/api/modules/model.py +++ b/python/fedml/api/modules/model.py @@ -320,19 +320,6 @@ def run(endpoint_id: str, json_string: str) -> bool: click.echo("Failed to run model.") return False -def get_endpoint(endpoint_id: str): - api_key = get_api_key() - if api_key == "": - click.echo(''' - Please use one of the ways below to login first: - (1) CLI: `fedml login $api_key` - (2) API: fedml.api.fedml_login(api_key=$api_key) - ''') - return False - - endpoint_detail_result = FedMLModelCards.get_instance().query_endpoint_detail_api(user_api_key=api_key, - endpoint_id=endpoint_id) - return endpoint_detail_result def delete_endpoint(endpoint_id: str) -> bool: api_key = get_api_key() diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py index 50ca315a10..b8237d93ba 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py +++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py @@ -167,7 +167,7 @@ def autoscaler_reconcile_after_interval(self): # Get cached token for authorization of autoscale request cached_token = fedml_model_cache.get_end_point_token(e_id, e_name, model_name) if cached_token is None: - # logging.error(f"Failed to get the cached token for endpoint {e_id}.") + logging.error(f"Failed to get the cached token for endpoint {e_id}.") continue req_header = { @@ -229,7 +229,7 @@ def monitor_replicas_number(): cached_token = FedMLModelCache.get_instance().get_end_point_token_with_eid(endpoint_id) if cached_token is None: - # logging.error(f"Failed to get the cached token for endpoint {endpoint_id}.") + logging.error(f"Failed to get the cached token for endpoint {endpoint_id}.") return req_header = { @@ -339,10 +339,6 @@ def monitor_replicas_perf(edge_id, mqtt_mgr=None): def monitor_slave_run_process_status(self): try: count = 0 - try: - client_data_interface.FedMLClientDataInterface.get_instance().create_job_table() - except Exception as e: - pass job_list = client_data_interface.FedMLClientDataInterface.get_instance().get_jobs_from_db() for job in job_list.job_list: count += 1 @@ -452,10 +448,6 @@ def monitor_master_run_process_status(self, server_id, device_info_reporter=None try: ComputeCacheManager.get_instance().set_redis_params() count = 0 - try: - server_data_interface.FedMLServerDataInterface.get_instance().create_job_table() - except Exception as e: - pass job_list = server_data_interface.FedMLServerDataInterface.get_instance().get_jobs_from_db() for job in job_list.job_list: count += 1 diff --git a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py index 6dd575f307..05cc342e36 100644 --- a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py @@ -135,15 +135,13 @@ def save_run_process(run_id, process_id, data_dir, info_dir, pass @staticmethod - def kill_process(process_id, exclude_current_pid=False): + def kill_process(process_id): try: process = psutil.Process(process_id) if process is None: return child_processes = process.children(recursive=True) for sub_process in child_processes: - if exclude_current_pid and sub_process.pid == os.getpid(): - continue if platform.system() == 'Windows': os.system("taskkill /PID {} /T /F".format(sub_process.pid)) else: diff --git a/python/fedml/computing/scheduler/comm_utils/sys_utils.py b/python/fedml/computing/scheduler/comm_utils/sys_utils.py index 065482c23b..aaa37bc4db 100644 --- a/python/fedml/computing/scheduler/comm_utils/sys_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/sys_utils.py @@ -114,8 +114,6 @@ def get_sys_runner_info(): except: pass - enable_simulation_gpu, simulation_gpu_count = get_simulation_gpu_env() - if enable_simulation_gpu: gpu_count = simulation_gpu_count gpu_total_mem = "80G" @@ -130,26 +128,9 @@ def get_sys_runner_info(): gpu_count, gpu_vendor, cpu_count, gpu_device_name -def get_simulation_gpu_env(): - _enable_simulation_gpu = enable_simulation_gpu - _simulation_gpu_count = simulation_gpu_count - - env_enable_simulation_gpu = os.getenv("FEDML_ENABLE_SIMULATION_GPU", None) - if env_enable_simulation_gpu is not None: - _enable_simulation_gpu = True if env_enable_simulation_gpu == "1" or env_enable_simulation_gpu == 1 else False - - env_simulation_gpu_count = os.getenv("FEDML_SIMULATION_GPU_COUNT", None) - if env_simulation_gpu_count is not None: - _simulation_gpu_count = int(env_simulation_gpu_count) - - return _enable_simulation_gpu, _simulation_gpu_count - - # GPU list: [GPU(ID, uuid, load, memoryTotal, memoryUsed, memoryFree, driver, # gpu_name, serial, display_mode, display_active, temperature)] def get_gpu_list(): - enable_simulation_gpu, simulation_gpu_count = get_simulation_gpu_env() - if enable_simulation_gpu: ret_gpu_list = [ {'ID': 0, 'uuid': 'GPU-dab987f0-be09-294a-96d6-f9afeef49877', 'load': 1.0, @@ -203,8 +184,6 @@ def get_gpu_list(): def get_available_gpu_id_list(limit=1) -> List[int]: - enable_simulation_gpu, simulation_gpu_count = get_simulation_gpu_env() - if enable_simulation_gpu: available_gpu_ids = [0, 1, 2, 3, 4, 5, 6, 7] if simulation_gpu_count > 8: diff --git a/python/fedml/computing/scheduler/master/base_master_agent.py b/python/fedml/computing/scheduler/master/base_master_agent.py index 30cf5da1c9..3aff523c24 100755 --- a/python/fedml/computing/scheduler/master/base_master_agent.py +++ b/python/fedml/computing/scheduler/master/base_master_agent.py @@ -23,9 +23,7 @@ def __init__(self): def login( self, user_id, api_key=None, device_id=None, - os_name=None, role=None, runner_cmd=None, - communication_manager=None, sender_message_queue=None, - status_center_queue=None, sender_message_event=None + os_name=None, role=None, runner_cmd=None ): # Login account login_result = FedMLAccountManager.get_instance().login( @@ -50,31 +48,20 @@ def login( # Initialize the protocol manager # noinspection PyBoardException try: - self._initialize_protocol_manager( - communication_manager=communication_manager, - sender_message_queue=sender_message_queue, - status_center_queue=status_center_queue, - sender_message_event=sender_message_event) + self._initialize_protocol_manager() except Exception as e: FedMLAccountManager.write_login_failed_file(is_client=False) self.protocol_mgr.stop() raise e # Start the protocol manager to process the messages from MLOps and slave agents. - if communication_manager is None: - self.protocol_mgr.start() - - return login_result + self.protocol_mgr.start() @staticmethod def logout(): GeneralConstants.cleanup_run_process(None, is_master=True) sys_utils.cleanup_all_fedml_server_api_processes() - def stop(self, kill_process=False): - if self.protocol_mgr is not None: - self.protocol_mgr.stop(kill_process=kill_process) - def _create_protocol_manager(self, role, login_result): if self.protocol_mgr is not None: return @@ -82,11 +69,7 @@ def _create_protocol_manager(self, role, login_result): login_result, agent_config=login_result.agent_config) self.protocol_mgr.run_as_edge_server_and_agent = True \ if role == FedMLAccountManager.ROLE_EDGE_SERVER else False - self.protocol_mgr.run_as_cloud_agent = True \ - if role == FedMLAccountManager.ROLE_CLOUD_AGENT or role == FedMLAccountManager.ROLE_GPU_MASTER_SERVER \ - else False - self.use_local_process_as_cloud_server = True \ - if role == FedMLAccountManager.ROLE_GPU_MASTER_SERVER else self.use_local_process_as_cloud_server + self.protocol_mgr.run_as_cloud_agent = True if role == FedMLAccountManager.ROLE_CLOUD_AGENT else False self.protocol_mgr.run_as_cloud_server = True if role == FedMLAccountManager.ROLE_CLOUD_SERVER else False self.protocol_mgr.args = login_result self.protocol_mgr.edge_id = login_result.edge_id @@ -96,20 +79,12 @@ def _create_protocol_manager(self, role, login_result): self.protocol_mgr.enable_simulation_cloud_agent = self.enable_simulation_cloud_agent self.protocol_mgr.use_local_process_as_cloud_server = self.use_local_process_as_cloud_server - def _initialize_protocol_manager( - self, communication_manager=None, sender_message_queue=None, - status_center_queue=None, sender_message_event=None - ): + def _initialize_protocol_manager(self): # Init local database self._init_database() # Initialize the master protocol - self.protocol_mgr.set_parent_agent(self) - self.protocol_mgr.initialize( - communication_manager=communication_manager, - sender_message_queue=sender_message_queue, - status_center_queue=status_center_queue, - sender_message_event=sender_message_event) + self.protocol_mgr.initialize() # Report the IDLE status to MLOps self.mlops_metrics.report_server_training_status( @@ -134,9 +109,6 @@ def _init_logs(self, agent_args, edge_id): in_args.server_agent_id = edge_id MLOpsRuntimeLog.get_instance(in_args).init_logs() - def get_protocol_manager(self): - return self.protocol_mgr - @abstractmethod def _get_log_file_dir(self): pass @@ -152,17 +124,3 @@ def _init_database(self): @abstractmethod def _generate_protocol_manager_instance(self, args, agent_config=None): return None - - def start_master_server_instance(self, payload): - self.protocol_mgr.start_master_server_instance(payload) - - def generate_agent_instance(self): - return FedMLBaseMasterAgent() - - def process_job_complete_status(self, run_id, topic, payload): - if self.protocol_mgr is None: - return - if topic in self.protocol_mgr.get_subscribed_topics(): - message_handler = self.protocol_mgr.get_listener_handler(topic) - if message_handler is not None: - message_handler(topic, payload) diff --git a/python/fedml/computing/scheduler/master/base_master_job_runner.py b/python/fedml/computing/scheduler/master/base_master_job_runner.py index fdfff143aa..9ebab258bb 100755 --- a/python/fedml/computing/scheduler/master/base_master_job_runner.py +++ b/python/fedml/computing/scheduler/master/base_master_job_runner.py @@ -1,3 +1,4 @@ + import json import logging import multiprocessing @@ -6,9 +7,6 @@ import os import time import traceback - -import setproctitle - from ..scheduler_entry.constants import Constants from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog from ..master.server_constants import ServerConstants @@ -21,6 +19,7 @@ from fedml.utils.debugging import debug from ..scheduler_core.status_center import JobStatus from ..scheduler_core.compute_cache_manager import ComputeCacheManager +from multiprocessing import Process, Queue from ..scheduler_core.general_constants import GeneralConstants from ..scheduler_core.scheduler_base_job_runner import FedMLSchedulerBaseJobRunner, RunnerError, RunnerCompletedError from abc import ABC, abstractmethod @@ -44,13 +43,13 @@ def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id is_master_runner=True ) - self.run_edge_id_status_queue = multiprocessing.Manager().Queue() - self.run_metrics_queue = multiprocessing.Manager().Queue() - self.run_events_queue = multiprocessing.Manager().Queue() - self.run_artifacts_queue = multiprocessing.Manager().Queue() - self.run_logs_queue = multiprocessing.Manager().Queue() - self.run_edge_device_info_queue = multiprocessing.Manager().Queue() - self.run_edge_device_info_global_queue = multiprocessing.Manager().Queue() + self.run_edge_id_status_queue = Queue() + self.run_metrics_queue = Queue() + self.run_events_queue = Queue() + self.run_artifacts_queue = Queue() + self.run_logs_queue = Queue() + self.run_edge_device_info_queue = Queue() + self.run_edge_device_info_global_queue = Queue() self.run_extend_queue_list = None self.async_check_timeout = 0 self.enable_async_cluster = False @@ -69,12 +68,9 @@ def run( edge_device_info_queue=None, run_metrics_queue=None, run_event_queue=None, run_artifacts_queue=None, run_logs_queue=None, edge_device_info_global_queue=None, run_extend_queue_list=None, sender_message_center_queue=None, listener_message_queue=None, - status_center_queue=None, process_name=None + status_center_queue=None ): - if process_name is not None: - setproctitle.setproctitle(process_name) - - print(f"Master job runner process id {os.getpid()}, name {process_name}, run id {self.run_id}") + print(f"Master job runner process id {os.getpid()}, run id {self.run_id}") if platform.system() != "Windows": os.setsid() @@ -173,8 +169,7 @@ def run_impl( run_id, self.request_json, edge_id=self.edge_id, is_server_job=True, sender_message_queue=sender_message_queue, listener_message_queue=listener_message_queue, - status_center_queue=status_center_queue, - process_name=GeneralConstants.get_launch_master_user_process_name(run_id, self.edge_id) + status_center_queue=status_center_queue ) # Check if the run status is normal @@ -236,12 +231,9 @@ def run_server_job( edge_device_info_queue=None, run_metrics_queue=None, run_event_queue=None, run_artifacts_queue=None, run_logs_queue=None, edge_device_info_global_queue=None, run_extend_queue_list=None, sender_message_center_queue=None, listener_message_queue=None, - status_center_queue=None, process_name=None + status_center_queue=None ): - if process_name is not None: - setproctitle.setproctitle(process_name) - - print(f"Server runner process id {os.getpid()}, name {process_name}. run id {self.run_id}") + print(f"Server runner process id {os.getpid()}, run id {self.run_id}") if platform.system() != "Windows": os.setsid() @@ -413,9 +405,9 @@ def _generate_job_runner_instance(self, args, run_id=None, request_json=None, ag return None def start_runner_process( - self, run_id, request_json, edge_id=None, is_server_job=False, - sender_message_queue=None, listener_message_queue=None, - status_center_queue=None, process_name=None + self, run_id, request_json, edge_id=None, is_server_job=False, + sender_message_queue=None, listener_message_queue=None, + status_center_queue=None, ): server_runner = self._generate_job_runner_instance( self.args, run_id=run_id, request_json=request_json, @@ -433,26 +425,14 @@ def start_runner_process( server_runner.edge_id_status_queue = self.run_edge_id_status_queue server_runner.edge_device_info_queue = self.run_edge_device_info_queue self.run_extend_queue_list = self._generate_extend_queue_list() - if platform.system() == "Windows": - self.run_process = multiprocessing.Process( - target=server_runner.run if not is_server_job else server_runner.run_server_job, args=( - self.run_process_event, self.run_process_completed_event, self.run_edge_id_status_queue, - self.run_edge_device_info_queue, self.run_metrics_queue, self.run_events_queue, - self.run_artifacts_queue, self.run_logs_queue, self.run_edge_device_info_global_queue, - self.run_extend_queue_list, sender_message_queue, listener_message_queue, status_center_queue, - process_name, - ) - ) - else: - self.run_process = fedml.get_process( - target=server_runner.run if not is_server_job else server_runner.run_server_job, args=( - self.run_process_event, self.run_process_completed_event, self.run_edge_id_status_queue, - self.run_edge_device_info_queue, self.run_metrics_queue, self.run_events_queue, - self.run_artifacts_queue, self.run_logs_queue, self.run_edge_device_info_global_queue, - self.run_extend_queue_list, sender_message_queue, listener_message_queue, status_center_queue, - process_name, - ) + self.run_process = Process( + target=server_runner.run if not is_server_job else server_runner.run_server_job, args=( + self.run_process_event, self.run_process_completed_event, self.run_edge_id_status_queue, + self.run_edge_device_info_queue, self.run_metrics_queue, self.run_events_queue, + self.run_artifacts_queue, self.run_logs_queue, self.run_edge_device_info_global_queue, + self.run_extend_queue_list, sender_message_queue, listener_message_queue, status_center_queue ) + ) self.run_process.start() ServerConstants.save_run_process(run_id, self.run_process.pid) return self.run_process @@ -464,7 +444,7 @@ def put_run_edge_device_info_to_queue(self, run_id, edge_id, device_info): if int(edge_id) in edge_ids or str(edge_id) in edge_ids: run_id_str = str(run_id) if self.run_edge_device_info_queue is None: - self.run_edge_device_info_queue = multiprocessing.Manager().Queue() + self.run_edge_device_info_queue = Queue() self.run_edge_device_info_queue.put(device_info) def should_continue_run_job(self, run_id): @@ -592,7 +572,7 @@ def callback_run_logs(self, topic, payload): run_id = str(topic).split('/')[-1] run_id_str = str(run_id) if self.run_logs_queue is None: - self.run_logs_queue = multiprocessing.Manager().Queue() + self.run_logs_queue = Queue() self.run_logs_queue.put(payload) def callback_run_metrics(self, topic, payload): @@ -600,7 +580,7 @@ def callback_run_metrics(self, topic, payload): run_id = str(topic).split('/')[-1] run_id_str = str(run_id) if self.run_metrics_queue is None: - self.run_metrics_queue = multiprocessing.Manager().Queue() + self.run_metrics_queue = Queue() self.run_metrics_queue.put(payload) # def send_training_request_to_edges(self, active_edge_info_dict): @@ -730,3 +710,6 @@ def should_process_async_cluster(self): def get_client_id_list(self, server_edge_id_list): return server_edge_id_list + + + diff --git a/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py b/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py index 39f7438696..6831c9d034 100755 --- a/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py +++ b/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py @@ -1,39 +1,27 @@ import base64 import json import logging -import multiprocessing -import platform import time from abc import ABC from multiprocessing import Process - -import fedml from .cloud_server_manager import FedMLCloudServerManager -from ..comm_utils.run_process_utils import RunProcessUtils from ..scheduler_core.scheduler_base_job_runner_manager import FedMLSchedulerBaseJobRunnerManager -from ..scheduler_core.account_manager import FedMLAccountManager class FedMLBaseMasterJobRunnerManager(FedMLSchedulerBaseJobRunnerManager, ABC): def __init__(self): FedMLSchedulerBaseJobRunnerManager.__init__(self) - if not hasattr(self, "master_agent_instance_map"): - self.master_agent_instance_map = dict() # Override def start_job_runner( self, run_id, request_json, args=None, edge_id=None, is_server_job=False, sender_message_queue=None, listener_message_queue=None, status_center_queue=None, - communication_manager=None, master_agent_instance=None, should_start_cloud_server=False, - use_local_process_as_cloud_server=False, cuda_visible_gpu_ids_str=None, process_name=None + should_start_cloud_server=False, use_local_process_as_cloud_server=False, + cuda_visible_gpu_ids_str=None ): if should_start_cloud_server: - self._start_cloud_server( - args, run_id, request_json, edge_id=edge_id, - use_local_process_as_cloud_server=use_local_process_as_cloud_server, - sender_message_queue=sender_message_queue, listener_message_queue=listener_message_queue, - status_center_queue=status_center_queue, communication_manager=communication_manager, - master_agent_instance=master_agent_instance, process_name=process_name) + self._start_cloud_server(args, run_id, request_json, edge_id=edge_id, + use_local_process_as_cloud_server=use_local_process_as_cloud_server) return run_id_str = str(run_id) @@ -45,58 +33,34 @@ def start_job_runner( run_id, request_json, edge_id=edge_id, is_server_job=is_server_job, sender_message_queue=sender_message_queue, listener_message_queue=listener_message_queue, - status_center_queue=status_center_queue, - process_name=process_name + status_center_queue=status_center_queue ) def stop_job_runner( self, run_id, args=None, server_id=None, request_json=None, - run_as_cloud_agent=False, run_as_cloud_server=False, - use_local_process_as_cloud_server=False + run_as_cloud_agent=False, run_as_cloud_server=False ): super().stop_job_runner(run_id) if run_as_cloud_agent or run_as_cloud_server: - if not use_local_process_as_cloud_server: - stopping_process = Process( - target=FedMLCloudServerManager.stop_cloud_server, - args=(run_id, server_id, args.agent_config)) - stopping_process.start() - - run_id_str = str(run_id) - if self.master_agent_instance_map.get(run_id_str, None) is not None: - self.master_agent_instance_map.get(run_id_str).stop(kill_process=True) - self.master_agent_instance_map.pop(run_id_str) - - if use_local_process_as_cloud_server: - time.sleep(1) - RunProcessUtils.kill_process(self.cloud_run_process_map[run_id_str].pid) + stopping_process = Process( + target=FedMLCloudServerManager.stop_cloud_server, args=(run_id, server_id, args.agent_config)) + stopping_process.start() def complete_job_runner( self, run_id, args=None, server_id=None, request_json=None, - run_as_cloud_agent=False, run_as_cloud_server=False, - use_local_process_as_cloud_server=False + run_as_cloud_agent=False, run_as_cloud_server=False ): super().complete_job_runner(run_id) if run_as_cloud_agent or run_as_cloud_server: - if not use_local_process_as_cloud_server: - stopping_process = Process( - target=FedMLCloudServerManager.stop_cloud_server, - args=(run_id, server_id, args.agent_config)) - stopping_process.start() - - run_id_str = str(run_id) - if self.master_agent_instance_map.get(run_id_str, None) is not None: - self.master_agent_instance_map.get(run_id_str).stop(kill_process=True) - self.master_agent_instance_map.pop(run_id_str) + stopping_process = Process( + target=FedMLCloudServerManager.stop_cloud_server, args=(run_id, server_id, args.agent_config)) + stopping_process.start() def _start_cloud_server( self, args, run_id, request_json, edge_id=None, - use_local_process_as_cloud_server=False, - sender_message_queue=None, listener_message_queue=None, - status_center_queue=None, communication_manager=None, - master_agent_instance=None, process_name=None + use_local_process_as_cloud_server=False ): run_id_str = str(run_id) cloud_server_mgr = FedMLCloudServerManager( @@ -107,49 +71,19 @@ def _start_cloud_server( self.cloud_run_process_map[run_id_str] = Process(target=cloud_server_mgr.start_cloud_server_process_entry) self.cloud_run_process_map[run_id_str].start() else: - cloud_device_id = request_json.get("cloudServerDeviceId", "0") - server_id = request_json.get("server_id", 0) message_bytes = json.dumps(request_json).encode("ascii") base64_bytes = base64.b64encode(message_bytes) - payload = base64_bytes.decode("ascii") - self.master_agent_instance_map[str(run_id)] = master_agent_instance - - logging.info("start the master server: {}".format(payload)) + runner_cmd_encoded = base64_bytes.decode("ascii") + cloud_device_id = request_json.get("cloudServerDeviceId", "0") - if platform.system() == "Windows": - self.run_process = multiprocessing.Process( - target=cloud_server_mgr.start_local_master_server, - args=(args.account_id, args.api_key, args.os_name, args.version, - cloud_device_id, run_id, payload, - communication_manager, sender_message_queue, - status_center_queue, master_agent_instance, process_name)) - else: - self.cloud_run_process_map[run_id_str] = fedml.get_process( - target=cloud_server_mgr.start_local_master_server, - args=(args.account_id, args.api_key, args.os_name, args.version, - cloud_device_id, run_id, payload, - communication_manager, sender_message_queue, - status_center_queue, master_agent_instance, process_name)) + logging.info("runner_cmd_encoded: {}".format(runner_cmd_encoded)) + self.cloud_run_process_map[run_id_str] = Process( + target=cloud_server_mgr.start_local_cloud_server, + args=(args.account_id, args.version, cloud_device_id, runner_cmd_encoded)) self.cloud_run_process_map[run_id_str].start() time.sleep(1) - def start_local_master_server( - self, user, api_key, os_name, version, cloud_device_id, run_id, payload, - communication_manager=None, sender_message_queue=None, status_center_queue=None, - master_agent_instance=None - ): - if master_agent_instance is None: - return - master_agent_instance.login( - user, api_key=api_key, device_id=cloud_device_id, os_name=os_name, - role=FedMLAccountManager.ROLE_CLOUD_SERVER, - communication_manager=None, - sender_message_queue=None, - status_center_queue=None) - self.master_agent_instance_map[str(run_id)] = master_agent_instance - master_agent_instance.start_master_server_instance(payload) - def callback_run_logs(self, run_id, topic, payload): run_id_str = str(run_id) if self.job_runners.get(run_id_str, None) is not None: @@ -159,12 +93,3 @@ def callback_run_metrics(self, run_id, topic, payload): run_id_str = str(run_id) if self.job_runners.get(run_id_str, None) is not None: self.job_runners[run_id_str].callback_run_metrics(topic, payload) - - def callback_proxy_unknown_messages(self, run_id, topic, payload): - run_id_str = str(run_id) - master_agent = self.master_agent_instance_map.get(run_id_str, None) - if master_agent is None: - return - master_agent.process_job_complete_status(run_id, topic, payload) - - diff --git a/python/fedml/computing/scheduler/master/base_master_protocol_manager.py b/python/fedml/computing/scheduler/master/base_master_protocol_manager.py index 05529f8c8e..1c4cbba4f4 100755 --- a/python/fedml/computing/scheduler/master/base_master_protocol_manager.py +++ b/python/fedml/computing/scheduler/master/base_master_protocol_manager.py @@ -2,8 +2,6 @@ import base64 import json import logging -import time - import fedml from ..comm_utils.constants import SchedulerConstants from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog @@ -143,7 +141,6 @@ def on_agent_communication_connected(self, mqtt_client_object): def callback_start_train(self, topic=None, payload=None): # Fetch config from MLOps # noinspection PyBroadException - try: MLOpsConfigs.fetch_all_configs() except Exception: @@ -200,7 +197,7 @@ def callback_start_train(self, topic=None, payload=None): self.run_edge_ids[run_id_str] = edge_id_list # report server running status to master agent - if not self.run_as_cloud_server and not self.run_as_cloud_agent: + if not self.run_as_cloud_server: self.mlops_metrics.report_server_id_status( run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id, server_id=self.edge_id, server_agent_id=self.edge_id, running_json=payload) @@ -215,9 +212,7 @@ def callback_start_train(self, topic=None, payload=None): run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue(), - communication_manager=self.get_listener_communication_manager(), - process_name=GeneralConstants.get_launch_master_job_process_name(run_id, self.edge_id) + status_center_queue=self.get_status_queue() ) process = self._get_job_runner_manager().get_runner_process(run_id) @@ -228,17 +223,12 @@ def callback_start_train(self, topic=None, payload=None): elif self.run_as_cloud_agent: self.init_job_task(request_json) - server_id = request_json.get("server_id", self.edge_id) self._get_job_runner_manager().start_job_runner( run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue(), - communication_manager=self.get_listener_communication_manager(), - master_agent_instance=self.generate_agent_instance(), - should_start_cloud_server=True, - use_local_process_as_cloud_server=self.use_local_process_as_cloud_server, - process_name=GeneralConstants.get_launch_master_job_process_name(run_id, server_id) + status_center_queue=self.get_status_queue(), should_start_cloud_server=True, + use_local_process_as_cloud_server=self.use_local_process_as_cloud_server ) process = self._get_job_runner_manager().get_runner_process(run_id, is_cloud_server=True) @@ -247,7 +237,6 @@ def callback_start_train(self, topic=None, payload=None): elif self.run_as_cloud_server: self.server_agent_id = request_json.get("cloud_agent_id", self.edge_id) self.start_request_json = json.dumps(request_json) - server_id = request_json.get("server_id", self.edge_id) run_id = request_json["runId"] run_id_str = str(run_id) @@ -259,12 +248,10 @@ def callback_start_train(self, topic=None, payload=None): run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue(), - communication_manager=self.get_listener_communication_manager(), - process_name=GeneralConstants.get_launch_master_job_process_name(run_id, server_id) + status_center_queue=self.get_status_queue() ) - self.send_status_msg_to_edges(edge_id_list, run_id, server_id) + self.send_status_msg_to_edges(edge_id_list, run_id, self.edge_id) def callback_stop_train(self, topic, payload, use_payload=None): # Print the payload @@ -292,16 +279,6 @@ def callback_stop_train(self, topic, payload, use_payload=None): server_agent_id = self.edge_id topic_stop_train_to_cloud_server = f"mlops/flserver_agent_{server_id}/stop_train" self.message_center.send_message(topic_stop_train_to_cloud_server, payload) - - time.sleep(2) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, server_id) - self._get_job_runner_manager().stop_job_runner( - run_id, args=self.args, server_id=server_id, request_json=None, - run_as_cloud_agent=self.run_as_cloud_agent, run_as_cloud_server=self.run_as_cloud_server, - use_local_process_as_cloud_server=self.use_local_process_as_cloud_server) - self.generate_status_report(run_id, server_id, server_agent_id=server_agent_id). \ - report_server_id_status(run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_KILLED, - edge_id=server_id, server_id=server_id) return # Reset all edge status and server status @@ -327,11 +304,7 @@ def callback_complete_job(self, topic, payload): self._process_job_complete_status(run_id, server_id, request_json) def _process_job_complete_status(self, run_id, server_id, complete_payload): - # Complete the job runner - self._get_job_runner_manager().complete_job_runner( - run_id, args=self.args, server_id=server_id, request_json=complete_payload, - run_as_cloud_agent=self.run_as_cloud_agent, run_as_cloud_server=self.run_as_cloud_server, - use_local_process_as_cloud_server=self.use_local_process_as_cloud_server) + pass def callback_run_logs(self, topic, payload): run_id = str(topic).split('/')[-1] @@ -417,12 +390,6 @@ def callback_request_job_status(self, topic, payload): def callback_request_device_status_in_job(self, topic, payload): self.response_device_status_in_job(topic, payload) - def callback_proxy_unknown_messages(self, run_id, topic, payload): - self._get_job_runner_manager().callback_proxy_unknown_messages(run_id, topic, payload) - - def process_extra_queues(self, extra_queues): - self.rebuild_status_center(extra_queues[0]) - def generate_protocol_manager(self): message_status_runner = self._generate_protocol_manager_instance( self.args, agent_config=self.agent_config @@ -509,8 +476,6 @@ def init_job_task(self, request_json): self.setup_listener_for_run_logs(run_id) def setup_listeners_for_edge_status(self, run_id, edge_ids, server_id): - if self.run_as_cloud_agent: - return edge_status_topic = "fl_client/flclient_agent_" + str(server_id) + "/status" payload = {"run_id": run_id, "init_all_edge_id_list": edge_ids, "init_server_id": server_id} self.callback_edge_status(edge_status_topic, json.dumps(payload)) @@ -521,9 +486,6 @@ def setup_listeners_for_edge_status(self, run_id, edge_ids, server_id): self.subscribe_msg(edge_status_topic) def remove_listeners_for_edge_status(self, edge_ids=None): - if self.run_as_cloud_agent: - return - if edge_ids is None: edge_ids = self.request_json["edgeids"] @@ -580,7 +542,7 @@ def send_status_check_msg(self, run_id, edge_id, server_id, context=None): def send_status_msg_to_edges(self, edge_id_list, run_id, server_id, context=None): # Send status message to all edges for edge_id in edge_id_list: - self.send_status_check_msg(run_id, edge_id, server_id, context=context) + self.send_status_check_msg(run_id, edge_id, self.edge_id, context=context) def report_exception_status(self, run_id): self.mlops_metrics.report_job_status(run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION) @@ -592,9 +554,3 @@ def get_start_train_topic_with_edge_id(edge_id): @abstractmethod def _generate_protocol_manager_instance(self, args, agent_config=None): return None - - def start_master_server_instance(self, payload): - super().on_agent_communication_connected(None) - - self.receive_message_json(self.topic_start_train, payload) - diff --git a/python/fedml/computing/scheduler/master/cloud_server_manager.py b/python/fedml/computing/scheduler/master/cloud_server_manager.py index 3669cb32bc..040a0f38a3 100755 --- a/python/fedml/computing/scheduler/master/cloud_server_manager.py +++ b/python/fedml/computing/scheduler/master/cloud_server_manager.py @@ -2,14 +2,10 @@ import json import logging import os -import platform import traceback -import setproctitle - import fedml from fedml.computing.scheduler.comm_utils.sys_utils import get_python_program -from fedml.computing.scheduler.scheduler_core.account_manager import FedMLAccountManager class FedMLCloudServerManager: @@ -35,37 +31,14 @@ def __init__(self, args, run_id=None, edge_id=None, request_json=None, agent_con self.cloud_server_name = None @staticmethod - def start_local_cloud_server(user, api_key, os_name, version, cloud_device_id, runner_cmd_encoded): - if platform.system() != "Windows": - os.setsid() - + def start_local_cloud_server(user, version, cloud_device_id, runner_cmd_encoded): print(f"start cloud server, device id {cloud_device_id}, runner cmd {runner_cmd_encoded}") pip_source_dir = os.path.dirname(__file__) login_cmd = os.path.join(pip_source_dir, "server_login.py") run_cmd = f"{get_python_program()} -W ignore {login_cmd} -t login -r cloud_server -u {str(user)} " \ - f"-k {api_key} -v {version} -id {cloud_device_id} -rc {runner_cmd_encoded}" + f"-v {version} -id {cloud_device_id} -rc {runner_cmd_encoded}" os.system(run_cmd) - def start_local_master_server( - self, user, api_key, os_name, version, cloud_device_id, run_id, payload, - communication_manager=None, sender_message_queue=None, status_center_queue=None, - master_agent_instance=None, process_name=None - ): - if process_name is not None: - setproctitle.setproctitle(process_name) - - logging.info(f"Local master server pid: {os.getpid()}") - if platform.system() != "Windows": - os.setsid() - - master_agent_instance.login( - user, api_key=api_key, device_id=cloud_device_id, os_name=os_name, - role=FedMLAccountManager.ROLE_CLOUD_SERVER, runner_cmd=payload, - communication_manager=None, sender_message_queue=None, - status_center_queue=None) - - master_agent_instance.stop() - def start_cloud_server_process_entry(self): try: self.start_cloud_server_process() diff --git a/python/fedml/computing/scheduler/master/master_protocol_manager.py b/python/fedml/computing/scheduler/master/master_protocol_manager.py index 1adda439c6..ca9621e41d 100755 --- a/python/fedml/computing/scheduler/master/master_protocol_manager.py +++ b/python/fedml/computing/scheduler/master/master_protocol_manager.py @@ -7,9 +7,8 @@ class FedMLLaunchMasterProtocolManager(FedMLBaseMasterProtocolManager, ABC): def __init__(self, args, agent_config=None): FedMLBaseMasterProtocolManager.__init__(self, args, agent_config=agent_config) - self.message_center_name = "launch_master_agent" - # Override + # Override def generate_topics(self): super().generate_topics() @@ -36,6 +35,9 @@ def _init_extra_items(self): def print_connected_info(self): super().print_connected_info() - def generate_agent_instance(self): - from .master_agent import FedMLLaunchMasterAgent - return FedMLLaunchMasterAgent() + # Override + def _process_job_complete_status(self, run_id, server_id, complete_payload): + # Complete the job runner + self._get_job_runner_manager().complete_job_runner( + run_id, args=self.args, server_id=server_id, request_json=complete_payload, + run_as_cloud_agent=self.run_as_cloud_agent, run_as_cloud_server=self.run_as_cloud_server) diff --git a/python/fedml/computing/scheduler/master/server_login.py b/python/fedml/computing/scheduler/master/server_login.py index be7b73103f..8dd0696bc8 100755 --- a/python/fedml/computing/scheduler/master/server_login.py +++ b/python/fedml/computing/scheduler/master/server_login.py @@ -41,5 +41,4 @@ def logout(): master_agent.login(args.api_key, api_key=args.api_key, device_id=args.device_id, os_name=args.os_name, role=args.role, runner_cmd=args.runner_cmd) else: - master_agent.stop() master_agent.logout() diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index 32f5ebdeab..ab6bc4c895 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -1,12 +1,12 @@ import copy import json import logging -import multiprocessing import os import time import queue import traceback from abc import ABC +from multiprocessing import Queue import fedml from fedml.core.mlops import MLOpsRuntimeLog, MLOpsConfigs @@ -50,7 +50,7 @@ def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id self.replica_controller = None self.deployed_replica_payload = None self.slave_deployment_results_map = dict() - self.deployment_result_queue = multiprocessing.Manager().Queue() + self.deployment_result_queue = Queue() self.is_fresh_endpoint = True # Override diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py index efa56f4db5..9e0d51b588 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py @@ -264,8 +264,7 @@ def callback_start_deployment(self, topic, payload): run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue(), - process_name=GeneralConstants.get_deploy_master_job_process_name(run_id, self.edge_id) + status_center_queue=self.get_status_queue() ) process = self._get_job_runner_manager().get_runner_process(run_id) if process is not None: diff --git a/python/fedml/computing/scheduler/model_scheduler/model_device_client.py b/python/fedml/computing/scheduler/model_scheduler/model_device_client.py new file mode 100755 index 0000000000..05f43afc5f --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/model_device_client.py @@ -0,0 +1,98 @@ + +import copy +import logging +import multiprocessing +import time +import traceback +from multiprocessing import Process +from ..scheduler_core.account_manager import FedMLAccountManager +from .worker_agent import FedMLDeployWorkerAgent + + +class FedMLModelDeviceClientRunner: + def __init__(self, args, current_device_id, os_name, is_from_docker, service_config, infer_host="127.0.0.1"): + self.agent_process = None + self.agent_runner = None + self.agent_process_event = None + self.args = copy.deepcopy(args) + self.service_config = service_config + self.unique_device_id = None + self.current_device_id = current_device_id + self.os_name = os_name + self.is_from_docker = is_from_docker + self.edge_id = None + self.infer_host = infer_host + self.redis_addr = "local" + self.redis_port = "6379" + self.redis_password = "fedml_default" + + def get_edge_id(self): + return self.edge_id + + def start(self): + self.agent_runner = FedMLModelDeviceClientRunner(self.args, self.current_device_id, self.os_name, + self.is_from_docker, self.service_config) + self.agent_runner.infer_host = self.infer_host + self.agent_runner.redis_addr = self.redis_addr + self.agent_runner.redis_port = self.redis_port + self.agent_runner.redis_password = self.redis_password + if self.agent_process_event is None: + self.agent_process_event = multiprocessing.Event() + self.agent_process = Process(target=self.agent_runner.run_entry, args=(self.agent_process_event, self.args,)) + self.edge_id = self.bind_device() + self.agent_process.start() + + def run_entry(self, process_event, in_args): + # print(f"Model worker process id {os.getpid()}") + + self.agent_process_event = process_event + + worker_agent = FedMLDeployWorkerAgent() + + while not self.agent_process_event.is_set(): + try: + try: + worker_agent.logout() + except Exception as e: + pass + + worker_agent.login( + in_args.account_id, api_key=in_args.api_key, device_id=in_args.device_id, + os_name=in_args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM + ) + except Exception as e: + logging.info("Restart model device client: {}".format(traceback.format_exc())) + pass + finally: + try: + worker_agent.logout() + except Exception as e: + pass + time.sleep(15) + + try: + self.stop() + except Exception as e: + pass + + def check_runner_stop_event(self): + if self.agent_process_event is not None and self.agent_process_event.is_set(): + logging.info("Received stopping event.") + raise Exception("Runner stopped") + + def stop(self): + FedMLDeployWorkerAgent.logout() + + if self.agent_process_event is not None: + self.agent_process_event.set() + + def bind_device(self): + # Login account + login_result = FedMLAccountManager.get_instance().login( + self.args.account_id, api_key=self.args.api_key, device_id=self.args.device_id, + os_name=self.args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM + ) + if login_result is not None: + return login_result.edge_id + else: + return None diff --git a/python/fedml/computing/scheduler/model_scheduler/model_device_server.py b/python/fedml/computing/scheduler/model_scheduler/model_device_server.py new file mode 100755 index 0000000000..b2ecd144b1 --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/model_device_server.py @@ -0,0 +1,97 @@ + +import copy +import logging +import multiprocessing +import time +import traceback +from multiprocessing import Process +from ..scheduler_core.account_manager import FedMLAccountManager +from .master_agent import FedMLDeployMasterAgent + + +class FedMLModelDeviceServerRunner: + def __init__(self, args, current_device_id, os_name, is_from_docker, service_config, infer_host="127.0.0.1"): + self.agent_process = None + self.agent_runner = None + self.agent_process_event = None + self.args = copy.deepcopy(args) + self.service_config = service_config + self.unique_device_id = None + self.current_device_id = current_device_id + self.os_name = os_name + self.is_from_docker = is_from_docker + self.edge_id = None + self.infer_host = infer_host + self.redis_addr = "local" + self.redis_port = "6379" + self.redis_password = "fedml_default" + + def get_edge_id(self): + return self.edge_id + + def start(self): + self.agent_runner = FedMLModelDeviceServerRunner(self.args, self.current_device_id, self.os_name, + self.is_from_docker, self.service_config) + self.agent_runner.infer_host = self.infer_host + self.agent_runner.redis_addr = self.redis_addr + self.agent_runner.redis_port = self.redis_port + self.agent_runner.redis_password = self.redis_password + if self.agent_process_event is None: + self.agent_process_event = multiprocessing.Event() + self.agent_process = Process(target=self.agent_runner.run_entry, args=(self.agent_process_event, self.args)) + self.edge_id = self.bind_device() + self.agent_process.start() + + def run_entry(self, process_event, in_args): + # print(f"Model master process id {os.getpid()}") + + self.agent_process_event = process_event + master_agent = FedMLDeployMasterAgent() + + while not self.agent_process_event.is_set(): + try: + try: + master_agent.logout() + except Exception as e: + pass + + master_agent.login( + in_args.account_id, api_key=in_args.api_key, device_id=in_args.device_id, + os_name=in_args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM + ) + except Exception as e: + logging.info("Restart model device server: {}".format(traceback.format_exc())) + pass + finally: + try: + master_agent.logout() + except Exception as e: + pass + time.sleep(15) + + try: + self.stop() + except Exception as e: + pass + + def check_runner_stop_event(self): + if self.agent_process_event is not None and self.agent_process_event.is_set(): + logging.info("Received stopping event.") + raise Exception("Runner stopped") + + def stop(self): + FedMLDeployMasterAgent.logout() + + if self.agent_process_event is not None: + self.agent_process_event.set() + + def bind_device(self): + # Login account + login_result = FedMLAccountManager.get_instance().login( + self.args.account_id, api_key=self.args.api_key, device_id=self.args.device_id, + os_name=self.args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM + ) + if login_result is not None: + return login_result.edge_id + else: + return None diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py index 9204291c48..b1d0bebc47 100755 --- a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py @@ -12,7 +12,6 @@ from .device_model_msg_object import FedMLModelMsgObject from .device_client_constants import ClientConstants from .device_client_data_interface import FedMLClientDataInterface -from ..scheduler_core.general_constants import GeneralConstants from ..slave.base_slave_protocol_manager import FedMLBaseSlaveProtocolManager from .worker_job_runner_manager import FedMLDeployJobRunnerManager from .device_mqtt_inference_protocol import FedMLMqttInference @@ -164,8 +163,7 @@ def callback_start_deployment(self, topic, payload): run_id, request_json, args=self.args, edge_id=self.edge_id, sender_message_queue=self.message_center.get_sender_message_queue(), listener_message_queue=self.get_listener_message_queue(), - status_center_queue=self.get_status_queue(), - process_name=GeneralConstants.get_deploy_slave_job_process_name(run_id, self.edge_id) + status_center_queue=self.get_status_queue() ) process = self._get_job_runner_manager().get_runner_process(run_id) if process is not None: diff --git a/python/fedml/computing/scheduler/scheduler_core/account_manager.py b/python/fedml/computing/scheduler/scheduler_core/account_manager.py index 8d73a42679..3b80511d12 100755 --- a/python/fedml/computing/scheduler/scheduler_core/account_manager.py +++ b/python/fedml/computing/scheduler/scheduler_core/account_manager.py @@ -25,7 +25,6 @@ class FedMLAccountManager(Singleton): ROLE_CLOUD_SERVER = "cloud_server" ROLE_EDGE_DEVICE = "client" ROLE_GPU_PROVIDER = "gpu_supplier" - ROLE_GPU_MASTER_SERVER = "gpu_master_server" ROLE_DEPLOY_MASTER_ON_PREM = "md.on_premise_device.master" ROLE_DEPLOY_WORKER_ON_PREM = "md.on_premise_device" @@ -34,7 +33,6 @@ class FedMLAccountManager(Singleton): DEVICE_ID_SUFFIX_CLOUD_SERVER = ".Public.Server" DEVICE_ID_SUFFIX_EDGE_DEVICE = ".Edge.Device" DEVICE_ID_SUFFIX_GPU_PROVIDER = ".Edge.GPU.Supplier" - DEVICE_ID_SUFFIX_GPU_MASTER_SERVER = ".Edge.GPU.MasterServer" DEVICE_ID_SUFFIX_DEPLOY = "MDA" DEVICE_ID_SUFFIX_DEPLOY_MASTER_ON_PREM = ".OnPremise.Master.Device" DEVICE_ID_SUFFIX_DEPLOY_WORKER_ON_PREM = ".OnPremise.Device" @@ -43,7 +41,8 @@ class FedMLAccountManager(Singleton): DEVICE_ID_DOCKER_HUB_TAG = ".DockerHub" def __init__(self): - pass + if not hasattr(self, "agent_args"): + self.agent_args = None @staticmethod def get_instance(): @@ -51,7 +50,7 @@ def get_instance(): def login(self, user_id, api_key="", device_id=None, os_name=None, role=None, runner_cmd=None): # Build the agent args - agent_args = self.build_agent_args( + self.build_agent_args( user_id, api_key=api_key, device_id=device_id, os_name=os_name, role=role, runner_cmd=runner_cmd ) @@ -94,8 +93,8 @@ def login(self, user_id, api_key="", device_id=None, os_name=None, role=None, ru # noinspection PyBroadException try: edge_id, user_name, extra_url, general_edge_id = FedMLAccountManager.bind_account_and_device_id( - service_config["ml_ops_config"]["EDGE_BINDING_URL"], agent_args.account_id, - agent_args.unique_device_id, agent_args.os_name, + service_config["ml_ops_config"]["EDGE_BINDING_URL"], self.agent_args.account_id, + self.agent_args.unique_device_id, self.agent_args.os_name, api_key=api_key, role=role ) if edge_id > 0: @@ -119,13 +118,13 @@ def login(self, user_id, api_key="", device_id=None, os_name=None, role=None, ru return None # Fill the bound result to agent args. - agent_args = self.fill_argent_args( - agent_args, log_server_url=log_server_url, server_id=edge_id, + self.fill_argent_args( + log_server_url=log_server_url, server_id=edge_id, edge_id=edge_id, general_edge_id=general_edge_id, user_name=user_name, extra_url=extra_url, agent_config=service_config) - return agent_args + return self.agent_args def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, role=None, runner_cmd=None): # Generate the suffix for device based on the role @@ -145,9 +144,6 @@ def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_EDGE_DEVICE elif role == FedMLAccountManager.ROLE_GPU_PROVIDER: device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_GPU_PROVIDER - elif role == FedMLAccountManager.ROLE_GPU_MASTER_SERVER: - device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_GPU_MASTER_SERVER - is_master = True elif role == FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM: device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_DEPLOY_MASTER_ON_PREM is_master = True @@ -158,31 +154,32 @@ def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, # Build the agent args version = fedml.get_env_version() - agent_args = AgentArgs() - agent_args.role = role - agent_args.account_id = user_id - agent_args.api_key = api_key - agent_args.current_running_dir = GeneralConstants.get_deploy_fedml_home_dir(is_master=is_master) \ + if self.agent_args is None: + self.agent_args = AgentArgs() + self.agent_args.role = role + self.agent_args.account_id = user_id + self.agent_args.api_key = api_key + self.agent_args.current_running_dir = GeneralConstants.get_deploy_fedml_home_dir(is_master=is_master) \ if is_deploy else GeneralConstants.get_launch_fedml_home_dir(is_master=is_master) sys_name = platform.system() if sys_name == "Darwin": sys_name = "MacOS" - agent_args.os_name = sys_name if os_name is None or os_name == "" else os_name - agent_args.version = version - agent_args.log_file_dir = GeneralConstants.get_deploy_log_file_dir(is_master=is_master) \ + self.agent_args.os_name = sys_name if os_name is None or os_name == "" else os_name + self.agent_args.version = version + self.agent_args.log_file_dir = GeneralConstants.get_deploy_log_file_dir(is_master=is_master) \ if is_deploy else GeneralConstants.get_launch_log_file_dir(is_master=is_master) is_from_docker = False if device_id is not None and device_id != "0": - agent_args.current_device_id = device_id + self.agent_args.current_device_id = device_id else: data_dir = GeneralConstants.get_deploy_data_dir(is_master=is_master) \ if is_deploy else GeneralConstants.get_launch_data_dir(is_master=is_master) is_gpu_provider = True if role == FedMLAccountManager.ROLE_GPU_PROVIDER else False - agent_args.current_device_id = FedMLAccountManager.get_device_id( + self.agent_args.current_device_id = FedMLAccountManager.get_device_id( data_dir=data_dir, use_machine_id=is_gpu_provider) - agent_args.device_id = agent_args.current_device_id - agent_args.config_version = version - agent_args.cloud_region = "" + self.agent_args.device_id = self.agent_args.current_device_id + self.agent_args.config_version = version + self.agent_args.cloud_region = "" # Check if it is running in the fedml docker hub is_from_fedml_docker_hub = False @@ -194,29 +191,26 @@ def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, # Build unique device id docker_tag = FedMLAccountManager.DEVICE_ID_DOCKER_TAG if is_from_docker else "" docker_tag = FedMLAccountManager.DEVICE_ID_DOCKER_HUB_TAG if is_from_fedml_docker_hub else docker_tag - unique_device_id = f"{agent_args.current_device_id}@{agent_args.os_name}" \ + unique_device_id = f"{self.agent_args.current_device_id}@{self.agent_args.os_name}" \ f"{docker_tag}{device_id_suffix}" if role == FedMLAccountManager.ROLE_CLOUD_SERVER: - unique_device_id = agent_args.current_device_id + unique_device_id = self.agent_args.current_device_id # Set the unique device id - agent_args.is_from_docker = is_from_docker or is_from_fedml_docker_hub - agent_args.unique_device_id = unique_device_id - agent_args.runner_cmd = runner_cmd - - return agent_args + self.agent_args.is_from_docker = is_from_docker or is_from_fedml_docker_hub + self.agent_args.unique_device_id = unique_device_id + self.agent_args.runner_cmd = runner_cmd def fill_argent_args( - self, agent_args, log_server_url=None, server_id=None, edge_id=None, + self, log_server_url=None, server_id=None, edge_id=None, user_name=None, extra_url=None, general_edge_id=None, agent_config=None): - agent_args.log_server_url = log_server_url - agent_args.server_id = server_id - agent_args.edge_id = edge_id - agent_args.user_name = user_name - agent_args.extra_url = extra_url - agent_args.general_edge_id = general_edge_id - agent_args.agent_config = agent_config - return agent_args + self.agent_args.log_server_url = log_server_url + self.agent_args.server_id = server_id + self.agent_args.edge_id = edge_id + self.agent_args.user_name = user_name + self.agent_args.extra_url = extra_url + self.agent_args.general_edge_id = general_edge_id + self.agent_args.agent_config = agent_config @staticmethod def write_login_failed_file(is_client=True): diff --git a/python/fedml/computing/scheduler/scheduler_core/general_constants.py b/python/fedml/computing/scheduler/scheduler_core/general_constants.py index 0ab6f79577..8c60b17bdf 100755 --- a/python/fedml/computing/scheduler/scheduler_core/general_constants.py +++ b/python/fedml/computing/scheduler/scheduler_core/general_constants.py @@ -65,20 +65,6 @@ class GeneralConstants: FEDML_OTA_CMD_RESTART = "restart" FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT = "MODEL_END_POINT" - FEDML_PROCESS_NAME_PREFIX = "fedml-process-" - FEDML_LAUNCH_MASTER_JOB_RUNNER_TAG = "launch-master-job-runner" - FEDML_LAUNCH_SLAVE_JOB_RUNNER_TAG = "launch-slave-job-runner" - FEDML_LAUNCH_MASTER_USER_JOB_TAG = "launch-master-user-job" - FEDML_DEPLOY_MASTER_JOB_RUNNER_TAG = "deploy-master-job-runner" - FEDML_DEPLOY_SLAVE_JOB_RUNNER_TAG = "deploy-slave-job-runner" - FEDML_DEPLOY_MASTER_USER_JOB_TAG = "deploy-master-user-job" - FEDML_MESSAGE_CENTER_LISTENER_TAG = "message-center-listener" - FEDML_MESSAGE_CENTER_SENDER_TAG = "message-center-sender" - FEDML_STATUS_CENTER_TAG = "status-center" - FEDML_LOG_PROCESS_TAG = "log" - FEDML_MONITOR_PROCESS_TAG = "monitor" - - FEDML_TOPIC_STATUS_CENTER_STOP = "anywhere/status_center/stop" @staticmethod def get_package_unzip_dir(package_download_dir): @@ -230,65 +216,3 @@ def get_topic_complete_job(server_id): def get_payload_complete_job(run_id, server_id): payload_complete_job = {"runId": run_id, "serverId": server_id} return payload_complete_job - - @staticmethod - def get_process_name(process_tag, run_id=None, edge_id=None): - return f'{GeneralConstants.FEDML_PROCESS_NAME_PREFIX}{process_tag}'\ - f'{"-run-" + str(run_id) if run_id is not None and int(run_id) != 0 else ""}'\ - f'{"-edge-" + str(edge_id) if edge_id is not None else ""}' - - @staticmethod - def get_process_name_with_prefix(process_prefix, run_id=None, edge_id=None): - return f"{process_prefix}-run-{run_id}-edge-{edge_id}" - - @staticmethod - def get_launch_master_job_process_name(run_id, edge_id): - return GeneralConstants.get_process_name( - GeneralConstants.FEDML_LAUNCH_MASTER_JOB_RUNNER_TAG, run_id, edge_id) - - @staticmethod - def get_launch_slave_job_process_name(run_id, edge_id): - return GeneralConstants.get_process_name( - GeneralConstants.FEDML_LAUNCH_SLAVE_JOB_RUNNER_TAG, run_id, edge_id) - - @staticmethod - def get_launch_master_user_process_name(run_id, edge_id): - return GeneralConstants.get_process_name( - GeneralConstants.FEDML_LAUNCH_MASTER_USER_JOB_TAG, run_id, edge_id) - - @staticmethod - def get_deploy_master_job_process_name(run_id, edge_id): - return GeneralConstants.get_process_name( - GeneralConstants.FEDML_DEPLOY_MASTER_JOB_RUNNER_TAG, run_id, edge_id) - - @staticmethod - def get_deploy_slave_job_process_name(run_id, edge_id): - return GeneralConstants.get_process_name( - GeneralConstants.FEDML_DEPLOY_SLAVE_JOB_RUNNER_TAG, run_id, edge_id) - - @staticmethod - def get_deploy_master_user_process_name(run_id, edge_id): - return GeneralConstants.get_process_name( - GeneralConstants.FEDML_DEPLOY_MASTER_USER_JOB_TAG, run_id, edge_id) - - @staticmethod - def get_log_process_name(run_id, edge_id): - return GeneralConstants.get_process_name( - GeneralConstants.FEDML_LOG_PROCESS_TAG, run_id, edge_id) - - @staticmethod - def get_message_center_listener_process_name(message_center_name): - return f"{GeneralConstants.FEDML_PROCESS_NAME_PREFIX}{GeneralConstants.FEDML_MESSAGE_CENTER_LISTENER_TAG}-{message_center_name}" - - @staticmethod - def get_message_center_sender_process_name(message_center_name): - return f"{GeneralConstants.FEDML_PROCESS_NAME_PREFIX}{GeneralConstants.FEDML_MESSAGE_CENTER_SENDER_TAG}-{message_center_name}" - - @staticmethod - def get_status_center_process_name(status_center_tag): - return f"{GeneralConstants.FEDML_PROCESS_NAME_PREFIX}{GeneralConstants.FEDML_STATUS_CENTER_TAG}-{status_center_tag}" - - @staticmethod - def get_monitor_process_name(monitor_tag, run_id, edge_id): - return GeneralConstants.get_process_name( - f"{GeneralConstants.FEDML_MONITOR_PROCESS_TAG}-{monitor_tag}", run_id, edge_id) diff --git a/python/fedml/computing/scheduler/scheduler_core/message_center.py b/python/fedml/computing/scheduler/scheduler_core/message_center.py index 5f414d1873..dbe11700a0 100755 --- a/python/fedml/computing/scheduler/scheduler_core/message_center.py +++ b/python/fedml/computing/scheduler/scheduler_core/message_center.py @@ -1,20 +1,16 @@ import json import logging import os -import platform import threading import time import traceback import uuid import multiprocessing +from multiprocessing import Process, Queue import queue from os.path import expanduser -import setproctitle - -import fedml from fedml.core.distributed.communication.mqtt.mqtt_manager import MqttManager -from .general_constants import GeneralConstants from ..slave.client_constants import ClientConstants from ....core.mlops.mlops_metrics import MLOpsMetrics from operator import methodcaller @@ -24,7 +20,6 @@ class FedMLMessageCenter(object): FUNC_SETUP_MESSAGE_CENTER = "setup_message_center" FUNC_REBUILD_MESSAGE_CENTER = "rebuild_message_center" - FUNC_PROCESS_EXTRA_QUEUES = "process_extra_queues" ENABLE_SAVE_MESSAGE_TO_FILE = True PUBLISH_MESSAGE_RETRY_TIMEOUT = 60 * 1000.0 PUBLISH_MESSAGE_RETRY_COUNT = 3 @@ -32,12 +27,11 @@ class FedMLMessageCenter(object): MESSAGE_SENT_SUCCESS_RECORDS_FILE = "message-sent-success-records.log" MESSAGE_RECEIVED_RECORDS_FILE = "message-received-records.log" - def __init__(self, agent_config=None, sender_message_queue=None, - listener_message_queue=None, sender_message_event=None): + def __init__(self, agent_config=None, sender_message_queue=None, listener_message_queue=None): self.sender_agent_config = agent_config self.listener_agent_config = agent_config self.sender_message_queue = sender_message_queue - self.message_event = sender_message_event + self.message_event = None self.message_center_process = None self.sender_mqtt_mgr = None self.sender_mlops_metrics = None @@ -136,33 +130,21 @@ def release_sender_mqtt_mgr(self): def get_sender_message_queue(self): return self.sender_message_queue - def get_sender_message_event(self): - return self.message_event - def start_sender(self, message_center_name=None): - self.sender_message_queue = multiprocessing.Manager().Queue() + self.sender_message_queue = Queue() self.message_event = multiprocessing.Event() self.message_event.clear() - process_name = GeneralConstants.get_message_center_sender_process_name(message_center_name) message_center = FedMLMessageCenter(agent_config=self.sender_agent_config, sender_message_queue=self.sender_message_queue) - if platform.system() == "Windows": - self.message_center_process = multiprocessing.Process( - target=message_center.run_sender, args=( - self.message_event, self.sender_message_queue, - message_center_name, process_name - ) - ) - else: - self.message_center_process = fedml.get_process( - target=message_center.run_sender, args=( - self.message_event, self.sender_message_queue, - message_center_name, process_name - ) + self.message_center_process = Process( + target=message_center.run_sender, args=( + self.message_event, self.sender_message_queue, + message_center_name ) + ) self.message_center_process.start() - def stop_message_center(self): + def stop(self): if self.message_event is not None: self.message_event.set() @@ -174,10 +156,6 @@ def check_message_stop_event(self): logging.info("Received message center stopping event.") raise MessageCenterStoppedException("Message center stopped (for sender)") - if self.listener_message_event is not None and self.listener_message_event.is_set(): - logging.info("Received message center stopping event.") - raise MessageCenterStoppedException("Message center stopped (for listener)") - def send_message(self, topic, payload, run_id=None): message_entity = FedMLMessageEntity(topic=topic, payload=payload, run_id=run_id) self.sender_message_queue.put(message_entity.get_message_body()) @@ -215,13 +193,7 @@ def retry_sending_undelivered_message(self): # Save the message self.save_message_record(message_entity.run_id, message_entity.device_id, sent_message_record) - def run_sender(self, message_event, message_queue, message_center_name, process_name=None): - if process_name is not None: - setproctitle.setproctitle(process_name) - - if platform.system() != "Windows": - os.setsid() - + def run_sender(self, message_event, message_queue, message_center_name): self.message_event = message_event self.sender_message_queue = message_queue self.message_center_name = message_center_name @@ -276,16 +248,10 @@ def run_sender(self, message_event, message_queue, message_center_name, process_ self.release_sender_mqtt_mgr() - def get_protocol_communication_manager(self): - return None - def setup_listener_mqtt_mgr(self): if self.listener_mqtt_mgr is not None: return - # self.listener_mqtt_mgr = self.get_protocol_communication_manager() - # return - self.listener_mqtt_mgr = MqttManager( self.listener_agent_config["mqtt_config"]["BROKER_HOST"], self.listener_agent_config["mqtt_config"]["BROKER_PORT"], @@ -298,11 +264,7 @@ def setup_listener_mqtt_mgr(self): self.listener_mqtt_mgr.connect() self.listener_mqtt_mgr.loop_start() - def get_listener_communication_manager(self): - return self.listener_mqtt_mgr - def release_listener_mqtt_mgr(self): - #return try: if self.listener_mqtt_mgr is not None: self.listener_mqtt_mgr.loop_stop() @@ -325,9 +287,6 @@ def remove_message_listener(self, topic): self.listener_topics.remove(topic) self.listener_handler_funcs.pop(topic) - def get_listener_handler(self, topic): - return self.listener_handler_funcs.get(topic) - def get_message_runner(self): return None @@ -335,42 +294,29 @@ def get_listener_message_queue(self): return self.listener_message_queue def setup_listener_message_queue(self): - self.listener_message_queue = multiprocessing.Manager().Queue() + self.listener_message_queue = Queue() - def start_listener( - self, sender_message_queue=None, listener_message_queue=None, - sender_message_event=None, agent_config=None, message_center_name=None, extra_queues=None - ): + def start_listener(self, sender_message_queue=None, listener_message_queue=None, agent_config=None, message_center_name=None): if self.listener_message_center_process is not None: return if listener_message_queue is None: if self.listener_message_queue is None: - self.listener_message_queue = multiprocessing.Manager().Queue() + self.listener_message_queue = Queue() else: self.listener_message_queue = listener_message_queue self.listener_message_event = multiprocessing.Event() self.listener_message_event.clear() self.listener_agent_config = agent_config - message_runner = self + message_runner = self.get_message_runner() message_runner.listener_agent_config = agent_config - process_name = GeneralConstants.get_message_center_listener_process_name(message_center_name) - if platform.system() == "Windows": - self.listener_message_center_process = multiprocessing.Process( - target=message_runner.run_listener_dispatcher, args=( - self.listener_message_event, self.listener_message_queue, - self.listener_handler_funcs, sender_message_queue, - sender_message_event, message_center_name, extra_queues, process_name - ) - ) - else: - self.listener_message_center_process = fedml.get_process( - target=message_runner.run_listener_dispatcher, args=( - self.listener_message_event, self.listener_message_queue, - self.listener_handler_funcs, sender_message_queue, - sender_message_event, message_center_name, extra_queues, process_name - ) + self.listener_message_center_process = Process( + target=message_runner.run_listener_dispatcher, args=( + self.listener_message_event, self.listener_message_queue, + self.listener_handler_funcs, sender_message_queue, + message_center_name ) + ) self.listener_message_center_process.start() def check_listener_message_stop_event(self): @@ -403,22 +349,13 @@ def unsubscribe_msg(self, topic): self.listener_mqtt_mgr.unsubscribe_msg(topic) def run_listener_dispatcher( - self, listener_message_event, listener_message_queue, - listener_funcs, sender_message_queue, sender_message_event, - message_center_name, extra_queues, process_name=None + self, message_event, message_queue, listener_funcs, sender_message_queue, + message_center_name ): - if process_name is not None: - setproctitle.setproctitle(process_name) - - if platform.system() != "Windows": - os.setsid() - - self.listener_message_event = listener_message_event - self.listener_message_queue = listener_message_queue + self.listener_message_event = message_event + self.listener_message_queue = message_queue self.listener_handler_funcs = listener_funcs self.message_center_name = message_center_name - self.sender_message_queue = sender_message_queue - self.message_event = sender_message_event self.setup_listener_mqtt_mgr() @@ -427,9 +364,6 @@ def run_listener_dispatcher( else: methodcaller(FedMLMessageCenter.FUNC_REBUILD_MESSAGE_CENTER, sender_message_queue)(self) - if extra_queues is not None: - methodcaller(FedMLMessageCenter.FUNC_PROCESS_EXTRA_QUEUES, extra_queues)(self) - while True: message_entity = None try: @@ -444,7 +378,7 @@ def run_listener_dispatcher( # Get the message from the queue try: - message_body = listener_message_queue.get(block=False, timeout=0.1) + message_body = message_queue.get(block=False, timeout=0.1) except queue.Empty as e: # If queue is empty, then break loop message_body = None if message_body is None: @@ -468,11 +402,6 @@ def run_listener_dispatcher( message_handler_func_name = self.listener_handler_funcs.get(message_entity.topic, None) if message_handler_func_name is not None: methodcaller(message_handler_func_name, message_entity.topic, message_entity.payload)(self) - else: - if hasattr(self, "callback_proxy_unknown_messages") and \ - self.callback_proxy_unknown_messages is not None: - self.callback_proxy_unknown_messages( - message_entity.run_id, message_entity.topic, message_entity.payload) except Exception as e: if message_entity is not None: logging.info( diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py index 7175032375..6e0010f556 100755 --- a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py @@ -9,8 +9,6 @@ import traceback import zipfile import queue - -import fedml from ..comm_utils.constants import SchedulerConstants from ..comm_utils.job_utils import JobRunnerUtils, DockerArgs from ..scheduler_entry.constants import Constants @@ -75,7 +73,6 @@ def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id self.user_name = None self.general_edge_id = None self.message_center = None - self.status_center = None self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = { "${FEDSYS.RUN_ID}": "", "${FEDSYS.PRIVATE_LOCAL_DATA}": "", @@ -211,15 +208,9 @@ def retrieve_and_unzip_package(self, package_name, package_url): # Open a process to download the package so that we can avoid the request is blocked and check the timeout. from multiprocessing import Process completed_event = multiprocessing.Event() - info_queue = multiprocessing.Manager().Queue() - if platform.system() == "Windows": - download_process = multiprocessing.Process( - target=self.download_package_proc, - args=(package_url, local_package_file, completed_event, info_queue)) - else: - download_process = fedml.get_process( - target=self.download_package_proc, - args=(package_url, local_package_file, completed_event, info_queue)) + info_queue = multiprocessing.Queue() + download_process = Process(target=self.download_package_proc, + args=(package_url, local_package_file, completed_event, info_queue)) download_process.start() allowed_block_download_time = 60 download_finished = False @@ -615,8 +606,7 @@ def job_error_processor(self, error_list): def start_runner_process( self, run_id, edge_id, request_json, cuda_visible_gpu_ids_str=None, - sender_message_queue=None, listener_message_queue=None, - status_center_queue=None, process_name=None + sender_message_queue=None, status_center_queue=None ): return None @@ -650,8 +640,8 @@ def rebuild_message_status_center(self, sender_message_queue, listener_message_q self.mlops_metrics.set_messenger(self.message_center) self.mlops_metrics.run_id = self.run_id - self.status_center = FedMLStatusCenter.rebuild_status_center_from_queue(status_queue) + status_center = FedMLStatusCenter.rebuild_status_center_from_queue(status_queue) if self.status_reporter is None: self.status_reporter = MLOpsMetrics() - self.status_reporter.set_messenger(self.status_center) + self.status_reporter.set_messenger(status_center) self.status_reporter.run_id = self.run_id diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py index ad32f78631..dcc4045699 100755 --- a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py @@ -20,7 +20,7 @@ def start_job_runner( self, run_id, request_json, args=None, edge_id=None, is_server_job=False, sender_message_queue=None, listener_message_queue=None, status_center_queue=None, should_start_cloud_server=False, use_local_process_as_cloud_server=False, - cuda_visible_gpu_ids_str=None, process_name=None + cuda_visible_gpu_ids_str=None ): run_id_str = str(run_id) self.job_runners[run_id_str] = self._generate_job_runner_instance( @@ -29,11 +29,9 @@ def start_job_runner( ) self.job_runners[run_id_str].start_runner_process( run_id, request_json, edge_id=edge_id, - cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, sender_message_queue=sender_message_queue, listener_message_queue=listener_message_queue, - status_center_queue=status_center_queue, - process_name=process_name + status_center_queue=status_center_queue ) def stop_job_runner(self, run_id): diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py index 9970b1d3f6..19bb7e9882 100755 --- a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py @@ -2,13 +2,11 @@ import json import logging import multiprocessing -import os import sys import time import traceback import uuid import fedml -from ..comm_utils.run_process_utils import RunProcessUtils from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager from ....core.mlops.mlops_metrics import MLOpsMetrics @@ -47,7 +45,6 @@ def __init__(self, args, agent_config=None, is_master=False): self.mlops_metrics = None self.status_reporter = None self.user_name = args.user_name - self.parent_agent = None fedml._init_multiprocessing() @@ -61,54 +58,38 @@ def add_protocol_handler(self): # self.add_message_listener(self.topic_start_train, self.callback_start_train) pass - def initialize( - self, communication_manager=None, sender_message_queue=None, - status_center_queue=None, sender_message_event=None - ): + def initialize(self): # Generate the message topics self.generate_topics() # Setup MQTT connection - if communication_manager is None: - self.communication_mgr = MqttManager( - self.agent_config["mqtt_config"]["BROKER_HOST"], - self.agent_config["mqtt_config"]["BROKER_PORT"], - self.agent_config["mqtt_config"]["MQTT_USER"], - self.agent_config["mqtt_config"]["MQTT_PWD"], - self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], - f"FedML_Agent_Daemon_@{self.user_name}@_@{self.current_device_id}@_@{str(uuid.uuid4())}@", - self.topic_last_will, - json.dumps({"ID": self.edge_id, "status": GeneralConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) - ) - else: - self.communication_mgr = communication_manager + self.communication_mgr = MqttManager( + self.agent_config["mqtt_config"]["BROKER_HOST"], + self.agent_config["mqtt_config"]["BROKER_PORT"], + self.agent_config["mqtt_config"]["MQTT_USER"], + self.agent_config["mqtt_config"]["MQTT_PWD"], + self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], + f"FedML_Agent_Daemon_@{self.user_name}@_@{self.current_device_id}@_@{str(uuid.uuid4())}@", + self.topic_last_will, + json.dumps({"ID": self.edge_id, "status": GeneralConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) + ) # Add the message listeners for all topics self.add_protocol_handler() # Start the message center to process edge related messages. - if sender_message_queue is None: - self.setup_message_center() - sender_message_event = self.message_center.get_sender_message_event() - else: - self.rebuild_message_center(sender_message_queue) + self.setup_message_center() # Setup the message listener queue self.setup_listener_message_queue() # Start the status center to process edge related status. - if status_center_queue is None: - self.start_status_listener_center(sender_message_event=sender_message_event) - else: - self.set_status_queue(status_center_queue) - self.rebuild_status_center(status_center_queue) + self.start_status_listener_center() # Start the message center for listener self.start_listener(sender_message_queue=self.message_center.get_sender_message_queue(), - sender_message_event=sender_message_event, agent_config=self.agent_config, - message_center_name=self.message_center_name, - extra_queues=[self.get_status_queue()]) + message_center_name=self.message_center_name) # Init extra items, e.g. database, recovery, etc. self._init_extra_items() @@ -116,11 +97,11 @@ def initialize( # Setup MQTT connected listener self.communication_mgr.add_connected_listener(self.on_agent_communication_connected) self.communication_mgr.add_disconnected_listener(self.on_agent_communication_disconnected) + self.communication_mgr.connect() def start(self): # Start MQTT message loop try: - self.communication_mgr.connect() self.communication_mgr.loop_forever() except Exception as e: if str(e) == "Restarting after upgraded...": @@ -129,8 +110,6 @@ def start(self): logging.info("Server tracing: {}".format(traceback.format_exc())) finally: - logging.info(f"Protocol manager is about to exit, pid: {os.getpid()}") - FedMLAccountManager.write_login_failed_file(is_client=not self.is_master_agent) self.stop() @@ -141,7 +120,7 @@ def start(self): clean_process_group=False) sys.exit(1) - def stop(self, kill_process=False): + def stop(self): if self.communication_mgr is not None: # noinspection PyBroadException try: @@ -153,10 +132,7 @@ def stop(self, kill_process=False): self.communication_mgr.loop_stop() self.communication_mgr.disconnect() - if kill_process: - self.post_status_center_stopping_message() - self.release_message_center() - RunProcessUtils.kill_process(os.getppid(), exclude_current_pid=True) + self.release_message_center() @abstractmethod def _init_extra_items(self): @@ -220,37 +196,20 @@ def rebuild_message_center(self, message_center_queue): def release_message_center(self): try: - self.stop_message_center() - if self.message_center is not None: - self.message_center.stop_message_center() + self.message_center.stop() self.message_center = None except Exception as e: logging.error( - f"Failed to release the message center with Exception {e}. " - f"Traceback: {traceback.format_exc()}") - pass - - def release_status_center(self): - try: - self.stop_status_center() - - if self.status_center is not None: - self.status_center.stop_status_center() - self.status_center = None - - except Exception as e: - logging.error( - f"Failed to release the status center with Exception {e}. " + f"Failed to release slave communication manager with Exception {e}. " f"Traceback: {traceback.format_exc()}") pass - def start_status_listener_center(self, sender_message_event=None): + def start_status_listener_center(self): self.start_status_center( sender_message_center_queue=self.message_center.get_sender_message_queue(), listener_message_center_queue=self.get_listener_message_queue(), - sender_message_event=sender_message_event, is_slave_agent=not self.is_master_agent ) @@ -272,9 +231,6 @@ def rebuild_status_center(self, status_center_queue): self.status_reporter.edge_id = self.edge_id self.status_reporter.server_agent_id = self.server_agent_id - def process_extra_queues(self, extra_queues): - pass - def generate_status_report(self, run_id, edge_id, server_agent_id=None): status_reporter = MLOpsMetrics() status_reporter.set_messenger(self, send_message_func=self.send_status_message) @@ -310,29 +266,6 @@ def get_status_runner(self): return None - def get_protocol_communication_manager(self): - return self.communication_mgr - - def get_protocol_sender_message_queue(self): - return self.message_center.get_sender_message_queue() - - def get_protocol_sender_message_event(self): - return self.message_center.get_sender_message_event() - - def get_protocol_status_center_queue(self): - return self.get_status_queue() - - def get_subscribed_topics(self): - return self.subscribed_topics - def send_agent_active_msg(self, edge_id): active_msg = {"ID": edge_id, "status": GeneralConstants.MSG_MLOPS_SERVER_STATUS_IDLE} self.message_center.send_message_json(self.topic_active, json.dumps(active_msg)) - - def post_status_center_stopping_message(self, run_id=None): - topic_status_center_stopping = GeneralConstants.FEDML_TOPIC_STATUS_CENTER_STOP - payload = {"run_id": run_id} - self.status_reporter.send_message(topic_status_center_stopping, json.dumps(payload)) - - def set_parent_agent(self, parent_agent): - self.parent_agent = parent_agent diff --git a/python/fedml/computing/scheduler/scheduler_core/status_center.py b/python/fedml/computing/scheduler/scheduler_core/status_center.py index b1462d7ea9..97c2115e76 100755 --- a/python/fedml/computing/scheduler/scheduler_core/status_center.py +++ b/python/fedml/computing/scheduler/scheduler_core/status_center.py @@ -1,16 +1,10 @@ import logging -import os -import platform import time from enum import Enum, unique import multiprocessing +from multiprocessing import Process, Queue import queue - -import setproctitle - -import fedml -from .general_constants import GeneralConstants from .message_common import FedMLMessageEntity, FedMLStatusEntity from .message_center import FedMLMessageCenter import traceback @@ -87,7 +81,6 @@ class FedMLStatusCenter(object): TOPIC_SLAVE_JOB_LAUNCH_SUFFIX = "/start_train" TOPIC_SLAVE_JOB_STOP_PREFIX = "flserver_agent/" TOPIC_SLAVE_JOB_STOP_SUFFIX = "/stop_train" - TOPIC_STATUS_CENTER_STOP_PREFIX = GeneralConstants.FEDML_TOPIC_STATUS_CENTER_STOP ALLOWED_MAX_JOB_STATUS_CACHE_NUM = 1000 def __init__(self, message_queue=None): @@ -112,42 +105,25 @@ def get_status_runner(self): return None def start_status_center(self, sender_message_center_queue=None, - listener_message_center_queue=None, - sender_message_event=None, - is_slave_agent=False): - self.status_queue = multiprocessing.Manager().Queue() + listener_message_center_queue=None, is_slave_agent=False): + self.status_queue = Queue() self.status_event = multiprocessing.Event() self.status_event.clear() self.status_sender_message_center_queue = sender_message_center_queue self.status_listener_message_center_queue = listener_message_center_queue - self.status_runner = self - process_name = GeneralConstants.get_status_center_process_name( - f'{"deploy" if self.is_deployment_status_center else "launch"}_' - f'{"slave" if is_slave_agent else "master"}_agent') + self.status_runner = self.get_status_runner() target_func = self.status_runner.run_status_dispatcher if not is_slave_agent else \ self.status_runner.run_status_dispatcher_in_slave - if platform.system() == "Windows": - self.status_center_process = multiprocessing.Process( - target=target_func, args=( - self.status_event, self.status_queue, self.status_sender_message_center_queue, - self.status_listener_message_center_queue, sender_message_event, process_name - ) - ) - else: - self.status_center_process = fedml.get_process( - target=target_func, args=( - self.status_event, self.status_queue, self.status_sender_message_center_queue, - self.status_listener_message_center_queue, sender_message_event, process_name - ) + self.status_center_process = Process( + target=target_func, args=( + self.status_event, self.status_queue, self.status_sender_message_center_queue, + self.status_listener_message_center_queue ) + ) self.status_center_process.start() - def stop_status_center(self): - if self.status_event is not None: - self.status_event.set() - - def check_status_stop_event(self): + def check_message_stop_event(self): if self.status_event is not None and self.status_event.is_set(): logging.info("Received status center stopping event.") raise StatusCenterStoppedException("Status center stopped (for sender)") @@ -166,9 +142,6 @@ def send_status_message(self, topic, payload): def get_status_queue(self): return self.status_queue - def set_status_queue(self, status_queue): - self.status_queue = status_queue - def status_center_process_master_status(self, topic, payload): pass @@ -183,14 +156,7 @@ def rebuild_status_center(self, status_queue): def run_status_dispatcher(self, status_event, status_queue, sender_message_center_queue, - listener_message_center_queue, - sender_message_event, process_name=None): - if process_name is not None: - setproctitle.setproctitle(process_name) - - if platform.system() != "Windows": - os.setsid() - + listener_message_center_queue): # Save the parameters self.status_event = status_event self.status_queue = status_queue @@ -203,11 +169,10 @@ def run_status_dispatcher(self, status_event, status_queue, self.rebuild_message_center(sender_message_center_queue) message_center = FedMLMessageCenter( sender_message_queue=sender_message_center_queue, - listener_message_queue=listener_message_center_queue, - sender_message_event=sender_message_event + listener_message_queue=listener_message_center_queue ) - if status_queue is not None: + if sender_message_center_queue is not None: self.rebuild_status_center(status_queue) # Init status manager instances @@ -218,7 +183,7 @@ def run_status_dispatcher(self, status_event, status_queue, # Check if we should stop status dispatcher try: - self.check_status_stop_event() + self.check_message_stop_event() except StatusCenterStoppedException as e: break @@ -238,12 +203,6 @@ def run_status_dispatcher(self, status_event, status_queue, message_entity = FedMLMessageEntity(message_body=message_body) status_entity = FedMLStatusEntity(status_msg_body=message_body) - if message_entity.topic.startswith(FedMLStatusCenter.TOPIC_STATUS_CENTER_STOP_PREFIX): - # Process the stop message for message center and status center - message_center.stop_message_center() - self.stop_status_center() - continue - # Generate status manager instance run_id_str = str(status_entity.run_id) run_id_int = int(status_entity.run_id) @@ -293,14 +252,7 @@ def run_status_dispatcher(self, status_event, status_queue, def run_status_dispatcher_in_slave(self, status_event, status_queue, sender_message_center_queue, - listener_message_center_queue, - sender_message_event, process_name=None): - if process_name is not None: - setproctitle.setproctitle(process_name) - - if platform.system() != "Windows": - os.setsid() - + listener_message_center_queue): # Save the parameters self.status_event = status_event self.status_queue = status_queue @@ -313,11 +265,10 @@ def run_status_dispatcher_in_slave(self, status_event, status_queue, self.rebuild_message_center(sender_message_center_queue) message_center = FedMLMessageCenter( sender_message_queue=sender_message_center_queue, - listener_message_queue=listener_message_center_queue, - sender_message_event=sender_message_event + listener_message_queue=listener_message_center_queue ) - if status_queue is not None: + if sender_message_center_queue is not None: self.rebuild_status_center(status_queue) # Init status manager instances @@ -329,7 +280,7 @@ def run_status_dispatcher_in_slave(self, status_event, status_queue, # Check if we should stop status dispatcher try: - self.check_status_stop_event() + self.check_message_stop_event() except StatusCenterStoppedException as e: break diff --git a/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py b/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py index ec98cc7906..e045458db5 100755 --- a/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py +++ b/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py @@ -136,14 +136,12 @@ def process_job_completed_status(self, master_id, status): # self.stop_cloud_server() # self.remove_listener_for_run_metrics(self.run_id) # self.remove_listener_for_run_logs(self.run_id) - self.message_center.receive_message( GeneralConstants.get_topic_complete_job(master_id), json.dumps(GeneralConstants.get_payload_complete_job(self.run_id, master_id))) - if self.status_center.is_deployment_status_center: - if status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: - self.report_deployment_status(self.run_id, GeneralConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) + if self.status_center.is_deployment_status_center and status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: + self.report_deployment_status(self.run_id, GeneralConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) def process_job_exception_status(self, master_id, status): # Report exception job status @@ -189,17 +187,16 @@ def process_job_status_consensus(self, run_id, master_id, status): status = self.get_entire_job_status() # Set the device status based on the job status - if self.edge_status_dict is not None: - for edge_id_item, edge_status_item in self.edge_status_dict.items(): - if edge_id_item == "server": - continue - - # Calc the device status based on the job status - consensus_device_status = FedMLStatusManager.get_device_consensus_status_in_job( - status, edge_status_item) - if consensus_device_status is not None: - self.message_reporter.report_client_training_status( - edge_id_item, consensus_device_status, run_id=run_id, update_db=False) + for edge_id_item, edge_status_item in self.edge_status_dict.items(): + if edge_id_item == "server": + continue + + # Calc the device status based on the job status + consensus_device_status = FedMLStatusManager.get_device_consensus_status_in_job( + status, edge_status_item) + if consensus_device_status is not None: + self.message_reporter.report_client_training_status( + edge_id_item, consensus_device_status, run_id=run_id, update_db=False) # Save the job status to local storage FedMLServerDataInterface.get_instance().save_job_status(run_id, master_id, status, status) diff --git a/python/fedml/computing/scheduler/slave/base_slave_agent.py b/python/fedml/computing/scheduler/slave/base_slave_agent.py index 9876ac9912..01c0a39195 100755 --- a/python/fedml/computing/scheduler/slave/base_slave_agent.py +++ b/python/fedml/computing/scheduler/slave/base_slave_agent.py @@ -24,9 +24,7 @@ def __init__(self): def login( self, userid, api_key=None, device_id=None, - os_name=None, need_to_check_gpu=False, role=None, - communication_manager=None, sender_message_queue=None, - status_center_queue=None, sender_message_event=None + os_name=None, need_to_check_gpu=False, role=None ): # Preprocess the login args if need_to_check_gpu: @@ -35,7 +33,7 @@ def login( print("We can't find any gpu device on your machine. \n" "With the gpu_supplier(-g) option, you need to check if your machine " "has nvidia GPUs and installs CUDA related drivers.") - return None + return # Login account login_result = FedMLAccountManager.get_instance().login( @@ -59,22 +57,17 @@ def login( # Initialize the protocol manager # noinspection PyBoardException try: - self._initialize_protocol_manager( - communication_manager=communication_manager, - sender_message_queue=sender_message_queue, - status_center_queue=status_center_queue, - sender_message_event=sender_message_event) + self._initialize_protocol_manager() except Exception as e: FedMLAccountManager.write_login_failed_file(is_client=True) self.protocol_mgr.stop() raise e - return login_result - - def start(self): # Start the protocol manager to process the messages from MLOps and slave agents. self.protocol_mgr.start() + return login_result + @staticmethod def logout(): GeneralConstants.cleanup_run_process(None) @@ -91,20 +84,12 @@ def _create_protocol_manager(self, login_result): self.protocol_mgr.user_name = login_result.user_name self.protocol_mgr.agent_config = login_result.agent_config - def _initialize_protocol_manager( - self, communication_manager=None, sender_message_queue=None, - status_center_queue=None, sender_message_event=None - ): + def _initialize_protocol_manager(self): # Init local database self._init_database() # Initialize the master protocol - self.protocol_mgr.set_parent_agent(self) - self.protocol_mgr.initialize( - communication_manager=communication_manager, - sender_message_queue=sender_message_queue, - status_center_queue=status_center_queue, - sender_message_event=sender_message_event) + self.protocol_mgr.initialize() # Start the client API process self._start_slave_api() @@ -137,9 +122,6 @@ def _start_slave_api(self): should_capture_stderr=False ) - def get_protocol_manager(self): - return self.protocol_mgr - @abstractmethod def _get_log_file_dir(self): pass @@ -155,8 +137,3 @@ def _init_database(self): @abstractmethod def _generate_protocol_manager_instance(self, args, agent_config=None): return None - - def save_deploy_ids(self, deploy_master_edge_id=None, deploy_slave_edge_id=None): - self.protocol_mgr.save_deploy_ids( - deploy_master_edge_id=deploy_master_edge_id, deploy_slave_edge_id=deploy_slave_edge_id) - diff --git a/python/fedml/computing/scheduler/slave/base_slave_job_runner.py b/python/fedml/computing/scheduler/slave/base_slave_job_runner.py index 0486b131a6..5e530dbba7 100755 --- a/python/fedml/computing/scheduler/slave/base_slave_job_runner.py +++ b/python/fedml/computing/scheduler/slave/base_slave_job_runner.py @@ -7,9 +7,6 @@ import traceback from abc import ABC, abstractmethod -import setproctitle - -import fedml from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon from .client_data_interface import FedMLClientDataInterface @@ -50,12 +47,8 @@ def __repr__(self): ) def run(self, process_event, completed_event, run_extend_queue_list, - sender_message_center, listener_message_queue, status_center_queue, - process_name=None): - if process_name is not None: - setproctitle.setproctitle(process_name) - - print(f"Client runner process id {os.getpid()}, name {process_name}, run id {self.run_id}") + sender_message_center, listener_message_queue, status_center_queue): + print(f"Client runner process id {os.getpid()}, run id {self.run_id}") if platform.system() != "Windows": os.setsid() @@ -251,7 +244,7 @@ def reset_devices_status(self, edge_id, status): def start_runner_process( self, run_id, request_json, edge_id=None, sender_message_queue=None, listener_message_queue=None, - status_center_queue=None, cuda_visible_gpu_ids_str=None, process_name=None + status_center_queue=None, cuda_visible_gpu_ids_str=None ): client_runner = self._generate_job_runner_instance( self.args, run_id=run_id, request_json=request_json, @@ -266,17 +259,9 @@ def start_runner_process( client_runner.server_id = request_json.get("server_id", "0") self.run_extend_queue_list = self._generate_extend_queue_list() logging.info("start the runner process.") - - if platform.system() == "Windows": - self.run_process = multiprocessing.Process( - target=client_runner.run, args=( - self.run_process_event, self.run_process_completed_event, self.run_extend_queue_list, - sender_message_queue, listener_message_queue, status_center_queue, process_name - )) - else: - self.run_process = fedml.get_process(target=client_runner.run, args=( - self.run_process_event, self.run_process_completed_event, self.run_extend_queue_list, - sender_message_queue, listener_message_queue, status_center_queue, process_name - )) + self.run_process = Process(target=client_runner.run, args=( + self.run_process_event, self.run_process_completed_event, self.run_extend_queue_list, + sender_message_queue, listener_message_queue, status_center_queue + )) self.run_process.start() return self.run_process diff --git a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py index 534ee2f7d0..447bd05cd9 100755 --- a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py +++ b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py @@ -62,6 +62,8 @@ def __init__(self, args, agent_config=None): self.server_id = args.server_id self.model_device_server_id = None self.model_device_client_edge_id_list = None + self.model_device_server = None + self.model_device_client_list = None @abstractmethod def generate_topics(self): @@ -145,9 +147,15 @@ def add_subscribe_topic(self, topic): self.subscribed_topics.append(topic) def stop(self): - if self.model_device_client_edge_id_list is not None: - self.model_device_client_edge_id_list.clear() - self.model_device_client_edge_id_list = None + if self.model_device_server is not None: + self.model_device_server.stop() + self.model_device_server = None + + if self.model_device_client_list is not None: + for model_client in self.model_device_client_list: + model_client.stop() + self.model_device_client_list.clear() + self.model_device_client_list = None super().stop() @@ -257,8 +265,6 @@ def callback_start_train(self, topic, payload): # Report the run status with finished status and return self.generate_status_report(run_id, edge_id, server_agent_id=server_agent_id).report_client_id_status( edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, run_id=run_id) - - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, edge_id) return logging.info( f"Run started, available gpu ids: {JobRunnerUtils.get_instance().get_available_gpu_id_list(edge_id)}") @@ -276,7 +282,6 @@ def callback_start_train(self, topic, payload): listener_message_queue=self.get_listener_message_queue(), status_center_queue=self.get_status_queue(), cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, - process_name=GeneralConstants.get_launch_slave_job_process_name(run_id, edge_id) ) run_process = self._get_job_runner_manager().get_runner_process(run_id) if run_process is not None: diff --git a/python/fedml/computing/scheduler/slave/client_data_interface.py b/python/fedml/computing/scheduler/slave/client_data_interface.py index 74bf7a64a3..0e9e84381a 100755 --- a/python/fedml/computing/scheduler/slave/client_data_interface.py +++ b/python/fedml/computing/scheduler/slave/client_data_interface.py @@ -343,15 +343,6 @@ def handle_database_compatibility(self): self.close_job_db() - def check_if_table_exist(self, current_db_cursor): - results = current_db_cursor.execute("select * from sqlite_master where type='table' and name='jobs';") - if results is None: - return False - result_len = 0 - for row in results: - result_len += 1 - return False if result_len == 0 else True - def get_agent_status(self, edge_id=0): self.open_job_db() enabled = 1 diff --git a/python/fedml/computing/scheduler/slave/client_login.py b/python/fedml/computing/scheduler/slave/client_login.py index 7a1c759410..95c772a225 100755 --- a/python/fedml/computing/scheduler/slave/client_login.py +++ b/python/fedml/computing/scheduler/slave/client_login.py @@ -1,11 +1,11 @@ import argparse import os import fedml -from fedml.computing.scheduler.slave.united_agents import FedMLUnitedAgent +from fedml.computing.scheduler.slave.slave_agent import FedMLLaunchSlaveAgent def logout(): - FedMLUnitedAgent.get_instance().logout() + FedMLLaunchSlaveAgent.logout() if __name__ == "__main__": @@ -18,7 +18,6 @@ def logout(): parser.add_argument("--version", "-v", type=str, default="release") parser.add_argument("--local_server", "-ls", type=str, default="127.0.0.1") parser.add_argument("--role", "-r", type=str, default="client") - parser.add_argument("--runner_cmd", "-rc", type=str, default="{}") parser.add_argument("--device_id", "-id", type=str, default="0") parser.add_argument("--os_name", "-os", type=str, default="") parser.add_argument("--api_key", "-k", type=str, default="") @@ -37,10 +36,9 @@ def logout(): fedml.set_local_on_premise_platform_port(args.local_on_premise_platform_port) fedml.set_env_version(args.version) - united_agents = FedMLUnitedAgent.get_instance() + slave_agent = FedMLLaunchSlaveAgent() if args.type == 'login': - united_agents.login( - args.api_key, api_key=args.api_key, device_id=args.device_id, - os_name=args.os_name, role=args.role, runner_cmd=args.runner_cmd) + slave_agent.login(args.api_key, api_key=args.api_key, device_id=args.device_id, + os_name=args.os_name, role=args.role) else: - united_agents.logout() + FedMLLaunchSlaveAgent.logout() diff --git a/python/fedml/computing/scheduler/slave/slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/slave_protocol_manager.py index 449cd7c29c..a1067a0d96 100755 --- a/python/fedml/computing/scheduler/slave/slave_protocol_manager.py +++ b/python/fedml/computing/scheduler/slave/slave_protocol_manager.py @@ -1,15 +1,18 @@ - +import copy +import json import os +import fedml from ..comm_utils.job_cleanup import JobCleanup from .base_slave_protocol_manager import FedMLBaseSlaveProtocolManager from .launch_job_runner_manager import FedMLLaunchJobRunnerManager +from ..model_scheduler.model_device_server import FedMLModelDeviceServerRunner +from ..model_scheduler.model_device_client import FedMLModelDeviceClientRunner class FedMLLaunchSlaveProtocolManager(FedMLBaseSlaveProtocolManager): def __init__(self, args, agent_config=None): FedMLBaseSlaveProtocolManager.__init__(self, args, agent_config=agent_config) - self.message_center_name = "launch_slave_agent" # Override def generate_topics(self): @@ -31,8 +34,7 @@ def _get_job_runner_manager(self): def _process_connection_ready(self): from fedml.core.mlops import sync_deploy_id sync_deploy_id( - self.edge_id, self.model_device_server_id, self.model_device_client_edge_id_list, - message_center=self.message_center) + self.edge_id, self.model_device_server.edge_id, self.model_device_client_edge_id_list) # Override def _process_connection_lost(self): @@ -45,19 +47,59 @@ def _init_extra_items(self): # Sync the data when startup JobCleanup.get_instance().sync_data_on_startup(self.args.edge_id) - # Start the monitor process - self.mlops_metrics.stop_device_realtime_perf() - self.mlops_metrics.report_device_realtime_perf(self.args, self.args.agent_config["mqtt_config"]) - - def save_deploy_ids(self, deploy_master_edge_id=None, deploy_slave_edge_id=None): - if deploy_master_edge_id is not None: - self.model_device_server_id = deploy_master_edge_id - - if deploy_slave_edge_id is not None: - if self.model_device_client_edge_id_list is None: - self.model_device_client_edge_id_list = list() - self.model_device_client_edge_id_list.append(deploy_slave_edge_id) + # Get the environment variables + infer_host = os.getenv("FEDML_INFER_HOST", None) + infer_redis_addr = os.getenv("FEDML_INFER_REDIS_ADDR", None) + infer_redis_port = os.getenv("FEDML_INFER_REDIS_PORT", None) + infer_redis_password = os.getenv("FEDML_INFER_REDIS_PASSWORD", None) + model_client_num = os.getenv("FEDML_MODEL_WORKER_NUM", None) + + # Start deploy master agent and slave agent + in_args = copy.deepcopy(self.args) + if self.model_device_client_edge_id_list is None: + self.model_device_client_edge_id_list = list() + if self.model_device_client_list is None: + model_client_num = 1 if model_client_num is None else int(model_client_num) + self.model_device_client_list = list() + for client_index in range(model_client_num): + model_device_client = FedMLModelDeviceClientRunner( + in_args, f"{in_args.current_device_id}_{client_index + 1}", in_args.os_name, + in_args.is_from_docker, self.agent_config) + if infer_host is not None: + model_device_client.infer_host = infer_host + if infer_redis_addr is not None: + model_device_client.redis_addr = infer_redis_addr + if infer_redis_port is not None: + model_device_client.redis_port = infer_redis_port + if infer_redis_password is not None: + model_device_client.redis_password = infer_redis_password + model_device_client.start() + self.model_device_client_list.append(model_device_client) + self.model_device_client_edge_id_list.append(model_device_client.get_edge_id()) + + self.args = copy.deepcopy(in_args) + if self.model_device_server is None: + self.model_device_server = FedMLModelDeviceServerRunner(in_args, in_args.current_device_id, + in_args.os_name, in_args.is_from_docker, + self.agent_config) + if infer_host is not None: + self.model_device_server.infer_host = infer_host + if infer_redis_addr is not None: + self.model_device_server.redis_addr = infer_redis_addr + if infer_redis_port is not None: + self.model_device_server.redis_port = infer_redis_port + if infer_redis_password is not None: + self.model_device_server.redis_password = infer_redis_password + + self.model_device_server.start() + self.model_device_server_id = self.model_device_server.get_edge_id() # Save the deployed master and worker id list to the environment variable. os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server_id) os.environ["FEDML_DEPLOY_WORKER_IDS"] = str(self.model_device_client_edge_id_list) + + # Start the monitor process + self.args = copy.deepcopy(in_args) + self.mlops_metrics.stop_device_realtime_perf() + self.mlops_metrics.report_device_realtime_perf(self.args, self.args.agent_config["mqtt_config"]) + pass \ No newline at end of file diff --git a/python/fedml/computing/scheduler/slave/united_agents.py b/python/fedml/computing/scheduler/slave/united_agents.py deleted file mode 100755 index 3c8549c06a..0000000000 --- a/python/fedml/computing/scheduler/slave/united_agents.py +++ /dev/null @@ -1,75 +0,0 @@ -from fedml.computing.scheduler.model_scheduler.master_agent import FedMLDeployMasterAgent -from fedml.computing.scheduler.model_scheduler.worker_agent import FedMLDeployWorkerAgent -from fedml.computing.scheduler.scheduler_core.account_manager import FedMLAccountManager -from fedml.computing.scheduler.slave.slave_agent import FedMLLaunchSlaveAgent -from fedml.computing.scheduler.master.master_agent import FedMLLaunchMasterAgent -from fedml.core.common.singleton import Singleton - - -class FedMLUnitedAgent(Singleton): - - @staticmethod - def get_instance(): - return FedMLUnitedAgent() - - def logout(self): - FedMLLaunchSlaveAgent.logout() - - def login(self, userid, api_key=None, device_id=None, - os_name=None, need_to_check_gpu=False, role=None, runner_cmd=None): - # Create the launch master/slave and deploy master/slave agents. - launch_slave_agent = FedMLLaunchSlaveAgent() - launch_master_agent = FedMLLaunchMasterAgent() - deploy_slave_agent = FedMLDeployWorkerAgent() - deploy_master_agent = FedMLDeployMasterAgent() - - # Login with the launch slave role - login_result = launch_slave_agent.login( - api_key, api_key=api_key, device_id=device_id, - os_name=os_name, role=role - ) - - # Get the communication manager, sender message queue - shared_communication_mgr = launch_slave_agent.get_protocol_manager().get_protocol_communication_manager() - shared_slave_sender_message_queue = launch_slave_agent.get_protocol_manager().get_protocol_sender_message_queue() - shared_slave_sender_message_event = launch_slave_agent.get_protocol_manager().get_protocol_sender_message_event() - - # Login with the launch master role based on - # the shared communication manager, sender message center - launch_master_agent.login( - api_key, api_key=api_key, device_id=login_result.device_id, - os_name=os_name, runner_cmd=runner_cmd, - role=FedMLAccountManager.ROLE_GPU_MASTER_SERVER, - communication_manager=shared_communication_mgr, - sender_message_queue=None - ) - - # Get the status center queue - shared_slave_status_center_queue = launch_slave_agent.get_protocol_manager().get_protocol_status_center_queue() - shared_master_status_center_queue = launch_master_agent.get_protocol_manager().get_protocol_status_center_queue() - shared_master_sender_message_queue = launch_master_agent.get_protocol_manager().get_protocol_sender_message_queue() - shared_master_sender_message_event = launch_master_agent.get_protocol_manager().get_protocol_sender_message_event() - - # Login with the deployment master role based on - # the shared communication manager, sender message center, status center - deploy_master_login_result = deploy_master_agent.login( - userid, api_key=api_key, device_id=login_result.device_id, - os_name=os_name, role=FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM, - communication_manager=shared_communication_mgr - ) - - # Login with the deployment slave role based on - # the shared communication manager, sender message center, status center - deploy_slave_login_result = deploy_slave_agent.login( - userid, api_key=api_key, device_id=login_result.device_id, - os_name=os_name, role=FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM, - communication_manager=shared_communication_mgr - ) - - # Set the deployment ids to launch agent so that we can report the related device info to MLOps. - launch_slave_agent.save_deploy_ids( - deploy_master_edge_id=deploy_master_login_result.edge_id, - deploy_slave_edge_id=deploy_slave_login_result.edge_id) - - # Start the slave agent to connect to servers and loop forever. - launch_slave_agent.start() diff --git a/python/fedml/core/mlops/__init__.py b/python/fedml/core/mlops/__init__.py index 121c8e26bb..148427fe1f 100644 --- a/python/fedml/core/mlops/__init__.py +++ b/python/fedml/core/mlops/__init__.py @@ -1453,14 +1453,12 @@ def release_resources(run_id, device_id): MLOpsConstants.MSG_TOPIC_LAUNCH_RELEASE_GPU_IDS, json.dumps(payload)) -def sync_deploy_id(device_id, master_deploy_id, worker_deploy_id_list, message_center=None): - payload = {"device_id": device_id, "master_deploy_id": master_deploy_id, "worker_deploy_ids": worker_deploy_id_list} - if message_center is None: - fedml_args = get_fedml_args() - setup_log_mqtt_mgr() - MLOpsStore.mlops_log_mqtt_mgr.send_message_json( - MLOpsConstants.MSG_TOPIC_LAUNCH_SYNC_DEPLOY_IDS, json.dumps(payload)) - else: - message_center.send_message( MLOpsConstants.MSG_TOPIC_LAUNCH_SYNC_DEPLOY_IDS, json.dumps(payload)) +def sync_deploy_id(device_id, master_deploy_id, worker_deploy_id_list): + fedml_args = get_fedml_args() + setup_log_mqtt_mgr() + + payload = {"device_id": device_id, "master_deploy_id": master_deploy_id, "worker_deploy_ids": worker_deploy_id_list} + MLOpsStore.mlops_log_mqtt_mgr.send_message_json( + MLOpsConstants.MSG_TOPIC_LAUNCH_SYNC_DEPLOY_IDS, json.dumps(payload)) diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py index 61da372d97..4bb41df73f 100644 --- a/python/fedml/core/mlops/mlops_device_perfs.py +++ b/python/fedml/core/mlops/mlops_device_perfs.py @@ -1,7 +1,6 @@ import json import logging import os -import platform import time import traceback import uuid @@ -9,15 +8,12 @@ import multiprocessing import psutil -import setproctitle -import fedml from fedml.computing.scheduler.comm_utils import sys_utils from .device_info_report_protocol import FedMLDeviceInfoReportProtocol from .mlops_utils import MLOpsUtils from .system_stats import SysStats from ...computing.scheduler.comm_utils.job_monitor import JobMonitor -from ...computing.scheduler.scheduler_core.general_constants import GeneralConstants from ...core.distributed.communication.mqtt.mqtt_manager import MqttManager @@ -32,17 +28,6 @@ ROLE_ENDPOINT_REPLICA_NUM = 8 ROLE_ENDPOINT_REPLICA_PERF = 9 -ROLE_DEVICE_JOB_TOTAL_MONITOR_STR = "device_job_total" -ROLE_DEVICE_INFO_REPORTER_STR = "device_info" -ROLE_ENDPOINT_MASTER_STR = "endpoint_master" -ROLE_ENDPOINT_SLAVE_STR = "endpoint_slave" -ROLE_RUN_MASTER_STR = "run_master" -ROLE_RUN_SLAVE_STR = "run_slave" -ROLE_ENDPOINT_LOGS_STR = "endpoint_logs" -ROLE_AUTO_SCALER_STR = "autoscaler" -ROLE_ENDPOINT_REPLICA_NUM_STR = "endpoint_replica_num" -ROLE_ENDPOINT_REPLICA_PERF_STR = "endpoint_replica_perf" - class MLOpsDevicePerfStats(object): def __init__(self): @@ -91,161 +76,58 @@ def setup_realtime_stats_process(self, sys_args): self.device_realtime_stats_event.clear() perf_stats.device_realtime_stats_event = self.device_realtime_stats_event - if platform.system() == "Windows": - self.device_realtime_stats_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_DEVICE_INFO_REPORTER, self.is_client, - GeneralConstants.get_monitor_process_name( - ROLE_DEVICE_INFO_REPORTER_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.device_realtime_stats_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_DEVICE_INFO_REPORTER, self.is_client, - GeneralConstants.get_monitor_process_name( - ROLE_DEVICE_INFO_REPORTER_STR, perf_stats.run_id, perf_stats.edge_id))) + self.device_realtime_stats_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_DEVICE_INFO_REPORTER, self.is_client)) self.device_realtime_stats_process.start() if self.enable_job_total_monitor: - if platform.system() == "Windows": - self.job_total_monitor_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_DEVICE_JOB_TOTAL_MONITOR, self.is_client, - GeneralConstants.get_monitor_process_name( - ROLE_DEVICE_JOB_TOTAL_MONITOR_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.job_total_monitor_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_DEVICE_JOB_TOTAL_MONITOR, self.is_client, - GeneralConstants.get_monitor_process_name( - ROLE_DEVICE_JOB_TOTAL_MONITOR_STR, perf_stats.run_id, perf_stats.edge_id))) + self.job_total_monitor_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_DEVICE_JOB_TOTAL_MONITOR, self.is_client)) self.job_total_monitor_process.start() else: if self.is_client: - # Register endpoint master process - if platform.system() == "Windows": - self.monitor_endpoint_master_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_MASTER, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_MASTER_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.monitor_endpoint_master_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_MASTER, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_MASTER_STR, perf_stats.run_id, perf_stats.edge_id))) + self.monitor_endpoint_master_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_MASTER)) self.monitor_endpoint_master_process.start() - # Register endpoint slave process - if platform.system() == "Windows": - self.monitor_endpoint_slave_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_SLAVE, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_SLAVE_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.monitor_endpoint_slave_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_SLAVE, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_SLAVE_STR, perf_stats.run_id, perf_stats.edge_id))) - self.monitor_endpoint_slave_process.start() - - # Register run slave process - if platform.system() == "Windows": - self.monitor_run_slave_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_RUN_SLAVE, True, - GeneralConstants.get_monitor_process_name( - ROLE_RUN_SLAVE_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.monitor_run_slave_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_RUN_SLAVE, True, - GeneralConstants.get_monitor_process_name( - ROLE_RUN_SLAVE_STR, perf_stats.run_id, perf_stats.edge_id))) + self.monitor_run_slave_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_RUN_SLAVE)) self.monitor_run_slave_process.start() - # Register endpoint logs process - if platform.system() == "Windows": - self.monitor_endpoint_logs_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_LOGS, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_LOGS_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.monitor_endpoint_logs_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_LOGS, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_LOGS_STR, perf_stats.run_id, perf_stats.edge_id))) + self.monitor_endpoint_logs_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_LOGS)) self.monitor_endpoint_logs_process.start() # Register auto-scaler process - if platform.system() == "Windows": - self.monitor_auto_scaler_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_AUTO_SCALER, True, - GeneralConstants.get_monitor_process_name( - ROLE_AUTO_SCALER_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.monitor_auto_scaler_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_AUTO_SCALER, True, - GeneralConstants.get_monitor_process_name( - ROLE_AUTO_SCALER_STR, perf_stats.run_id, perf_stats.edge_id))) + self.monitor_auto_scaler_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_AUTO_SCALER)) self.monitor_auto_scaler_process.start() # Register replica number report channel - if platform.system() == "Windows": - self.monitor_replica_num_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_NUM, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_REPLICA_NUM_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.monitor_replica_num_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_NUM, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_REPLICA_NUM_STR, perf_stats.run_id, perf_stats.edge_id))) + self.monitor_replica_num_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_NUM)) self.monitor_replica_num_process.start() # Register replica performance report channel - if platform.system() == "Windows": - self.monitor_replica_perf_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_PERF, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_REPLICA_PERF_STR, perf_stats.run_id, perf_stats.edge_id))) - - else: - self.monitor_replica_perf_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_PERF, True, - GeneralConstants.get_monitor_process_name( - ROLE_ENDPOINT_REPLICA_PERF_STR, perf_stats.run_id, perf_stats.edge_id))) + self.monitor_replica_perf_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_PERF)) self.monitor_replica_perf_process.start() else: - if platform.system() == "Windows": - self.monitor_run_master_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_RUN_MASTER, False, - GeneralConstants.get_monitor_process_name( - ROLE_RUN_MASTER_STR, perf_stats.run_id, perf_stats.edge_id))) - else: - self.monitor_run_master_process = fedml.get_process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_RUN_MASTER, False, - GeneralConstants.get_monitor_process_name( - ROLE_RUN_MASTER_STR, perf_stats.run_id, perf_stats.edge_id))) + self.monitor_run_master_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_RUN_MASTER)) self.monitor_run_master_process.start() - def report_device_realtime_stats_entry(self, sys_event, role, is_client=False, process_name=None): - if process_name is not None: - setproctitle.setproctitle(process_name) - - # print(f"Report device realtime stats, process id {os.getpid()}, name {process_name}") + def report_device_realtime_stats_entry(self, sys_event, role, is_client=False): + # print(f"Report device realtime stats, process id {os.getpid()}") self.device_realtime_stats_event = sys_event mqtt_mgr = MqttManager( diff --git a/python/fedml/core/mlops/mlops_job_perfs.py b/python/fedml/core/mlops/mlops_job_perfs.py index 429e32ff1d..fe3d921558 100644 --- a/python/fedml/core/mlops/mlops_job_perfs.py +++ b/python/fedml/core/mlops/mlops_job_perfs.py @@ -1,25 +1,19 @@ import json import logging import os -import platform import time import traceback import uuid import multiprocess as multiprocessing import psutil -import setproctitle -import fedml from .mlops_utils import MLOpsUtils from .system_stats import SysStats -from ...computing.scheduler.scheduler_core.general_constants import GeneralConstants from ...core.distributed.communication.mqtt.mqtt_manager import MqttManager class MLOpsJobPerfStats(object): - JOB_PERF_PROCESS_TAG = "job_perf" - def __init__(self): self.job_stats_process = None self.job_stats_event = None @@ -144,26 +138,16 @@ def setup_job_stats_process(self, sys_args): self.job_stats_event.clear() perf_stats.job_stats_event = self.job_stats_event perf_stats.job_process_id_map = self.job_process_id_map - if platform.system() == "Windows": - self.job_stats_process = multiprocessing.Process( - target=perf_stats.report_job_stats_entry, - args=(self.job_stats_event, GeneralConstants.get_monitor_process_name( - MLOpsJobPerfStats.JOB_PERF_PROCESS_TAG, perf_stats.run_id, perf_stats.edge_id))) - else: - self.job_stats_process = fedml.get_process( - target=perf_stats.report_job_stats_entry, - args=(self.job_stats_event, GeneralConstants.get_monitor_process_name( - MLOpsJobPerfStats.JOB_PERF_PROCESS_TAG, perf_stats.run_id, perf_stats.edge_id))) + + self.job_stats_process = multiprocessing.Process(target=perf_stats.report_job_stats_entry, + args=(self.job_stats_event,)) self.job_stats_process.start() def report_job_stats(self, sys_args): self.setup_job_stats_process(sys_args) - def report_job_stats_entry(self, sys_event, process_name): - if process_name is not None: - setproctitle.setproctitle(process_name) - - # print(f"Report job realtime stats, process id {os.getpid()}, name {process_name}") + def report_job_stats_entry(self, sys_event): + # print(f"Report job realtime stats, process id {os.getpid()}") self.job_stats_event = sys_event mqtt_mgr = MqttManager( diff --git a/python/fedml/core/mlops/mlops_runtime_log_daemon.py b/python/fedml/core/mlops/mlops_runtime_log_daemon.py index bf136a36c9..ff06dc91b3 100644 --- a/python/fedml/core/mlops/mlops_runtime_log_daemon.py +++ b/python/fedml/core/mlops/mlops_runtime_log_daemon.py @@ -1,19 +1,16 @@ import argparse import logging import os -import platform import shutil import threading import time import multiprocess as multiprocessing import requests -import setproctitle import yaml import fedml from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils -from fedml.computing.scheduler.scheduler_core.general_constants import GeneralConstants from fedml.core.mlops.mlops_utils import MLOpsLoggingUtils from ...core.mlops.mlops_configs import MLOpsConfigs @@ -258,11 +255,8 @@ def should_ignore_log_line(log_line): return False - def log_process(self, process_event, process_name=None): - if process_name is not None: - setproctitle.setproctitle(process_name) - - logging.info(f"Log uploading process id {os.getpid()}, run id {self.run_id}, name {process_name}, edge id {self.device_id}") + def log_process(self, process_event): + logging.info(f"Log uploading process id {os.getpid()}, run id {self.run_id}, edge id {self.device_id}") self.log_process_event = process_event only_push_artifact = False @@ -424,8 +418,6 @@ def set_log_source(self, source): self.log_source = source def start_log_processor(self, log_run_id, log_device_id, log_source=None, log_file_prefix=None): - if log_run_id == "-1" or int(log_run_id) <= 0: - return log_processor = MLOpsRuntimeLogProcessor(self.args.using_mlops, log_run_id, log_device_id, self.log_file_dir, self.log_server_url, @@ -439,13 +431,8 @@ def start_log_processor(self, log_run_id, log_device_id, log_source=None, log_fi self.log_process_event_map[event_map_id] = multiprocessing.Event() self.log_process_event_map[event_map_id].clear() log_processor.log_process_event = self.log_process_event_map[event_map_id] - process_name = GeneralConstants.get_log_process_name(log_run_id, log_device_id) - if platform.system() == "Windows": - log_child_process = multiprocessing.Process( - target=log_processor.log_process, args=(self.log_process_event_map[event_map_id], process_name)) - else: - log_child_process = fedml.get_process( - target=log_processor.log_process, args=(self.log_process_event_map[event_map_id], process_name)) + log_child_process = multiprocessing.Process(target=log_processor.log_process, + args=(self.log_process_event_map[event_map_id],)) # process = threading.Thread(target=log_processor.log_process) # process.start() if log_child_process is not None: diff --git a/python/setup.py b/python/setup.py index 262fc060c4..4757c10a17 100644 --- a/python/setup.py +++ b/python/setup.py @@ -20,7 +20,7 @@ def finalize_options(self): requirements = [ 'GPUtil', - 'PyYAML==5.3.1', + 'PyYAML', 'aiohttp>=3.8.1', 'attrdict', 'attrs', @@ -69,8 +69,7 @@ def finalize_options(self): 'python-dotenv', 'protobuf>=3.20.2,<4.0dev', 'typer<0.10.0,>=0.3.0', - 'fastapi-cli==0.0.1', - 'setproctitle' + 'fastapi-cli==0.0.1' ] requirements_extra_mpi = [ @@ -127,7 +126,7 @@ def finalize_options(self): setup( name="fedml", - version="0.8.51b1", + version="0.9.0", author="FedML Team", author_email="ch@fedml.ai", description="A research and production integrated edge-cloud library for " diff --git a/python/tests/cross-silo/run_cross_silo.sh b/python/tests/cross-silo/run_cross_silo.sh index 0beaaffc52..2ccdbff15b 100644 --- a/python/tests/cross-silo/run_cross_silo.sh +++ b/python/tests/cross-silo/run_cross_silo.sh @@ -1,10 +1,10 @@ #!/bin/bash set -e WORKSPACE=$(pwd) -# PROJECT_HOME=$WORKSPACE/../../ -# cd $PROJECT_HOME +PROJECT_HOME=$WORKSPACE/../../ +cd $PROJECT_HOME -cd examples/federate/cross_silo/mqtt_s3_fedavg_mnist_lr_example/custom_data_and_model +cd examples/cross_silo/mqtt_s3_fedavg_mnist_lr_example/custom_data_and_model # run client(s) RUN_ID="$(python -c "import uuid; print(uuid.uuid4().hex)")" diff --git a/python/tests/smoke_test/cli/build.sh b/python/tests/smoke_test/cli/build.sh index de956692f1..98fdb05244 100644 --- a/python/tests/smoke_test/cli/build.sh +++ b/python/tests/smoke_test/cli/build.sh @@ -16,7 +16,7 @@ # --help Show this message and exit. # build client package -cd ../../../examples/federate/cross_silo/mqtt_s3_fedavg_mnist_lr_example/one_line +cd ../../../examples/cross_silo/mqtt_s3_fedavg_mnist_lr_example/one_line echo "$PWD" SOURCE=client @@ -30,4 +30,4 @@ SOURCE=server ENTRY=torch_server.py CONFIG=config DEST=./mlops -fedml build -t server -sf $SOURCE -ep $ENTRY -cf $CONFIG -df $DEST +fedml build -t server -sf $SOURCE -ep $ENTRY -cf $CONFIG -df $DEST \ No newline at end of file diff --git a/python/tests/test_deploy/test_deploy.py b/python/tests/test_deploy/test_deploy.py deleted file mode 100644 index d7243c68de..0000000000 --- a/python/tests/test_deploy/test_deploy.py +++ /dev/null @@ -1,39 +0,0 @@ -import os.path -import time -import fedml -# Login -API_KEY = os.getenv("API_KEY") -fedml.set_env_version("test") -fedml.set_local_on_premise_platform_port(18080) -error_code, error_msg = fedml.api.fedml_login(api_key=API_KEY) -if error_code != 0: - raise Exception("API Key is invalid!") - -# Yaml file -cur_dir = os.path.dirname(__file__) -fedml_dir = os.path.dirname(cur_dir) -python_dir = os.path.dirname(fedml_dir) -yaml_file = os.path.join(python_dir, "examples", "launch", "serve_job_mnist.yaml") - -# Launch job -launch_result_dict = {} -launch_result_status = {} - -launch_result = fedml.api.launch_job(yaml_file) -print("Endpoint id is", launch_result.inner_id) - -cnt = 0 -while 1: - try: - r = fedml.api.get_endpoint(endpoint_id=launch_result.inner_id) - except Exception as e: - raise Exception(f"FAILED to get endpoint:{launch_result.inner_id}. {e}") - if r.status == "DEPLOYED": - print("Deployment has been successfully!") - break - elif r.status == "FAILED": - raise Exception("FAILED to deploy.") - time.sleep(1) - cnt += 1 - if cnt %3 ==0: - print('Deployment status is', r.status) \ No newline at end of file diff --git a/python/tests/test_federate/test_federate.sh b/python/tests/test_federate/test_federate.sh deleted file mode 100644 index ebfcb60330..0000000000 --- a/python/tests/test_federate/test_federate.sh +++ /dev/null @@ -1,26 +0,0 @@ - -WORKSPACE=`pwd` -echo $WORKSPACE -cd $WORKSPACE/examples/federate/quick_start/parrot -python torch_fedavg_mnist_lr_one_line_example.py --cf fedml_config.yaml -python torch_fedavg_mnist_lr_custum_data_and_model_example.py --cf fedml_config.yaml - -cd $WORKSPACE/examples/federate/simulation/sp_decentralized_mnist_lr_example -python torch_fedavg_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - -cd $WORKSPACE/examples/federate/simulation/sp_fednova_mnist_lr_example -python torch_fednova_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - -cd $WORKSPACE/examples/federate/simulation/sp_fedopt_mnist_lr_example -python torch_fedopt_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - -cd $WORKSPACE/examples/federate/simulation/sp_hierarchicalfl_mnist_lr_example -python torch_hierarchicalfl_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - - -cd $WORKSPACE/examples/federate/simulation/sp_turboaggregate_mnist_lr_example -python torch_turboaggregate_mnist_lr_step_by_step_example.py --cf fedml_config.yaml - - -cd $WORKSPACE/examples/federate/simulation/sp_vertical_mnist_lr_example -python torch_vertical_mnist_lr_step_by_step_example.py --cf fedml_config.yaml diff --git a/python/tests/test_launch/test_launch.py b/python/tests/test_launch/test_launch.py deleted file mode 100644 index a6b6ffb9cf..0000000000 --- a/python/tests/test_launch/test_launch.py +++ /dev/null @@ -1,50 +0,0 @@ -import os.path -import time -import fedml -from fedml.api.constants import RunStatus - -API_KEY = os.getenv("API_KEY") -# Login -fedml.set_env_version("test") -fedml.set_local_on_premise_platform_port(18080) -error_code, error_msg = fedml.api.fedml_login(api_key=API_KEY) -if error_code != 0: - raise Exception("API Key is invalid!") - -# Yaml file -cur_dir = os.path.dirname(__file__) -fedml_dir = os.path.dirname(cur_dir) -python_dir = os.path.dirname(fedml_dir) -yaml_file = os.path.join(python_dir, "examples", "launch", "hello_job.yaml") - -# Launch job - -launch_result = fedml.api.launch_job(yaml_file) - -# launch_result = fedml.api.launch_job_on_cluster(yaml_file, "alex-cluster") -if launch_result.result_code != 0: - raise Exception(f"Failed to launch job. Reason: {launch_result.result_message}") - -# check job status -while 1: - time.sleep(1) - # if - # if launch_result_status[run_id] == RunStatus.FINISHED: - # continue - log_result = fedml.api.run_logs(launch_result.run_id, 1, 5) - if log_result is None or log_result.run_status is None: - raise Exception(f"Failed to get job status.") - - print(f"run_id: {launch_result.run_id} run_status: {log_result.run_status}") - - if log_result.run_status in [RunStatus.ERROR, RunStatus.FAILED]: - log_result = fedml.api.run_logs(launch_result.run_id, 1, 100) - if log_result is None or log_result.run_status is None: - raise Exception(f"run_id:{launch_result.run_id} run_status:{log_result.run_status} and failed to get run logs.") - - raise Exception(f"run_id:{launch_result.run_id} run_status:{log_result.run_status} run logs: {log_result.log_line_list}") - if log_result.run_status == RunStatus.FINISHED: - print(f"Job finished successfully.") - break - - diff --git a/python/tests/test_train/test_train.py b/python/tests/test_train/test_train.py deleted file mode 100644 index 039d3b81d2..0000000000 --- a/python/tests/test_train/test_train.py +++ /dev/null @@ -1,49 +0,0 @@ -import os.path -import time -import fedml -from fedml.api.constants import RunStatus - -API_KEY = os.getenv("API_KEY") -# Login -fedml.set_env_version("test") -fedml.set_local_on_premise_platform_port(18080) -error_code, error_msg = fedml.api.fedml_login(api_key=API_KEY) -if error_code != 0: - raise Exception("API Key is invalid!") - -# Yaml file -cur_dir = os.path.dirname(__file__) -fedml_dir = os.path.dirname(cur_dir) -python_dir = os.path.dirname(fedml_dir) -yaml_file = os.path.join(python_dir, "examples", "train", "mnist_train", "train.yaml") - -# Launch job - -launch_result = fedml.api.launch_job(yaml_file) - -# launch_result = fedml.api.launch_job_on_cluster(yaml_file, "alex-cluster") -if launch_result.result_code != 0: - raise Exception(f"Failed to launch job. Reason: {launch_result.result_message}") - -# check job status -while 1: - time.sleep(1) - # if - # if launch_result_status[run_id] == RunStatus.FINISHED: - # continue - log_result = fedml.api.run_logs(launch_result.run_id, 1, 5) - if log_result is None or log_result.run_status is None: - raise Exception(f"Failed to get job status.") - - print(f"run_id: {launch_result.run_id} run_status: {log_result.run_status}") - - if log_result.run_status in [RunStatus.ERROR, RunStatus.FAILED]: - log_result = fedml.api.run_logs(launch_result.run_id, 1, 100) - if log_result is None or log_result.run_status is None: - raise Exception(f"run_id:{launch_result.run_id} run_status:{log_result.run_status} and failed to get run logs.") - - raise Exception(f"run_id:{launch_result.run_id} run_status:{log_result.run_status} run logs: {log_result.log_line_list}") - if log_result.run_status == RunStatus.FINISHED: - print(f"Job finished successfully.") - break -