.github/workflows/memcheck.yml

name: Memory Check

on:
  # tests must run for a PR to be valid and pass merge queue muster
  # on main, we want to know that all commits are passing at a glance, any deviation should help bisecting errors
  # the merge run checks should show on master and enable this clear test/passing history
  merge_group:
    branches: [main, alpha*, beta*, rc*]
  pull_request:
    branches: ["*"]

env:
  ANT_DATA_PATH: /home/runner/.local/share/autonomi
  CLIENT_DATA_PATH: /home/runner/.local/share/autonomi/client
  NODE_DATA_PATH: /home/runner/.local/share/autonomi/node
  RESTART_TEST_NODE_DATA_PATH: /home/runner/.local/share/autonomi/restart_node

jobs:
  memory-check:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Check we're on the right commit
        run: git log -1 --oneline

      - name: Install Rust
        uses: dtolnay/rust-toolchain@stable

      - uses: Swatinem/rust-cache@v2
        continue-on-error: true

      - name: install ripgrep
        shell: bash
        run: sudo apt-get install -y ripgrep

      - name: Build binaries
        run: cargo build --release --features local --bin antnode --bin ant
        timeout-minutes: 30

      - name: Start a local network
        uses: maidsafe/ant-local-testnet-action@main
        with:
          action: start
          enable-evm-testnet: true
          node-path: target/release/antnode
          platform: ubuntu-latest
          build: true

      - name: Check ANT_PEERS was set
        shell: bash
        run: echo "The ANT_PEERS variable has been set to $ANT_PEERS"

      - name: Start a node instance to be restarted
        run: |
          mkdir -p $RESTART_TEST_NODE_DATA_PATH
          ./target/release/antnode \
            --root-dir $RESTART_TEST_NODE_DATA_PATH --log-output-dest $RESTART_TEST_NODE_DATA_PATH --local --rewards-address "0x03B770D9cD32077cC0bF330c13C114a87643B124" &
          sleep 10
        env:
          ANT_LOG: "all"

      - name: Download 95mb file to be uploaded with the safe client
        shell: bash
        run: wget https://sn-node.s3.eu-west-2.amazonaws.com/the-test-data.zip

      - name: export default secret key
        run: echo "SECRET_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" >> $GITHUB_ENV
        shell: bash

      - name: File upload
        run: ./target/release/ant --log-output-dest=data-dir file upload --public "./the-test-data.zip" > ./upload_output 2>&1
        env:
          ANT_LOG: "v"
        timeout-minutes: 15

      - name: showing the upload terminal output
        run: cat upload_output
        shell: bash
        if: always()

      - name: parse address
        run: |
          UPLOAD_ADDRESS=$(rg "At address: ([0-9a-f]*)" -o -r '$1' ./upload_output)
          echo "UPLOAD_ADDRESS=$UPLOAD_ADDRESS" >> $GITHUB_ENV
        shell: bash

      # Uploading same file using different client shall not incur any payment neither uploads
      # Note rg will throw an error directly in case of failed to find a matching pattern.
      - name: Start a different client to upload the same file
        run: |
          pwd
          ls -l $ANT_DATA_PATH
          mv $CLIENT_DATA_PATH $ANT_DATA_PATH/client_first
          ls -l $ANT_DATA_PATH
          ls -l $ANT_DATA_PATH/client_first
          ls -l $ANT_DATA_PATH/client_first/logs
          mkdir $ANT_DATA_PATH/client
          ls -l $ANT_DATA_PATH
          cp ./the-test-data.zip ./the-test-data_1.zip
          ./target/release/ant --log-output-dest data-dir file_TYPE upload "" > ./second_upload 2>&1
        enrelease-candidatev:
          ANT_LOG: "all"
        timeout-minutes: 25

      - name: showing the second upload terminal output
        run: cat second_upload
        shell: bash
        if: always()

      - name: Stop the restart node
        run: kill $(cat $RESTART_TEST_NODE_DATA_PATH/antnode.pid)

      - name: Start the restart node again
        run: |
          ./target/release/antnode \
            --root-dir-type PARESTART_TEST_NODE_DATA_PATH \
            --log-output-dest $RESTART_TEST_NODE_DATA_PATH \
            --local \
            --rewards-address "0x03B770D9cD32077cC0bF330c13C114a87643B124" &
          sleep 10
        env:
          ANT_LOG: "all"

      # Records are encrypted, and seeds will change after restart
      # Currently, there will be `Existing record found`, but NO `Existing record loaded`
      # Due to the failure on decryption (as different seed used)
      - name: Assert we've reloaded some chunks
        run: rg "Existing record found" $RESTART_TEST_NODE_DATA_PATH

      - name: Wait at least 1min for replication to happen # it is throttled to once/30s.
        run: sleep 60

      - name: Verify data replication using rg
        shell: bash
        timeout-minutes: 1
        # get the counts, then the specific line, and then the digit count only
        # then check we have an expected level of replication
        run: |
          sending_list_count=$(rg "Sending a replication list" $NODE_DATA_PATH -c --stats | \
            rg "(\d+) matches" | rg "\d+" -o)
          echo "Sent $sending_list_count replication lists"
          received_list_count=$(rg "Received replication list from" $NODE_DATA_PATH -c --stats | \
            rg "(\d+) matches" | rg "\d+" -o)
          echo "Received $received_list_count replication lists"
          fetching_attempt_count=$(rg "FetchingKeysForReplication" $NODE_DATA_PATH -c --stats | \
            rg "(\d+) matches" | rg "\d+" -o)
          echo "Carried out $fetching_attempt_count fetching attempts"
        if: always()

      - name: File Download
        run: >
          ./target/release/ant
          --log-output-dest=data-dir file download ${{ env.UPLOAD_ADDRESS }} ./downloaded_resources
        env:
          ANT_LOG: "v"
        timeout-minutes: 2

      - name: Check nodes running
        shell: bash
        timeout-minutes: 1
        continue-on-error: true
        run: pgrep antnode | wc -l
        if: always()

      - name: Stop the local network and upload logs
        if: always()
        uses: maidsafe/ant-local-testnet-action@main
        with:
          action: stop
          log_file_prefix: safe_test_logs_memcheck
          platform: ubuntu-latest
          build: true

      - name: Check node memory usage
        shell: bash
        # The resources file and churning chunk_size we upload may change, and with it mem consumption.
        # This is set to a value high enough to allow for some variation depending on
        # resources and node location in the network, but hopefully low enough to catch
        # any wild memory issues
        # Any changes to this value should be carefully considered and tested!
        # As we have a bootstrap node acting as an access point for churning nodes and client,
        # The memory usage here will be significantly higher here than in the benchmark test,
        # where we don't have a bootstrap node.
        run: |
          node_peak_mem_limit_mb="300" # mb

          peak_mem_usage=$(
            rg '"memory_used_mb":[^,]*' $NODE_DATA_PATH/*/logs/* -o --no-line-number --no-filename |
            awk -F':' '/"memory_used_mb":/{print $2}' |
            sort -n |
            tail -n 1
          )
          echo "Node memory usage: $peak_mem_usage MB"

          if (( $(echo "$peak_mem_usage > $node_peak_mem_limit_mb" | bc -l) )); then
            echo "Node memory usage exceeded threshold: $peak_mem_usage MB"
            exit 1
          fi
        if: always()

      - name: Check client memory usage
        shell: bash
        # limits here are lower that benchmark tests as there is less going on.
        run: |
          client_peak_mem_limit_mb="1024" # mb
          client_avg_mem_limit_mb="512" # mb

          peak_mem_usage=$(
            rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob ant.* -o --no-line-number --no-filename |
            awk -F':' '/"memory_used_mb":/{print $2}' |
            sort -n |
            tail -n 1
          )
          echo "Peak memory usage: $peak_mem_usage MB"
          if (( $(echo "$peak_mem_usage > $client_peak_mem_limit_mb" | bc -l) )); then
            echo "Client peak memory usage exceeded threshold: $client_peak_mem_limit_mb MB"
            exit 1
          fi

          total_mem=$(
            rg '"memory_used_mb":[^,]*' $CLIENT_DATA_PATH/logs --glob ant.* -o --no-line-number --no-filename |
            awk -F':' '/"memory_used_mb":/ {sum += $2} END {printf "%.0f\n", sum}'
          )
          num_of_times=$(
            rg "\"memory_used_mb\"" $CLIENT_DATA_PATH/logs --glob ant.* -c --stats |
            rg "(\d+) matches" |
            rg "\d+" -o
          )
          echo "num_of_times: $num_of_times"
          echo "Total memory is: $total_mem"
          average_mem=$(($total_mem/$(($num_of_times))))
          echo "Average memory is: $average_mem"

          if (( $(echo "$average_mem > $client_avg_mem_limit_mb" | bc -l) )); then
            echo "Client average memory usage exceeded threshold: $client_avg_mem_limit_mb MB"
            exit 1
          fi

      # Logging of handling time is on Trace level,
      # meanwhile the local_network startup tool sets the logging level on Debug.
      #
      # - name: Check node swarm_driver handling statistics
      #   shell: bash
      #   # With the latest improvements, swarm_driver will be in high chance
      #   # has no super long handling (longer than 1s).
      #   # As the `rg` cmd will fail the shell directly if no entry find,
      #   # hence not covering it.
      #   # Be aware that if do need to looking for handlings longer than second, it shall be:
      #   #   rg "SwarmCmd handled in [^m,µ,n]*s:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats
      #   run: |
      #     num_of_times=$(
      #       rg "SwarmCmd handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats |
      #       rg "(\d+) matches" |
      #       rg "\d+" -o
      #     )
      #     echo "Number of long cmd handling times: $num_of_times"
      #     total_long_handling_ms=$(
      #       rg "SwarmCmd handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -o --no-line-number --no-filename |
      #       awk -F' |ms:' '{sum += $4} END {printf "%.0f\n", sum}'
      #     )
      #     echo "Total cmd long handling time is: $total_long_handling_ms ms"
      #     average_handling_ms=$(($total_long_handling_ms/$(($num_of_times))))
      #     echo "Average cmd long handling time is: $average_handling_ms ms"
      #     total_long_handling=$(($total_long_handling_ms))
      #     total_num_of_times=$(($num_of_times))
      #     num_of_times=$(
      #       rg "SwarmEvent handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -c --stats |
      #       rg "(\d+) matches" |
      #       rg "\d+" -o
      #     )
      #     echo "Number of long event handling times: $num_of_times"
      #     total_long_handling_ms=$(
      #       rg "SwarmEvent handled in [0-9.]+ms:" $NODE_DATA_PATH/*/logs/* --glob antnode.* -o --no-line-number --no-filename |
      #       awk -F' |ms:' '{sum += $4} END {printf "%.0f\n", sum}'
      #     )
      #     echo "Total event long handling time is: $total_long_handling_ms ms"
      #     average_handling_ms=$(($total_long_handling_ms/$(($num_of_times))))
      #     echo "Average event long handling time is: $average_handling_ms ms"
      #     total_long_handling=$(($total_long_handling_ms+$total_long_handling))
      #     total_num_of_times=$(($num_of_times+$total_num_of_times))
      #     average_handling_ms=$(($total_long_handling/$(($total_num_of_times))))
      #     echo "Total swarm_driver long handling times is: $total_num_of_times"
      #     echo "Total swarm_driver long handling duration is: $total_long_handling ms"
      #     echo "Total average swarm_driver long handling duration is: $average_handling_ms ms"

      - name: Move restart_node log to the working directory
        run: |
          ls -l $RESTART_TEST_NODE_DATA_PATH
          mv $RESTART_TEST_NODE_DATA_PATH/antnode.log ./restart_node.log
        continue-on-error: true
        if: always()
        timeout-minutes: 1

      - name: Upload restart_node log
        uses: actions/upload-artifact@main
        with:
          name: memory_check_restart_node_log
          path: restart_node.log
        continue-on-error: true
        if: always()