Skip to content

Commit

Permalink
test: collect detailed logs for tests in datadog[infeng-752] (#9637)
Browse files Browse the repository at this point in the history
  • Loading branch information
djanicekpach authored Jul 17, 2024
1 parent 274d763 commit bbd6f8a
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 11 deletions.
119 changes: 119 additions & 0 deletions .circleci/datadog/ci-local-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Config file taken from https://github.com/DataDog/datadog-agent/blob/main/pkg/config/config_template.yaml
# These values are overridden by environment variables:
# api_key
# dd_site
# dd_url
# DD_TAGS
# DD_EXTRA_TAGS
# DD_ENV
# apm is explicitly disabled here for cost reasons


##################################
## Log collection Configuration ##
##################################

## @param logs_enabled - boolean - optional - default: false
## @env DD_LOGS_ENABLED - boolean - optional - default: false
## Enable Datadog Agent log collection by setting logs_enabled to true.
#
logs_enabled: true

## @param logs_config - custom object - optional
## Enter specific configurations for your Log collection.
## Uncomment this parameter and the one below to enable them.
## See https://docs.*************/agent/logs/
#
logs_config:

## @param container_collect_all - boolean - optional - default: false
## @env DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL - boolean - optional - default: false
## Enable container log collection for all the containers (see ac_exclude to filter out containers)
#
container_collect_all: true


####################################
## Trace Collection Configuration ##
####################################

## @param apm_config - custom object - optional
## Enter specific configurations for your trace collection.
## Uncomment this parameter and the one below to enable them.
## See https://docs.*************/agent/apm/
#
apm_config:

## @param enabled - boolean - optional - default: true
## @env DD_APM_ENABLED - boolean - optional - default: true
## Set to true to enable the APM Agent.
#
enabled: false

######################################
## Process Collection Configuration ##
######################################

# @param process_config - custom object - optional
# Enter specific configurations for your Process data collection.
# Uncomment this parameter and the one below to enable them.
# See https://docs.*************/graphing/infrastructure/process/

process_config:

# @param process_collection - custom object - optional
# Specifies settings for collecting processes.
process_collection:
# @param enabled - boolean - optional - default: false
# Enables collection of information about running processes.
enabled: false

# @param container_collection - custom object - optional
# Specifies settings for collecting containers.
container_collection:
# @param enabled - boolean - optional - default: true
# Enables collection of information about running containers.
enabled: false

# Deprecated - use `process_collection.enabled` and `container_collection.enabled` instead
# @param enabled - string - optional - default: "false"
# @env DD_PROCESS_CONFIG_ENABLED - string - optional - default: "false"
# A string indicating the enabled state of the Process Agent:
# * "false" : The Agent collects only containers information.
# * "true" : The Agent collects containers and processes information.
# * "disabled" : The Agent process collection is disabled.

enabled: "false"

# @param process_discovery - custom object - optional
# Specifies custom settings for the `process_discovery` object.
process_discovery:
# @param enabled - boolean - optional - default: true
# Toggles the `process_discovery` check. If enabled, this check gathers information about running integrations.
enabled: false

# @param interval - duration - optional - default: 4h - minimum: 10m
# An interval in hours that specifies how often the process discovery check should run.
interval: 10m


###########################
## Logging Configuration ##
###########################

## @param log_level - string - optional - default: info
## @env DD_LOG_LEVEL - string - optional - default: info
## Minimum log level of the Datadog Agent.
## Valid log levels are: trace, debug, info, warn, error, critical, and off.
## Note: When using the 'off' log level, quotes are mandatory.
#
log_level: 'debug'

## @param log_file - string - optional
## @env DD_LOG_FILE - string - optional
## Path of the log file for the Datadog Agent.
## See https://docs.*************/agent/guide/agent-log-files/
#
log_file: /tmp/artifacts/logs/dd-agent-log.txt


13 changes: 13 additions & 0 deletions .circleci/datadog/e2e-log-settings.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
logs:
- type: file
path: "/tmp/artifacts/logs/*.log"
service: "<SERVICE_NAME>"
source: "determined-task-logs"
- type: file
path: "/tmp/devcluster/*.log"
service: "<SERVICE_NAME>"
source: "devcluster-logs"
- type: file
path: "/tmp/priority_scheduler/*.log"
service: "<SERVICE_NAME>"
source: "devcluster-priority-scheduler-logs"
95 changes: 84 additions & 11 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ commands:
description: Collect logs from the cluster tasks.
steps:
- run:
when: on_fail
name: "Ensure necessary Python packages are available."
when: always
command: |
pkg_names="fire determined"
for pkg_name in $pkg_names; do
Expand All @@ -161,8 +161,8 @@ commands:
fi
done
- run:
when: on_fail
name: "Collect logs and calculate statistics"
when: always
command: |
target_dir="<< parameters.store_path >>"
mkdir -p $target_dir
Expand Down Expand Up @@ -626,6 +626,50 @@ commands:
type: boolean
default: true
steps:
- when:
condition:
and:
- equal: [<<parameters.master-host>>,'localhost']
- <<parameters.wait-for-master>>
- not: <<parameters.managed-devcluster>>
steps:
- run:
name: Install DataDog agent
command: |
if [ "$AIS_DD_ENABLE_MONITORING" == "true" ]; then
host_tags="test.mark:<<parameters.mark>>,\
ci.pipeline_id:${CIRCLE_PIPELINE_ID},\
ci.workflow_id:${CIRCLE_WORKFLOW_ID},\
ci.job_num:${CIRCLE_BUILD_NUM},\
ci.username:${CIRCLE_USERNAME},\
git.tag:${CIRCLE_TAG},\
git.commit:${CIRCLE_SHA1},\
git.repo:${CIRCLE_PROJECT_REPONAME},\
ci.totalNodes:${CIRCLE_NODE_TOTAL},\
ci.nodeIdx:${CIRCLE_NODE_INDEX},\
git.pr_num:${CIRCLE_PR_NUMBER}"
sudo mkdir -p /tmp/artifacts/logs
sudo chmod -R a+rw /tmp/artifacts/logs
DD_ENV="ci-${CIRCLE_JOB}" \
DD_HOST_TAGS="$host_tags" \
DD_SERVICE="determined-pytest-<<parameters.mark>>" \
bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script_agent7.sh)"
# config files for the agent have an expected file structure
sudo mkdir -p /etc/datadog-agent/conf.d/determined-master.d/
sudo chmod a+rw /etc/datadog-agent/datadog.yaml
sudo chmod -R a+rw /etc/datadog-agent/conf.d/determined-master.d/
sudo cat .circleci/datadog/ci-local-config.yaml >> /etc/datadog-agent/datadog.yaml
sudo sed -e "s/<SERVICE_NAME>/determined-pytest-<<parameters.mark>>/g" .circleci/datadog/e2e-log-settings.yaml > /etc/datadog-agent/conf.d/determined-master.d/conf.yaml
# restart agent with config
sudo usermod -a -G docker dd-agent
sudo systemctl stop datadog-agent
sudo systemctl start datadog-agent
sleep 5
sudo datadog-agent status
fi
# Wait for master before splitting tests, since so many splits depend on
# asking master for its configuration in order to apply skipifs.
- when:
Expand Down Expand Up @@ -662,24 +706,38 @@ commands:
echo "No Determined master listening on '<<parameters.master-scheme>>://<<parameters.master-host>>:<<parameters.master-port>>'"
fi
cat /tmp/all-relevant-files | circleci tests run --command="DD_CIVISIBILITY_AGENTLESS_ENABLED=true \
DD_SITE='datadoghq.com' \
DD_ENV='ci-<<parameters.mark>>' DD_SERVICE='determined-pytest-<<parameters.mark>>' \
tags="test.mark:<<parameters.mark>>,\
ci.pipeline_id:${CIRCLE_PIPELINE_ID},\
ci.workflow_id:${CIRCLE_WORKFLOW_ID},\
ci.job_num:${CIRCLE_BUILD_NUM},\
ci.username:${CIRCLE_USERNAME},\
git.tag:${CIRCLE_TAG},\
git.commit:${CIRCLE_SHA1},\
ci.totalNodes:${CIRCLE_NODE_TOTAL},\
ci.nodeIdx:${CIRCLE_NODE_INDEX},\
git.pr_num:${CIRCLE_PR_NUMBER}"
CMD="DD_CIVISIBILITY_AGENTLESS_ENABLED=true \
DD_TAGS='${tags}' \
DD_ENV='ci-<<parameters.mark>>' \
DD_SERVICE='determined-pytest-<<parameters.mark>>' \
DET_MASTER_CERT_FILE=<<parameters.master-cert>> \
DET_MASTER_CERT_NAME=<<parameters.master-cert-name>> \
IS_CIRCLECI_JOB=1 XDG_CONFIG_HOME=/tmp \
xargs pytest --capture=tee-sys -vv \
-m '<<parameters.mark>>' \
--durations=0 \
--ddtrace \
--master-scheme="<<parameters.master-scheme>>" \
--master-host="<<parameters.master-host>>" \
--master-port="<<parameters.master-port>>" \
--master-scheme='<<parameters.master-scheme>>' \
--master-host='<<parameters.master-host>>' \
--master-port='<<parameters.master-port>>' \
-o junit_family=xunit1 \
--junit-xml="<<parameters.junit-path>>" \
<<parameters.extra-pytest-flags>>" \
--verbose --split-by=timings
--junit-xml='<<parameters.junit-path>>' \
<<parameters.extra-pytest-flags>>"
echo "$CMD"
cat /tmp/all-relevant-files | circleci tests run --command="$CMD" \
--verbose --split-by=timings
pytest_status=$?
echo Pytest exited with $pytest_status
exit $pytest_status
Expand All @@ -694,6 +752,21 @@ commands:
master_address: "<<parameters.master-scheme>>://<<parameters.master-host>>:<<parameters.master-port>>"
- store_artifacts:
path: /tmp/artifacts/logs
- when:
condition:
and:
- equal: [<<parameters.master-host>>,'localhost']
- <<parameters.wait-for-master>>
- not: <<parameters.managed-devcluster>>
steps:
- run: # We don't know how long Circle leaves these machines running in the background. Take down the agent for safety.
name: Stop DataDog agent
when: always
command: |
if [ "$AIS_DD_ENABLE_MONITORING" == "true" ]; then
sudo systemctl stop datadog-agent || true
fi

run-det-deploy-tests:
parameters:
Expand Down

0 comments on commit bbd6f8a

Please sign in to comment.