Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DBNode] - Make repairs actually repair data #1849

Merged
merged 15 commits into from
Aug 6, 2019
11 changes: 5 additions & 6 deletions scripts/docker-integration-tests/cold_writes_simple/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ function write_data {
}
}')


if [[ $respCode -eq "200" ]]; then
return 0
else
Expand Down Expand Up @@ -63,17 +62,17 @@ function read_all {
}

echo "Write data for 'now - 2 * bufferPast' (testing cold writes from memory)"
write_data "coldWritesNoIndex" "foo" "$(($(date +"%s") - 60 * 10 * 2))" 12.3456789
write_data "coldWritesRepairAndNoIndex" "foo" "$(($(date +"%s") - 60 * 10 * 2))" 12.3456789

echo "Expect to read 1 datapoint"
read_all "coldWritesNoIndex" "foo" 1
read_all "coldWritesRepairAndNoIndex" "foo" 1

echo "Write data for 'now - 2 * blockSize' (testing compaction to disk)"
write_data "coldWritesNoIndex" "foo" "$(($(date +"%s") - 60 * 60 * 2))" 98.7654321
write_data "coldWritesRepairAndNoIndex" "foo" "$(($(date +"%s") - 60 * 60 * 2))" 98.7654321

echo "Wait until cold writes are flushed"
ATTEMPTS=10 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ -n "$(docker-compose -f ${COMPOSE_FILE} exec dbnode01 find /var/lib/m3db/data/coldWritesNoIndex -name "*1-checkpoint.db")" ]'
'[ -n "$(docker-compose -f ${COMPOSE_FILE} exec dbnode01 find /var/lib/m3db/data/coldWritesRepairAndNoIndex -name "*1-checkpoint.db")" ]'

echo "Restart DB (test bootstrapping cold writes)"
docker-compose -f ${COMPOSE_FILE} restart dbnode01
Expand All @@ -83,4 +82,4 @@ ATTEMPTS=10 TIMEOUT=2 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:9002/health | jq .bootstrapped)" == true ]'

echo "Expect to read 2 datapoints"
read_all "coldWritesNoIndex" "foo" 2
read_all "coldWritesRepairAndNoIndex" "foo" 2
90 changes: 80 additions & 10 deletions scripts/docker-integration-tests/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,70 @@ function setup_single_m3db_node {
wait_for_db_init
}

function setup_three_m3db_nodes {
local dbnode_host_1=${DBNODE_HOST:-dbnode01}
local dbnode_host_2=${DBNODE_HOST:-dbnode02}
local dbnode_host_3=${DBNODE_HOST:-dbnode03}
local dbnode_port=${DBNODE_PORT:-9000}
local dbnode_host_1_health_port=${DBNODE_HEALTH_PORT:-9012}
local dbnode_host_2_health_port=${DBNODE_HEALTH_PORT:-9022}
local dbnode_host_3_health_port=${DBNODE_HEALTH_PORT:-9032}
local coordinator_port=${COORDINATOR_PORT:-7201}

echo "Wait for API to be available"
ATTEMPTS=100 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${coordinator_port}"'/api/v1/namespace | jq ".namespaces | length")" == "0" ]'

echo "Adding placement and agg namespace"
curl -vvvsSf -X POST 0.0.0.0:${coordinator_port}/api/v1/database/create -d '{
"type": "cluster",
"namespaceName": "agg",
"retentionTime": "6h",
"num_shards": 3,
"replicationFactor": 3,
"hosts": [
{
"id": "m3db_local_1",
richardartoul marked this conversation as resolved.
Show resolved Hide resolved
"isolation_group": "rack-a",
"zone": "embedded",
"weight": 1024,
"address": "'"${dbnode_host_1}"'",
"port": '"${dbnode_port}"'
},
{
"id": "m3db_local_2",
"isolation_group": "rack-b",
"zone": "embedded",
"weight": 1024,
"address": "'"${dbnode_host_2}"'",
"port": '"${dbnode_port}"'
},
{
"id": "m3db_local_3",
"isolation_group": "rack-c",
"zone": "embedded",
"weight": 1024,
"address": "'"${dbnode_host_3}"'",
"port": '"${dbnode_port}"'
}
]
}'

echo "Wait until placement is init'd"
ATTEMPTS=10 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${coordinator_port}"'/api/v1/placement | jq .placement.instances.m3db_local_1.id)" == \"m3db_local_1\" ]'

wait_for_namespaces

echo "Wait until bootstrapped"
ATTEMPTS=100 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${dbnode_host_1_health_port}"'/health | jq .bootstrapped)" == true ]'
ATTEMPTS=100 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${dbnode_host_2_health_port}"'/health | jq .bootstrapped)" == true ]'
ATTEMPTS=100 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${dbnode_host_3_health_port}"'/health | jq .bootstrapped)" == true ]'
}

function wait_for_db_init {
local dbnode_host=${DBNODE_HOST:-dbnode01}
local dbnode_port=${DBNODE_PORT:-9000}
Expand Down Expand Up @@ -80,6 +144,16 @@ function wait_for_db_init {
ATTEMPTS=10 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${coordinator_port}"'/api/v1/placement | jq .placement.instances.m3db_local.id)" == \"m3db_local\" ]'

wait_for_namespaces

echo "Wait until bootstrapped"
ATTEMPTS=100 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${dbnode_health_port}"'/health | jq .bootstrapped)" == true ]'
}

function wait_for_namespaces {
local coordinator_port=${COORDINATOR_PORT:-7201}

echo "Wait until agg namespace is init'd"
ATTEMPTS=10 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${coordinator_port}"'/api/v1/namespace | jq .registry.namespaces.agg.indexOptions.enabled)" == true ]'
Expand All @@ -94,19 +168,19 @@ function wait_for_db_init {
ATTEMPTS=10 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${coordinator_port}"'/api/v1/namespace | jq .registry.namespaces.unagg.indexOptions.enabled)" == true ]'

echo "Adding coldWritesNoIndex namespace"
echo "Adding coldWritesRepairAndNoIndex namespace"
curl -vvvsSf -X POST 0.0.0.0:${coordinator_port}/api/v1/services/m3db/namespace -d '{
"name": "coldWritesNoIndex",
"name": "coldWritesRepairAndNoIndex",
"options": {
"bootstrapEnabled": true,
"flushEnabled": true,
"writesToCommitLog": true,
"cleanupEnabled": true,
"snapshotEnabled": true,
"repairEnabled": false,
"repairEnabled": true,
"coldWritesEnabled": true,
"retentionOptions": {
"retentionPeriodDuration": "8h",
"retentionPeriodDuration": "4h",
"blockSizeDuration": "1h",
"bufferFutureDuration": "10m",
"bufferPastDuration": "10m",
Expand All @@ -116,12 +190,8 @@ function wait_for_db_init {
}
}'

echo "Wait until coldWritesNoIndex namespace is init'd"
echo "Wait until coldWritesRepairAndNoIndex namespace is init'd"
ATTEMPTS=10 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${coordinator_port}"'/api/v1/namespace | jq .registry.namespaces.coldWritesNoIndex.coldWritesEnabled)" == true ]'

echo "Wait until bootstrapped"
ATTEMPTS=100 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \
'[ "$(curl -sSf 0.0.0.0:'"${dbnode_health_port}"'/health | jq .bootstrapped)" == true ]'
'[ "$(curl -sSf 0.0.0.0:'"${coordinator_port}"'/api/v1/namespace | jq .registry.namespaces.coldWritesRepairAndNoIndex.coldWritesEnabled)" == true ]'
}

60 changes: 60 additions & 0 deletions scripts/docker-integration-tests/repair/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
version: "3.5"
services:
dbnode01:
expose:
- "9000-9004"
- "2379-2380"
ports:
- "0.0.0.0:9012:9002"
- "0.0.0.0:9013:9003"
networks:
- backend
image: "m3dbnode_integration:${REVISION}"
environment:
- M3DB_HOST_ID=m3db_local_1
volumes:
- "./m3dbnode.yml:/etc/m3dbnode/m3dbnode.yml"
dbnode02:
expose:
- "9000-9004"
- "2379-2380"
ports:
- "0.0.0.0:9022:9002"
- "0.0.0.0:9023:9003"
networks:
- backend
image: "m3dbnode_integration:${REVISION}"
environment:
- M3DB_HOST_ID=m3db_local_2
volumes:
- "./m3dbnode.yml:/etc/m3dbnode/m3dbnode.yml"
dbnode03:
expose:
- "9000-9004"
- "2379-2380"
ports:
- "0.0.0.0:9032:9002"
- "0.0.0.0:9033:9003"
networks:
- backend
image: "m3dbnode_integration:${REVISION}"
environment:
- M3DB_HOST_ID=m3db_local_3
volumes:
- "./m3dbnode.yml:/etc/m3dbnode/m3dbnode.yml"
coordinator01:
expose:
- "7201"
- "7203"
- "7204"
ports:
- "0.0.0.0:7201:7201"
- "0.0.0.0:7203:7203"
- "0.0.0.0:7204:7204"
networks:
- backend
image: "m3coordinator_integration:${REVISION}"
volumes:
- "./:/etc/m3coordinator/"
networks:
backend:
46 changes: 46 additions & 0 deletions scripts/docker-integration-tests/repair/m3coordinator.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
listenAddress:
type: "config"
value: "0.0.0.0:7201"

logging:
level: info

metrics:
scope:
prefix: "coordinator"
prometheus:
handlerPath: /metrics
listenAddress: 0.0.0.0:7203 # until https://github.com/m3db/m3/issues/682 is resolved
sanitization: prometheus
samplingRate: 1.0
extended: none

limits:
perQuery:
maxFetchedSeries: 100

clusters:
- namespaces:
- namespace: agg
type: aggregated
retention: 10h
resolution: 15s
- namespace: unagg
type: unaggregated
retention: 10h
client:
config:
service:
env: default_env
zone: embedded
service: m3db
cacheDir: /var/lib/m3kv
etcdClusters:
- zone: embedded
endpoints:
- dbnode01:2379
writeConsistencyLevel: majority
readConsistencyLevel: unstrict_majority

tagOptions:
idScheme: quoted
86 changes: 86 additions & 0 deletions scripts/docker-integration-tests/repair/m3dbnode.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
db:
logging:
level: info

tracing:
backend: jaeger
jaeger:
reporter:
localAgentHostPort: jaeger:6831
sampler:
type: const
param: 1

metrics:
prometheus:
handlerPath: /metrics
sanitization: prometheus
samplingRate: 1.0
extended: detailed

listenAddress: 0.0.0.0:9000
clusterListenAddress: 0.0.0.0:9001
httpNodeListenAddress: 0.0.0.0:9002
httpClusterListenAddress: 0.0.0.0:9003
debugListenAddress: 0.0.0.0:9004

hostID:
resolver: environment
envVarName: M3DB_HOST_ID

client:
writeConsistencyLevel: majority
readConsistencyLevel: unstrict_majority

gcPercentage: 100

writeNewSeriesAsync: true
writeNewSeriesLimitPerSecond: 1048576
writeNewSeriesBackoffDuration: 2ms

bootstrap:
# Intentionally disable peers bootstrapper to ensure it doesn't interfere with test.
bootstrappers:
- filesystem
- commitlog
- uninitialized_topology
commitlog:
returnUnfulfilledForCorruptCommitLogFiles: false

cache:
series:
policy: lru
postingsList:
size: 262144

commitlog:
flushMaxBytes: 524288
flushEvery: 1s
queue:
calculationType: fixed
size: 2097152

fs:
filePathPrefix: /var/lib/m3db

config:
service:
env: default_env
zone: embedded
service: m3db
cacheDir: /var/lib/m3kv
etcdClusters:
- zone: embedded
endpoints:
- dbnode01:2379
seedNodes:
initialCluster:
- hostID: m3db_local_1
endpoint: http://dbnode01:2380

# Enable repairs.
repair:
enabled: true
throttle: 1ms
checkInterval: 1ms

Loading