Skip to content

Commit

Permalink
network proxy-based failure testing
Browse files Browse the repository at this point in the history
- Lots of detail is in src/test/regress/mitmscripts/README
- Create a new target, make check-failure, which runs tests
- Tells travis how to install everything and run the tests
  • Loading branch information
lithp committed Jul 6, 2018
1 parent c6cf40e commit a54f9a6
Show file tree
Hide file tree
Showing 14 changed files with 1,565 additions and 2 deletions.
11 changes: 11 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
sudo: required
dist: trusty
language: c
python:
- "3.5"
cache:
apt: true
directories:
Expand All @@ -27,10 +29,19 @@ before_install:
- setup_apt
- curl https://install.citusdata.com/community/deb.sh | sudo bash
- nuke_pg
- pyenv versions
- pyenv global 3.6
- sudo apt-get install python3-pip
- sudo pip3 install --upgrade pip
- python --version
- python3 --version
install:
- install_uncrustify
- install_pg
- install_custom_pg
- pip3 install --user mitmproxy==3.0.4
- pip3 install --user construct==2.9.45
- mitmproxy --version
# download and install HLL manually, as custom builds won't satisfy deps
# only install if performing non-11 build
- |
Expand Down
6 changes: 5 additions & 1 deletion src/test/regress/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ output_files := $(patsubst $(citus_abs_srcdir)/output/%.source,expected/%.out, $
# intermediate, for muscle memory backward compatibility.
check: check-full
# check-full triggers all tests that ought to be run routinely
check-full: check-multi check-multi-mx check-multi-task-tracker-extra check-worker check-follower-cluster
check-full: check-multi check-multi-mx check-multi-task-tracker-extra check-worker check-follower-cluster check-failure

# using pg_regress_multi_check unnecessarily starts up multiple nodes, which isn't needed
# for check-worker. But that's harmless besides a few cycles.
Expand Down Expand Up @@ -79,6 +79,10 @@ check-follower-cluster: all
$(pg_regress_multi_check) --load-extension=citus --follower-cluster \
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/multi_follower_schedule $(EXTRA_TESTS)

check-failure: all
$(pg_regress_multi_check) --load-extension=citus --mitmproxy \
-- $(MULTI_REGRESS_OPTS) --schedule=$(citus_abs_srcdir)/failure_schedule $(EXTRA_TESTS)

clean distclean maintainer-clean:
rm -f $(output_files) $(input_files)
rm -rf tmp_check/
20 changes: 20 additions & 0 deletions src/test/regress/Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[[source]]

name = "pypi"
url = "https://pypi.python.org/simple"
verify_ssl = true


[packages]

mitmproxy = "==3.0.4"
construct = "*"


[dev-packages]



[requires]

python_version = "3.5"
328 changes: 328 additions & 0 deletions src/test/regress/Pipfile.lock

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions src/test/regress/expected/failure_setup.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
SELECT citus.mitmproxy('conn.allow()');
mitmproxy
-----------

(1 row)

-- add the workers
SELECT master_add_node('localhost', :worker_1_port); -- the second worker
master_add_node
---------------------------------------------------
(1,1,localhost,57637,default,f,t,primary,default)
(1 row)

SELECT master_add_node('localhost', :worker_2_port + 2); -- the first worker, behind a mitmproxy
master_add_node
---------------------------------------------------
(2,2,localhost,57640,default,f,t,primary,default)
(1 row)

50 changes: 50 additions & 0 deletions src/test/regress/expected/failure_test_helpers.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
-- By default Citus makes lots of connections in the background which fill up the log
-- By tweaking these settings you can make sure you only capture packets related to what
-- you're doing
ALTER SYSTEM SET citus.distributed_deadlock_detection_factor TO -1;
ALTER SYSTEM SET citus.recover_2pc_interval TO -1;
ALTER SYSTEM set citus.enable_statistics_collection TO false;
SELECT pg_reload_conf();
pg_reload_conf
----------------
t
(1 row)

-- Add some helper functions for sending commands to mitmproxy
CREATE FUNCTION citus.mitmproxy(text) RETURNS TABLE(result text) AS $$
DECLARE
command ALIAS FOR $1;
BEGIN
CREATE TEMPORARY TABLE mitmproxy_command (command text) ON COMMIT DROP;
CREATE TEMPORARY TABLE mitmproxy_result (res text) ON COMMIT DROP;

INSERT INTO mitmproxy_command VALUES (command);

EXECUTE format('COPY mitmproxy_command TO %L', current_setting('citus.mitmfifo'));
EXECUTE format('COPY mitmproxy_result FROM %L', current_setting('citus.mitmfifo'));

RETURN QUERY SELECT * FROM mitmproxy_result;
END;
$$ LANGUAGE plpgsql;
CREATE FUNCTION citus.clear_network_traffic() RETURNS void AS $$
BEGIN
PERFORM citus.mitmproxy('recorder.reset()');
RETURN; -- return void
END;
$$ LANGUAGE plpgsql;
CREATE FUNCTION citus.dump_network_traffic()
RETURNS TABLE(conn int, source text, message text) AS $$
BEGIN
CREATE TEMPORARY TABLE mitmproxy_command (command text) ON COMMIT DROP;
CREATE TEMPORARY TABLE mitmproxy_result (
conn int, source text, message text
) ON COMMIT DROP;

INSERT INTO mitmproxy_command VALUES ('recorder.dump()');

EXECUTE format('COPY mitmproxy_command TO %L', current_setting('citus.mitmfifo'));
EXECUTE format('COPY mitmproxy_result FROM %L', current_setting('citus.mitmfifo'));

RETURN QUERY SELECT * FROM mitmproxy_result;
END;
$$ LANGUAGE plpgsql;
5 changes: 5 additions & 0 deletions src/test/regress/failure_schedule
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# import this file (from psql you can use \i) to use mitmproxy manually
test: failure_test_helpers

# this should only be run by pg_regress_multi, you don't need it
test: failure_setup
1 change: 1 addition & 0 deletions src/test/regress/mitmscripts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__
169 changes: 169 additions & 0 deletions src/test/regress/mitmscripts/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
Automated Failure testing
=========================

Automated Failure Testing works by inserting a network proxy (mitmproxy) between the
citus coordinator and one of the workers (connections to the other worker are left
unchanged). The proxy is configurable, and sits on a fifo waiting for commands. When it
receives a command over the fifo it reconfigures itself and sends back response.
Regression tests which use automated failure testing communicate with mitmproxy by running
special UDFs which talk to said fifo. The tests send commands such as "fail any connection
which contain the string 'COMMIT'" and then run SQL queries and assert that the
coordinator has reasonable behavior when the specified failures occur.

Contents of this file:
I. Getting Started
II. Running mitmproxy manually
III. citus.mitmproxy() command strings
IV. Recording Network Traffic

# I. Getting Started

First off, to use this you'll need mitmproxy, I recommend version 3.0.4, and I also
recommend running it with python 3.6. This script integrates pretty deeply with mitmproxy
so other versions might fail to work.

I highly recommend using pipenv to install mitmproxy. It lets you easily manage isolated
environments (instead of installing python packages globally). If you've heard of
virtualenv, pipenv is that but much easier to use.

Once you've installed it:

$ cd src/test/regress
$ pipenv --python 3.6
$ pipenv install # there's already a Pipfile.lock in src/test/regress with packages
$ pipenv shell # this enters the virtual environment, putting mitmproxy onto $PATH

That's all you need to do to run the failure tests:

$ make check-failure

# II. Running mitmproxy manually

$ mkfifo /tmp/mitm.fifo # first, you need a fifo
$ cd src/test/regress
$ pipenv shell
$ mitmdump --rawtcp -p 9702 --mode reverse:localhost:9700 -s mitmscripts/fluent.py --set fifo=/tmp/mitm.fifo

The specific port numbers will be different depending on your setup. The above string
means mitmdump will accept connections on port 9702 and forward them to the worker
listening on port 9700.

Now, open psql and run:

# UPDATE pg_dist_node SET nodeport = 9702 WHERE nodeport = 9700;

Again, the specific port numbers depend on your setup.

# \i src/test/regress/sql/failure_test_helpers.sql

The above file creates some UDFs and also disables a few citus features which make
connections in the background.

You also want to tell the UDFs how to talk to mitmproxy (careful, this must be an absolute
path):

# SET citus.mitmfifo = '/tmp/mitm.fifo';

(nb: this GUC does not appear in shared_library_init.c, Postgres allows setting and
reading GUCs which have not been defined by any extension)

You're all ready! If it worked, you should be able to run this:

# SELECT citus.mitmproxy('conn.allow()');
mitmproxy
-----------

(1 row)

# III. citus.mitmproxy() command strings

Command strings specify a pipline. Each connection is handled individually, and the
pipeline is called once for every packet which is sent. For example, given this string:

`conn.onQuery().after(2).kill()` -> kill a connection if three Query packets are seen

- onQuery() is a filter. It only passes Query packets (packets which the frontend sends
to the backend which specify a query which is to be run) onto the next step of the
pipeline.

- after(2) is another filter, it ignores the first two packets which are sent to it, then
sends the following packets to the next step of the pipeline.

- kill() is an action, when a packet reaches it the connection containing that packet will
be killed.

## Actions

There are 5 actions you can take on connections:

conn.allow() - the default, allows all connections to execute unmodified
conn.kill() - kills all connections immediately after the first packet is sent
conn.reset() - kill() calls shutdown(SHUT_WR), shutdown(SHUT_RD), close(). This is a very
graceful way to close the socket. reset() causes a RST packet to be sent
and forces the connection closed in something more resembling an error.
conn.cancel(pid) - This doesn't cause any changes at the network level. Instead it sends
a SIGINT to pid and introduces a short delay, with hopes that the
signal will be received before the delay ends. You can use it to write
cancellation tests.

The previous actions all work on a per-connection basis. Meaning, each connection is
tracked individually. A command such as `conn.onQuery().kill()` will only kill the
connection on which the Query packet was seen. A command such as
`conn.onQuery().after(2).kill()` will never trigger if each Query is sent on a different
connection, even if you send dozens of Query packets.

The final action works a bit differently:

conn.killall() - the killall() command kills this and all subsequent connections. Any
packets sent once it triggers will have their connections killed.

## Filters

conn.onQuery().kill() - kill a connection once a "Query" packet is seen
conn.onCopyData().kill() - kill a connection once a "CopyData" packet is seen

The list of supported packets can be found in ./structs.py, and the list of packets which
could be supported can be found at:
https://www.postgresql.org/docs/current/static/protocol-message-formats.html

You can also inspect the contents of packets:

conn.onQuery(query="COMMIT").kill() - you can look into the actual query which is sent and
match on its contents (this is always a regex)
conn.onQuery(query="^COMMIT").kill() - the query must start with COMMIT
conn.onQuery(query="pg_table_size\(") - you must escape parens, since you're in a regex

after(n) matches after the n-th packet has been sent:

conn.after(2).kill() - Kill connections when the third packet is sent down them

There's also a low-level filter which runs a regex against the raw content of the packet:

conn.matches(b"^Q").kill() - this is another way of writing conn.onQuery(). Note the 'b',
it's always required.

## Chaining:

Filters and actions can be arbitrarily chained:

conn.matches(b"^Q").after(2).kill() - kill any connection when the third Query is sent

# IV. Recording Network Traffic

There are also some special commands. This proxy also records every packet and lets you
inspect them:

recorder.dump() - emits a list of captured packets in COPY text format
recorder.reset() - empties the data structure containing the captured packets

Both of those calls empty the structure containing the packets, a call to dump() will only
return the packets which were captured since the last call to .dump() or reset()

Back when you called `\i sql/failure_test_helpers.sql` you created some UDFs which make
using these strings easier. Here are some commands you can run from psql, or from inside
failure tests:

citus.clear_network_traffic() - this empties the buffer containing captured packets
citus.dump_network_traffic() - this returns a little table and pretty-prints information
on all the packets captured since the last call to
clear_network_traffic() or dump_network_traffic()
Loading

0 comments on commit a54f9a6

Please sign in to comment.