From ae1a2705266b36c6611eecf4541b0c6608f8cb02 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 15 Jan 2019 16:22:05 +1030 Subject: [PATCH] gossipd: handle overflowing query properly (avoid slow 100% CPU reports) Don't do this: (gdb) bt #0 0x00007f37ae667c40 in ?? () from /lib/x86_64-linux-gnu/libz.so.1 #1 0x00007f37ae668b38 in ?? () from /lib/x86_64-linux-gnu/libz.so.1 #2 0x00007f37ae669907 in deflate () from /lib/x86_64-linux-gnu/libz.so.1 #3 0x00007f37ae674c65 in compress2 () from /lib/x86_64-linux-gnu/libz.so.1 #4 0x000000000040cfe3 in zencode_scids (ctx=0xc1f118, scids=0x2599bc49 "\a\325{", len=176320) at gossipd/gossipd.c:218 #5 0x000000000040d0b3 in encode_short_channel_ids_end (encoded=0x7fff8f98d9f0, max_bytes=65490) at gossipd/gossipd.c:236 #6 0x000000000040dd28 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290511, number_of_blocks=8) at gossipd/gossipd.c:576 #7 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290511, number_of_blocks=16) at gossipd/gossipd.c:595 #8 0x000000000040ddee in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290495, number_of_blocks=32) at gossipd/gossipd.c:596 #9 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290495, number_of_blocks=64) at gossipd/gossipd.c:595 #10 0x000000000040ddee in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290431, number_of_blocks=128) at gossipd/gossipd.c:596 #11 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290431, number_of_blocks=256) at gossipd/gossipd.c:595 #12 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290431, number_of_blocks=512) at gossipd/gossipd.c:595 #13 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17290431, number_of_blocks=1024) at gossipd/gossipd.c:595 #14 0x000000000040ddee in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=2047) at gossipd/gossipd.c:596 #15 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=4095) at gossipd/gossipd.c:595 #16 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=8191) at gossipd/gossipd.c:595 #17 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=16382) at gossipd/gossipd.c:595 #18 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=32764) at gossipd/gossipd.c:595 #19 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=65528) at gossipd/gossipd.c:595 #20 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=131056) at gossipd/gossipd.c:595 #21 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=262112) at gossipd/gossipd.c:595 #22 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=524225) at gossipd/gossipd.c:595 #23 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=1048450) at gossipd/gossipd.c:595 #24 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=2096900) at gossipd/gossipd.c:595 #25 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=4193801) at gossipd/gossipd.c:595 #26 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=8387603) at gossipd/gossipd.c:595 #27 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=17289408, number_of_blocks=16775207) at gossipd/gossipd.c:595 #28 0x000000000040ddee in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=33550414) at gossipd/gossipd.c:596 #29 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=67100829) at gossipd/gossipd.c:595 #30 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=134201659) at gossipd/gossipd.c:595 #31 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=268403318) at gossipd/gossipd.c:595 #32 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=536806636) at gossipd/gossipd.c:595 #33 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=1073613273) at gossipd/gossipd.c:595 #34 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=2147226547) at gossipd/gossipd.c:595 #35 0x000000000040ddc6 in queue_channel_ranges (peer=0x3868fc8, first_blocknum=514201, number_of_blocks=4294453094) at gossipd/gossipd.c:595 #36 0x000000000040df26 in handle_query_channel_range (peer=0x3868fc8, msg=0x37e0678 "\001\ao\342\214\n\266\361\263r\301\246\242F\256c\367O\223\036\203e\341Z\b\234h\326\031") at gossipd/gossipd.c:625 The cause was that converting a block number to an scid truncates it at 24 bits. When we look through the index from (truncated number) to (real end number) we get every channel, which is too large to encode, so we iterate again. This fixes both that problem, and also the issue that we'd end up dividing into many empty sections until we get to the highest block number. Instead, we just tack the empty blocks on to then end of the final query. Reported-by: George Vaccaro Signed-off-by: Rusty Russell --- CHANGELOG.md | 2 ++ gossipd/gossipd.c | 53 +++++++++++++++++++++++++++++++------------- tests/test_gossip.py | 10 ++++----- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d65ced9ef617..858b07217934 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,8 @@ changes. ### Fixed +- Protocol: handling `query_channel_range` for large numbers of blocks + (eg. 4 billion) was slow due to a bug. ### Security diff --git a/gossipd/gossipd.c b/gossipd/gossipd.c index d04300a5c910..5f4e504b0b54 100644 --- a/gossipd/gossipd.c +++ b/gossipd/gossipd.c @@ -659,9 +659,14 @@ static void reply_channel_range(struct peer *peer, /*~ When we need to send an array of channels, it might go over our 64k packet * size. If it doesn't, we recurse, splitting in two, etc. Each message * indicates what blocks it contains, so the recipient knows when we're - * finished. */ + * finished. + * + * tail_blocks is the empty blocks at the end, in case they asked for all + * blocks to 4 billion. + */ static void queue_channel_ranges(struct peer *peer, - u32 first_blocknum, u32 number_of_blocks) + u32 first_blocknum, u32 number_of_blocks, + u32 tail_blocks) { struct routing_state *rstate = peer->daemon->rstate; u8 *encoded = encode_short_channel_ids_start(tmpctx); @@ -704,7 +709,8 @@ static void queue_channel_ranges(struct peer *peer, /* If we can encode that, fine: send it */ if (encode_short_channel_ids_end(&encoded, max_encoded_bytes)) { - reply_channel_range(peer, first_blocknum, number_of_blocks, + reply_channel_range(peer, first_blocknum, + number_of_blocks + tail_blocks, encoded); return; } @@ -717,22 +723,26 @@ static void queue_channel_ranges(struct peer *peer, first_blocknum); return; } - status_debug("queue_channel_ranges full: splitting %u+%u and %u+%u", + status_debug("queue_channel_ranges full: splitting %u+%u and %u+%u(+%u)", first_blocknum, number_of_blocks / 2, first_blocknum + number_of_blocks / 2, - number_of_blocks - number_of_blocks / 2); - queue_channel_ranges(peer, first_blocknum, number_of_blocks / 2); + number_of_blocks - number_of_blocks / 2, + tail_blocks); + queue_channel_ranges(peer, first_blocknum, number_of_blocks / 2, 0); queue_channel_ranges(peer, first_blocknum + number_of_blocks / 2, - number_of_blocks - number_of_blocks / 2); + number_of_blocks - number_of_blocks / 2, + tail_blocks); } /*~ The peer can ask for all channels is a series of blocks. We reply with one * or more messages containing the short_channel_ids. */ static u8 *handle_query_channel_range(struct peer *peer, const u8 *msg) { + struct routing_state *rstate = peer->daemon->rstate; struct bitcoin_blkid chain_hash; - u32 first_blocknum, number_of_blocks; + u32 first_blocknum, number_of_blocks, tail_blocks; + struct short_channel_id last_scid; if (!fromwire_query_channel_range(msg, &chain_hash, &first_blocknum, &number_of_blocks)) { @@ -751,14 +761,25 @@ static u8 *handle_query_channel_range(struct peer *peer, const u8 *msg) return NULL; } - /* This checks for 32-bit overflow! */ - if (first_blocknum + number_of_blocks < first_blocknum) { - return towire_errorfmt(peer, NULL, - "query_channel_range overflow %u+%u", - first_blocknum, number_of_blocks); - } - - queue_channel_ranges(peer, first_blocknum, number_of_blocks); + /* If they ask for number_of_blocks UINTMAX, and we have to divide + * and conquer, we'll do a lot of unnecessary work. Cap it at the + * last value we have, then send an empty reply. */ + if (uintmap_last(&rstate->chanmap, &last_scid.u64)) { + u32 last_block = short_channel_id_blocknum(&last_scid); + + /* u64 here avoids overflow on number_of_blocks + UINTMAX for example */ + if ((u64)first_blocknum + number_of_blocks > last_block) { + tail_blocks = first_blocknum + number_of_blocks + - last_block - 1; + number_of_blocks -= tail_blocks; + } else + tail_blocks = 0; + } else + tail_blocks = 0; + + queue_channel_ranges(peer, first_blocknum, number_of_blocks, + tail_blocks); return NULL; } diff --git a/tests/test_gossip.py b/tests/test_gossip.py index 0231033e8bd4..cbb66ed51457 100644 --- a/tests/test_gossip.py +++ b/tests/test_gossip.py @@ -4,7 +4,6 @@ import json import logging import os -import pytest import struct import subprocess import time @@ -548,7 +547,6 @@ def check_gossip(n): @unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1") -@pytest.mark.xfail(strict=True) def test_gossip_query_channel_range(node_factory, bitcoind): l1, l2, l3, l4 = node_factory.line_graph(4, opts={'log-level': 'io'}, fundchannel=False) @@ -662,8 +660,8 @@ def test_gossip_query_channel_range(node_factory, bitcoind): first=0, num=1000000) - # Turns out it sends huge number of empty replies here. - l1.daemon.wait_for_logs([r'\[IN\] 0108'] * 21) + # Turns out it sends: 0+53, 53+26, 79+13, 92+7, 99+3, 102+2, 104+1, 105+999895 + l1.daemon.wait_for_logs([r'\[IN\] 0108'] * 8) # It should definitely have split assert ret['final_first_block'] != 0 or ret['final_num_blocks'] != 1000000 @@ -673,11 +671,11 @@ def test_gossip_query_channel_range(node_factory, bitcoind): assert ret['short_channel_ids'][1] == scid23 l2.daemon.wait_for_log('queue_channel_ranges full: splitting') - # Test overflow case doesn't split forever; should only get 32 for this. + # Test overflow case doesn't split forever; should still only get 8 for this ret = l1.rpc.dev_query_channel_range(id=l2.info['id'], first=1, num=4294967294) - l1.daemon.wait_for_logs([r'\[IN\] 0108'] * 32) + l1.daemon.wait_for_logs([r'\[IN\] 0108'] * 8) # And no more! time.sleep(1)