diff --git a/ChangeLog.txt b/ChangeLog.txt index 4f0b4b7..62e8ac8 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -6,6 +6,18 @@ # file, you can obtain one at https://mozilla.org/MPL/2.0/. # +Version 1.2.1-beta1 - 2021-08-23 +-------------------------- + +* Change behaviour of internal sniffer channel when overloaded to better + report and manage dropping/sampling of packets + * This fixes a bug where the 'Total Packets processed' statistic did not + include any packets dropped during overload +* Improve `log-interval` statistics and packet drop logging +* EXPERIMENTAL: New sampling mode option (disabled by default) + * sampling can be enabled at a specified rate for a specified time + when packet drops exceed a specified threshold (see config options) + Version 1.2.0 - 2021-05-07 -------------------------- diff --git a/doc/user-guide/compactor-capture-from-network.adoc.in b/doc/user-guide/compactor-capture-from-network.adoc.in index eaffae3..8b30a37 100644 --- a/doc/user-guide/compactor-capture-from-network.adoc.in +++ b/doc/user-guide/compactor-capture-from-network.adoc.in @@ -37,3 +37,18 @@ *-L, --log-network-stats-period* _arg_:: Every _arg_ seconds, log basic statistics on packet collection to the system log. The default value of 0 disables this logging. + +*--sampling-threshold* _arg_:: + A threshold for the percentage of traffic dropped on the internal channels above which + sampling will be enabled (if *sampling-rate* is greater than 0). The + default value is 10. + +*--sampling-rate* _arg_:: + The rate (1 in _arg_ packets) to be applied when sampling mode is enabled (this is an + experimental feature). The rate is applied for *sampling-time* seconds and then sampling + is disabled. After this, depending on the traffic rate, sampling may be enabled again if the drops rise + above the *sampling-threshold*. The default value of 0 disables this option. + +*--sampling-time* _arg_:: + The period of time to apply sampling mode for. To avoid accidentally setting a low value + that could result in instability this must be at least 10s. The default value is 100. \ No newline at end of file diff --git a/doc/user-guide/installation.adoc.in b/doc/user-guide/installation.adoc.in index 0119740..5511b9b 100644 --- a/doc/user-guide/installation.adoc.in +++ b/doc/user-guide/installation.adoc.in @@ -4,11 +4,14 @@ Binary install packages are available from a https://launchpad.net[Launchpad] PPA (Personal Package Archive) -`ppa:dns-stats/compactor` for +`ppa:dns-stats/compactor-bionic` for -* http://releases.ubuntu.com/16.04/[Ubuntu 16.04 LTS 'Xenial Xerus'] +* http://releases.ubuntu.com/18.04/[Ubuntu 18.04 LTS 'Bionic Beaver'] -==== Ubuntu 16.04 LTS 'Xenial Xerus' +Experimental packages for Ubuntu 20.04 are also available for early testing +from `ppa:dns-stats/compactor-focal`. + +==== Ubuntu packages _compactor_ and _inspector_ are supplied in separate packages named `dns-stats-compactor` and `dns-stats-inspector`. diff --git a/doc/user-guide/overview.adoc.in b/doc/user-guide/overview.adoc.in index a93e866..6bfb4d6 100644 --- a/doc/user-guide/overview.adoc.in +++ b/doc/user-guide/overview.adoc.in @@ -114,7 +114,7 @@ with the following changes: ** _BlockStatistics_: *** _compactor-non-dns-packets_ (-1): count of the number of received packets that could not be interpreted as DNS packets. -*** _compactor-out-of-order-packets (-2): count of the number of received packets that +*** _compactor-out-of-order-packets_ (-2): count of the number of received packets that were not received by _compactor_ in strict chronological order. *** _compactor-missing-pairs_ (-3): count of output query/response pairs not written by _compactor_ because they could not be processed quickly enough. @@ -122,6 +122,7 @@ with the following changes: by _compactor_ because they could not be processed quickly enough. *** _compactor-missing-non-dns_ (-5): count of output ignored PCAP packets not written by _compactor_ because they could not be processed quickly enough. +*** _compactor-packets_ (-6): total packets received by compactor [IMPORTANT] ==== diff --git a/doc/user-guide/running.adoc.in b/doc/user-guide/running.adoc.in index 0ac0b8f..aab3275 100644 --- a/doc/user-guide/running.adoc.in +++ b/doc/user-guide/running.adoc.in @@ -297,32 +297,42 @@ messages. | Comment | ERROR -| C-DNS overflow. Dropping address event(s). -| An address event is a countable event associated with a client address, such as -a TCP reset or ICMP event. These are happening faster than _compactor_ can -record them, so at least one has been dropped. +| Dropping on these channels: Sniffer C-DNS +| Packets are arriving faster than _compactor_ can +process them, so packets are being dropped internally on the specified +channels. More information is available via the *--log-network-stats-period* logging output. | ERROR -| C-DNS overflow. Dropping query/response(s). -| DNS query/response matches are being generated faster than they can be -recorded to the C-DNS output. At least one of these matches has been dropped. - -| ERROR -| Ignored PCAP overflow. Dropping packet(s). +| Ignored PCAP channel overflow. Dropping packet(s). | More ignored packets - that is, packets that do not appear to be DNS related or which are malformed - are arriving than can be processed and recorded to the ignored PCAP output. At least one has been dropped. | ERROR -| Raw PCAP overflow. Dropping packet(s). +| Raw PCAP channel overflow. Dropping packet(s). | More input packets are arriving than can be processed and recorded to the raw PCAP output. At least one has been dropped. +| WARNING +| Sampling mode switched on for 100s with rate of 1 in 10 +| Drops on at least one internal channel are occurring at a rate higher +than specified by the *--sampling-threshold* option so sampling is enabled. + +| WARNING +| Sampling mode extended as drops still occurring +| Drops above the threshold level are still occurring after the specified +*--sampling-time* period. + +| WARNING +| Sampling mode switched off because time limit expired and not dropping above threshold +| Drops are now below the sampling threshold and so sampling is disabled after the specified +time limit. + | INFO | Total packet count, etc. | Basic statistics on the ongoing network capture requested by the -*--log-network-stats-period* option. +*--log-network-stats-period* option (see below). | INFO | Starting network capture @@ -338,6 +348,17 @@ to the raw PCAP output. At least one has been dropped. stopping collection. |=== +Example statistics produced by enabling the *log-network-stats-period* option: + +---- +*Stats interval: average rate 480027 pps over 1s + LIBPCAP : recv/OS drop/IF drop 481002/ 0/ 0 + Sniffer : recv/dropped/queue 480701/ 0/ 917 + Sampling: recv/discard 480027/ 0 + C-DNS : recv/dropped/queue 480027/ 0/ 0 + PCAP out: raw drop/ignored drop 0/ 0/ +---- + Other error messages are reporting an internal error. === _compactor_ performance considerations diff --git a/etc/compactor.conf.in b/etc/compactor.conf.in index 808cd16..dedfc3f 100644 --- a/etc/compactor.conf.in +++ b/etc/compactor.conf.in @@ -56,6 +56,16 @@ # Log basic collection stats to syslog every n seconds. 0 (default) == never. # log-network-stats-period=0 +# (Sampling is an experimental feature) +# Sampling threshold is percentage of traffic dropped above which sampling will be enabled. Default is 10. +# sampling-threshold=10 + +# Sampling rate (1 in n) to be applied if packets dropped internally. 0 (default) == none. +# sampling-rate=0 + +# Apply sampling for n seconds for before re-checking for dropped packets. Default is 100. +# sampling-time=100 + # Output options. # Output file rotation period, in seconds. diff --git a/src/channel.hpp b/src/channel.hpp index 2122ce1..0cc6721 100644 --- a/src/channel.hpp +++ b/src/channel.hpp @@ -1,5 +1,5 @@ /* - * Copyright 2016-2017 Internet Corporation for Assigned Names and Numbers. + * Copyright 2016-2021 Internet Corporation for Assigned Names and Numbers. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -71,6 +71,15 @@ class Channel return closed_; } + /** + * \brief Return current queue size + */ + unsigned get_length() + { + std::lock_guard lock(m_); + return queue_.size(); + } + /** * \brief Add a new item to the channel. * diff --git a/src/compactor.cpp b/src/compactor.cpp index 87d4a23..55f766e 100644 --- a/src/compactor.cpp +++ b/src/compactor.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -270,17 +271,31 @@ static void sniff_loop(BaseSniffers* sniffer, { bool seen_raw_overflow = false; bool seen_ignored_overflow = false; - bool seen_ae_overflow = false; bool do_raw_pcap = !config.raw_pcap_pattern.empty(); bool do_ignored_pcap = !config.ignored_pcap_pattern.empty(); bool do_decode = config.debug_qr || config.debug_dns || config.report_info || !config.output_pattern.empty(); bool do_match = config.debug_qr || config.report_info || !config.output_pattern.empty(); - cno::system_clock::time_point last_timestamp; - cno::system_clock::time_point next_stats_log; - cno::system_clock::time_point last_stats_log_timestamp; + cno::system_clock::time_point last_recv_timestamp; // timestamp of last recieved packet + cno::system_clock::time_point next_statslog_timestamp; // time when next log output of stats is due + cno::system_clock::time_point last_statslog_timestamp; // time when last log output of stats was done + cno::system_clock::time_point next_drop_check_timestamp; // next time we should inspect the drop stats + cno::system_clock::time_point sampling_end_timestamp; // next time sampling should be stopped + PacketStatistics last_stats = stats; + PacketStatistics last_drop_check_stats = stats; + + struct pcap_stat last_pcap_stats; + sniffer->pcap_stats(last_pcap_stats); + + BaseSniffers::Stats sniffer_stats; + BaseSniffers::Stats last_sniffer_stats; + sniffer->sniffer_stats(last_sniffer_stats); + BaseSniffers::Stats last_drop_check_sniffer_stats = last_sniffer_stats; + + bool drops_last_check = false; + bool sampling = false; auto dns_sink = [&](std::unique_ptr& dns) @@ -303,11 +318,6 @@ static void sniff_loop(BaseSniffers* sniffer, if ( !output.cbor->put(cbi, false) ) { ++stats.output_cbor_drop_count; - if ( !seen_ae_overflow ) - { - LOG_ERROR << "C-DNS overflow. Dropping address event(s)"; - seen_ae_overflow = true; - } } } }; @@ -322,7 +332,7 @@ static void sniff_loop(BaseSniffers* sniffer, ++stats.output_ignored_pcap_drop_count; if ( !seen_ignored_overflow ) { - LOG_ERROR << "Ignored PCAP overflow. Dropping packet(s)"; + LOG_ERROR << "Ignored PCAP channel overflow. Dropping packet(s)"; seen_ignored_overflow = true; } } @@ -343,9 +353,9 @@ static void sniff_loop(BaseSniffers* sniffer, ++stats.raw_packet_count; - if ( last_timestamp > pcap->timestamp ) + if ( last_recv_timestamp > pcap->timestamp ) ++stats.out_of_order_packet_count; - last_timestamp = pcap->timestamp; + last_recv_timestamp = pcap->timestamp; if ( do_raw_pcap ) { @@ -354,71 +364,150 @@ static void sniff_loop(BaseSniffers* sniffer, ++stats.output_raw_pcap_drop_count; if ( !seen_raw_overflow ) { - LOG_ERROR << "Raw PCAP overflow. Dropping packet(s)"; + LOG_ERROR << "Raw PCAP channel overflow. Dropping packet(s)"; seen_raw_overflow = true; } } } - if ( do_decode ) - { - bool ignored = false; - - try + // If we're dropping packets, respond by sampling. + if ( sampling && (stats.raw_packet_count % config.sampling_rate) != 0 ) { + ++stats.discarded_cbor_count; + } + else { + if ( do_decode ) { - packet_stream.process_packet(pcap); + bool ignored = false; + + try + { + packet_stream.process_packet(pcap); + } + catch (const unhandled_packet& e) + { + ignored = true; + ++stats.unhandled_packet_count; + } + catch (const malformed_packet& e) + { + ignored = true; + ++stats.malformed_message_count; + } + + if ( ignored ) + ignored_sink(pcap); } - catch (const unhandled_packet& e) - { - ignored = true; - ++stats.unhandled_packet_count; + } + + // SAMPLING - check for drops or timeout on sampling mode very second + if ( next_drop_check_timestamp <= last_recv_timestamp ) + { + next_drop_check_timestamp = last_recv_timestamp + std::chrono::seconds(1); + sniffer->sniffer_stats(sniffer_stats); + seen_raw_overflow = seen_ignored_overflow = false; + + uint64_t new_sniffs = sniffer_stats.pkts_sniffed - last_drop_check_sniffer_stats.pkts_sniffed; + uint64_t new_raw = stats.raw_packet_count - last_drop_check_stats.raw_packet_count; + uint64_t new_sniff_drops = sniffer_stats.pkts_dropped - last_drop_check_sniffer_stats.pkts_dropped; + uint64_t new_cbor_drops = stats.output_cbor_drop_count - last_drop_check_stats.output_cbor_drop_count; + bool sniff_dropping = new_sniff_drops > new_sniffs * (config.sampling_threshold/100.0); + bool cbor_dropping = new_cbor_drops > new_raw * (config.sampling_threshold/100.0); + + // If seeing drops, only trigger off these two queues for now + if ( new_sniff_drops > 0 || new_cbor_drops > 0 ) + { + LOG_ERROR << "Dropping on these channels: " << (new_sniff_drops!=0?"Sniffer ":"") + << (new_cbor_drops!=0?"C-DNS":""); } - catch (const malformed_packet& e) - { - ignored = true; - ++stats.malformed_message_count; + if ( sniff_dropping || cbor_dropping ) { + if (config.sampling_rate > 0) { + if ( !sampling ) { + if (!drops_last_check ) { + // register the drops and wait for next check + drops_last_check = true; + } else { + sampling = true; + sampling_end_timestamp = last_recv_timestamp + std::chrono::seconds(config.sampling_time); + LOG_WARN << "Sampling mode switched on for " << config.sampling_time + << "s with rate of 1 in " << config.sampling_rate; + } + } + // sampling but still dropping, push the end of the timer out + else if ( sampling_end_timestamp <= last_recv_timestamp ) { + sampling_end_timestamp = last_recv_timestamp + std::chrono::seconds(config.sampling_time); + LOG_WARN << "Sampling mode extended as drops still occurring"; + } + } + } else if ( sampling && sampling_end_timestamp <= last_recv_timestamp ) { + sampling = false; + drops_last_check = false; + LOG_WARN << "Sampling mode switched off because time limit expired and not dropping above threshold."; } - if ( ignored ) - ignored_sink(pcap); + // Add the number of drops in the sniffer to the raw packet count. + stats.raw_packet_count += new_sniff_drops; + last_drop_check_sniffer_stats = sniffer_stats; + last_drop_check_stats = last_stats; } + + // Output interval stats to log if ( config.log_network_stats_period > 0 ) { - if ( next_stats_log.time_since_epoch().count() == 0 ) + if ( next_statslog_timestamp.time_since_epoch().count() == 0 ) { - next_stats_log = last_timestamp + std::chrono::seconds(config.log_network_stats_period); - last_stats_log_timestamp = last_timestamp; + next_statslog_timestamp = last_recv_timestamp + std::chrono::seconds(config.log_network_stats_period); + last_statslog_timestamp = last_recv_timestamp; } - else if ( next_stats_log <= last_timestamp ) + else if ( next_statslog_timestamp <= last_recv_timestamp ) { - cno::seconds period = cno::duration_cast(last_timestamp - last_stats_log_timestamp); - - LOG_INFO << - "Total " << stats.raw_packet_count - last_stats.raw_packet_count << - " (" << (stats.raw_packet_count - last_stats.raw_packet_count) / period.count() << " pkt/s), " << - "Dropped raw/ignored/C-DNS packets " << - stats.output_raw_pcap_drop_count -last_stats.output_raw_pcap_drop_count << "/" << - stats.output_ignored_pcap_drop_count - last_stats.output_ignored_pcap_drop_count << "/" << - stats.output_cbor_drop_count - last_stats.output_cbor_drop_count; - - struct pcap_stat pcap_stat; - if ( sniffer->stats(pcap_stat) ) - { - LOG_INFO << "PCAP recv " << pcap_stat.ps_recv << - " drop " << pcap_stat.ps_drop << - " drop at iface " << pcap_stat.ps_ifdrop; - } - next_stats_log = last_timestamp + cno::seconds(config.log_network_stats_period); - last_stats_log_timestamp = last_timestamp; - last_stats = stats; + int w = 10; //output width, big enough for interval numbers up to 5 billion pps + cno::seconds period = cno::duration_cast(last_recv_timestamp - last_statslog_timestamp); + + struct pcap_stat pcap_stats; + sniffer->pcap_stats(pcap_stats); + LOG_INFO << "*Stats interval: average rate " << std::setw(w) + << (stats.raw_packet_count - last_stats.raw_packet_count) / period.count() << " pps over " + << config.log_network_stats_period << "s"; + + // Output stats directly from libpcap + LOG_INFO << " LIBPCAP : recv/OS drop/IF drop " << std::setw(w) + << pcap_stats.ps_recv - last_pcap_stats.ps_recv << "/" << std::setw(w) + << pcap_stats.ps_drop - last_pcap_stats.ps_drop << "/" << std::setw(w) + << pcap_stats.ps_ifdrop - last_pcap_stats.ps_ifdrop; + // Output info from PacketStatists and the sniffer for this interval + sniffer->sniffer_stats(sniffer_stats); + LOG_INFO << " Sniffer : recv/dropped/queue " << std::setw(w) + << sniffer_stats.pkts_sniffed - last_sniffer_stats.pkts_sniffed << "/" << std::setw(w) + << sniffer_stats.pkts_dropped - last_sniffer_stats.pkts_dropped << "/" << std::setw(w) + << sniffer_stats.channel_length; + if (config.sampling_rate > 0) { + LOG_INFO << " Sampling: recv/discard " << std::setw(w) + << (stats.raw_packet_count - last_stats.raw_packet_count) - + (sniffer_stats.pkts_dropped - last_sniffer_stats.pkts_dropped) << "/" << std::setw(w) + << stats.discarded_cbor_count - last_stats.discarded_cbor_count; } + LOG_INFO << " C-DNS : recv/dropped/queue " << std::setw(w) + << stats.processed_message_count - last_stats.processed_message_count << "/" << std::setw(w) + << stats.output_cbor_drop_count - last_stats.output_cbor_drop_count << "/" << std::setw(w) + << output.cbor->get_length(); + LOG_INFO << " PCAP out: raw drop/ignored drop " << std::setw(w) + << stats.output_raw_pcap_drop_count - last_stats.output_raw_pcap_drop_count << "/" << std::setw(w) + << stats.output_ignored_pcap_drop_count - last_stats.output_ignored_pcap_drop_count << "/" << std::setw(w); + LOG_INFO << ""; + + // Update time/state + next_statslog_timestamp = last_recv_timestamp + cno::seconds(config.log_network_stats_period); + last_statslog_timestamp = last_recv_timestamp; + last_stats = stats; + last_pcap_stats = pcap_stats; + last_sniffer_stats = sniffer_stats; } } } // Retrieve PCAP stats, if available. struct pcap_stat pcap_stat; - if ( sniffer->stats(pcap_stat) ) + if ( sniffer->pcap_stats(pcap_stat) ) { stats.pcap_recv_count += pcap_stat.ps_recv; stats.pcap_drop_count += pcap_stat.ps_drop; @@ -447,41 +536,42 @@ static void tap_loop(DnsTap& dnstap, const Configuration& config, PacketStatistics& stats) { - cno::system_clock::time_point last_timestamp; - cno::system_clock::time_point next_stats_log; - cno::system_clock::time_point last_stats_log_timestamp; + cno::system_clock::time_point last_recv_timestamp; + cno::system_clock::time_point next_statslog_timestamp; + cno::system_clock::time_point last_statslog_timestamp; PacketStatistics last_stats = stats; auto sink = [&](std::unique_ptr& dns) { ++stats.raw_packet_count; ++stats.processed_message_count; - if ( last_timestamp > dns->timestamp ) + if ( last_recv_timestamp > dns->timestamp ) ++stats.out_of_order_packet_count; stats.malformed_message_count = dnstap.malformed_message_count(); - last_timestamp = dns->timestamp; + last_recv_timestamp = dns->timestamp; if ( config.debug_dns ) std::cout << *dns; matcher.add(std::move(dns)); if ( config.log_network_stats_period > 0 ) { - if ( next_stats_log.time_since_epoch().count() == 0 ) + if ( next_statslog_timestamp.time_since_epoch().count() == 0 ) { - next_stats_log = last_timestamp + std::chrono::seconds(config.log_network_stats_period); - last_stats_log_timestamp = last_timestamp; + next_statslog_timestamp = last_recv_timestamp + std::chrono::seconds(config.log_network_stats_period); + last_statslog_timestamp = last_recv_timestamp; } - else if ( next_stats_log <= last_timestamp ) + else if ( next_statslog_timestamp <= last_recv_timestamp ) { - cno::seconds period = cno::duration_cast(last_timestamp - last_stats_log_timestamp); - - LOG_INFO << - "Total " << stats.raw_packet_count - last_stats.raw_packet_count << - " (" << (stats.raw_packet_count - last_stats.raw_packet_count) / period.count() << " pkt/s), " << - "Dropped C-DNS packets " << - stats.output_cbor_drop_count - last_stats.output_cbor_drop_count; - next_stats_log = last_timestamp + cno::seconds(config.log_network_stats_period); - last_stats_log_timestamp = last_timestamp; + cno::seconds period = cno::duration_cast(last_recv_timestamp - last_statslog_timestamp); + + LOG_INFO << "*Stats interval: average rate " << std::setw(10) + << (stats.raw_packet_count - last_stats.raw_packet_count) / period.count() << " pps over " + << config.log_network_stats_period << "s"; + LOG_INFO << " C-DNS : recv/dropped " << std::setw(10) + << stats.processed_message_count - last_stats.processed_message_count << "/" << std::setw(10) + << stats.output_cbor_drop_count - last_stats.output_cbor_drop_count << "/" << std::setw(10); + next_statslog_timestamp = last_recv_timestamp + cno::seconds(config.log_network_stats_period); + last_statslog_timestamp = last_recv_timestamp; last_stats = stats; } } @@ -589,7 +679,6 @@ static int run_configuration(const po::variables_map& vm, sniff_config.set_chan_max_size(config.max_channel_size); PacketStatistics stats{}; - bool seen_qr_overflow = false; QueryResponseMatcher matcher( [&](std::shared_ptr qr) @@ -612,11 +701,6 @@ static int run_configuration(const po::variables_map& vm, if ( !output.cbor->put(cbi, false) ) { ++stats.output_cbor_drop_count; - if ( !seen_qr_overflow ) - { - LOG_ERROR << "C-DNS overflow. Dropping query/response(s)"; - seen_qr_overflow = true; - } } } }); @@ -811,6 +895,7 @@ static int run_configuration(const po::variables_map& vm, stats.dump_stats(std::cout); } + LOG_INFO << "Compactor shutdown complete"; return res; } @@ -888,6 +973,24 @@ int main(int ac, char *av[]) << "Error:\tIPv6 prefix length must be in range 0 to 128.\n"; return 1; } + if ( configuration.sampling_threshold > 100 || configuration.sampling_threshold == 0 ) + { + std::cerr + << "Error:\tSampling threshold must be in range 1 to 100.\n"; + return 1; + } + if ( configuration.sampling_rate == 1 ) + { + std::cerr + << "Error:\tSampling rate cannot be set to 1.\n"; + return 1; + } + if ( configuration.sampling_time < 10 ) + { + std::cerr + << "Error:\tSampling time must be greater than 10.\n"; + return 1; + } // Disable collection stats logging and disable logging // the hostname if reading from file. @@ -896,6 +999,7 @@ int main(int ac, char *av[]) configuration.log_network_stats_period = 0; configuration.omit_hostid = true; } + LOG_INFO << "Compactor initializing..."; // To enable a SIGHUP to not lose data, file compression // must survive the restart. That means compression @@ -945,5 +1049,6 @@ int main(int ac, char *av[]) #if ENABLE_DNSTAP google::protobuf::ShutdownProtobufLibrary(); #endif + LOG_INFO << "Compactor shutdown complete"; return 0; } diff --git a/src/configuration.cpp b/src/configuration.cpp index 4f0b0b4..a915bab 100644 --- a/src/configuration.cpp +++ b/src/configuration.cpp @@ -304,9 +304,10 @@ Configuration::Configuration() max_block_items(5000), max_output_size(0), report_info(false), log_network_stats_period(0), + sampling_threshold(10), sampling_rate(0), sampling_time(100), debug_dns(false), debug_qr(false), omit_hostid(false), omit_sysid(false), start_end_times_from_data(false), - max_channel_size(10000), + max_channel_size(300000), client_address_prefix_ipv4(DEFAULT_IPV4_PREFIX_LENGTH), client_address_prefix_ipv6(DEFAULT_IPV6_PREFIX_LENGTH), server_address_prefix_ipv4(DEFAULT_IPV4_PREFIX_LENGTH), @@ -477,6 +478,15 @@ Configuration::Configuration() ("log-network-stats-period,L", po::value(&log_network_stats_period)->default_value(0), "log network collection stats period.") + ("sampling-threshold", + po::value(&sampling_threshold)->default_value(10), + "sampling threshold - percentage of traffic dropped.") + ("sampling-rate", + po::value(&sampling_rate)->default_value(0), + "sampling rate applied when dropping packets.") + ("sampling-time", + po::value(&sampling_time)->default_value(100), + "time to sample for before checking for drops.") ; } diff --git a/src/configuration.hpp b/src/configuration.hpp index 749ac27..ec9a271 100644 --- a/src/configuration.hpp +++ b/src/configuration.hpp @@ -852,6 +852,21 @@ class Configuration */ unsigned int log_network_stats_period; + /** + * \brief sampling threshold above which to enable sampling + */ + unsigned int sampling_threshold; + + /** + * \brief sampling rate to use if enabled + */ + unsigned int sampling_rate; + + /** + * \brief minimum time to apply sampling for + */ + unsigned int sampling_time; + /** * \brief output text summary of individual DNS messages. */ diff --git a/src/log.cpp b/src/log.cpp index a28de3c..03d91c5 100644 --- a/src/log.cpp +++ b/src/log.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "log.hpp" @@ -26,7 +27,11 @@ namespace logging = boost::log; namespace src = boost::log::sources; namespace sinks = boost::log::sinks; namespace keywords = boost::log::keywords; +namespace expr = boost::log::expressions; +#ifdef __APPLE__ +void init_logging() {} +#else void init_logging() { using sink_t = sinks::synchronous_sink; @@ -43,7 +48,21 @@ void init_logging() // Set the straightforward level translator for the "Severity" attribute of type int backend->set_severity_mapper(sinks::syslog::direct_severity_mapping("Severity")); + boost::shared_ptr< sink_t > frontend(new sink_t(backend)); + + // This makes the sink to write log records that look like this: + // 1: [info] An info severity message + // 2: [error] An error severity message + frontend->set_formatter + ( + expr::format("[%1%] %2%") + % logging::trivial::severity + % expr::smessage + ); + // Wrap it into the frontend and register in the core. // The backend requires synchronization in the frontend. - core->add_sink(boost::make_shared(backend)); + core->add_sink(frontend); + } +#endif diff --git a/src/packetstatistics.hpp b/src/packetstatistics.hpp index 2add37f..b1ea3f4 100644 --- a/src/packetstatistics.hpp +++ b/src/packetstatistics.hpp @@ -1,5 +1,5 @@ /* - * Copyright 2016-2017 Internet Corporation for Assigned Names and Numbers. + * Copyright 2016-2021 Internet Corporation for Assigned Names and Numbers. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -27,57 +27,57 @@ using PacketStatistics = struct PacketStatistics_s { /** - * \brief count of packets received. + * \brief count of packets received (sniffed and dropped by sniffer). */ uint64_t raw_packet_count; /** - * \brief count of DNS packets received out of time order. + * \brief count of DNS packets received out of time order (after any sampling applied). */ uint64_t out_of_order_packet_count; /** - * \brief count of unhandled packets received. + * \brief count of unhandled packets received (after any sampling applied). */ uint64_t unhandled_packet_count; /** - * \brief count of well-formed DNS messages received. + * \brief count of well-formed DNS messages received (after any sampling applied). * * C-DNS standard quantity. */ uint64_t processed_message_count; /** - * \brief count of total query/response pairs output. + * \brief count of total query/response pairs output (after any sampling applied). * * C-DNS standard quantity. */ uint64_t qr_pair_count; /** - * \brief count of queries with no matching response. + * \brief count of queries with no matching response (after any sampling applied). * * C-DNS standard quantity. */ uint64_t query_without_response_count; /** - * \brief count of responses with no matching query. + * \brief count of responses with no matching query (after any sampling applied). * * C-DNS standard quantity. */ uint64_t response_without_query_count; /** - * \brief cout of discarded messages due to OPCODE not in collection list. + * \brief cout of discarded messages due to OPCODE not in collection list (after any sampling applied). * * C-DNS standard quantity. */ uint64_t discarded_opcode_count; /** - * \brief count of malformed DNS packets received. + * \brief count of malformed DNS packets received (after any sampling applied). * * C-DNS standard quantity. */ @@ -113,6 +113,11 @@ using PacketStatistics = struct PacketStatistics_s */ uint64_t output_cbor_drop_count; + /** + * \brief count of CBOR items discarded due to sampling. + */ + uint64_t discarded_cbor_count; + /** * \brief Dump the stats to the stream provided * diff --git a/src/sniffers.cpp b/src/sniffers.cpp index ed334aa..3402e57 100644 --- a/src/sniffers.cpp +++ b/src/sniffers.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2016-2019 Internet Corporation for Assigned Names and Numbers. + * Copyright 2016-2019, 2021 Internet Corporation for Assigned Names and Numbers. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -131,8 +131,9 @@ namespace { } } -BaseSniffers::BaseSniffers(unsigned chan_max_size) - : max_fd_(0), select_timeout_(1000), packets_(chan_max_size) +BaseSniffers::BaseSniffers(unsigned chan_max_size, bool block) + : max_fd_(0), select_timeout_(1000), packets_(chan_max_size), + block_put_(block), packets_sniffed_(0), packets_dropped_(0) { FD_ZERO(&fdset_); } @@ -157,7 +158,14 @@ Tins::Packet BaseSniffers::next_packet() return Tins::Packet(); } -bool BaseSniffers::stats(struct pcap_stat& stats) +void BaseSniffers::sniffer_stats(struct Stats& stats) +{ + stats.pkts_sniffed = packets_sniffed_; + stats.pkts_dropped = packets_dropped_; + stats.channel_length = packets_.get_length(); +} + +bool BaseSniffers::pcap_stats(struct pcap_stat& stats) { bool res = true; std::unique_lock lock(m_); @@ -168,7 +176,7 @@ bool BaseSniffers::stats(struct pcap_stat& stats) { struct pcap_stat istat; - if ( pcap_stats(h, &istat) == 0 ) + if ( ::pcap_stats(h, &istat) == 0 ) { stats.ps_recv += istat.ps_recv; stats.ps_drop += istat.ps_drop; @@ -232,9 +240,11 @@ void BaseSniffers::packet_read_thread() { case 1: read_one = true; + ++packets_sniffed_; try { - packets_.put(make_packet(h, hdr, data)); + if ( !packets_.put(make_packet(h, hdr, data), block_put_) ) + ++packets_dropped_; } catch (Tins::exception_base&) { @@ -242,7 +252,8 @@ void BaseSniffers::packet_read_thread() // packets - packets where transport level decode fails - // back to the application as RawPDU. There they will be // treated as ignored and logged if appropriate. - packets_.put(Tins::Packet(new Tins::RawPDU(reinterpret_cast(data), hdr->caplen), hdr->ts, DONT_COPY_PDU)); + if ( !packets_.put(Tins::Packet(new Tins::RawPDU(reinterpret_cast(data), hdr->caplen), hdr->ts, DONT_COPY_PDU), block_put_) ) + ++packets_dropped_; } break; @@ -348,7 +359,7 @@ NetworkSniffers::NetworkSniffers(const std::vector& interfaces, FileSniffer::FileSniffer(const std::string& fname, const SniffersConfiguration& config) - : BaseSniffers(config.chan_max_size()) + : BaseSniffers(config.chan_max_size(), true) { char errbuf[PCAP_ERRBUF_SIZE]; pcap_t* handle = pcap_open_offline(fname.c_str(), errbuf); diff --git a/src/sniffers.hpp b/src/sniffers.hpp index 9a930e3..dbd4068 100644 --- a/src/sniffers.hpp +++ b/src/sniffers.hpp @@ -1,5 +1,5 @@ /* - * Copyright 2016-2018 Internet Corporation for Assigned Names and Numbers. + * Copyright 2016-2021 Internet Corporation for Assigned Names and Numbers. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this @@ -225,13 +225,36 @@ class SniffersConfiguration class BaseSniffers { public: + /** + * \struct Stats + * \brief Statistics on sniffing. + */ + struct Stats + { + /** + * \brief Total number of packets sniffed. + */ + uint64_t pkts_sniffed; + + /** + * \brief Total number of packets dropped. + */ + uint64_t pkts_dropped; + + /** + * \brief Snapshot channel length + */ + unsigned channel_length; + }; + /** * \brief The default constructor. * * \param chan_max_size maximum size of channel delivering packets. + * \param block block when adding packets to processing queue. */ - explicit BaseSniffers(unsigned chan_max_size = 1000); - + // SAMPLING + explicit BaseSniffers(unsigned chan_max_size = 1000, bool block = false); /** * \brief Destructor. */ @@ -246,12 +269,19 @@ class BaseSniffers Tins::Packet next_packet(); /** - * \brief Get stats on the sniffers. + * \brief Get sniffer stats. + * + * \param stats a sniffer stats structure. + */ + void sniffer_stats(struct Stats& stats); + + /** + * \brief Get PCAP stats on the sniffers. * * \param stats a PCAP stats structure. * \returns `true` if stats updated. */ - bool stats(struct pcap_stat& stats); + bool pcap_stats(struct pcap_stat& stats); /** * \brief Break out of the collection loop. @@ -324,6 +354,21 @@ class BaseSniffers * \brief background thread collecting packets. */ std::thread t_; + + /** + * \brief block when adding sniffed packet to processing queue. + */ + bool block_put_; + + /** + * \brief count of packets sniffed. + */ + std::atomic packets_sniffed_; + + /** + * \brief count of packets dropped. + */ + std::atomic packets_dropped_; };