ovs-post-process

#!/usr/bin/env python3
# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*-
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python

import sys
import os
import lzma
import re
import copy
from pathlib import Path
from multiprocessing import Process
TOOLBOX_HOME = os.environ.get('TOOLBOX_HOME')
if TOOLBOX_HOME is None:
    print("This script requires libraries that are provided by the toolbox project.")
    print("Toolbox can be acquired from https://github.com/perftool-incubator/toolbox and")
    print("then use 'export TOOLBOX_HOME=/path/to/toolbox' so that it can be located.")
    exit(1)
else:
    p = Path(TOOLBOX_HOME) / 'python'
    if not p.exists() or not p.is_dir():
        print("ERROR: <TOOLBOX_HOME>/python ('%s') does not exist!" % (p))
        exit(2)
    sys.path.append(str(p))
from toolbox.metrics import log_sample
from toolbox.metrics import finish_samples

def conntrack_post_process(filename: str):
    print('ovs-post-process-conntrack')
    # Made for the file conntrack-stats-show.txt.xz
    file_name = filename
    with lzma.open(file_name, 'rt') as file:
        desc = {'source': 'ovs-dpctl', 'type': 'ct-stats-show', 'class': 'count'}
        proto_tcp = {'protocol': 'tcp'}
        proto_udp = {'protocol': 'udp'}
        proto_icmp = {'protocol': 'icmp'}
        proto_other = {'protocol': 'other'}
        tcp_sample = {'end': 0, 'value': 0}
        udp_sample = {'end': 0, 'value': 0}
        icmp_sample = {'end': 0, 'value': 0}
        other_sample = {'end': 0, 'value': 0}
        file_id_ovs = 'ovs-conntrack'
        for line in file:
            if 'DATE' in line:
                # The date stamp indicates a new sample to log
                # This is from date +%s.%N to nanosecond precision
                # DATE:1620741229.909499801
                time_stamp_float = float(line.split(':')[1])
                end_stamp_ms = int(time_stamp_float * 1000)
                tcp_sample['end'] = end_stamp_ms
                udp_sample['end'] = end_stamp_ms
                icmp_sample['end'] = end_stamp_ms
                other_sample['end'] = end_stamp_ms
            elif 'TCP' in line:
                # TCP: 2587
                tcp_sample['value'] = int(line.split(':')[1])
                log_sample(file_id_ovs, desc, proto_tcp, tcp_sample)
            elif 'UDP' in line:
                # UDP: 74
                udp_sample['value'] = int(line.split(':')[1])
                log_sample(file_id_ovs, desc, proto_udp, udp_sample)
            elif 'ICMP' in line:
                # ICMP: 3
                icmp_sample['value'] = int(line.split(':')[1])
                log_sample(file_id_ovs, desc, proto_icmp, icmp_sample)
            elif 'Other' in line:
                # Other: 2
                other_sample['value'] = int(line.split(':')[1])
                log_sample(file_id_ovs, desc, proto_other, other_sample)
            else:
                continue

    finish_samples()

def dpctl_memory_show_process(filename: str):
    print('ovs-post-process-dpctl-memory-show')
    file_name = filename
    with lzma.open(file_name,'rt') as file:
        desc = {'source': 'ovs-appctl', 'class': 'count', 'type': 'mem-show'}
        handler = {'source' : 'handlers'}
        ofconns = {'source' : 'ofconns'}
        ports = {'source' : 'ports'}
        revalidators = {'source' : 'revalidators'}
        rules = {'source' : 'rules'}
        udpif_keys = {'source' : 'udpif-keys'}
        handler_smpl = {'end': 0, 'value': 0}
        ofconns_smpl = {'end': 0, 'value': 0}
        ports_smpl = {'end': 0, 'value': 0}
        revalidators_smpl = {'end': 0, 'value': 0}
        rules_smpl = {'end': 0, 'value': 0}
        udpif_smpl = {'end': 0, 'value': 0}
        file_id = 'ovs-memory-show'
        for line in file:
            if 'DATE' in line:
                # Parses output of date +%s.%N
                time_stamp_float = float(line.split(':')[1])
                end_stamp_ms = int(time_stamp_float * 1000)
                handler_smpl['end'] = end_stamp_ms
                ofconns_smpl['end'] = end_stamp_ms
                ports_smpl['end'] = end_stamp_ms
                revalidators_smpl['end'] = end_stamp_ms
                rules_smpl['end'] = end_stamp_ms
                udpif_smpl['end'] = end_stamp_ms
            elif 'handlers'in line:
                # handlers:59 ofconns:2 ports:49 revalidators:21 rules:6664 udpif keys:643
                data = line.split(' ')
                try:
                    handler_smpl['value'] = int(data[0].split(':')[1])
                    ofconns_smpl['value'] = int(data[1].split(':')[1])
                    ports_smpl['value'] = int(data[2].split(':')[1])
                    revalidators_smpl['value'] = int(data[3].split(':')[1])
                    rules_smpl['value'] = int(data[4].split(':')[1])
                    udpif_smpl['value'] = int(data[7].split(':')[1])
                except IndexError as e:
                    print(e,data)
                    continue
                log_sample(file_id, desc, handler, handler_smpl)
                log_sample(file_id, desc, ofconns, ofconns_smpl)
                log_sample(file_id, desc, ports, ports_smpl)
                log_sample(file_id, desc, revalidators, revalidators_smpl)
                log_sample(file_id, desc, rules, rules_smpl)
                log_sample(file_id, desc, udpif_keys, udpif_smpl)
            else:
                continue

    finish_samples()


def compute_and_log_stats(br_name: str, new_br_data: dict, old_br_data: dict):
    # This function is responsible for calculating the rate of change, This is done with the parsed date stamp
    # rate = (new - old)/time_delta
    # example of br_data that is being parsed in this function
    '''
    {
          "timestamp": 1621349568.9893348,
          "LOCAL": {
            "rx_pkts": 2813750994,
            "rx_bytes": 1575488822034,
            "rx_drop": 0,
            "rx_errs": 0,
            "rx_frame": 0,
            "rx_over": 0,
            "rx_crc": 0,
            "tx_pkts": 2869366564,
            "tx_bytes": 1832211829840,
            "tx_drop": 0,
            "tx_errs": 0,
            "tx_coll": 0
          },
          "ens7f0": {
            "rx_pkts": 3661435731,
            "rx_bytes": 2013820599332,
            "rx_drop": 0,
            "rx_errs": 0,
            "rx_frame": 0,
            "rx_over": 0,
            "rx_crc": 0,
            "tx_pkts": 3782767418,
            "tx_bytes": 2312518145390,
            "tx_drop": 0,
            "tx_errs": 0,
            "tx_coll": 0
          },
    '''
    file_id = 'ovs-port-counters'
    log_desc = {'source': 'ovs-ofctl', 'class': 'throughput'}
    if len(old_br_data.keys()) != len(new_br_data.keys()):
        print("Number of ports changed between measurements on %s" % br_name)

    time_delta = new_br_data['timestamp'] - old_br_data['timestamp']
    for intf_name in new_br_data.keys():
        if 'timestamp' in intf_name:
            continue
        for counter, val in new_br_data[intf_name].items():
            log_names = {'bridge' : br_name , 'interface': intf_name}
            # e.g. tx_bytes
            direction,name = counter.split('_')
            log_names['direction'] = direction
            if 'pkts' in name:
                log_desc['type'] = 'packets-sec'
            elif 'bytes' in name:
                log_desc['type'] = 'Gbps'
            else:
                log_desc['type'] = 'errors-sec'
                log_names['type'] = name

            try:
                rate = (val - old_br_data[intf_name][counter]) / time_delta
                # unify on Gbps for network transfer
                if log_desc['type'] == 'Gbps':
                    rate = (rate * 8) / 1_000_000_000
                sample = {'end': int(new_br_data['timestamp'] * 1000),'value': float(rate)}
                log_sample(file_id, log_desc, log_names, sample)
            except KeyError:
                # if a port appears, it will be in the new data and not the old
                # catch the key error, will only happen once per port appearance
                pass

def appctl_dpif_netdev_pmd_perf_show(filename: str):
    print('ovs-post-process-appctl-dpif-netdev-pmd-perf-show')
    file_id = 'ovs-pmd'

    with lzma.open(filename,'rt') as file:
        epoch_ms = None

        # Find a timestamp, then find a PMD section, then parse a PMD subsection
        for line in file:

            line = re.sub('\n', '', line)

            # Things we recognize but don't need
            if match_other := re.search(r'^' +
                '^Time:.*$|' +
                '^  Iterations:.*$|' +
                '^  - Used TSC cycles:.*$|' +
                '^  Tx batches:.*$|' +
                '^  Datapath passes:.*$|' +
                '^$', line):
                continue

            #'Measurement duration: 3.058 s'
            if match_duration := re.search(r'^Measurement duration:\s+(\d+\.\d+)\s+s', line):
                duration_sec = float(match_duration.group(1))
                continue

            #'DATE:1677729845.091742739'
            if match_date := re.search(r'^DATE:(\d+.\d+)', line):
                epoch_ms = int(1000 * float(match_date.group(1)))
                # Once a new timestamp is found, all previous matches are invalid
                pmd, core, node = None, None, None
                continue

            if epoch_ms:
                # We have a timestamp, look for a PMD subsection like:
                # 'pmd thread numa_id 0 core_id 4:'
                if match_pmd := re.search(r'^pmd thread numa_id (\d+) core_id (\d+):', line):
                    node, core = int(match_pmd.group(1)), int(match_pmd.group(2))
                    pmd = str(node) + '-' + str(core)
                    continue

                if pmd:
                    # We have a pmd section, find a subsection (datapth, iterations, packets) for this pmd:
                    # 
                    # NOTE: A 'pmd-stats-clear' is run after each 'pmd-stats-show' in order to get proper percent-busy
                    # and percent-idle for PMD used-cycles.  If the clear is not done, those stats will be cumulative since
                    # OVS was started which is not what we want.
                    #
                    # It is not possible to compute percent-busy and percent-idle for PMD usage by comparing idle-iterations (count)
                    # to busy-iterations (count).  The time spent in idle vs a busy iteration is completely different, and we need
                    # the percent of time, not the percent of iterations.t
                    #
                    # The side affect of this is that one cannot diff a stat which is a 'count' from one sample to the next because
                    # the count gets reset to 0 just after very sample. This would typically be done for other tools, getting
                    # a delta between samples (delta = work done) then dividing by the time difference (throughput = work/time).
                    # However, there is a 'Measurement duration: 3.053 s' in the stats log, so one can take the current sample
                    # count stat (like Tx packets) and divide by the Measurement duration.

                    desc = { 'source': 'ovs-pmd', 'class': 'throughput' }
                    names = { 'id': str(pmd), 'node': str(node), 'core': str(core) }
                    sample = { 'end': epoch_ms }

                    #'  - PHWOL hits:                  0  (  0.0 %)'
                    #'  - MFEX Opt hits:               0  (  0.0 %)'
                    #'  - Simple Match hits:           0  (  0.0 %)'
                    #'  - EMC hits:              1569581  ( 50.8 %)'
                    #'  - SMC hits:                    0  (  0.0 %)'
                    #'  - Megaflow hits:         1522168  ( 49.2 %, 1.00 subtbl lookups/hit)'
                    if match_dp := re.search(r'^\s\s\-\s(.+)\shits:\s*(\d+)\s*\(\s*(\d+\.\d+)\s%.*\)', line):
                        desc['type'] = 'datapath-hits-sec'
                        names['dp'] = match_dp.group(1)
                        names['dp'] = re.sub(' ', '-', names['dp'])
                        sample['value'] = float(match_dp.group(2));
                        if duration_sec:
                            sample['value'] /= duration_sec
                            log_sample(file_id, desc, names, sample)
                        else:
                            print('WARNING: did not have duration for hits-sec calc')
                        continue

                    #'  - Upcalls:                     0  (  0.0 %, 0.0 us/upcall)'
                    #'  - Lost upcalls:                0  (  0.0 %)'
                    if match_dp := re.search(r'^\s\s\-\s(Upcalls|Lost\supcalls):\s*(\d+)\s*\(\s*(\d+\.\d+)\s%(.*\))', line):
                        desc['type'] = 'datapath-hits-sec'
                        names['dp'] = match_dp.group(1)
                        names['dp'] = re.sub(' ', '-', names['dp'])
                        sample['value'] = float(match_dp.group(2));
                        if duration_sec:
                            sample['value'] /= duration_sec
                            log_sample(file_id, desc, names, sample)
                        else:
                            print('WARNING: did not have duration for hits-sec calc')
                        continue

                    #'  Rx packets:              3091749  (1013 Kpps, 2952 cycles/pkt)'
                    #'  Tx packets:              3091749  (1013 Kpps)'
                    #'  Rx packets:                    0'
                    #'  Tx packets:                    0'
                    if match_packets := re.search(r'^\s\s(\w+)\spackets:\s*(\d+)(.*)', line):
                        desc['type'] = 'kpps'
                        names['direction'] = match_packets.group(1)
                        if match_packets.group(2) == '0':
                            sample['value'] = 0
                            log_sample(file_id, desc, names, sample)
                        else:
                            if match_packets_kpps := re.search(r'\s*(\d+)\sKpps.*', match_packets.group(3)):
                                sample['value'] = match_packets_kpps.group(1)
                                log_sample(file_id, desc, names, sample)
                            else:
                                print('WARNING: found non-zero packets but no match for Kpps:')
                                print(line)
                        continue

                    #'  - idle iterations:         13275  (  0.1 % of used cycles)'
                    #'  - busy iterations:        103311  ( 99.9 % of used cycles)'
                    if match_busy_idle := re.search(r'^\s\s\-\s(busy|idle)\siterations:\s*\d+\s*\(\s*(\d+\.\d+)\s%\sof\sused\scycles\)', line):
                        desc['type'] = 'pmd-' + match_busy_idle.group(1) # 'busy' or 'idle'
                        sample['value'] = float(match_busy_idle.group(2)) / 100
                        log_sample(file_id, desc, names, sample)
                        continue

                    print('WARNING: ********** could not find match for [' + line + ']')
                continue
                print('pmd not defined')
            continue
            print('epoch_ms not defined')
    finish_samples()


def ofctl_port_counters(filename: str):
    print('ovs-post-process-ofctl-dump-ports')
    # assume that number of ports per bridge will change during the run
    # we don't control the coming and going of pods
    prev_bridges = {}
    bridges = {}
    curr_date = 0

    with lzma.open(filename,'rt') as file:
        curr_bridge = ''
        curr_port = ''
        for line in file:
            if 'DATE' in line:
                s = line.split(':')[1]
                curr_date = float(s)
            elif '[ovs-ofctl' in line:
                # [ovs-ofctl dump-ports br-int]\n
                br_name = line.split(' ')[-1].strip(']\n')
                curr_bridge = br_name
                # We have found a second or subsequent bridge port dump
                if br_name in bridges:
                    if br_name not in prev_bridges:
                        # alias the existing data structure with the prev_bridges data structure
                        prev_bridges[br_name] = bridges[br_name]
                    else:
                        # stats are already there, time to calc and log the stats
                        compute_and_log_stats(br_name,bridges[br_name], prev_bridges[br_name])
                        prev_bridges[br_name] = bridges[br_name]

                # init the structure with a clean dictionary
                bridges[br_name] = {}
                bridges[br_name]['timestamp'] = curr_date
            elif 'OFPST_PORT' in line:
                continue

            elif 'port' in line:
                # port_id can be a number or a string, rx_stats is the start of the port counters
                port_id,rx_stats = line.split(':')
                # parsing this string:   port "92035bf4585978e"
                port_id = port_id.split(' ')[-1].strip('"')
                curr_port = port_id
                bridges[curr_bridge][curr_port] = {}
                # rx pkts=105713661, bytes=15600717866, drop=2, errs=0, frame=0, over=0, crc=0
                rx_stats = rx_stats.split(',')
                for stat in rx_stats:
                    try:
                        key, val = stat.split('=')
                    except ValueError as e:
                        print(e,filename,line)
                    if '?' in val:
                        val = 0
                    if 'rx pkts' in key:
                        key = 'pkts'
                    key = key.strip()
                    bridges[curr_bridge][curr_port]['rx_'+ key] = int(val)

            elif 'tx pkts' in line:
                tx_stats = line.split(',')
                for stat in tx_stats:
                    key, val = stat.split('=')
                    if '?' in val:
                        val = 0
                    if 'tx pkts' in key:
                        key = 'pkts'
                    key = key.strip()
                    bridges[curr_bridge][curr_port]['tx_'+ key] = int(val)

            elif 'duration' in line:
                continue

            else:
                continue

    finish_samples()

def dpctl_datapath_stats(filename: str):
    print("ovs-post-process-dpctl-datapath-stats")
    dp_names = []
    flow_desc_count = {'source': 'ovs-dpctl', 'type': 'flows-count', 'class': 'count'}
    lookups_desc_rate = {'source': 'ovs-dpctl', 'type': 'lookups-sec', 'class': 'count'}
    masks_desc_rate = {'source': 'ovs-dpctl', 'type': 'masks-sec', 'class': 'count'}
    cache_desc_rate = {'source': 'ovs-dpctl', 'type': 'cache-sec', 'class': 'count'}
    file_id = 'dpctl-datapath-stats'
    prev_stamp = 0
    curr_stamp = 0
    time_delta_sec = 0
    cache_hit_sample = {'end': 0, 'value': 0}
    prev_cache_hit = 0
    lookups_hit_sample = {'end': 0, 'value': 0}
    prev_lookups_hit = 0
    lookups_miss_sample = {'end': 0, 'value': 0}
    prev_lookups_miss = 0
    lookups_lost_sample = {'end': 0, 'value': 0}
    prev_lookups_lost = 0
    masks_hit_sample = {'end': 0, 'value': 0}
    prev_masks_hit = 0
    masks_total_sample = {'end': 0, 'value': 0}
    prev_masks_total = 0
    flows_sample = {'end': 0, 'value': 0}
    with lzma.open(filename,'rt') as file:
        for line in file:
            if 'DATE' in line:
                # Parses the output of date +%s.%N
                # We are tracking rate as well as sample end times from this
                # stamp. using float on this value converts it to a time
                # in microseconds 1668029405.978062
                prev_stamp = curr_stamp
                stamp = float(line.split(':')[1])
                end_stamp_ms = int(stamp * 1000)

                # Chop off floating precision so rate calculations are done in seconds
                curr_stamp = int(stamp)
                time_delta_sec = curr_stamp - prev_stamp

                cache_hit_sample['end'] = end_stamp_ms
                lookups_hit_sample['end'] = end_stamp_ms
                lookups_miss_sample['end'] = end_stamp_ms
                lookups_lost_sample['end'] = end_stamp_ms
                masks_hit_sample['end'] = end_stamp_ms
                masks_total_sample['end'] = end_stamp_ms
                flows_sample['end'] = end_stamp_ms
            elif '@' in line:
            # We have found a datapath
                name = line.split(':')[0]
                if name not in dp_names:
                    dp_names.append(name)

            elif 'lookups:' in line:
                #  lookups: hit:74624770 missed:2867652 lost:0
                pairs = line[2:].split(' ')
                curr_lookups_hit = int(pairs[1].split(':')[1])
                curr_lookups_miss = int(pairs[2].split(':')[1])
                curr_lookups_lost = int(pairs[3].split(':')[1])

                lookups_hit_sample['value'] = (curr_lookups_hit - prev_lookups_hit) / time_delta_sec
                lookups_miss_sample['value'] = (curr_lookups_miss - prev_lookups_miss) / time_delta_sec
                lookups_lost_sample['value'] = (curr_lookups_lost - prev_lookups_lost) / time_delta_sec

                prev_lookups_hit = curr_lookups_hit
                prev_lookups_miss = curr_lookups_miss
                prev_lookups_lost = curr_lookups_lost

                log_sample(file_id, lookups_desc_rate, {'interface': dp_names[-1], 'action': 'hit'}, lookups_hit_sample)
                log_sample(file_id, lookups_desc_rate, {'interface': dp_names[-1], 'action': 'miss'}, lookups_miss_sample)
                log_sample(file_id, lookups_desc_rate, {'interface': dp_names[-1], 'action': 'lost'}, lookups_lost_sample)

            elif 'flows:' in line:
                #  flows: 554
                pairs = line.split(':')
                curr_flows = int(pairs[1])


                flows_sample['value'] = curr_flows
                log_sample(file_id, flow_desc_count, {'interface': dp_names[-1], 'counter': 'flows'}, flows_sample)


            elif 'masks:' in line:
                #  masks: hit:512424506 total:64 hit/pkt:6.61
                pairs = line[2:].split(' ')
                curr_masks_hit = int(pairs[1].split(':')[1])
                curr_masks_total = int(pairs[2].split(':')[1])

                masks_hit_sample['value'] = (curr_masks_hit - prev_masks_hit) / time_delta_sec
                masks_total_sample['value'] = (curr_masks_total - prev_masks_total) / time_delta_sec

                prev_masks_hit = curr_masks_hit
                prev_masks_total = curr_masks_total

                log_sample(file_id, masks_desc_rate, {'interface': dp_names[-1], 'action': 'hit'}, masks_hit_sample)
                log_sample(file_id, masks_desc_rate, {'interface': dp_names[-1], 'action': 'total'}, masks_total_sample)

            # startswith is slower, but coding around the false positives is uglier
            elif line.startswith('  cache:'):
                #  cache: hit:46193519 hit-rate:59.61%
                pairs = line[2:].split(' ')
                curr_cache_hit = int(pairs[1].split(':')[1])

                cache_hit_sample['value'] = (curr_cache_hit - prev_cache_hit) / time_delta_sec
                prev_cache_hit = curr_cache_hit
                log_sample(file_id, cache_desc_rate, {'interface': dp_names[-1], 'action': 'hit'}, cache_hit_sample)

            elif 'port 0' in line:
                # Use port 0 as token to remove the datapath name from the print
                # string in case there are multiple datapaths
                dp_names.pop()

            else:
                # don't care, keep going
                continue


    finish_samples()

def upcall_stats(filename: str):
    print("ovs-post-process-upcall-stats")
    upcall_flow_count = {'source': 'ovs-appctl', 'type': 'upcall-flow', 'class': 'count'}
    upcall_avg_count = {'source': 'ovs-appctl', 'type': 'upcall-flow-avg', 'class': 'count'}
    upcall_max_count = {'source': 'ovs-appctl', 'type': 'upcall-flow-max', 'class': 'count'}
    upcall_limit_count = {'source': 'ovs-appctl', 'type': 'upcall-flow-limit', 'class': 'count'}
    upcall_duration_count = {'source': 'ovs-appctl', 'type': 'upcall-flow-dump-duration-ms', 'class': 'count'}
    file_id = 'upcall-datapath-stats'
    dp_names = []
    flow_sample = {'end': 0,'value':0}
    avg_sample = {'end': 0,'value':0}
    max_sample = {'end': 0,'value':0}
    limit_sample = {'end': 0,'value':0}
    duration_sample = {'end': 0,'value':0}
    with lzma.open(filename,'rt') as file:
        for line in file:
            if 'DATE' in line:
                # Parses  output of date +%s.%N
                s = float(line.split(':')[1])
                end_stamp_ms = int(s * 1000)
                flow_sample['end'] = end_stamp_ms
                avg_sample['end'] = end_stamp_ms
                max_sample['end'] = end_stamp_ms
                limit_sample['end'] = end_stamp_ms
                duration_sample['end'] = end_stamp_ms
            elif '@' in line:
            # We have found a datapath
                name = line.split(':')[0]
                if name not in dp_names:
                    dp_names.append(name)

            elif 'flows' in line:
                #  flows         : (current 538) (avg 548) (max 747) (limit 200000)
                pairs = line.split(':')[1].strip()
                pairs = pairs.split(') (')
                curr_flow = int(pairs[0].split(' ')[1])
                curr_avg = int(pairs[1].split(' ')[1])
                curr_max = int(pairs[2].split(' ')[1])
                # remove trailing close paren on numeric limit value
                curr_limit = int(pairs[3].split(' ')[1].strip(')'))

                flow_sample['value'] = curr_flow
                avg_sample['value'] = curr_avg
                max_sample['value'] = curr_max
                limit_sample['value'] = curr_limit

                log_sample(file_id, upcall_flow_count,{'interface':dp_names[-1],'counter':'flow'},flow_sample)
                log_sample(file_id, upcall_avg_count,{'interface':dp_names[-1],'counter':'avg'},avg_sample)
                log_sample(file_id, upcall_max_count,{'interface':dp_names[-1],'counter':'max'},max_sample)
                log_sample(file_id, upcall_limit_count,{'interface':dp_names[-1],'counter':'limit'},limit_sample)


            elif 'duration' in line:
                #  dump duration : 10ms
                pairs = line.split(':')[1].strip()
                # slice 'ms' off the value
                curr_duration = int(pairs[:-2])
                duration_sample['value'] = curr_duration
                log_sample(file_id, upcall_duration_count,{'interface':dp_names[-1],'counter':'duration'},duration_sample)

            elif 'ufid' in line:
                # use ufid as a per-datapath token
                dp_names.pop()

            else:
                continue

    finish_samples()

def extract_inside_parens(paren_str: str) -> str:
    left = paren_str.find('(')
    right = paren_str.find(')')
    if left == -1 or right == -1:
        return 'err'
    # slice off left paren
    left = left + 1
    return paren_str[left:right]

def dpctl_dump_flows(filename: str):
    print('ovs-post-process-dpctl-dump-flows')
    dpctl_dump_desc = {'source' : 'ovs-dpctl','class': 'count'}
    curr_stamp = 0
    prev_stamp = 0
    curr_expired_flow = 0
    prev_expired_flow = 0
    curr_new_flow = 0
    prev_new_flow = 0
    time_delta_sec = 0
    file_id = 'dpctl-dump-flows'
    ip_proto_lookup_table = {'0': 'NA', '1': 'ICMP',\
        '2': 'IGMP', '6': 'IGMP', '17': 'UDP', '47': 'GRE'}

    # lower case hex string for comparision
    eth_type_lookup_table = {'0x0800': 'IPv4','0x86dd': 'IPv6',\
        '0x8808': 'Ethernet Flow Control','0x88cc': 'LLDP','0x0806': 'ARP'}

    ufids = {}

    with lzma.open(filename,'rt') as file:
        for line in file:
            if 'DATE' in line:
                # Time keeping
                prev_stamp = curr_stamp
                s = float(line.split(':')[1])
                end_stamp_ms = int(s * 1000)
                curr_stamp = int(s)
                time_delta_sec = curr_stamp - prev_stamp

                # check expired flows
                # flow freshnes is stamped with curr_stamp when a flow appears in the file
                # if a flow is in the ufids dictionary and its stamp
                # doesn't match the previous stamp then it probably matches
                # the prev_prev stamp, so it didn't get updated and needs to be
                # removed from the ufids dictionary
                for ufid_key in list(ufids.keys()):
                    if ufids[ufid_key]['freshness'] != prev_stamp:
                        curr_expired_flow += 1
                        del(ufids[ufid_key])

                # since we are using DATE to track new expired flows, a 0 for the time stamp is no good.
                # when DATE occurs in the file it is a new sample, so we need to record the
                # new / expired flows from the previous sample, and get ready to parse the new
                # sample
                if prev_stamp == 0:
                    time_stamp = curr_stamp
                else:
                    time_stamp = prev_stamp

                dpctl_dump_desc['type'] = 'ufid-new-flows-sec'
                new_flow_rate = (curr_new_flow - prev_new_flow) / time_delta_sec
                log_sample(file_id, dpctl_dump_desc, {'counter': 'new-flows-sec'},{'end':end_stamp_ms,'value':new_flow_rate})
                prev_new_flow = curr_new_flow

                dpctl_dump_desc['type'] = 'ufid-expired-flows-sec'
                expired_flow_rate = (curr_expired_flow - prev_expired_flow) / time_delta_sec
                log_sample(file_id, dpctl_dump_desc, {'counter': 'expired-flows-sec'},{'end':end_stamp_ms,'value':expired_flow_rate})
                prev_expired_flow = curr_expired_flow

            elif 'ufid:' in line:
                line = line.split(',')
                # set the ufid from the file as a key for the dict
                ufid_key = line[0].split(':')[1]
                # This is a new flow, use current stamp
                if ufid_key not in ufids:
                    ufids[ufid_key] = {}
                    ufids[ufid_key]['freshness'] = curr_stamp
                    ufids[ufid_key]['prev_packet_count'] = -1
                    ufids[ufid_key]['prev_byte_count'] = -1
                    curr_new_flow += 1
                else:
                    # House Keeping on existing flows
                    ufids[ufid_key]['freshness'] = curr_stamp
                    ufids[ufid_key]['prev_packet_count'] = ufids[ufid_key]['curr_packet_count']
                    ufids[ufid_key]['prev_byte_count'] = ufids[ufid_key]['curr_byte_count']

                # list comprehensions - search through the array of values for
                # metrics, returns an array, split that on the ':'
                # then grab the value ['packets:2'] -> ['packets','2']
                curr_packet = int([x for x in line if 'packets' in x][0].split(':')[1])
                curr_byte = int([x for x in line if 'byte' in x][0].split(':')[1])
                datapath = [x for x in line if 'dp:' in x][0].split(':')[1]
                in_port = [x for x in line if 'in_port' in x][0]
                eth_type = [x for x in line if 'eth_type' in x][0]
                last_used = [x for x in line if 'used:' in x][0]
                ufids[ufid_key]['curr_packet_count'] = curr_packet
                ufids[ufid_key]['curr_byte_count'] = curr_byte
                ufids[ufid_key]['datapath'] = datapath
                ufids[ufid_key]['in_port'] = extract_inside_parens(in_port)
                ufids[ufid_key]['eth_type'] = extract_inside_parens(eth_type).lower()

                '''TODO - this section, we just need the basics right now
                if eth_type == '0x0800'
                    ipv4 = [x for x in line if 'ipv4' in x][0]
                    # src=10.131.0.16/255.255.255.240,dst=192.168.111.24,proto=6,tos=0/0,ttl=64,frag=no
                    ip4_match = extract_inside_parens(ipv4)
                    ip4_csv = ip4_match.split(',')
                    ip4_src = ipaddress.ip_network([x for x in ip4_csv if 'src=' in x][0].strip('src='))
                    ip4_dst = ipaddress.ip_address([x for x in ip4_csv if 'dst=' in x][0].strip('dst='))
                    ip4_proto_raw = [x for x in ip4_csv if 'proto=' in x][0].strip('proto=')
                    ip4_proto = ip_proto_lookup_table[ip4_proto_raw]
                if eth_type == '0x86dd':
                    ipv6 = [x for x in line if 'ipv6' in x][0]
                    ip6_match = extract_inside_parens(ipv6)
                    ip6_csv = ip6_match.split(',')
                ufid:41f86fcb-5a71-46a1-8a2c-b18ac103ffd9, recirc_id(0x953b99),dp_hash(0/0),skb_priority(0/0),in_port(0707684fc6fdb77),skb_mark(0/0),ct_state(0x2a/0x3f),ct_zone(0/0),ct_mark(0/0),ct_label(0/0x3),eth(src=0a:58:75:0c:67:b1,dst=0a:58:06:cc:5b:0e),eth_type(0x86dd),ipv6(src=fd01:0:0:1::/ffff:ffff:ffff:ffff::,dst=fd01:0:0:2::16,label=0/0,proto=6,tclass=0/0x3,hlimit=64,frag=no),tcp(src=0/0,dst=0/0),tcp_flags(0/0), packets:1, bytes:826, used:4.972s, flags:P., dp:ovs, actions:ct_clear,set(tunnel(tun_id=0x4,ipv6_dst=fc00:1000::6,ttl=64,tp_dst=6081,geneve({class=0x102,type=0x80,len=4,0x30005}),flags(df|csum|key))),set(eth(src=0a:58:7d:50:4d:fc,dst=0a:58:85:e2:e6:5d)),set(ipv6(hlimit=63)),genev_sys_6081
                ufid:8acc72d6-b3cb-4051-93e3-4d1177a97a0d, recirc_id(0x953be4),dp_hash(0/0),skb_priority(0/0),in_port(6e769d62d8f9785),skb_mark(0/0),ct_state(0x2a/0x3f),ct_zone(0/0),ct_mark(0/0),ct_label(0/0x1),eth(src=00:00:00:00:00:00/00:00:00:00:00:00,dst=0a:58:05:b9:5e:9f),eth_type(0x86dd),ipv6(src=::/::,dst=fd01:0:0:1::25,label=0/0,proto=0/0,tclass=0/0,hlimit=0/0,frag=no), packets:4, bytes:6396, used:3.145s, flags:P., dp:ovs, actions:1b771892fabde8c
                '''
                # math on packets / bytes
                packet_rate = (ufids[ufid_key]['curr_packet_count'] - ufids[ufid_key]['prev_packet_count']) / time_delta_sec
                bytes_to_Gbits = ((ufids[ufid_key]['curr_byte_count'] - ufids[ufid_key]['prev_byte_count']) * 8.0) / float(1_000_000_000)
                bit_rate = bytes_to_Gbits / time_delta_sec

                # Since we only want to log samples with 2 data points check if the prev_counts are still unset
                if ufids[ufid_key]['prev_packet_count'] == -1 and ufids[ufid_key]['prev_byte_count'] == -1:
                    continue

                # log samples
                dpctl_dump_desc['type'] = 'ufid-packets-sec'
                log_sample(file_id,dpctl_dump_desc,{'id':ufid_key},{'end': end_stamp_ms,'value':packet_rate})
                dpctl_dump_desc['type'] = 'ufid-Gbps'
                log_sample(file_id,dpctl_dump_desc,{'id':ufid_key},{'end': end_stamp_ms,'value':bit_rate})
            else:
                continue

    finish_samples()

def main():
    print("ovs-post-process-multiprocess-launcher")
    conntrack_process = Process(target=conntrack_post_process, args=('appctl-dpctl-ct-stats-show.txt.xz',), name='conntrack')
    conntrack_process.start()

    mem_show_process = Process(target=dpctl_memory_show_process, args=('appctl-memory-show.txt.xz',), name='dpctl-show')
    mem_show_process.start()

    pmd_counters_process = Process(target=appctl_dpif_netdev_pmd_perf_show, args=('appctl-dpif-netdev-pmd-perf-show.txt.xz',), name='appctl-dpif-netdev-pmd-perf-show')
    pmd_counters_process.start()

    port_counters_process = Process(target=ofctl_port_counters, args=('ofctl-dump-ports.txt.xz',), name='ofctl-dump-ports')
    port_counters_process.start()

    dpctl_stats_process = Process(target=dpctl_datapath_stats, args=('appctl-dpctl-show.txt.xz',), name='dpctl-dp-stats')
    dpctl_stats_process.start()

    upcall_stats_process = Process(target=upcall_stats, args=('appctl-upcall-show.txt.xz',), name='upcall-stats')
    upcall_stats_process.start()

    dpctl_dump_process = Process(target=dpctl_dump_flows, args=('appctl-dpctl-dump-flows.txt.xz',), name='dpctl-dump-flows')
    dpctl_dump_process.start()

    # Always join last
    conntrack_process.join()
    mem_show_process.join()
    port_counters_process.join()
    dpctl_stats_process.join()
    upcall_stats_process.join()
    dpctl_dump_process.join()
    pmd_counters_process.join()

if __name__ == '__main__':
    exit(main())