Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TESTING: Enrich dns indicators #112

Merged
merged 18 commits into from
Apr 26, 2015
Merged
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 40 additions & 21 deletions winnower.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,19 @@
import json
import pygeoip
import re
import sys

from netaddr import IPAddress, IPRange, IPSet
from sortedcontainers import SortedDict

from logger import get_logger
import logging

logger = get_logger('winnower')

# from http://en.wikipedia.org/wiki/Reserved_IP_addresses:
reserved_ranges = IPSet(['0.0.0.0/8', '100.64.0.0/10', '127.0.0.0/8', '192.88.99.0/24',
'198.18.0.0/15', '198.51.100.0/24', '203.0.113.0/24', '233.252.0.0/24'])
gi_org = SortedDict()
geo_data = pygeoip.GeoIP('data/GeoIP.dat', pygeoip.MEMORY_CACHE)


def load_gi_org(filename):
Expand All @@ -43,32 +42,49 @@ def org_by_addr(address):


def maxhits(dns_records):
max = 0
hmax = 0
hostname = None
for record in dns_records:
if record['count'] > max:
max = record['count']
#logger.info("examining %s" % record)
if record['count'] > hmax:
hmax = record['count']
hostname = record['rrname'].rstrip('.')
return hostname


def enrich_IPv4(address, geo_data, dnsdb=None):
def maxhits_rdata(dns_records):
hmax = 0
hostname = None
for record in dns_records:
# logger.info("Examining %s" % record)
if record['count'] > hmax:
hmax = record['count']
hostname = record['rdata'][0].rstrip('.')
return hostname


def enrich_IPv4(address, dnsdb=None, hostname=None):
as_num, as_name = org_by_addr(address)
country = geo_data.country_code_by_addr('%s' % address)
if dnsdb:
hostname = maxhits(dnsdb.query_rdata_ip('%s' % address))
rhost = maxhits(dnsdb.query_rdata_ip('%s' % address))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the first "logical issue" - we should be gathering the PTR records here, so it should actually be a dnsdb.query_rrset using the inaddr.arpa address of the IP address. I have changed the other of hostname and rhost variables to reflect what they are on the CSV so it can be clearer what is expected of this function.

We are going for maxhits all right. It should be rare that there is more than 1 PTR record associated with an in-addr.arpa. We might get a CNAME that points to a PTR, and we should just let the CNAME go in for now.

else:
hostname = None
return (as_num, as_name, country, None, hostname)
rhost = None
return (as_num, as_name, country, hostname, rhost)


def enrich_FQDN(address, date, dnsdb):
records = dnsdb.query_rrset(address, rrtype='A')
records = filter_date(records, date)
ip_addr = maxhits(records)
yesterday = dt.datetime.strptime(date, '%Y-%m-%d') - dt.timedelta(days=1)
yesterday_str = yesterday.strftime('%Y-%m-%d')
records = filter_date(records, yesterday_str)
ip_addr = maxhits_rdata(records)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And here is the second, and hardest "logical issue". We should not be getting maxhits in this case. We should be expanding ALL THE IP addresses that the domain resolves to on that specific date. So 1 FQDN line could "expand" to N IPv4 lines

This will obviously mess up the way the enrichment process is built right now.

if ip_addr:
logger.info('Mapped %s to %s' % (address, ip_addr))
return ip_addr
# logger.info('Mapped %s to %s on %s' % (address, ip_addr, date))
ip_addr_data = enrich_IPv4(IPAddress(ip_addr), dnsdb, address)
return (ip_addr,) + ip_addr_data
else:
return None


def filter_date(records, date):
Expand Down Expand Up @@ -113,15 +129,15 @@ def winnow(in_file, out_file, enr_file):
server = config.get('Winnower', 'dnsdb_server')
api = config.get('Winnower', 'dnsdb_api')
enrich_ip = config.get('Winnower', 'enrich_ip')
if enrich_ip == '1':
if enrich_ip == '1' or enrich_ip == 'True':
enrich_ip = True
logger.info('Enriching IPv4 indicators: TRUE')
else:
enrich_ip = False
logger.info('Enriching IPv4 indicators: FALSE')

enrich_dns = config.get('Winnower', 'enrich_dns')
if enrich_dns == '1':
if enrich_dns == '1' or enrich_dns == 'True':
enrich_dns = True
logger.info('Enriching DNS indicators: TRUE')
else:
Expand All @@ -132,7 +148,7 @@ def winnow(in_file, out_file, enr_file):

# handle the case where we aren't using DNSDB
dnsdb = dnsdb_query.DnsdbClient(server, api)
if len(dnsdb.query_rdata_name('google.com')) == 0:
if api == 'YOUR_API_KEY_HERE' or len(dnsdb.query_rdata_name('google.com')) == 0:
dnsdb = None
logger.info('Invalid DNSDB configuration found')

Expand All @@ -142,33 +158,36 @@ def winnow(in_file, out_file, enr_file):
# TODO: make these locations configurable?
logger.info('Loading GeoIP data')
gi_org = load_gi_org('data/GeoIPASNum2.csv')
geo_data = pygeoip.GeoIP('data/GeoIP.dat', pygeoip.MEMORY_CACHE)

wheat = []
enriched = []

logger.info('Beginning winnowing process')
for each in crop:
(addr, addr_type, direction, source, note, date) = each
# this should be refactored into appropriate functions
if addr_type == 'IPv4' and is_ipv4(addr):
#logger.info('Enriching %s' % addr)
ipaddr = IPAddress(addr)
if not reserved(ipaddr):
wheat.append(each)
if enrich_ip:
e_data = (addr, addr_type, direction, source, note, date) + enrich_IPv4(ipaddr, geo_data, dnsdb)
e_data = (addr, addr_type, direction, source, note, date) + enrich_IPv4(ipaddr, dnsdb)
enriched.append(e_data)
else:
e_data = (addr, addr_type, direction, source, note, date) + enrich_IPv4(ipaddr, geo_data)
e_data = (addr, addr_type, direction, source, note, date) + enrich_IPv4(ipaddr)
enriched.append(e_data)
else:
logger.error('Found invalid address: %s from: %s' % (addr, source))
elif addr_type == 'FQDN' and is_fqdn(addr):
#logger.info('Enriching %s' % addr)
wheat.append(each)
if enrich_dns and dnsdb:
e_data = (addr, addr_type, direction, source, note, date, enrich_FQDN(addr, date, dnsdb))
enriched.append(e_data)
# print "Enriching %s" % addr
e_data = enrich_FQDN(addr, date, dnsdb)
if e_data:
e_data = (e_data[0], "IPv4", direction, source, note, date) + e_data[1:]
enriched.append(e_data)
else:
logger.error('Could not determine address type for %s listed as %s' % (addr, addr_type))

Expand Down