Skip to content

Commit

Permalink
Merge pull request #34 from clearcode/master
Browse files Browse the repository at this point in the history
Modify replay tracking to use site ID in query string when using replay-tracking option.
  • Loading branch information
diosmosis committed Feb 26, 2013
2 parents d976a30 + 8d642de commit 7e93e75
Showing 1 changed file with 50 additions and 22 deletions.
72 changes: 50 additions & 22 deletions misc/log-analytics/import_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,10 @@ class DynamicResolver(object):

def __init__(self):
self._cache = {}
if config.options.replay_tracking:
# get existing sites
self._cache['sites'] = piwik.call_api(
'SitesManager.getAllSites')

def _get_site_id_from_hit_host(self, hit):
main_url = 'http://' + hit.host
Expand Down Expand Up @@ -939,24 +943,43 @@ def _resolve(self, hit):
stats.piwik_sites.add(site_id)
return site_id

def _resolve_when_replay_tracking(self, hit):
"""
If parsed site ID found in the _cache['sites'] return site ID and main_url,
otherwise return (None, None) tuple.
"""
site_id = hit.args['idsite']
if site_id in self._cache['sites'].keys():
stats.piwik_sites.add(site_id)
return (site_id, self._cache['sites'][site_id]['main_url'])
else:
return (None, None)

def resolve(self, hit):
"""
Return the site ID from the cache if found, otherwise call _resolve.
If replay_tracking option is enabled, call _resolve_when_replay_tracking.
"""
try:
site_id = self._cache[hit.host]
except KeyError:
logging.debug(
'Site ID for hostname %s not in cache', hit.host
)
site_id = self._resolve(hit)
logging.debug('Site ID for hostname %s: %s', hit.host, site_id)
self._cache[hit.host] = site_id
return (site_id, 'http://' + hit.host)
if config.options.replay_tracking:
# We only consider requests with piwik.php which don't need host to be imported
return self._resolve_when_replay_tracking(hit)
else:
try:
site_id = self._cache[hit.host]
except KeyError:
logging.debug(
'Site ID for hostname %s not in cache', hit.host
)
site_id = self._resolve(hit)
logging.debug('Site ID for hostname %s: %s', hit.host, site_id)
self._cache[hit.host] = site_id
return (site_id, 'http://' + hit.host)


def check_format(self, format):
if 'host' not in format.regex.groupindex and not config.options.log_hostname:
if config.options.replay_tracking:
pass
elif 'host' not in format.regex.groupindex and not config.options.log_hostname:
fatal_error(
"the selected log format doesn't include the hostname: you must "
"specify the Piwik site ID with the --idsite argument"
Expand Down Expand Up @@ -1068,7 +1091,7 @@ def _get_hit_args(self, hit):
site_id, main_url = resolver.resolve(hit)
if site_id is None:
# This hit doesn't match any known Piwik site.
stats.piwik_sites_ignored.add(hit.host)
stats.piwik_sites_ignored.add(getattr(hit, 'host', 'unrecognized host to site ID %s' % hit.args.get('idsite')))
stats.count_lines_no_site.increment()
return

Expand Down Expand Up @@ -1411,6 +1434,21 @@ def invalid_line(line, reason):

if timezone:
hit.date -= datetime.timedelta(hours=timezone/100)
if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php
if not hit.query_string or not hit.path.lower().endswith('piwik.php'):
continue
else:
query_arguments = urlparse.parse_qs(hit.query_string)
if not "idsite" in query_arguments:
invalid_line(line, 'missing idsite')
continue
else:
try:
hit.args.update((k, v.pop().encode('raw_unicode_escape').decode(config.options.encoding)) for k, v in query_arguments.iteritems())
except UnicodeDecodeError:
invalid_line(line, 'invalid encoding')
continue

# Check if the hit must be excluded.
if all((method(hit) for method in self.check_methods)):
Expand All @@ -1420,16 +1458,6 @@ def invalid_line(line, reason):
Recorder.add_hits(hits)
hits = []

if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php
if hit.query_string and hit.path.lower().endswith('piwik.php'):
query_arguments = urlparse.parse_qs(hit.query_string)
if "idsite" in query_arguments:
try:
hit.args.update((k, v.pop().encode('raw_unicode_escape').decode(config.options.encoding)) for k, v in query_arguments.iteritems())
except UnicodeDecodeError:
invalid_line(line, 'invalid encoding')
continue
# add last chunk of hits
if len(hits) > 0:
Recorder.add_hits(hits)
Expand Down

0 comments on commit 7e93e75

Please sign in to comment.