Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed replay-tracking option to parse different sites ID. #34

Merged
merged 2 commits into from
Feb 26, 2013
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 50 additions & 22 deletions misc/log-analytics/import_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,10 @@ class DynamicResolver(object):

def __init__(self):
self._cache = {}
if config.options.replay_tracking:
# get existing sites
self._cache['sites'] = piwik.call_api(
'SitesManager.getAllSites')

def _get_site_id_from_hit_host(self, hit):
main_url = 'http://' + hit.host
Expand Down Expand Up @@ -933,24 +937,43 @@ def _resolve(self, hit):
stats.piwik_sites.add(site_id)
return site_id

def _resolve_when_replay_tracking(self, hit):
"""
If parsed site ID found in the _cache['sites'] return site ID and main_url,
otherwise return (None, None) tuple.
"""
site_id = hit.args['idsite']
if site_id in self._cache['sites'].keys():
stats.piwik_sites.add(site_id)
return (site_id, self._cache['sites'][site_id]['main_url'])
else:
return (None, None)

def resolve(self, hit):
"""
Return the site ID from the cache if found, otherwise call _resolve.
If replay_tracking option is enabled, call _resolve_when_replay_tracking.
"""
try:
site_id = self._cache[hit.host]
except KeyError:
logging.debug(
'Site ID for hostname %s not in cache', hit.host
)
site_id = self._resolve(hit)
logging.debug('Site ID for hostname %s: %s', hit.host, site_id)
self._cache[hit.host] = site_id
return (site_id, 'http://' + hit.host)
if config.options.replay_tracking:
# We only consider requests with piwik.php which don't need host to be imported
return self._resolve_when_replay_tracking(hit)
else:
try:
site_id = self._cache[hit.host]
except KeyError:
logging.debug(
'Site ID for hostname %s not in cache', hit.host
)
site_id = self._resolve(hit)
logging.debug('Site ID for hostname %s: %s', hit.host, site_id)
self._cache[hit.host] = site_id
return (site_id, 'http://' + hit.host)


def check_format(self, format):
if 'host' not in format.regex.groupindex and not config.options.log_hostname:
if config.options.replay_tracking:
pass
elif 'host' not in format.regex.groupindex and not config.options.log_hostname:
fatal_error(
"the selected log format doesn't include the hostname: you must "
"specify the Piwik site ID with the --idsite argument"
Expand Down Expand Up @@ -1062,7 +1085,7 @@ def _get_hit_args(self, hit):
site_id, main_url = resolver.resolve(hit)
if site_id is None:
# This hit doesn't match any known Piwik site.
stats.piwik_sites_ignored.add(hit.host)
stats.piwik_sites_ignored.add(getattr(hit, 'host', 'unrecognized host to site ID %s' % hit.args.get('idsite')))
stats.count_lines_no_site.increment()
return

Expand Down Expand Up @@ -1405,6 +1428,21 @@ def invalid_line(line, reason):

if timezone:
hit.date -= datetime.timedelta(hours=timezone/100)
if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php
if not hit.query_string or not hit.path.lower().endswith('piwik.php'):
continue
else:
query_arguments = urlparse.parse_qs(hit.query_string)
if not "idsite" in query_arguments:
invalid_line(line, 'missing idsite')
continue
else:
try:
hit.args.update((k, v.pop().encode('raw_unicode_escape').decode(config.options.encoding)) for k, v in query_arguments.iteritems())
except UnicodeDecodeError:
invalid_line(line, 'invalid encoding')
continue

# Check if the hit must be excluded.
if all((method(hit) for method in self.check_methods)):
Expand All @@ -1414,16 +1452,6 @@ def invalid_line(line, reason):
Recorder.add_hits(hits)
hits = []

if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php
if hit.query_string and hit.path.lower().endswith('piwik.php'):
query_arguments = urlparse.parse_qs(hit.query_string)
if "idsite" in query_arguments:
try:
hit.args.update((k, v.pop().encode('raw_unicode_escape').decode(config.options.encoding)) for k, v in query_arguments.iteritems())
except UnicodeDecodeError:
invalid_line(line, 'invalid encoding')
continue
# add last chunk of hits
if len(hits) > 0:
Recorder.add_hits(hits)
Expand Down