From 7040c18c25af02789a85b632798bc0517ed46685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomek=20W=C3=B3jcik?= Date: Mon, 25 Feb 2013 15:25:25 +0100 Subject: [PATCH 1/2] Fixed replay-tracking option to parse different sites ID. --- misc/log-analytics/import_logs.py | 72 +++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 265a0685510..bafb96501ff 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -873,6 +873,10 @@ class DynamicResolver(object): def __init__(self): self._cache = {} + if config.options.replay_tracking: + # get existing sites + self._cache['sites'] = piwik.call_api( + 'SitesManager.getAllSites') def _get_site_id_from_hit_host(self, hit): main_url = 'http://' + hit.host @@ -933,24 +937,43 @@ def _resolve(self, hit): stats.piwik_sites.add(site_id) return site_id + def _resolve_when_replay_tracking(self, hit): + """ + If parsed site ID found in the _cache['sites'] return site ID and main_url, + otherwise return (None, None) tuple. + """ + site_id = hit.args['idsite'] + if site_id in self._cache['sites'].keys(): + stats.piwik_sites.add(site_id) + return (site_id, self._cache['sites'][site_id]['main_url']) + else: + return (None, None) + def resolve(self, hit): """ Return the site ID from the cache if found, otherwise call _resolve. + If replay_tracking option is enabled, call _resolve_when_replay_tracking. """ - try: - site_id = self._cache[hit.host] - except KeyError: - logging.debug( - 'Site ID for hostname %s not in cache', hit.host - ) - site_id = self._resolve(hit) - logging.debug('Site ID for hostname %s: %s', hit.host, site_id) - self._cache[hit.host] = site_id - return (site_id, 'http://' + hit.host) + if config.options.replay_tracking: + # We only consider requests with piwik.php which don't need host to be imported + return self._resolve_when_replay_tracking(hit) + else: + try: + site_id = self._cache[hit.host] + except KeyError: + logging.debug( + 'Site ID for hostname %s not in cache', hit.host + ) + site_id = self._resolve(hit) + logging.debug('Site ID for hostname %s: %s', hit.host, site_id) + self._cache[hit.host] = site_id + return (site_id, 'http://' + hit.host) def check_format(self, format): - if 'host' not in format.regex.groupindex and not config.options.log_hostname: + if config.options.replay_tracking: + pass + elif 'host' not in format.regex.groupindex and not config.options.log_hostname: fatal_error( "the selected log format doesn't include the hostname: you must " "specify the Piwik site ID with the --idsite argument" @@ -1062,7 +1085,7 @@ def _get_hit_args(self, hit): site_id, main_url = resolver.resolve(hit) if site_id is None: # This hit doesn't match any known Piwik site. - stats.piwik_sites_ignored.add(hit.host) + stats.piwik_sites_ignored.add(getattr(hit, 'host', 'unrecognized host to site ID %s' % hit.args.get('idsite'))) stats.count_lines_no_site.increment() return @@ -1405,6 +1428,21 @@ def invalid_line(line, reason): if timezone: hit.date -= datetime.timedelta(hours=timezone/100) + if config.options.replay_tracking: + # we need a query string and we only consider requests with piwik.php + if not hit.query_string and not hit.path.lower().endswith('piwik.php'): + continue + else: + query_arguments = urlparse.parse_qs(hit.query_string) + if not "idsite" in query_arguments: + invalid_line(line, 'missing idsite') + continue + else: + try: + hit.args.update((k, v.pop().encode('raw_unicode_escape').decode(config.options.encoding)) for k, v in query_arguments.iteritems()) + except UnicodeDecodeError: + invalid_line(line, 'invalid encoding') + continue # Check if the hit must be excluded. if all((method(hit) for method in self.check_methods)): @@ -1414,16 +1452,6 @@ def invalid_line(line, reason): Recorder.add_hits(hits) hits = [] - if config.options.replay_tracking: - # we need a query string and we only consider requests with piwik.php - if hit.query_string and hit.path.lower().endswith('piwik.php'): - query_arguments = urlparse.parse_qs(hit.query_string) - if "idsite" in query_arguments: - try: - hit.args.update((k, v.pop().encode('raw_unicode_escape').decode(config.options.encoding)) for k, v in query_arguments.iteritems()) - except UnicodeDecodeError: - invalid_line(line, 'invalid encoding') - continue # add last chunk of hits if len(hits) > 0: Recorder.add_hits(hits) From 8d642de42efd40898441fd2223d1af0d8c457357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomek=20W=C3=B3jcik?= Date: Mon, 25 Feb 2013 16:38:49 +0100 Subject: [PATCH 2/2] Fixed if condition. --- misc/log-analytics/import_logs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index bafb96501ff..b6fd91b5040 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -1430,7 +1430,7 @@ def invalid_line(line, reason): hit.date -= datetime.timedelta(hours=timezone/100) if config.options.replay_tracking: # we need a query string and we only consider requests with piwik.php - if not hit.query_string and not hit.path.lower().endswith('piwik.php'): + if not hit.query_string or not hit.path.lower().endswith('piwik.php'): continue else: query_arguments = urlparse.parse_qs(hit.query_string)