From a881af47d897d52acce027d02672ca45b5fcaaa6 Mon Sep 17 00:00:00 2001 From: gas1121 Date: Mon, 14 Aug 2017 18:40:21 +0800 Subject: [PATCH 01/22] make scheduler handle serialized request properly --- crawler/crawling/distributed_scheduler.py | 94 ++++++++------------- crawler/requirements.txt | 52 ++++++------ crawler/tests/test_distributed_scheduler.py | 42 +++++++-- 3 files changed, 97 insertions(+), 91 deletions(-) diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py index 14cf2818..706ebb36 100644 --- a/crawler/crawling/distributed_scheduler.py +++ b/crawler/crawling/distributed_scheduler.py @@ -7,6 +7,7 @@ from scrapy.http import Request from scrapy.conf import settings from scrapy.utils.python import to_unicode +from scrapy.utils.reqser import request_to_dict, request_from_dict import redis import random @@ -391,7 +392,7 @@ def enqueue_request(self, request): if not request.dont_filter and self.dupefilter.request_seen(request): self.logger.debug("Request not added back to redis") return - req_dict = self.request_to_dict(request) + req_dict = request_to_dict(request, self.spider) if not self.is_blacklisted(req_dict['meta']['appid'], req_dict['meta']['crawlid']): @@ -436,28 +437,6 @@ def enqueue_request(self, request): .format(appid=req_dict['meta']['appid'], id=req_dict['meta']['crawlid'])) - def request_to_dict(self, request): - ''' - Convert Request object to a dict. - modified from scrapy.utils.reqser - ''' - req_dict = { - # urls should be safe (safe_string_url) - 'url': to_unicode(request.url), - 'method': request.method, - 'headers': dict(request.headers), - 'body': request.body, - 'cookies': request.cookies, - 'meta': request.meta, - '_encoding': request._encoding, - 'priority': request.priority, - 'dont_filter': request.dont_filter, - # callback/errback are assumed to be a bound instance of the spider - 'callback': None if request.callback is None else request.callback.__name__, - 'errback': None if request.errback is None else request.errback.__name__, - } - return req_dict - def find_item(self): ''' Finds an item from the throttled queues @@ -504,50 +483,47 @@ def next_request(self): if item: self.logger.debug(u"Found url to crawl {url}" \ .format(url=item['url'])) - try: - req = Request(item['url']) - except ValueError: - # need absolute url - # need better url validation here - req = Request('http://' + item['url']) - - try: - if 'callback' in item and item['callback'] is not None: - req.callback = getattr(self.spider, item['callback']) - except AttributeError: - self.logger.warn("Unable to find callback method") - - try: - if 'errback' in item and item['errback'] is not None: - req.errback = getattr(self.spider, item['errback']) - except AttributeError: - self.logger.warn("Unable to find errback method") - if 'meta' in item: - item = item['meta'] - - # defaults not in schema - if 'curdepth' not in item: - item['curdepth'] = 0 - if "retry_times" not in item: - item['retry_times'] = 0 - - for key in list(item.keys()): - req.meta[key] = item[key] + # item is a serialized request + req = request_from_dict(item) + else: + # item is feeded from outside and is not a serialized request, + # parse it manually + req = self.request_from_feed(item) # extra check to add items to request - if 'useragent' in item and item['useragent'] is not None: - req.headers['User-Agent'] = item['useragent'] - if 'cookie' in item and item['cookie'] is not None: - if isinstance(item['cookie'], dict): - req.cookies = item['cookie'] - elif isinstance(item['cookie'], basestring): - req.cookies = self.parse_cookie(item['cookie']) + if 'useragent' in req.meta and req.meta['useragent'] is not None: + req.headers['User-Agent'] = req.meta['useragent'] return req return None + def request_from_feed(self, item): + try: + req = Request(item['url']) + except ValueError: + # need absolute url + # need better url validation here + req = Request('http://' + item['url']) + + # defaults not in schema + if 'curdepth' not in item: + item['curdepth'] = 0 + if "retry_times" not in item: + item['retry_times'] = 0 + + for key in list(item.keys()): + req.meta[key] = item[key] + + # extra check to add items to request + if 'cookie' in item and item['cookie'] is not None: + if isinstance(item['cookie'], dict): + req.cookies = item['cookie'] + elif isinstance(item['cookie'], basestring): + req.cookies = self.parse_cookie(item['cookie']) + return req + def parse_cookie(self, string): ''' Parses a cookie string like returned in a Set-Cookie header diff --git a/crawler/requirements.txt b/crawler/requirements.txt index 58214803..172a6aae 100644 --- a/crawler/requirements.txt +++ b/crawler/requirements.txt @@ -1,40 +1,40 @@ -attrs==16.3.0 # Updated from 16.1.0 -cffi==1.9.1 # Updated from 1.7.0 +attrs==17.2.0 # Updated from 16.3.0 +cffi==1.10.0 # Updated from 1.9.1 ConcurrentLogHandler==0.9.1 -cryptography==1.8.1 # Updated from 1.5 -cssselect==1.0.1 # Updated from 0.9.2 +cryptography==2.0.3 # Updated from 1.8.1 +cssselect==1.0.1 enum34==1.1.6 funcsigs==1.0.2 -future==0.16.0 # Updated from 0.15.2 -idna==2.5 # Updated from 2.1 -ipaddress==1.0.18 # Updated from 1.0.16 -kafka-python==1.3.3 # Updated from 1.3.2 -kazoo==2.2.1 -lxml==3.7.3 # Updated from 3.6.4 +future==0.16.0 +idna==2.6 # Updated from 2.5 +ipaddress==1.0.18 +kafka-python==1.3.4 # Updated from 1.3.3 +kazoo==2.4.0 # Updated from 2.2.1 +lxml==3.8.0 # Updated from 3.7.3 mock==2.0.0 nose==1.3.7 -parsel==1.1.0 # Updated from 1.0.3 -pbr==2.0.0 # Updated from 1.10.0 -pyasn1==0.2.3 # Updated from 0.1.9 -pyasn1-modules==0.0.8 -pycparser==2.17 # Updated from 2.14 +parsel==1.2.0 # Updated from 1.1.0 +pbr==3.1.1 # Updated from 2.0.0 +pyasn1==0.3.2 # Updated from 0.2.3 +pyasn1-modules==0.0.11 # Updated from 0.0.8 +pycparser==2.18 # Updated from 2.17 PyDispatcher==2.0.5 -pyOpenSSL==16.2.0 # Updated from 16.1.0 -python-json-logger==0.1.7 # Updated from 0.1.5 +pyOpenSSL==17.2.0 # Updated from 16.2.0 +python-json-logger==0.1.8 # Updated from 0.1.7 PyYAML==3.12 queuelib==1.4.2 redis==2.10.5 -requests==2.13.0 # Updated from 2.11.1 -requests-file==1.4.1 # Updated from 1.4 +requests==2.18.3 # Updated from 2.13.0 +requests-file==1.4.2 # Updated from 1.4.1 retrying==1.3.3 -Scrapy==1.3.3 +Scrapy==1.4.0 # Updated from 1.3.3 ../utils # scutils==1.3.0dev0 -service-identity==16.0.0 +service-identity==17.0.0 # Updated from 16.0.0 six==1.10.0 -testfixtures==4.13.5 # Updated from 4.10.0 -tldextract==2.0.2 # Updated from 2.0.1 -Twisted==17.1.0 # Updated from 16.4.0 +testfixtures==5.1.1 # Updated from 4.13.5 +tldextract==2.1.0 # Updated from 2.0.2 +Twisted==17.5.0 # Updated from 17.1.0 ujson==1.35 -w3lib==1.17.0 # Updated from 1.16.0 -zope.interface==4.3.3 # Updated from 4.2.0 +w3lib==1.18.0 # Updated from 1.17.0 +zope.interface==4.4.2 # Updated from 4.3.3 # Generated with piprot 0.9.7 diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py index 75db7b18..aaf3e42a 100644 --- a/crawler/tests/test_distributed_scheduler.py +++ b/crawler/tests/test_distributed_scheduler.py @@ -7,6 +7,7 @@ from mock import MagicMock from crawling.distributed_scheduler import DistributedScheduler from scrapy.http import Request +from scrapy.utils.reqser import request_to_dict from scutils.redis_throttled_queue import RedisThrottledQueue @@ -139,6 +140,21 @@ def test_find_item(self): self.assertEqual(self.scheduler.find_item(), None) # should also not raise exception +class TestDistributedSchedulerRequestFromFeed(ThrottleMixin, TestCase): + def test_request_from_feed(self): + self.req = self.get_request() + feed = { + "url": "http://ex.com", + "crawlid": "abc123", + "appid": "myapp", + "spiderid": "link", + } + out = self.scheduler.request_from_feed(feed) + self.assertEquals(out.url, 'http://ex.com') + for key in out.meta: + self.assertEqual(out.meta[key], self.req.meta[key]) + + class TestDistributedSchedulerNextRequest(ThrottleMixin, TestCase): @mock.patch('time.time', return_value=5) @@ -169,12 +185,26 @@ def test_next_request(self, t): except Exception as e: self.assertEqual(str(e), "ip") - # test got item - self.scheduler.find_item = MagicMock( - return_value={"url": "http://ex.com", - "crawlid": "abc123", - "appid": "myapp", - "spiderid": "link"}) + # test request from feed + feed = { + "url": "http://ex.com", + "crawlid": "abc123", + "appid": "myapp", + "spiderid": "link", + } + self.scheduler.find_item = MagicMock(return_value=feed) + out = self.scheduler.next_request() + self.assertEquals(out.url, 'http://ex.com') + for key in out.meta: + self.assertEqual(out.meta[key], self.req.meta[key]) + + # test request from serialized request + exist_req = Request('http://ex.com') + exist_item = request_to_dict(exist_req) + exist_item["meta"]["crawlid"] = "abc123" + exist_item["meta"]["appid"] = "myapp" + exist_item["meta"]["spiderid"] = "link" + self.scheduler.find_item = MagicMock(return_value=exist_item) out = self.scheduler.next_request() self.assertEquals(out.url, 'http://ex.com') for key in out.meta: From 91636fa84e36a97e28f77f73430d2cb49bc5f82e Mon Sep 17 00:00:00 2001 From: gas1121 Date: Mon, 14 Aug 2017 18:53:29 +0800 Subject: [PATCH 02/22] update comment --- crawler/crawling/distributed_scheduler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py index 706ebb36..16ef7c27 100644 --- a/crawler/crawling/distributed_scheduler.py +++ b/crawler/crawling/distributed_scheduler.py @@ -487,8 +487,7 @@ def next_request(self): # item is a serialized request req = request_from_dict(item) else: - # item is feeded from outside and is not a serialized request, - # parse it manually + # item is a feed from outside, parse it manually req = self.request_from_feed(item) # extra check to add items to request From b6824bc9e86d18ebfee56be573944d9d76233e5b Mon Sep 17 00:00:00 2001 From: gas1121 Date: Mon, 14 Aug 2017 19:45:46 +0800 Subject: [PATCH 03/22] fix error when converting dict to request in scheduler --- crawler/crawling/distributed_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py index 16ef7c27..fb4fbb7d 100644 --- a/crawler/crawling/distributed_scheduler.py +++ b/crawler/crawling/distributed_scheduler.py @@ -485,7 +485,7 @@ def next_request(self): .format(url=item['url'])) if 'meta' in item: # item is a serialized request - req = request_from_dict(item) + req = request_from_dict(item, self.spider) else: # item is a feed from outside, parse it manually req = self.request_from_feed(item) From 0deaf741334a538601f9f69f83c7c6b3e73309e5 Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Sun, 20 Aug 2017 14:08:30 -0400 Subject: [PATCH 04/22] Updated master requirements from crawler changes --- requirements.txt | 50 ++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5a83d46a..a1e1a338 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,47 +6,47 @@ Jinja2==2.9.5 # Updated from 2.8 MarkupSafe==1.0 # Updated from 0.23 PyDispatcher==2.0.5 PyYAML==3.12 -Scrapy==1.3.3 # Updated from 1.3.1 -Twisted==17.1.0 # Updated from 16.4.0 +Scrapy==1.4.0 # Updated from 1.3.3 +Twisted==17.5.0 # Updated from 17.1.0 Werkzeug==0.12.1 # Updated from 0.11.11 -attrs==16.3.0 # Updated from 16.1.0 -cffi==1.9.1 # Updated from 1.7.0 +attrs==17.2.0 # Updated from 16.3.0 +cffi==1.10.0 # Updated from 1.9.1 characteristic==14.3.0 click==6.7 # Updated from 6.6 coverage==4.3.4 # Updated from 4.0.3 -cryptography==1.8.1 # Updated from 1.5 +cryptography==2.0.3 # Updated from 1.8.1 cssselect==1.0.1 # Updated from 0.9.2 enum34==1.1.6 funcsigs==1.0.2 -future==0.16.0 # Updated from 0.15.2 -idna==2.5 # Updated from 2.1 -ipaddress==1.0.18 # Updated from 1.0.16 +future==0.16.0 +idna==2.6 # Updated from 2.5 +ipaddress==1.0.18 itsdangerous==0.24 jsonschema==2.6.0 # Updated from 2.5.1 -kafka-python==1.3.3 # Updated from 1.3.2 -kazoo==2.2.1 -lxml==3.7.3 # Updated from 3.6.4 +kafka-python==1.3.4 # Updated from 1.3.3 +kazoo==2.4.0 # Updated from 2.2.1 +lxml==3.8.0 # Updated from 3.7.3 mock==2.0.0 nose==1.3.7 -parsel==1.1.0 # Updated from 1.0.3 -pbr==2.0.0 # Updated from 1.10.0 -pyOpenSSL==16.2.0 # Updated from 16.1.0 -pyasn1-modules==0.0.8 -pyasn1==0.2.3 # Updated from 0.1.9 -pycparser==2.17 # Updated from 2.14 -python-json-logger==0.1.7 # Updated from 0.1.5 +parsel==1.2.0 # Updated from 1.1.0 +pbr==3.1.1 # Updated from 2.0.0 +pyOpenSSL==17.2.0 # Updated from 16.2.0 +pyasn1==0.3.2 # Updated from 0.2.3 +pyasn1-modules==0.0.11 # Updated from 0.0.8 +pycparser==2.18 # Updated from 2.17 +python-json-logger==0.1.8 # Updated from 0.1.7 python-redis-lock==3.2.0 # Updated from 3.1.0 queuelib==1.4.2 redis==2.10.5 -requests-file==1.4.1 # Updated from 1.4 -requests==2.13.0 # Updated from 2.11.1 +requests==2.18.3 # Updated from 2.13.0 +requests-file==1.4.2 # Updated from 1.4.1 retrying==1.3.3 ./utils # scutils==1.3.0dev0 -service-identity==16.0.0 +service-identity==17.0.0 # Updated from 16.0.0 six==1.10.0 -testfixtures==4.13.5 # Updated from 4.11.0 -tldextract==2.0.2 # Updated from 2.0.1 +testfixtures==5.1.1 # Updated from 4.13.5 +tldextract==2.1.0 # Updated from 2.0.2 ujson==1.35 -w3lib==1.17.0 # Updated from 1.16.0 -zope.interface==4.3.3 # Updated from 4.2.0 +w3lib==1.18.0 # Updated from 1.17.0 +zope.interface==4.4.2 # Updated from 4.3.3 # Generated with piprot 0.9.7 \ No newline at end of file From 00ab692ce272b199990354ed6b19e3e4aca93048 Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Sun, 20 Aug 2017 14:25:02 -0400 Subject: [PATCH 05/22] update changelog with new pr --- docs/topics/changelog.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/topics/changelog.rst b/docs/topics/changelog.rst index ed7d7ede..77f58f0d 100644 --- a/docs/topics/changelog.rst +++ b/docs/topics/changelog.rst @@ -12,6 +12,11 @@ Date: ??/??/???? - Add Python 3 support +Crawler +^^^^^^^ + +- Improved request to dictionary serialization support + Scrapy Cluster 1.2 ------------------ From 9441bdc3842aaf500116efc7fdaa86c04a8c07e0 Mon Sep 17 00:00:00 2001 From: Alex Teut Date: Fri, 22 Sep 2017 00:43:57 +0300 Subject: [PATCH 06/22] Fix REST-Kafka topic variable name in docker settings.py (#136) --- docker/rest/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/rest/settings.py b/docker/rest/settings.py index 3d295dd8..bcd47c12 100644 --- a/docker/rest/settings.py +++ b/docker/rest/settings.py @@ -27,7 +27,7 @@ def str2bool(v): KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES = 10 * 1024 * 1024 # 10MB KAFKA_CONSUMER_SLEEP_TIME = 1 -KAFKA_INCOMING_TOPIC = os.getenv('KAFKA_INCOMING_TOPIC', 'demo.incoming') +KAFKA_PRODUCER_TOPIC = os.getenv('KAFKA_PRODUCER_TOPIC', 'demo.incoming') KAFKA_PRODUCER_BATCH_LINGER_MS = 25 # 25 ms before flush KAFKA_PRODUCER_BUFFER_BYTES = 4 * 1024 * 1024 # 4MB before blocking From c4ebe9208378c9b750c82c93bde9e834defcb467 Mon Sep 17 00:00:00 2001 From: Russ Ferriday Date: Wed, 1 Nov 2017 15:23:09 +0000 Subject: [PATCH 07/22] Fix assert deprecations --- crawler/tests/online.py | 2 +- crawler/tests/test_distributed_scheduler.py | 10 +- crawler/tests/test_log_retry_middleware.py | 8 +- .../tests/test_meta_passthrough_middleware.py | 10 +- crawler/tests/test_redis_stats_middleware.py | 18 +-- kafka-monitor/tests/test_kafka_monitor.py | 32 ++--- kafka-monitor/tests/test_plugins.py | 10 +- redis-monitor/tests/online.py | 6 +- redis-monitor/tests/test_plugins.py | 44 +++---- redis-monitor/tests/test_redis_monitor.py | 24 ++-- rest/tests/online.py | 4 +- rest/tests/test_rest_service.py | 114 +++++++++--------- utils/tests/online.py | 6 +- utils/tests/test_log_factory.py | 4 +- utils/tests/test_redis_queue.py | 14 +-- utils/tests/test_zookeeper_watcher.py | 6 +- 16 files changed, 156 insertions(+), 156 deletions(-) diff --git a/crawler/tests/online.py b/crawler/tests/online.py index 3296cf02..9cead8e4 100644 --- a/crawler/tests/online.py +++ b/crawler/tests/online.py @@ -101,7 +101,7 @@ def thread_func(): and the_dict['crawlid'] == 'abc12345': message_count += 1 - self.assertEquals(message_count, 1) + self.assertEqual(message_count, 1) def tearDown(self): keys = self.redis_conn.keys('stats:crawler:*:test-spider:*') diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py index aaf3e42a..86c50662 100644 --- a/crawler/tests/test_distributed_scheduler.py +++ b/crawler/tests/test_distributed_scheduler.py @@ -60,7 +60,7 @@ def test_enqueue_request(self, t): # test request already seen self.scheduler.dupefilter.request_seen = MagicMock(return_value=True) - self.assertEquals(self.scheduler.enqueue_request(self.req), None) + self.assertEqual(self.scheduler.enqueue_request(self.req), None) # test request not expiring and queue seen self.scheduler.queue_keys = ['link:ex.com:queue'] @@ -150,7 +150,7 @@ def test_request_from_feed(self): "spiderid": "link", } out = self.scheduler.request_from_feed(feed) - self.assertEquals(out.url, 'http://ex.com') + self.assertEqual(out.url, 'http://ex.com') for key in out.meta: self.assertEqual(out.meta[key], self.req.meta[key]) @@ -194,7 +194,7 @@ def test_next_request(self, t): } self.scheduler.find_item = MagicMock(return_value=feed) out = self.scheduler.next_request() - self.assertEquals(out.url, 'http://ex.com') + self.assertEqual(out.url, 'http://ex.com') for key in out.meta: self.assertEqual(out.meta[key], self.req.meta[key]) @@ -206,13 +206,13 @@ def test_next_request(self, t): exist_item["meta"]["spiderid"] = "link" self.scheduler.find_item = MagicMock(return_value=exist_item) out = self.scheduler.next_request() - self.assertEquals(out.url, 'http://ex.com') + self.assertEqual(out.url, 'http://ex.com') for key in out.meta: self.assertEqual(out.meta[key], self.req.meta[key]) # test didn't get item self.scheduler.find_item = MagicMock(return_value=None) - self.assertEquals(self.scheduler.next_request(), None) + self.assertEqual(self.scheduler.next_request(), None) class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase): diff --git a/crawler/tests/test_log_retry_middleware.py b/crawler/tests/test_log_retry_middleware.py index 2b368c73..a64fe08e 100644 --- a/crawler/tests/test_log_retry_middleware.py +++ b/crawler/tests/test_log_retry_middleware.py @@ -26,7 +26,7 @@ def test_lrm_stats_setup(self): # test nothing self.lrm._setup_stats_status_codes() - self.assertEquals([str(x) for x in self.lrm.stats_dict.keys()], ['lifetime']) + self.assertEqual([str(x) for x in self.lrm.stats_dict.keys()], ['lifetime']) # test good/bad rolling stats self.lrm.stats_dict = {} @@ -43,7 +43,7 @@ def test_lrm_stats_setup(self): # check that both keys are set up self.lrm._setup_stats_status_codes() - self.assertEquals( + self.assertEqual( sorted([str(x) for x in self.lrm.stats_dict.keys()]), sorted(good)) @@ -51,12 +51,12 @@ def test_lrm_stats_setup(self): for time_key in self.lrm.stats_dict: if time_key == 0: - self.assertEquals( + self.assertEqual( self.lrm.stats_dict[0].key, '{k}:lifetime'.format(k=k1) ) else: - self.assertEquals( + self.assertEqual( self.lrm.stats_dict[time_key].key, '{k}:{t}'.format(k=k1, t=time_key) ) diff --git a/crawler/tests/test_meta_passthrough_middleware.py b/crawler/tests/test_meta_passthrough_middleware.py index 19406f1a..595bbcbe 100644 --- a/crawler/tests/test_meta_passthrough_middleware.py +++ b/crawler/tests/test_meta_passthrough_middleware.py @@ -33,13 +33,13 @@ def test_mpm_middleware(self): for item in self.mpm.process_spider_output(a, test_list, MagicMock()): if isinstance(item, Request): - self.assertEquals(a.meta, item.meta) + self.assertEqual(a.meta, item.meta) yield_count += 1 - self.assertEquals(yield_count, 3) + self.assertEqual(yield_count, 3) # 1 debug for the method, 1 debug for the request - self.assertEquals(self.mpm.logger.debug.call_count, 2) + self.assertEqual(self.mpm.logger.debug.call_count, 2) # test meta unchanged if already exists r = Request('http://aol.com') @@ -47,6 +47,6 @@ def test_mpm_middleware(self): for item in self.mpm.process_spider_output(a, [r], MagicMock()): # key1 value1 did not pass through, since it was already set - self.assertEquals(item.meta['key1'], 'othervalue') + self.assertEqual(item.meta['key1'], 'othervalue') # key2 was not set, therefor it passed through - self.assertEquals(item.meta['key2'], 'value2') + self.assertEqual(item.meta['key2'], 'value2') diff --git a/crawler/tests/test_redis_stats_middleware.py b/crawler/tests/test_redis_stats_middleware.py index 614d68d1..9544f27b 100644 --- a/crawler/tests/test_redis_stats_middleware.py +++ b/crawler/tests/test_redis_stats_middleware.py @@ -25,12 +25,12 @@ def test_load_stats_codes(self): # test nothing spider_name = 'link' self.rsm._setup_stats_status_codes(spider_name) - self.assertEquals(list(self.rsm.stats_dict[spider_name]['status_codes'].keys()), []) + self.assertEqual(list(self.rsm.stats_dict[spider_name]['status_codes'].keys()), []) # test status codes only self.rsm.settings['STATS_RESPONSE_CODES'] = [200, 403] self.rsm._setup_stats_status_codes(spider_name) - self.assertEquals( + self.assertEqual( sorted(self.rsm.stats_dict[spider_name]['status_codes'].keys()), sorted([200, 403])) self.assertEqual(list(self.rsm.stats_dict[spider_name]['status_codes'][200].keys()), @@ -53,10 +53,10 @@ def test_load_stats_codes(self): # check that both keys are set up self.rsm._setup_stats_status_codes(spider_name) - self.assertEquals( + self.assertEqual( sorted([str(x) for x in self.rsm.stats_dict[spider_name]['status_codes'][200].keys()]), sorted(good)) - self.assertEquals( + self.assertEqual( sorted([str(x) for x in self.rsm.stats_dict[spider_name]['status_codes'][403].keys()]), sorted(good)) @@ -65,24 +65,24 @@ def test_load_stats_codes(self): for time_key in self.rsm.stats_dict[spider_name]['status_codes'][200]: if time_key == 0: - self.assertEquals( + self.assertEqual( self.rsm.stats_dict[spider_name]['status_codes'][200][0].key, '{k}:lifetime'.format(k=k1) ) else: - self.assertEquals( + self.assertEqual( self.rsm.stats_dict[spider_name]['status_codes'][200][time_key].key, '{k}:{t}'.format(k=k1, t=time_key) ) for time_key in self.rsm.stats_dict[spider_name]['status_codes'][403]: if time_key == 0: - self.assertEquals( + self.assertEqual( self.rsm.stats_dict[spider_name]['status_codes'][403][0].key, '{k}:lifetime'.format(k=k2) ) else: - self.assertEquals( + self.assertEqual( self.rsm.stats_dict[spider_name]['status_codes'][403][time_key].key, '{k}:{t}'.format(k=k2, t=time_key) ) @@ -141,6 +141,6 @@ def test_rsm_input(self): self.rsm.process_spider_input(response, spider) # 4 calls for link, 4 calls for wandering - self.assertEquals(fake_stats.increment.call_count, 8) + self.assertEqual(fake_stats.increment.call_count, 8) diff --git a/kafka-monitor/tests/test_kafka_monitor.py b/kafka-monitor/tests/test_kafka_monitor.py index 38deb72b..6d1e2e8b 100644 --- a/kafka-monitor/tests/test_kafka_monitor.py +++ b/kafka-monitor/tests/test_kafka_monitor.py @@ -62,8 +62,8 @@ def test_load_stats_total(self): self.kafka_monitor.settings['STATS_TIMES'] = [] self.kafka_monitor._setup_stats_total(MagicMock()) - self.assertEquals(list(self.kafka_monitor.stats_dict['total'].keys()), ['lifetime']) - self.assertEquals(list(self.kafka_monitor.stats_dict['fail'].keys()), ['lifetime']) + self.assertEqual(list(self.kafka_monitor.stats_dict['total'].keys()), ['lifetime']) + self.assertEqual(list(self.kafka_monitor.stats_dict['fail'].keys()), ['lifetime']) # test good/bad rolling stats self.kafka_monitor.stats_dict = {} @@ -79,10 +79,10 @@ def test_load_stats_total(self): ] self.kafka_monitor._setup_stats_total(MagicMock()) - self.assertEquals( + self.assertEqual( sorted([str(x) for x in self.kafka_monitor.stats_dict['total'].keys()]), sorted(good)) - self.assertEquals( + self.assertEqual( sorted([str(x) for x in self.kafka_monitor.stats_dict['fail'].keys()]), sorted(good)) @@ -91,24 +91,24 @@ def test_load_stats_total(self): for time_key in self.kafka_monitor.stats_dict['total']: if time_key == 0: - self.assertEquals( + self.assertEqual( self.kafka_monitor.stats_dict['total'][0].key, '{k}:lifetime'.format(k=k1) ) else: - self.assertEquals( + self.assertEqual( self.kafka_monitor.stats_dict['total'][time_key].key, '{k}:{t}'.format(k=k1, t=time_key) ) for time_key in self.kafka_monitor.stats_dict['fail']: if time_key == 0: - self.assertEquals( + self.assertEqual( self.kafka_monitor.stats_dict['fail'][0].key, '{k}:lifetime'.format(k=k2) ) else: - self.assertEquals( + self.assertEqual( self.kafka_monitor.stats_dict['fail'][time_key].key, '{k}:{t}'.format(k=k2, t=time_key) ) @@ -129,13 +129,13 @@ def test_load_stats_plugins(self): 'ZookeeperHandler' ] - self.assertEquals( + self.assertEqual( sorted(list(self.kafka_monitor.stats_dict['plugins'].keys())), sorted(defaults)) for key in self.kafka_monitor.plugins_dict: plugin_name = self.kafka_monitor.plugins_dict[key]['instance'].__class__.__name__ - self.assertEquals( + self.assertEqual( list(self.kafka_monitor.stats_dict['plugins'][plugin_name].keys()), ['lifetime']) @@ -154,13 +154,13 @@ def test_load_stats_plugins(self): self.kafka_monitor._setup_stats_plugins(MagicMock()) - self.assertEquals( + self.assertEqual( sorted(self.kafka_monitor.stats_dict['plugins'].keys()), sorted(defaults)) for key in self.kafka_monitor.plugins_dict: plugin_name = self.kafka_monitor.plugins_dict[key]['instance'].__class__.__name__ - self.assertEquals( + self.assertEqual( sorted([str(x) for x in self.kafka_monitor.stats_dict['plugins'][plugin_name].keys()]), sorted(good)) @@ -168,12 +168,12 @@ def test_load_stats_plugins(self): k1 = 'stats:kafka-monitor:{p}'.format(p=plugin_key) for time_key in self.kafka_monitor.stats_dict['plugins'][plugin_key]: if time_key == 0: - self.assertEquals( + self.assertEqual( self.kafka_monitor.stats_dict['plugins'][plugin_key][0].key, '{k}:lifetime'.format(k=k1) ) else: - self.assertEquals( + self.assertEqual( self.kafka_monitor.stats_dict['plugins'][plugin_key][time_key].key, '{k}:{t}'.format(k=k1, t=time_key) ) @@ -224,7 +224,7 @@ class a(object): self.kafka_monitor._process_messages() self.fail("Scrape not called") except AssertionError as e: - self.assertEquals("scrape", str(e)) + self.assertEqual("scrape", str(e)) # test that handler function is called for the actions message_string = "{\"uuid\":\"blah\",\"crawlid\":\"1234\"," \ @@ -237,5 +237,5 @@ class a(object): self.kafka_monitor._process_messages() self.fail("Action not called") except AssertionError as e: - self.assertEquals("action", str(e)) + self.assertEqual("action", str(e)) diff --git a/kafka-monitor/tests/test_plugins.py b/kafka-monitor/tests/test_plugins.py index 7cd946a6..96cfa5f2 100644 --- a/kafka-monitor/tests/test_plugins.py +++ b/kafka-monitor/tests/test_plugins.py @@ -56,7 +56,7 @@ def test_scrape_handler(self): handler.handle(valid) self.fail("Action not called") except AssertionError as e: - self.assertEquals("added", str(e)) + self.assertEqual("added", str(e)) # check timeout is added handler.redis_conn.zadd = MagicMock() @@ -66,7 +66,7 @@ def test_scrape_handler(self): handler.handle(valid) self.fail("Expires not called") except AssertionError as e: - self.assertEquals("expires", str(e)) + self.assertEqual("expires", str(e)) def test_action_handler(self): handler = ActionHandler() @@ -85,7 +85,7 @@ def test_action_handler(self): handler.handle(valid) self.fail("Added not called") except AssertionError as e: - self.assertEquals("added", str(e)) + self.assertEqual("added", str(e)) def test_stats_handler(self): handler = StatsHandler() @@ -102,7 +102,7 @@ def test_stats_handler(self): handler.handle(valid) self.fail("Added not called") except AssertionError as e: - self.assertEquals("added", str(e)) + self.assertEqual("added", str(e)) def test_zookeeper_handler(self): handler = ZookeeperHandler() @@ -123,7 +123,7 @@ def test_zookeeper_handler(self): handler.handle(valid) self.fail("Added not called") except AssertionError as e: - self.assertEquals("added", str(e)) + self.assertEqual("added", str(e)) def test_bad_plugins(self): class ForgotSchema(BaseHandler): diff --git a/redis-monitor/tests/online.py b/redis-monitor/tests/online.py index 4db06aef..b3c67668 100644 --- a/redis-monitor/tests/online.py +++ b/redis-monitor/tests/online.py @@ -93,7 +93,7 @@ def test_process_item(self): self.redis_monitor._process_plugin(plugin) # ensure the key is gone - self.assertEquals(self.redis_monitor.redis_conn.get(key), None) + self.assertEqual(self.redis_monitor.redis_conn.get(key), None) self.redis_monitor.close() sleep(10) # now test the message was sent to kafka @@ -109,10 +109,10 @@ def test_process_item(self): pass else: the_dict = json.loads(m.value) - self.assertEquals(success, the_dict) + self.assertEqual(success, the_dict) message_count += 1 - self.assertEquals(message_count, 1) + self.assertEqual(message_count, 1) def tearDown(self): # if for some reason the tests fail, we end up falling behind on diff --git a/redis-monitor/tests/test_plugins.py b/redis-monitor/tests/test_plugins.py index 3242ae40..056f4a50 100644 --- a/redis-monitor/tests/test_plugins.py +++ b/redis-monitor/tests/test_plugins.py @@ -73,9 +73,9 @@ def setUp(self): def test_info_regex(self): regex = self.fix_re(self.plugin.regex) - self.assertEquals(re.findall(regex, 'info:stuff:stuff'), ['info:stuff:stuff']) - self.assertEquals(re.findall(regex, 'info:stuff:stuff:stuff'), ['info:stuff:stuff:stuff']) - self.assertEquals(re.findall(regex, 'info:stuff'), []) + self.assertEqual(re.findall(regex, 'info:stuff:stuff'), ['info:stuff:stuff']) + self.assertEqual(re.findall(regex, 'info:stuff:stuff:stuff'), ['info:stuff:stuff:stuff']) + self.assertEqual(re.findall(regex, 'info:stuff'), []) def test_info_get_bin(self): v1 = "stuff" @@ -83,7 +83,7 @@ def test_info_get_bin(self): v2 = 200 self.plugin.redis_conn.zscan_iter = MagicMock(return_value=[(v1, v2)]) ret_val = self.plugin._get_bin('key') - self.assertEquals(ret_val, {-200: ['stuff']}) + self.assertEqual(ret_val, {-200: ['stuff']}) def test_info_get_crawlid(self): master = {} @@ -124,7 +124,7 @@ def test_info_get_crawlid(self): 'uuid': 'ABC123' } - self.assertEquals(result, success) + self.assertEqual(result, success) def test_info_get_appid(self): master = {} @@ -165,7 +165,7 @@ def test_info_get_appid(self): 'expires': 10 }}} - self.assertEquals(result, success) + self.assertEqual(result, success) class TestStopPlugin(TestCase, RegexFixer): @@ -176,11 +176,11 @@ def setUp(self): def test_stop_regex(self): regex = self.fix_re(self.plugin.regex) - self.assertEquals(re.findall(regex, 'stop:spider:app:crawl'), + self.assertEqual(re.findall(regex, 'stop:spider:app:crawl'), ['stop:spider:app:crawl']) - self.assertEquals(re.findall(regex, 'stop:spider:app'), + self.assertEqual(re.findall(regex, 'stop:spider:app'), ['stop:spider:app']) - self.assertEquals(re.findall(regex, 'stop:stuff'), []) + self.assertEqual(re.findall(regex, 'stop:stuff'), []) def test_stop_monitor_mini_purge(self): self.plugin.redis_conn.scan_iter = MagicMock(return_value=['link:istresearch.com:queue']) @@ -189,7 +189,7 @@ def test_stop_monitor_mini_purge(self): ['{"crawlid":"crawl", "appid":"foo"}'], ]) - self.assertEquals(self.plugin._mini_purge("link", "app", "crawl"), 1) + self.assertEqual(self.plugin._mini_purge("link", "app", "crawl"), 1) class TestExpirePlugin(TestCase, RegexFixer): @@ -200,9 +200,9 @@ def setUp(self): def test_stop_regex(self): regex = self.fix_re(self.plugin.regex) - self.assertEquals(re.findall(regex, 'timeout:blah1:blah2:bla3'), + self.assertEqual(re.findall(regex, 'timeout:blah1:blah2:bla3'), ['timeout:blah1:blah2:bla3']) - self.assertEquals(re.findall(regex, 'timeout:blah1:blah2'), []) + self.assertEqual(re.findall(regex, 'timeout:blah1:blah2'), []) def test_expire_monitor_time(self): # if the stop monitor passes then this is just testing whether @@ -222,7 +222,7 @@ def test_expire_monitor_time(self): self.plugin.handle("key:stuff:blah:blah", 4) self.fail("Expire not called") except BaseException as e: - self.assertEquals("throw once", str(e)) + self.assertEqual("throw once", str(e)) class TestStatsPlugin(TestCase, RegexFixer): def setUp(self): @@ -233,16 +233,16 @@ def setUp(self): def test_stats_regex(self): regex = self.fix_re(self.plugin.regex) - self.assertEquals(re.findall(regex, 'statsrequest:crawler:testApp'), + self.assertEqual(re.findall(regex, 'statsrequest:crawler:testApp'), ['statsrequest:crawler:testApp']) - self.assertEquals(re.findall(regex, 'statsrequest:crawler'), []) + self.assertEqual(re.findall(regex, 'statsrequest:crawler'), []) def _assert_thrown(self, key, equals): try: self.plugin.handle(key, 'blah') self.fail(equals + " exception not thrown") except Exception as e: - self.assertEquals(equals, str(e)) + self.assertEqual(equals, str(e)) def test_stats_handle(self): # trying to make sure that everything is called @@ -291,7 +291,7 @@ def side_effect(*args): } } } - self.assertEquals(result, good) + self.assertEqual(result, good) def test_stats_get_machine(self): # tests stats on three different machines, with different spiders @@ -313,7 +313,7 @@ def test_stats_get_machine(self): 'host3': {'200': {'86400': 5}} } } - self.assertEquals(result, good) + self.assertEqual(result, good) def test_stats_get_queue(self): # tests stats on three different machines, with different spiders @@ -355,7 +355,7 @@ def ret_val(*args): } } } - self.assertEquals(result, good) + self.assertEqual(result, good) def test_stats_get_plugin(self): self.plugin.redis_conn.keys = MagicMock(return_value=[ @@ -374,7 +374,7 @@ def test_stats_get_plugin(self): "total": {"lifetime": 5, "3600": 5}, "fail": {"68000": 5, "3600": 5} } - self.assertEquals(result, good) + self.assertEqual(result, good) class TestZookeeperPlugin(TestCase, RegexFixer): def setUp(self): @@ -389,9 +389,9 @@ def setUp(self): def test_zk_regex(self): regex = self.fix_re(self.plugin.regex) - self.assertEquals(re.findall(regex, 'zk:blah1:blah2:bla3'), + self.assertEqual(re.findall(regex, 'zk:blah1:blah2:bla3'), ['zk:blah1:blah2:bla3']) - self.assertEquals(re.findall(regex, 'zk:blah1:blah2'), []) + self.assertEqual(re.findall(regex, 'zk:blah1:blah2'), []) def test_zk_handle_du(self): # domain update diff --git a/redis-monitor/tests/test_redis_monitor.py b/redis-monitor/tests/test_redis_monitor.py index f4545014..fcea510e 100644 --- a/redis-monitor/tests/test_redis_monitor.py +++ b/redis-monitor/tests/test_redis_monitor.py @@ -64,7 +64,7 @@ def test_active_plugins(self): self.redis_monitor._process_plugin(plugin) self.fail("Info not called") except BaseException as e: - self.assertEquals("info", str(e)) + self.assertEqual("info", str(e)) # action try: @@ -72,7 +72,7 @@ def test_active_plugins(self): self.redis_monitor._process_plugin(plugin) self.fail("Stop not called") except BaseException as e: - self.assertEquals("stop", str(e)) + self.assertEqual("stop", str(e)) # expire try: @@ -80,7 +80,7 @@ def test_active_plugins(self): self.redis_monitor._process_plugin(plugin) self.fail("Expire not called") except BaseException as e: - self.assertEquals("expire", str(e)) + self.assertEqual("expire", str(e)) # test that an exception within a handle method is caught self.redis_monitor._process_failures = MagicMock() @@ -158,13 +158,13 @@ def test_load_stats_plugins(self): 'ZookeeperMonitor' ] - self.assertEquals( + self.assertEqual( sorted(self.redis_monitor.stats_dict['plugins'].keys()), sorted(defaults)) for key in self.redis_monitor.plugins_dict: plugin_name = self.redis_monitor.plugins_dict[key]['instance'].__class__.__name__ - self.assertEquals( + self.assertEqual( list(self.redis_monitor.stats_dict['plugins'][plugin_name].keys()), ['lifetime']) @@ -183,13 +183,13 @@ def test_load_stats_plugins(self): self.redis_monitor._setup_stats_plugins() - self.assertEquals( + self.assertEqual( sorted(self.redis_monitor.stats_dict['plugins'].keys()), sorted(defaults)) for key in self.redis_monitor.plugins_dict: plugin_name = self.redis_monitor.plugins_dict[key]['instance'].__class__.__name__ - self.assertEquals( + self.assertEqual( sorted([str(x) for x in self.redis_monitor.stats_dict['plugins'][plugin_name].keys()]), sorted(good)) @@ -197,12 +197,12 @@ def test_load_stats_plugins(self): k1 = 'stats:redis-monitor:{p}'.format(p=plugin_key) for time_key in self.redis_monitor.stats_dict['plugins'][plugin_key]: if time_key == 0: - self.assertEquals( + self.assertEqual( self.redis_monitor.stats_dict['plugins'][plugin_key][0].key, '{k}:lifetime'.format(k=k1) ) else: - self.assertEquals( + self.assertEqual( self.redis_monitor.stats_dict['plugins'][plugin_key][time_key].key, '{k}:{t}'.format(k=k1, t=time_key) ) @@ -216,7 +216,7 @@ def test_main_loop(self): self.redis_monitor._main_loop() self.fail("_process_plugin not called") except BaseException as e: - self.assertEquals("normal", str(e)) + self.assertEqual("normal", str(e)) def test_precondition(self): self.redis_monitor.stats_dict = {} @@ -235,12 +235,12 @@ def test_precondition(self): self.redis_monitor._process_key_val(instance, key, value) self.fail('handler not called') except BaseException as e: - self.assertEquals('handler', str(e)) + self.assertEqual('handler', str(e)) def test_get_fail_key(self): key = 'test' result = 'lock:test:failures' - self.assertEquals(self.redis_monitor._get_fail_key(key), result) + self.assertEqual(self.redis_monitor._get_fail_key(key), result) def test_process_failures(self): self.redis_monitor.settings = {'RETRY_FAILURES':True, diff --git a/rest/tests/online.py b/rest/tests/online.py index fb568618..6bdecdb2 100644 --- a/rest/tests/online.py +++ b/rest/tests/online.py @@ -39,10 +39,10 @@ def test_status(self): r = requests.get('http://127.0.0.1:{p}'.format(p=self.port_number)) results = r.json() - self.assertEquals(results['node_health'], 'GREEN') + self.assertEqual(results['node_health'], 'GREEN') def tearDown(self): self.rest_service.close() if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/rest/tests/test_rest_service.py b/rest/tests/test_rest_service.py index 2f2772f7..01ca12df 100644 --- a/rest/tests/test_rest_service.py +++ b/rest/tests/test_rest_service.py @@ -61,7 +61,7 @@ def test_load_schemas_bad(self): @mock.patch('six.moves.builtins.open', mock_open(read_data='{\"stuff\":\"value\"}'), create=True) def test_load_schemas_bad(self): self.rest_service._load_schemas() - self.assertEquals(self.rest_service.schemas, + self.assertEqual(self.rest_service.schemas, {'hey2': {u'stuff': u'value'}}) def test_process_messages(self): @@ -110,7 +110,7 @@ class a(object): m.value = message_string messages = [m] self.rest_service._process_messages() - self.assertEquals(self.rest_service.uuids, {'abc123': {u'uuid': u'abc123'}}) + self.assertEqual(self.rest_service.uuids, {'abc123': {u'uuid': u'abc123'}}) def test_send_result_to_redis(self): # test not connected @@ -216,8 +216,8 @@ def test_setup_kafka(self): self.rest_service._setup_kafka() except: pass - self.assertEquals(self.rest_service.consumer, None) - self.assertEquals(self.rest_service.producer, None) + self.assertEqual(self.rest_service.consumer, None) + self.assertEqual(self.rest_service.producer, None) # test if everything flows through self.rest_service._create_consumer = MagicMock() @@ -232,7 +232,7 @@ def test_create_ret_object(self): "data": None, "error": None } - self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.FAILURE), r) + self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.FAILURE), r) # success r = { @@ -240,7 +240,7 @@ def test_create_ret_object(self): "data": None, "error": None } - self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.SUCCESS), r) + self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.SUCCESS), r) # data r = { @@ -248,7 +248,7 @@ def test_create_ret_object(self): "data": 'blah', "error": None } - self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.SUCCESS, data='blah'), r) + self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.SUCCESS, data='blah'), r) # error message r = { @@ -258,7 +258,7 @@ def test_create_ret_object(self): "message": 'err' } } - self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.FAILURE, + self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.FAILURE, error=True, error_message='err'), r) @@ -271,7 +271,7 @@ def test_create_ret_object(self): "cause": "the cause" } } - self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.FAILURE, + self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.FAILURE, error=True, error_message='err', error_cause="the cause"), r) @@ -306,7 +306,7 @@ def test_close(self): self.rest_service._close_thread = MagicMock() self.rest_service.close() - self.assertEquals(self.rest_service._close_thread.call_count, 4) + self.assertEqual(self.rest_service._close_thread.call_count, 4) self.assertTrue(self.rest_service.closed) self.assertTrue(self.rest_service.consumer.close.called) self.assertTrue(self.rest_service.producer.close.called) @@ -314,19 +314,19 @@ def test_close(self): def test_calculate_health(self): self.rest_service.redis_connected = False self.rest_service.kafka_connected = False - self.assertEquals(self.rest_service._calculate_health(), "RED") + self.assertEqual(self.rest_service._calculate_health(), "RED") self.rest_service.redis_connected = True self.rest_service.kafka_connected = False - self.assertEquals(self.rest_service._calculate_health(), "YELLOW") + self.assertEqual(self.rest_service._calculate_health(), "YELLOW") self.rest_service.redis_connected = False self.rest_service.kafka_connected = True - self.assertEquals(self.rest_service._calculate_health(), "YELLOW") + self.assertEqual(self.rest_service._calculate_health(), "YELLOW") self.rest_service.redis_connected = True self.rest_service.kafka_connected = True - self.assertEquals(self.rest_service._calculate_health(), "GREEN") + self.assertEqual(self.rest_service._calculate_health(), "GREEN") def test_feed_to_kafka(self): self.rest_service.producer = MagicMock() @@ -352,7 +352,7 @@ def test_log_call(self): override.test_log_call() self.assertTrue(override.logger.info.called) - self.assertEquals(override.logger.info.call_args[0][0], "test logger") + self.assertEqual(override.logger.info.call_args[0][0], "test logger") def test_error_catch(self): override = Override('settings.py') @@ -363,7 +363,7 @@ def test_error_catch(self): override.logger.error = MagicMock() results = override.test_error1() self.assertTrue(override.logger.error.called) - self.assertEquals(override.logger.error.call_args[0][0], + self.assertEqual(override.logger.error.call_args[0][0], "Uncaught Exception Thrown") d = { u'data': None, @@ -373,8 +373,8 @@ def test_error_catch(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 500) + self.assertEqual(data, d) + self.assertEqual(results[1], 500) # test normal response with self.rest_service.app.test_request_context(): @@ -382,8 +382,8 @@ def test_error_catch(self): results = override.test_error2() self.assertFalse(override.logger.error.called) data = json.loads(results[0].data) - self.assertEquals(data, 'test data') - self.assertEquals(results[1], 200) + self.assertEqual(data, 'test data') + self.assertEqual(results[1], 200) # test normal response with alternate response code with self.rest_service.app.test_request_context(): @@ -391,8 +391,8 @@ def test_error_catch(self): results = override.test_error3() self.assertFalse(override.logger.error.called) data = json.loads(results[0].data) - self.assertEquals(data, 'test data') - self.assertEquals(results[1], 109) + self.assertEqual(data, 'test data') + self.assertEqual(results[1], 109) def test_validate_json(self): override = Override('settings.py') @@ -404,7 +404,7 @@ def test_validate_json(self): content_type='application/json'): results = override.test_json() self.assertTrue(override.logger.error.called) - self.assertEquals(override.logger.error.call_args[0][0], + self.assertEqual(override.logger.error.call_args[0][0], 'The payload must be valid JSON.') d = { @@ -415,8 +415,8 @@ def test_validate_json(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 400) + self.assertEqual(data, d) + self.assertEqual(results[1], 400) # no json data = '["a list", ashdasd ,\\ !]' @@ -424,7 +424,7 @@ def test_validate_json(self): self.rest_service.logger.error.reset_mock() results = override.test_json() self.assertTrue(override.logger.error.called) - self.assertEquals(override.logger.error.call_args[0][0], + self.assertEqual(override.logger.error.call_args[0][0], 'The payload must be valid JSON.') d = { @@ -435,8 +435,8 @@ def test_validate_json(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 400) + self.assertEqual(data, d) + self.assertEqual(results[1], 400) # good json data = '["a list", "2", "3"]' @@ -445,7 +445,7 @@ def test_validate_json(self): override.logger.reset_mock() results = override.test_json() self.assertFalse(override.logger.error.called) - self.assertEquals(results, 'data') + self.assertEqual(results, 'data') def test_validate_schema(self): override = Override('settings.py') @@ -473,7 +473,7 @@ def test_validate_schema(self): content_type='application/json'): results = override.test_schema() self.assertFalse(override.logger.error.called) - self.assertEquals(results, 'data') + self.assertEqual(results, 'data') # invalid schema data = u'{"value": "data here", "otherkey": "bad data"}' @@ -481,7 +481,7 @@ def test_validate_schema(self): content_type='application/json'): results = override.test_schema() self.assertTrue(override.logger.error.called) - self.assertEquals(override.logger.error.call_args[0][0], + self.assertEqual(override.logger.error.call_args[0][0], "Invalid Schema") if six.PY3: @@ -497,8 +497,8 @@ def test_validate_schema(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 400) + self.assertEqual(data, d) + self.assertEqual(results[1], 400) # Routes ------------------ @@ -515,7 +515,7 @@ def test_index(self): "node_health": 'RED' } data = json.loads(results[0].data) - self.assertEquals(data, d) + self.assertEqual(data, d) def test_feed(self): # test not connected @@ -534,8 +534,8 @@ def test_feed(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 500) + self.assertEqual(data, d) + self.assertEqual(results[1], 500) # connected self.rest_service.kafka_connected = True @@ -554,8 +554,8 @@ def test_feed(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 500) + self.assertEqual(data, d) + self.assertEqual(results[1], 500) # test no uuid self.rest_service._feed_to_kafka = MagicMock(return_value=True) @@ -568,8 +568,8 @@ def test_feed(self): u'status': u'SUCCESS' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 200) + self.assertEqual(data, d) + self.assertEqual(results[1], 200) # test with uuid, got response time_list = [0, 1, 2, 3, 4, 5] @@ -590,8 +590,8 @@ def fancy_get_time(): u'status': u'SUCCESS' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 200) + self.assertEqual(data, d) + self.assertEqual(results[1], 200) self.assertFalse('key' in self.rest_service.uuids) # test with uuid, no response @@ -609,10 +609,10 @@ def fancy_get_time2(): u'status': u'SUCCESS' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 200) + self.assertEqual(data, d) + self.assertEqual(results[1], 200) self.assertTrue('key' in self.rest_service.uuids) - self.assertEquals(self.rest_service.uuids['key'], 'poll') + self.assertEqual(self.rest_service.uuids['key'], 'poll') def test_poll(self): orig = self.rest_service.validator @@ -635,8 +635,8 @@ def test_poll(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 500) + self.assertEqual(data, d) + self.assertEqual(results[1], 500) # test connected found poll key self.rest_service.redis_conn = MagicMock() @@ -651,8 +651,8 @@ def test_poll(self): u'status': u'SUCCESS' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 200) + self.assertEqual(data, d) + self.assertEqual(results[1], 200) # test connected didnt find poll key self.rest_service.redis_conn.get = MagicMock(return_value=None) @@ -668,8 +668,8 @@ def test_poll(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 404) + self.assertEqual(data, d) + self.assertEqual(results[1], 404) # test connection error self.rest_service._spawn_redis_connection_thread = MagicMock() @@ -679,7 +679,7 @@ def test_poll(self): self.rest_service.redis_conn.get = MagicMock(side_effect=ConnectionError) results = self.rest_service.poll() self.assertTrue(self.rest_service.logger.error.called) - self.assertEquals(self.rest_service.logger.error.call_args[0][0], "Lost connection to Redis") + self.assertEqual(self.rest_service.logger.error.call_args[0][0], "Lost connection to Redis") self.assertTrue(self.rest_service._spawn_redis_connection_thread.called) d = { @@ -690,8 +690,8 @@ def test_poll(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 500) + self.assertEqual(data, d) + self.assertEqual(results[1], 500) # test value error self.rest_service.logger.warning = MagicMock() @@ -700,7 +700,7 @@ def test_poll(self): self.rest_service.redis_conn.get = MagicMock(side_effect=ValueError) results = self.rest_service.poll() self.assertTrue(self.rest_service.logger.warning.called) - self.assertEquals(self.rest_service.logger.warning.call_args[0][0], "Unparseable JSON Received from redis") + self.assertEqual(self.rest_service.logger.warning.call_args[0][0], "Unparseable JSON Received from redis") d = { u'data': None, @@ -710,8 +710,8 @@ def test_poll(self): u'status': u'FAILURE' } data = json.loads(results[0].data) - self.assertEquals(data, d) - self.assertEquals(results[1], 500) + self.assertEqual(data, d) + self.assertEqual(results[1], 500) self.rest_service.validator = orig diff --git a/utils/tests/online.py b/utils/tests/online.py index ffa06be7..2ab0930d 100644 --- a/utils/tests/online.py +++ b/utils/tests/online.py @@ -405,10 +405,10 @@ def test_get_file_contents(self): ensure=False, valid_init=True) - self.assertEquals(self.zoo_watcher.get_file_contents(), self.file_data) - self.assertEquals(pointer_zoo_watcher.get_file_contents(), + self.assertEqual(self.zoo_watcher.get_file_contents(), self.file_data) + self.assertEqual(pointer_zoo_watcher.get_file_contents(), self.file_data) - self.assertEquals(pointer_zoo_watcher.get_file_contents(True), + self.assertEqual(pointer_zoo_watcher.get_file_contents(True), self.pointer_data) pointer_zoo_watcher.close() diff --git a/utils/tests/test_log_factory.py b/utils/tests/test_log_factory.py index 608af9d9..204cf7cb 100644 --- a/utils/tests/test_log_factory.py +++ b/utils/tests/test_log_factory.py @@ -313,8 +313,8 @@ def test_preserve_data(self): extras = {"key": "value", 'a': [1, 2, 3]} def cb(log_message=None, log_extra=None): - self.assertEquals(log_message, message) - self.assertEquals(log_extra, extras) + self.assertEqual(log_message, message) + self.assertEqual(log_extra, extras) self.logger.register_callback('>DEBUG', cb) self.logger.log_level = 'INFO' diff --git a/utils/tests/test_redis_queue.py b/utils/tests/test_redis_queue.py index 14c152f2..be2d0108 100644 --- a/utils/tests/test_redis_queue.py +++ b/utils/tests/test_redis_queue.py @@ -30,16 +30,16 @@ def test_encode(self): q = Base(MagicMock(), 'key', pickle) # python pickling is different between versions data = pickle.dumps('cool', protocol=-1).decode('latin1') - self.assertEquals(q._encode_item('cool'), data) + self.assertEqual(q._encode_item('cool'), data) q2 = Base(MagicMock(), 'key', ujson) - self.assertEquals(q2._encode_item('cool2'), '"cool2"') + self.assertEqual(q2._encode_item('cool2'), '"cool2"') def test_decode(self): q = Base(MagicMock(), 'key', pickle) - self.assertEquals(q._decode_item(u"\x80\x02U\x04coolq\x00."), 'cool') + self.assertEqual(q._decode_item(u"\x80\x02U\x04coolq\x00."), 'cool') q2 = Base(MagicMock(), 'key', ujson) - self.assertEquals(q2._decode_item('"cool2"'), 'cool2') + self.assertEqual(q2._decode_item('"cool2"'), 'cool2') def test_len(self): with self.assertRaises(NotImplementedError): @@ -79,7 +79,7 @@ def test_push(self): def test_pop(self): self.assertTrue(hasattr(self.queue, 'pop')) self.queue.server.rpop = MagicMock(return_value='"stuff"') - self.assertEquals(self.queue.pop(), "stuff") + self.assertEqual(self.queue.pop(), "stuff") class TestRedisPriorityQueue(QueueMixin, TestCase): @@ -99,7 +99,7 @@ def test_pop(self): m = MagicMock() m.execute = MagicMock(return_value=[{}, 60]) self.queue.server.pipeline = MagicMock(return_value=m) - self.assertEquals(self.queue.pop(), None) + self.assertEqual(self.queue.pop(), None) class TestRedisStack(QueueMixin, TestCase): @@ -117,4 +117,4 @@ def test_push(self): def test_pop(self): self.assertTrue(hasattr(self.queue, 'pop')) self.queue.server.lpop = MagicMock(return_value='"stuff"') - self.assertEquals(self.queue.pop(), "stuff") + self.assertEqual(self.queue.pop(), "stuff") diff --git a/utils/tests/test_zookeeper_watcher.py b/utils/tests/test_zookeeper_watcher.py index fc5b52c8..05449c6e 100644 --- a/utils/tests/test_zookeeper_watcher.py +++ b/utils/tests/test_zookeeper_watcher.py @@ -30,13 +30,13 @@ def test_get_file_contents(self): self.zoo_watcher.old_data = 'old_data' self.zoo_watcher.pointer = False - self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data') + self.assertEqual(self.zoo_watcher.get_file_contents(), 'old_data') self.zoo_watcher.pointer = True - self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data') + self.assertEqual(self.zoo_watcher.get_file_contents(), 'old_data') self.zoo_watcher.pointer = True - self.assertEquals(self.zoo_watcher.get_file_contents(True), 'old_pointed') + self.assertEqual(self.zoo_watcher.get_file_contents(True), 'old_pointed') def test_compare_pointer(self): self.zoo_watcher.old_pointed = '/path1' From 3684a9feb532ff02c3663419bad84066c20aaea3 Mon Sep 17 00:00:00 2001 From: Russ Ferriday Date: Mon, 6 Nov 2017 17:50:45 +0000 Subject: [PATCH 08/22] Update dmoz.org to dmoztools.net since dmoz.org now redirects. (#145) (#147) --- crawler/config/example.yml | 4 ++-- crawler/tests/test_distributed_scheduler.py | 2 +- docs/topics/crawler/controlling.rst | 2 +- docs/topics/crawler/extension.rst | 12 ++++++------ docs/topics/introduction/quickstart.rst | 4 ++-- docs/topics/kafka-monitor/api.rst | 12 ++++++------ redis-monitor/tests/test_plugins.py | 12 ++++++------ 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/crawler/config/example.yml b/crawler/config/example.yml index f1b21897..f96f282c 100644 --- a/crawler/config/example.yml +++ b/crawler/config/example.yml @@ -1,9 +1,9 @@ domains: - dmoz.org: + dmoztools.net: window: 60 hits: 60 scale: 1.0 wikipedia.org: window: 60 hits: 30 - scale: 0.5 \ No newline at end of file + scale: 0.5 diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py index 86c50662..ca22b566 100644 --- a/crawler/tests/test_distributed_scheduler.py +++ b/crawler/tests/test_distributed_scheduler.py @@ -220,7 +220,7 @@ class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase): def test_change_config(self): good_string = ""\ "domains:\n"\ - " dmoz.org:\n"\ + " dmoztools.net:\n"\ " window: 60\n"\ " hits: 60\n"\ " scale: 1.0\n"\ diff --git a/docs/topics/crawler/controlling.rst b/docs/topics/crawler/controlling.rst index 4daf1371..1b345c64 100644 --- a/docs/topics/crawler/controlling.rst +++ b/docs/topics/crawler/controlling.rst @@ -141,7 +141,7 @@ To utilize the different throttle mechanisms you can alter the following setting Combining Domain Queues and Throttling -------------------------------------- -At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoz.org:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process. +At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoztools.net:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process. This results in Spiders across the cluster continually polling all available domain queues for new requests, but only receiving requests when the throttle mechanism indicates that the request limit has not gone beyond the max desired configuration. Because the throttle coordination is conducted via Redis, it is not reliant on any one Scrapy process to determine whether the cluster can or can't crawl a particular domain. diff --git a/docs/topics/crawler/extension.rst b/docs/topics/crawler/extension.rst index 85f91380..57c33980 100644 --- a/docs/topics/crawler/extension.rst +++ b/docs/topics/crawler/extension.rst @@ -199,7 +199,7 @@ Then, feed your cluster. :: - python kafka_monitor.py feed '{"url": "http://dmoz.org", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}' + python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}' If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kafkadump.py`` script, you will begin to see output like so... @@ -208,8 +208,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf { "body": , "crawlid": "test123456", - "response_url": "http://www.dmoz.org/", - "url": "http://www.dmoz.org/", + "response_url": "http://www.dmoztools.net/", + "url": "http://www.dmoztools.net/", "status_code": 200, "status_msg": "OK", "appid": "testapp", @@ -228,8 +228,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf { "body": , "crawlid": "test123456", - "response_url": "http://www.dmoz.org/Computers/Hardware/", - "url": "http://www.dmoz.org/Computers/Hardware/", + "response_url": "http://www.dmoztools.net/Computers/Hardware/", + "url": "http://www.dmoztools.net/Computers/Hardware/", "status_code": 200, "status_msg": "OK", "appid": "testapp", @@ -273,4 +273,4 @@ You can also fire up more than one crawl job at a time, and track the steps that "wandering_spider_count": 4 } -You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities. \ No newline at end of file +You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities. diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst index cf63c8ea..e82506f7 100644 --- a/docs/topics/introduction/quickstart.rst +++ b/docs/topics/introduction/quickstart.rst @@ -460,7 +460,7 @@ Crawl Request: :: - python kafka_monitor.py feed '{"url": "http://dmoz.org", "appid":"testapp", "crawlid":"abc1234", "maxdepth":1}' + python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"abc1234", "maxdepth":1}' Now send an ``info`` action request to see what is going on with the crawl: @@ -478,7 +478,7 @@ The following things will occur for this action request: :: - {u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoz.org': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'} + {u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoztools.net': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'} In this case we had 25 urls pending in the queue, so yours may be slightly different. diff --git a/docs/topics/kafka-monitor/api.rst b/docs/topics/kafka-monitor/api.rst index 95614f4f..cb417c2c 100644 --- a/docs/topics/kafka-monitor/api.rst +++ b/docs/topics/kafka-monitor/api.rst @@ -74,9 +74,9 @@ Kafka Request: :: - $ python kafka_monitor.py feed '{"url": "http://www.dmoz.org/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}' + $ python kafka_monitor.py feed '{"url": "http://www.dmoztools.net/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}' - - Submits a dmoz.org crawl spidering 2 levels deep with a high priority + - Submits a dmoztools.net crawl spidering 2 levels deep with a high priority :: @@ -899,7 +899,7 @@ Zookeeper Request: :: - $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoz.org", "hits":60, "window":60, "scale":0.9}' + $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoztools.net", "hits":60, "window":60, "scale":0.9}' Response from Kafka: @@ -907,7 +907,7 @@ Response from Kafka: { "action": "domain-update", - "domain": "dmoz.org", + "domain": "dmoztools.net", "server_time": 1464402128, "uuid": "abc123", "appid": "madisonTest" @@ -923,7 +923,7 @@ Zookeeper Request: :: - $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoz.org"}' + $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoztools.net"}' Response from Kafka: @@ -931,7 +931,7 @@ Response from Kafka: { "action": "domain-remove", - "domain": "dmoz.org", + "domain": "dmoztools.net", "server_time": 1464402146, "uuid": "abc123", "appid": "madisonTest" diff --git a/redis-monitor/tests/test_plugins.py b/redis-monitor/tests/test_plugins.py index 056f4a50..324b6d84 100644 --- a/redis-monitor/tests/test_plugins.py +++ b/redis-monitor/tests/test_plugins.py @@ -322,7 +322,7 @@ def test_stats_get_queue(self): 'link:istresearch.com:queue', 'link:yellowpages.com:queue', 'link:cnn.com:queue', - 'wandering:dmoz.org:queue', + 'wandering:dmoztools.net:queue', 'wandering:craigslist.org:queue', ]) results = [5, 10, 11, 1, 3] @@ -349,7 +349,7 @@ def ret_val(*args): 'spider_backlog': 4, 'num_domains': 2, 'domains': [ - {'domain': 'dmoz.org', 'backlog': 1}, + {'domain': 'dmoztools.net', 'backlog': 1}, {'domain': 'craigslist.org', 'backlog': 3}, ] } @@ -395,20 +395,20 @@ def test_zk_regex(self): def test_zk_handle_du(self): # domain update - s = b'blacklist: []\ndomains:\n dmoz.org: {hits: 60, scale: 1.0, window: 60}\n' + s = b'blacklist: []\ndomains:\n dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n' val = '{"uuid":"blah123","hits":15,"scale":0.9,"window":60}' - expected = b'blacklist: []\ndomains:\n cnn.com:\n hits: 15\n scale: 0.9\n window: 60\n dmoz.org:\n hits: 60\n scale: 1.0\n window: 60\n' + expected = b'blacklist: []\ndomains:\n cnn.com:\n hits: 15\n scale: 0.9\n window: 60\n dmoztools.net:\n hits: 60\n scale: 1.0\n window: 60\n' self.plugin.zoo_client.get = MagicMock(return_value=(s,)) self.plugin.handle(key="zk:domain-update:cnn.com:testapp", value=val) self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected) def test_zk_handle_dr(self): # domain remove - s = b'blacklist: []\ndomains:\n dmoz.org: {hits: 60, scale: 1.0, window: 60}\n' + s = b'blacklist: []\ndomains:\n dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n' val = '{"uuid":"blah123"}' expected = b'blacklist: []\ndomains: {}\n' self.plugin.zoo_client.get = MagicMock(return_value=(s,)) - self.plugin.handle(key="zk:domain-remove:dmoz.org:testapp", value=val) + self.plugin.handle(key="zk:domain-remove:dmoztools.net:testapp", value=val) self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected) def test_zk_handle_bu(self): From 62be326ac1db80a38a9b268e56869858c4bca4ef Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Tue, 7 Nov 2017 11:11:51 -0500 Subject: [PATCH 09/22] Fix typo Fixes #148 --- crawler/crawling/distributed_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py index fb4fbb7d..58edb89c 100644 --- a/crawler/crawling/distributed_scheduler.py +++ b/crawler/crawling/distributed_scheduler.py @@ -82,7 +82,7 @@ def __init__(self, server, persist, update_int, timeout, retries, logger, self.ip_update_interval = ip_refresh self.add_type = add_type self.add_ip = add_ip - self.item_retires = retries + self.item_retries = retries self.logger = logger self.ip_regex = re.compile(ip_regex) self.backlog_blacklist = backlog_blacklist From 95d524772ab8d91526b80a1ea161be52ff2e8d90 Mon Sep 17 00:00:00 2001 From: Russ Ferriday Date: Tue, 7 Nov 2017 16:14:09 +0000 Subject: [PATCH 10/22] Trivial tense/typos (#146) * Trivial tense/typos * Update dmoz.org to dmoztools.net since dmoz.org now redirects. (#145) (#147) --- docs/topics/introduction/quickstart.rst | 4 ++-- docs/topics/kafka-monitor/design.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst index e82506f7..78655b4a 100644 --- a/docs/topics/introduction/quickstart.rst +++ b/docs/topics/introduction/quickstart.rst @@ -360,7 +360,7 @@ At this point you should have a Scrapy Cluster setup that has been tested and ap .. note:: You can append ``&`` to the end of the following commands to run them in the background, but we recommend you open different terminal windows to first get a feel of how the cluster operates. -The following commands outline what you would run in a traditional environment. If using a container based solution these commands are ran when you run the container itself. +The following commands outline what you would run in a traditional environment. If using a container based solution these commands are run when you run the container itself. **Bare Bones:** @@ -425,7 +425,7 @@ Which ever setup you chose, every process within should stay running for the rem .. note:: If you chose to set the Rest service up, this section may also be performed via the :doc:`../rest/index` endpoint. You just need to ensure the JSON identified in the following section is properly fed into the :ref:`feed ` rest endpoint. -*The follwing commands can be ran from the command line, whether that is on the machine itself or inside the Kafka Monitor container depends on the setup chosen above.* +*The following commands can be run from the command line, whether that is on the machine itself or inside the Kafka Monitor container depends on the setup chosen above.* 1) We now need to feed the cluster a crawl request. This is done via the same Kafka Monitor python script, but with different command line arguements. diff --git a/docs/topics/kafka-monitor/design.rst b/docs/topics/kafka-monitor/design.rst index 1558d571..827e95c6 100644 --- a/docs/topics/kafka-monitor/design.rst +++ b/docs/topics/kafka-monitor/design.rst @@ -11,9 +11,9 @@ Soon enough those same applications wanted the ability to retrieve information a The Kafka Monitor reads from the desired inbound Kafka topic, and applies the currently loaded Plugin's JSON APIs to the received message. The first Plugin to have a valid `JSON Schema `_ for the received JSON object is then allowed to do its own processing and manipulation of the object. -In Scrapy Cluster's use case, the default Plugins write their requests into Redis keys, but the functionality does not stop there. The Kafka Monitor settings can alter which plugins are loaded, or add new plugins to extend functionality. These modules allow the Kafka Monitor core to have a small footprint but allow extension or different plugins to be ran. +In Scrapy Cluster's use case, the default Plugins write their requests into Redis keys, but the functionality does not stop there. The Kafka Monitor settings can alter which plugins are loaded, or add new plugins to extend functionality. These modules allow the Kafka Monitor core to have a small footprint but allow extension or different plugins to be run. -The Kafka Monitor can be ran as a single process, or part of the same Kafka consumer group spread across multiple machines and processes. This allows distributed and fault tolerant throughput to ensure the crawl requests to the cluster are always read. +The Kafka Monitor can be run as a single process, or part of the same Kafka consumer group spread across multiple machines and processes. This allows distributed and fault tolerant throughput to ensure the crawl requests to the cluster are always read. From our own internal debugging and ensuring other applications were working properly, a utility program called Kafka Dump was also created in order to be able to interact and monitor the kafka messages coming through. This is a small dump utility with no external dependencies, to allow users to get an insight into what is being passed through the Kafka topics within the cluster. From 4775c06bf7249639e3f7ff763fa600a11e57b62b Mon Sep 17 00:00:00 2001 From: innspyros <30524071+innspyros@users.noreply.github.com> Date: Thu, 9 Nov 2017 17:51:45 +0200 Subject: [PATCH 11/22] vm.max_map_count tip (#151) docker eleasticsearch container not running when the vm.max_map_count has the default value which is too low. --- docs/topics/advanced/docker.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/topics/advanced/docker.rst b/docs/topics/advanced/docker.rst index fca01884..8aa77be2 100644 --- a/docs/topics/advanced/docker.rst +++ b/docs/topics/advanced/docker.rst @@ -160,6 +160,18 @@ You can ensure everything started up via: /usr/sbin/sshd ... p, 22/tcp, 2888/tcp, 3888/tcp +TIP + +In the unfortunate case that elasticsearch is not running and the following message shows up into the logs: + +:: + + ERROR: bootstrap checks failed max virtual memory areas vm.max_map_count [65530] is too low, increase to at least [262144] + +you have to edit the virtual memory settings of the machine you are running docker onto. + +For more info about the needed edits you can follow the link to the `official Elasticsearch documentation `_ . + From here, please continue to the :ref:`Kibana ` portion of the :doc:`ELK ` integration guide. From 61d8809e5bc08596c7f96234998da4173fceb182 Mon Sep 17 00:00:00 2001 From: innspyros <30524071+innspyros@users.noreply.github.com> Date: Thu, 30 Nov 2017 20:02:21 +0200 Subject: [PATCH 12/22] Ansible kafka zookeeper host list. (#152) * vm.max_map_count tip docker eleasticsearch container not running when the vm.max_map_count has the default value which is too low. * Ansible Kafka template consumer.properties.j2 had a hardcoded zookeeper.connect value. Now the value is created from the iteration of the zookeeper_host_list (host and port). --- ansible/roles/kafka/templates/consumer.properties.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/kafka/templates/consumer.properties.j2 b/ansible/roles/kafka/templates/consumer.properties.j2 index 10b7bf4d..1eb7e368 100644 --- a/ansible/roles/kafka/templates/consumer.properties.j2 +++ b/ansible/roles/kafka/templates/consumer.properties.j2 @@ -17,7 +17,7 @@ # Zookeeper connection string # comma separated host:port pairs, each corresponding to a zk # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002" -zookeeper.connect=127.0.0.1:2181 +zookeeper.connect={% for host in zookeeper_host_list %}{{ host }}:{{ zookeeper_client_port|default(2181) }}{% if not loop.last %},{% endif %}{% endfor %} # timeout in ms for connecting to zookeeper zookeeper.connection.timeout.ms=6000 From bc6977ab6d8805d4c1282d01d28e349fe4ce37cd Mon Sep 17 00:00:00 2001 From: Spyros Vlachos Date: Fri, 5 Jan 2018 10:11:43 +0200 Subject: [PATCH 13/22] Add auto encoding resolution for crawling items. --- crawler/crawling/items.py | 1 + crawler/crawling/pipelines.py | 4 +++- crawler/crawling/spiders/link_spider.py | 1 + crawler/crawling/spiders/wandering_spider.py | 1 + kafka-monitor/kafkadump.py | 4 +++- 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/crawler/crawling/items.py b/crawler/crawling/items.py index bec5f279..68f8537e 100644 --- a/crawler/crawling/items.py +++ b/crawler/crawling/items.py @@ -19,3 +19,4 @@ class RawResponseItem(Item): attrs = Field() success = Field() exception = Field() + encoding = Field() diff --git a/crawler/crawling/pipelines.py b/crawler/crawling/pipelines.py index 5baa710a..653a9071 100644 --- a/crawler/crawling/pipelines.py +++ b/crawler/crawling/pipelines.py @@ -181,7 +181,9 @@ def process_item(self, item, spider): prefix = self.topic_prefix try: - if self.use_base64: + if self.use_base64 and 'encoding' in datum: + datum['body'] = base64.b64encode(bytes(datum['body'])) + elif self.use_base64: datum['body'] = base64.b64encode(bytes(datum['body'], 'utf-8')) message = ujson.dumps(datum, sort_keys=True) except: diff --git a/crawler/crawling/spiders/link_spider.py b/crawler/crawling/spiders/link_spider.py index 32af800b..289c6a63 100644 --- a/crawler/crawling/spiders/link_spider.py +++ b/crawler/crawling/spiders/link_spider.py @@ -40,6 +40,7 @@ def parse(self, response): item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body + item["encoding"] = response.encoding item["links"] = [] # determine whether to continue spidering diff --git a/crawler/crawling/spiders/wandering_spider.py b/crawler/crawling/spiders/wandering_spider.py index 7b856769..66084569 100644 --- a/crawler/crawling/spiders/wandering_spider.py +++ b/crawler/crawling/spiders/wandering_spider.py @@ -46,6 +46,7 @@ def parse(self, response): item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body + item["encoding"] = response.encoding item["links"] = [] # we want to know how far our spider gets if item['attrs'] is None: diff --git a/kafka-monitor/kafkadump.py b/kafka-monitor/kafkadump.py index ff77cc29..9b4b9061 100644 --- a/kafka-monitor/kafkadump.py +++ b/kafka-monitor/kafkadump.py @@ -126,7 +126,9 @@ def main(): val = message.value try: item = json.loads(val) - if args['decode_base64'] and 'body' in item: + if args['decode_base64'] and 'body' in item and 'encoding' in item: + item['body'] = base64.b64decode(item['body']).decode(item['encoding']) + elif args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: From 580de6f067014c3851d3a95f51d9d6df2d991950 Mon Sep 17 00:00:00 2001 From: Spyros Vlachos Date: Mon, 15 Jan 2018 16:20:29 +0200 Subject: [PATCH 14/22] Automatic discovery of html encoding --- crawler/crawling/pipelines.py | 11 +++-- crawler/tests/online.py | 2 +- crawler/tests/test_pipelines.py | 76 +++++++++++++++++++++++++++++++-- kafka-monitor/kafkadump.py | 14 ++++-- rest/tests/test_rest_service.py | 34 +++++++-------- 5 files changed, 109 insertions(+), 28 deletions(-) diff --git a/crawler/crawling/pipelines.py b/crawler/crawling/pipelines.py index 653a9071..99b9a0f2 100644 --- a/crawler/crawling/pipelines.py +++ b/crawler/crawling/pipelines.py @@ -181,10 +181,13 @@ def process_item(self, item, spider): prefix = self.topic_prefix try: - if self.use_base64 and 'encoding' in datum: - datum['body'] = base64.b64encode(bytes(datum['body'])) - elif self.use_base64: - datum['body'] = base64.b64encode(bytes(datum['body'], 'utf-8')) + if self.use_base64: + if isinstance(datum['body'], str): + datum['body'] = bytes(datum['body'], datum['encoding']) + datum['body'] = base64.b64encode(datum['body']) + elif 'encoding' in datum and 'utf-8' != datum['encoding'] and datum['encoding']: + datum['body'] = datum['body'].decode(datum['encoding']) + message = ujson.dumps(datum, sort_keys=True) except: message = 'json failed to parse' diff --git a/crawler/tests/online.py b/crawler/tests/online.py index 9cead8e4..ecf14af4 100644 --- a/crawler/tests/online.py +++ b/crawler/tests/online.py @@ -38,7 +38,7 @@ class TestLinkSpider(TestCase): "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\ "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\ "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\ - "link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" + "test-spider\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" def setUp(self): self.settings = get_project_settings() diff --git a/crawler/tests/test_pipelines.py b/crawler/tests/test_pipelines.py index e6b54ac2..69a73971 100644 --- a/crawler/tests/test_pipelines.py +++ b/crawler/tests/test_pipelines.py @@ -1,6 +1,9 @@ +# coding=utf-8 from builtins import object from unittest import TestCase import mock +import ujson +import base64 from mock import MagicMock from crawling.pipelines import (LoggingBeforePipeline, KafkaPipeline) from crawling.items import RawResponseItem @@ -24,6 +27,43 @@ def _get_item(self): item["request_headers"] = {} item["body"] = "text" item["links"] = [] + item["encoding"] = "utf-8" + + return item + + def _get_internationalized_utf8_item(self): + item = RawResponseItem() + item['appid'] = 'app' + item['crawlid'] = 'crawlid' + item['attrs'] = {} + item["url"] = "http://dumb.com" + item["response_url"] = "http://dumb.com" + item["status_code"] = 200 + item["status_msg"] = "OK" + item["response_headers"] = {} + item["request_headers"] = {} + item["body"] = u"This is a test - Αυτό είναι ένα τεστ - 这是一个测试 - これはテストです" + item["links"] = [] + item["encoding"] = "utf-8" + + return item + + def _get_internationalized_iso_item(self): + item = RawResponseItem() + item['appid'] = 'app' + item['crawlid'] = 'crawlid' + item['attrs'] = {} + item["url"] = "http://dumb.com" + item["response_url"] = "http://dumb.com" + item["status_code"] = 200 + item["status_msg"] = "OK" + item["response_headers"] = {} + item["request_headers"] = {} + # Fill the item["body"] with the string 'αυτό είναι ένα τεστ' that was encoded in iso-8859-7 + # using iconv and further encoded in base64 in order to store it inside this file. + item["body"] = base64.b64decode('4fX0/CDl3+3h6SDd7eEg9OXz9Ao=') + item["links"] = [] + item["encoding"] = "iso-8859-7" return item @@ -75,7 +115,7 @@ def test_process_item(self, e): # test normal send, no appid topics self.pipe.process_item(item, spider) - expected = '{"appid":"app","attrs":{},"body":"text","crawlid":"crawlid","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' + expected = '{"appid":"app","attrs":{},"body":"text","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' self.pipe.producer.send.assert_called_once_with('prefix.crawled_firehose', expected) self.pipe.producer.send.reset_mock() @@ -93,10 +133,40 @@ def test_process_item(self, e): self.pipe.appid_topics = False self.pipe.use_base64 = True self.pipe.process_item(item, spider) - expected = '{"appid":"app","attrs":{},"body":"dGV4dA==","crawlid":"crawlid","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' - self.pipe.producer.send.assert_called_once_with('prefix.crawled_firehose', + expected = '{"appid":"app","attrs":{},"body":"dGV4dA==","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' + self.pipe.producer.send.assert_called_with('prefix.crawled_firehose', expected) + # test base64 encode/decode with utf-8 encoding + item = self._get_internationalized_utf8_item() + self.pipe.appid_topics = False + self.pipe.use_base64 = True + self.pipe.process_item(item, spider) + expected = '{"appid":"app","attrs":{},"body":"VGhpcyBpcyBhIHRlc3QgLSDOkc+Fz4TPjCDOtc6vzr3Osc65IM6tzr3OsSDPhM61z4PPhCAtIOi\\/meaYr+S4gOS4qua1i+ivlSAtIOOBk+OCjOOBr+ODhuOCueODiOOBp+OBmQ==","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' + self.pipe.producer.send.assert_called_with('prefix.crawled_firehose', + expected) + # unpack the arguments used for the previous assertion call + call_args, call_kwargs = self.pipe.producer.send.call_args + crawl_args_dict = ujson.loads(call_args[1]) + decoded_string = base64.b64decode(crawl_args_dict['body']).decode(crawl_args_dict['encoding']) + self.assertEquals(decoded_string, item.get('body')) + + # test base64 encode/decode with iso encoding + item = self._get_internationalized_iso_item() + self.pipe.appid_topics = False + self.pipe.use_base64 = True + self.pipe.process_item(item, spider) + expected = '{"appid":"app","attrs":{},"body":"4fX0\\/CDl3+3h6SDd7eEg9OXz9Ao=","crawlid":"crawlid","encoding":"iso-8859-7","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}' + self.pipe.producer.send.assert_called_with('prefix.crawled_firehose', + expected) + # unpack the arguments used for the previous assertion call + call_args, call_kwargs = self.pipe.producer.send.call_args + crawl_args_dict = ujson.loads(call_args[1]) + decoded_string = base64.b64decode(crawl_args_dict['body']).decode(crawl_args_dict['encoding']) + self.assertEquals(decoded_string, item.get('body').decode(item.get('encoding'))) + # Test again against the original (before it was encoded in iso) string + self.assertEquals(decoded_string, u"αυτό είναι ένα τεστ\n") + # test kafka exception item = self._get_item() copy = deepcopy(item) diff --git a/kafka-monitor/kafkadump.py b/kafka-monitor/kafkadump.py index 9b4b9061..1db8d797 100644 --- a/kafka-monitor/kafkadump.py +++ b/kafka-monitor/kafkadump.py @@ -10,6 +10,7 @@ import time import argparse import base64 +import six from scutils.settings_wrapper import SettingsWrapper from scutils.log_factory import LogFactory @@ -56,6 +57,10 @@ def main(): required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") + dump_parser.add_argument('-jb', '--just-body', action='store_const', + required=False, const=True, default=False, + help="Just print the raw html 'body' key in" + " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") @@ -126,10 +131,8 @@ def main(): val = message.value try: item = json.loads(val) - if args['decode_base64'] and 'body' in item and 'encoding' in item: + if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']).decode(item['encoding']) - elif args['decode_base64'] and 'body' in item: - item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] @@ -140,6 +143,11 @@ def main(): if args['pretty']: print(json.dumps(item, indent=4)) + elif args['just_body']: + if six.PY2: + print(item['body'].encode('utf-8')) + else: + print(item['body']) else: print(item) num_records = num_records + 1 diff --git a/rest/tests/test_rest_service.py b/rest/tests/test_rest_service.py index 01ca12df..d2f1293a 100644 --- a/rest/tests/test_rest_service.py +++ b/rest/tests/test_rest_service.py @@ -372,7 +372,7 @@ def test_error_catch(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 500) @@ -381,7 +381,7 @@ def test_error_catch(self): override.logger.error.reset_mock() results = override.test_error2() self.assertFalse(override.logger.error.called) - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, 'test data') self.assertEqual(results[1], 200) @@ -390,7 +390,7 @@ def test_error_catch(self): override.logger.error.reset_mock() results = override.test_error3() self.assertFalse(override.logger.error.called) - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, 'test data') self.assertEqual(results[1], 109) @@ -414,7 +414,7 @@ def test_validate_json(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 400) @@ -434,7 +434,7 @@ def test_validate_json(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 400) @@ -496,7 +496,7 @@ def test_validate_schema(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 400) @@ -514,7 +514,7 @@ def test_index(self): "my_id": 'a908', "node_health": 'RED' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) def test_feed(self): @@ -533,7 +533,7 @@ def test_feed(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 500) @@ -553,7 +553,7 @@ def test_feed(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 500) @@ -567,7 +567,7 @@ def test_feed(self): u'error': None, u'status': u'SUCCESS' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 200) @@ -589,7 +589,7 @@ def fancy_get_time(): u'error': None, u'status': u'SUCCESS' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 200) self.assertFalse('key' in self.rest_service.uuids) @@ -608,7 +608,7 @@ def fancy_get_time2(): u'error': None, u'status': u'SUCCESS' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 200) self.assertTrue('key' in self.rest_service.uuids) @@ -634,7 +634,7 @@ def test_poll(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 500) @@ -650,7 +650,7 @@ def test_poll(self): u'error': None, u'status': u'SUCCESS' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 200) @@ -667,7 +667,7 @@ def test_poll(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 404) @@ -689,7 +689,7 @@ def test_poll(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 500) @@ -709,7 +709,7 @@ def test_poll(self): }, u'status': u'FAILURE' } - data = json.loads(results[0].data) + data = json.loads(results[0].data.decode('utf-8')) self.assertEqual(data, d) self.assertEqual(results[1], 500) From d0725318b767b37a77b6bffe9a456eae27249d65 Mon Sep 17 00:00:00 2001 From: Spyros Vlachos Date: Mon, 15 Jan 2018 17:49:44 +0200 Subject: [PATCH 15/22] Cleanup utf-8 logic --- crawler/crawling/pipelines.py | 10 ++++++++-- kafka-monitor/kafkadump.py | 5 ++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/crawler/crawling/pipelines.py b/crawler/crawling/pipelines.py index 99b9a0f2..b6352917 100644 --- a/crawler/crawling/pipelines.py +++ b/crawler/crawling/pipelines.py @@ -181,11 +181,17 @@ def process_item(self, item, spider): prefix = self.topic_prefix try: + # Get the encoding. If it's not a key of datum, return utf-8 + encoding = datum.get('encoding', 'utf-8') + if self.use_base64: + # When running in Python 2 datum['body'] is a string if isinstance(datum['body'], str): - datum['body'] = bytes(datum['body'], datum['encoding']) + datum['body'] = bytes(datum['body'], encoding) + # In Python 3 datum['body'] is already in byte form datum['body'] = base64.b64encode(datum['body']) - elif 'encoding' in datum and 'utf-8' != datum['encoding'] and datum['encoding']: + + elif 'utf-8' != encoding: datum['body'] = datum['body'].decode(datum['encoding']) message = ujson.dumps(datum, sort_keys=True) diff --git a/kafka-monitor/kafkadump.py b/kafka-monitor/kafkadump.py index 1db8d797..20e68a95 100644 --- a/kafka-monitor/kafkadump.py +++ b/kafka-monitor/kafkadump.py @@ -131,8 +131,11 @@ def main(): val = message.value try: item = json.loads(val) + # Get the encoding. If it's not a key of item, return utf-8. + encoding = item.get('encoding', 'utf-8') + if args['decode_base64'] and 'body' in item: - item['body'] = base64.b64decode(item['body']).decode(item['encoding']) + item['body'] = base64.b64decode(item['body']).decode(encoding) if args['no_body'] and 'body' in item: del item['body'] From 19470761051a54f1bd4d6421dd1856bff1be52a1 Mon Sep 17 00:00:00 2001 From: Spyros Vlachos Date: Wed, 17 Jan 2018 16:56:18 +0200 Subject: [PATCH 16/22] Fix spider name in crawler's online tests (#161) Because of the wrong spider name, a wrong redis directory ended up containing the queue key. This way the spider was not crawling the provided link. --- crawler/tests/online.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/tests/online.py b/crawler/tests/online.py index 9cead8e4..ecf14af4 100644 --- a/crawler/tests/online.py +++ b/crawler/tests/online.py @@ -38,7 +38,7 @@ class TestLinkSpider(TestCase): "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\ "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\ "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\ - "link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" + "test-spider\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" def setUp(self): self.settings = get_project_settings() From 5e20adf616ae915f2f57c341070fe1bae8846d42 Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Wed, 17 Jan 2018 10:40:20 -0500 Subject: [PATCH 17/22] fix typo for consumer vs producer Resolves #163 --- rest/rest_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rest/rest_service.py b/rest/rest_service.py index 342a4f7f..a823b5f9 100644 --- a/rest/rest_service.py +++ b/rest/rest_service.py @@ -424,7 +424,7 @@ def _create_consumer(self): @retry(wait_exponential_multiplier=500, wait_exponential_max=10000) def _create_producer(self): - """Tries to establish a Kafka consumer connection""" + """Tries to establish a Kafka producer connection""" if not self.closed: try: self.logger.debug("Creating new kafka producer using brokers: " + From 9bd8c9558868177966c18b4a053404956cf88e03 Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Wed, 17 Jan 2018 10:56:29 -0500 Subject: [PATCH 18/22] test at logging in on Travis without email flag --- travis/docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/docker.sh b/travis/docker.sh index 00a33ea7..85f68a73 100755 --- a/travis/docker.sh +++ b/travis/docker.sh @@ -42,7 +42,7 @@ if [ "$TRAVIS_BRANCH" = "dev" ] && [ "$TRAVIS_PULL_REQUEST" = "false" ] && [ "$T sudo docker rmi istresearch/scrapy-cluster:rest-test # log into docker - sudo docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" + sudo docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" # push new containers sudo docker push istresearch/scrapy-cluster From dad16cd0e3c923e7e4bdc4e587f2c9e83270327c Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Fri, 19 Jan 2018 10:30:11 -0500 Subject: [PATCH 19/22] Scutils LogFactory update to pass all available extras. This is a sneaky bug seen when you override the LogObject class `add_extras()` method, and then you would like to use the callback functionality with your newly returned values. Before, the callback would only be passed the original extra parameter, now it is passed the updated parameters. --- utils/scutils/log_factory.py | 10 +++++----- utils/scutils/version.py | 2 +- utils/tests/test_log_factory.py | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/utils/scutils/log_factory.py b/utils/scutils/log_factory.py index a89b9fab..c0c16abd 100644 --- a/utils/scutils/log_factory.py +++ b/utils/scutils/log_factory.py @@ -208,7 +208,7 @@ def debug(self, message, extra={}): if self.level_dict['DEBUG'] >= self.level_dict[self.log_level]: extras = self.add_extras(extra, "DEBUG") self._write_message(message, extras) - self.cb_handler.fire_callbacks('DEBUG', message, extra) + self.cb_handler.fire_callbacks('DEBUG', message, extras) def info(self, message, extra={}): ''' @@ -220,7 +220,7 @@ def info(self, message, extra={}): if self.level_dict['INFO'] >= self.level_dict[self.log_level]: extras = self.add_extras(extra, "INFO") self._write_message(message, extras) - self.cb_handler.fire_callbacks('INFO', message, extra) + self.cb_handler.fire_callbacks('INFO', message, extras) def warn(self, message, extra={}): ''' @@ -241,7 +241,7 @@ def warning(self, message, extra={}): if self.level_dict['WARNING'] >= self.level_dict[self.log_level]: extras = self.add_extras(extra, "WARNING") self._write_message(message, extras) - self.cb_handler.fire_callbacks('WARNING', message, extra) + self.cb_handler.fire_callbacks('WARNING', message, extras) def error(self, message, extra={}): ''' @@ -253,7 +253,7 @@ def error(self, message, extra={}): if self.level_dict['ERROR'] >= self.level_dict[self.log_level]: extras = self.add_extras(extra, "ERROR") self._write_message(message, extras) - self.cb_handler.fire_callbacks('ERROR', message, extra) + self.cb_handler.fire_callbacks('ERROR', message, extras) def critical(self, message, extra={}): ''' @@ -265,7 +265,7 @@ def critical(self, message, extra={}): if self.level_dict['CRITICAL'] >= self.level_dict[self.log_level]: extras = self.add_extras(extra, "CRITICAL") self._write_message(message, extras) - self.cb_handler.fire_callbacks('CRITICAL', message, extra) + self.cb_handler.fire_callbacks('CRITICAL', message, extras) def _write_message(self, message, extra): ''' diff --git a/utils/scutils/version.py b/utils/scutils/version.py index d8b10d6e..d955d061 100644 --- a/utils/scutils/version.py +++ b/utils/scutils/version.py @@ -1,2 +1,2 @@ -__version__ = '1.3.0dev3' +__version__ = '1.3.0dev4' VERSION = tuple(int(x) for x in __version__.split('.')) diff --git a/utils/tests/test_log_factory.py b/utils/tests/test_log_factory.py index 204cf7cb..cff371fd 100644 --- a/utils/tests/test_log_factory.py +++ b/utils/tests/test_log_factory.py @@ -309,12 +309,14 @@ def multiply_5(log_message=None, log_extra=None): self.assertEqual(6, self.logger.x) def test_preserve_data(self): + self.logger._get_time = MagicMock(return_value='2015-11-12T10:11:12.0Z') message = "THIS IS A TEST" extras = {"key": "value", 'a': [1, 2, 3]} + extras_add = self.logger.add_extras(extras, 'INFO') def cb(log_message=None, log_extra=None): self.assertEqual(log_message, message) - self.assertEqual(log_extra, extras) + self.assertEqual(log_extra, extras_add) self.logger.register_callback('>DEBUG', cb) self.logger.log_level = 'INFO' From b35a1644256284b6217fd42ea4ec211621876c31 Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Fri, 19 Jan 2018 09:58:42 -0500 Subject: [PATCH 20/22] change istresearch.com to something more stable Hopefully this fixes the crawler integration tests which have been failing, tested locally and works --- crawler/tests/online.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler/tests/online.py b/crawler/tests/online.py index ecf14af4..3ff68ebb 100644 --- a/crawler/tests/online.py +++ b/crawler/tests/online.py @@ -35,7 +35,7 @@ class CustomSpider(LinkSpider): class TestLinkSpider(TestCase): example_feed = "{\"allowed_domains\":null,\"allow_regex\":null,\""\ - "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\ + "crawlid\":\"abc12345\",\"url\":\"http://dmoztools.net/\",\"expires\":0,\""\ "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\ "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\ "test-spider\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" @@ -77,7 +77,7 @@ def test_crawler_process(self): d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis - key = "test-spider:istresearch.com:queue" + key = "test-spider:dmoztools.net:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, From 99505d83e2eb5e810b54306a22eaea6391c40774 Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Fri, 19 Jan 2018 10:11:52 -0500 Subject: [PATCH 21/22] documentation updates for new url URL changed from istresearch.com to dmoztools.net for better stability --- docs/topics/advanced/rediskeys.rst | 2 +- docs/topics/introduction/quickstart.rst | 4 ++-- docs/topics/kafka-monitor/quickstart.rst | 10 +++++----- docs/topics/rest/api.rst | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/topics/advanced/rediskeys.rst b/docs/topics/advanced/rediskeys.rst index 77f74fa6..e5ca5d0a 100644 --- a/docs/topics/advanced/rediskeys.rst +++ b/docs/topics/advanced/rediskeys.rst @@ -69,6 +69,6 @@ If you run the integration tests, there may be temporary Redis keys created that - **cluster:test** - Used when testing the Kafka Monitor can act and set a key in Redis -- **test-spider:istresearch.com:queue** - Used when testing the crawler installation can interact with Redis and Kafka +- **test-spider:dmoztools.net:queue** - Used when testing the crawler installation can interact with Redis and Kafka - **stats:crawler::test-spider:** - Automatically created and destoryed during crawler testing by the stats collection mechanism settings. diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst index 78655b4a..513252de 100644 --- a/docs/topics/introduction/quickstart.rst +++ b/docs/topics/introduction/quickstart.rst @@ -431,7 +431,7 @@ Which ever setup you chose, every process within should stay running for the rem :: - python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"abc123"}' + python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"abc123"}' You will see the following output on the command line for that successful request: @@ -439,7 +439,7 @@ You will see the following output on the command line for that successful reques 2015-12-22 15:45:37,457 [kafka-monitor] INFO: Feeding JSON into demo.incoming { - "url": "http://istresearch.com", + "url": "http://dmoztools.net", "crawlid": "abc123", "appid": "testapp" } diff --git a/docs/topics/kafka-monitor/quickstart.rst b/docs/topics/kafka-monitor/quickstart.rst index 1392b60c..9708bf83 100644 --- a/docs/topics/kafka-monitor/quickstart.rst +++ b/docs/topics/kafka-monitor/quickstart.rst @@ -33,7 +33,7 @@ JSON Object feeder into your desired Kafka Topic. This takes a valid JSON object :: - $ python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"ABC123"}' + $ python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"ABC123"}' The command line feed is very slow and should not be used in production. Instead, you should write your own continuously running application to feed Kafka the desired API requests that you require. @@ -89,10 +89,10 @@ Feed an item :: - $ python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"ABC123"}' + $ python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"ABC123"}' 2016-01-05 15:14:44,829 [kafka-monitor] INFO: Feeding JSON into demo.incoming { - "url": "http://istresearch.com", + "url": "http://dmoztools.net", "crawlid": "ABC123", "appid": "testapp" } @@ -116,8 +116,8 @@ If you have a :ref:`Crawler ` running, you should see the html come thr "response_headers": { }, - "response_url": "http://istresearch.com", - "url": "http://istresearch.com", + "response_url": "http://dmoztools.net", + "url": "http://dmoztools.net", "status_code": 200, "status_msg": "OK", "appid": "testapp", diff --git a/docs/topics/rest/api.rst b/docs/topics/rest/api.rst index cb2aab4b..524b17a6 100644 --- a/docs/topics/rest/api.rst +++ b/docs/topics/rest/api.rst @@ -156,7 +156,7 @@ Feed a crawl request :: - $ curl scdev:5343/feed -H "Content-Type: application/json" -d '{"url":"istresearch.com", "appid":"madisonTest", "crawlid":"abc123"}' + $ curl scdev:5343/feed -H "Content-Type: application/json" -d '{"url":"http://dmoztools.net", "appid":"madisonTest", "crawlid":"abc123"}' Feed a Stats request From d90d4e91c5c19073751fa57665b1a96d4b55625f Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Tue, 23 Jan 2018 10:33:30 -0500 Subject: [PATCH 22/22] changelog updates for dev This should bring us up to speed with all the changes made via PR's or other minor issues --- docs/topics/changelog.rst | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/topics/changelog.rst b/docs/topics/changelog.rst index 77f58f0d..055c0b33 100644 --- a/docs/topics/changelog.rst +++ b/docs/topics/changelog.rst @@ -12,11 +12,32 @@ Date: ??/??/???? - Add Python 3 support +- Fixed assert deprecations in unit tests + +- Corrected Ansible host list for zookeeper + +- Minor documentation changes/updates + Crawler ^^^^^^^ - Improved request to dictionary serialization support +- Updated item encoding serialization + +Rest +^^^^ + +- Corrected Kafka environment variable + +Utils +^^^^^ + +- Fixed LogFactory callback with correct extras dictionary + +- Adds quicker shutdown to ThreadedCounter object + + Scrapy Cluster 1.2 ------------------ @@ -130,7 +151,7 @@ Redis Monitor Changes - Added Stats Collection Crawler Changes -^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^ - Upgraded Crawler to be compatible with Scrapy 1.0 @@ -151,7 +172,7 @@ Crawler Changes - Added example Wandering Spider Scrapy Cluster 1.0 ---------------------- +------------------ Date: 5/21/2015