From a881af47d897d52acce027d02672ca45b5fcaaa6 Mon Sep 17 00:00:00 2001
From: gas1121 <jtdzhx@gmail.com>
Date: Mon, 14 Aug 2017 18:40:21 +0800
Subject: [PATCH 01/22] make scheduler handle serialized request properly

---
 crawler/crawling/distributed_scheduler.py   | 94 ++++++++-------------
 crawler/requirements.txt                    | 52 ++++++------
 crawler/tests/test_distributed_scheduler.py | 42 +++++++--
 3 files changed, 97 insertions(+), 91 deletions(-)

diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py
index 14cf2818..706ebb36 100644
--- a/crawler/crawling/distributed_scheduler.py
+++ b/crawler/crawling/distributed_scheduler.py
@@ -7,6 +7,7 @@
 from scrapy.http import Request
 from scrapy.conf import settings
 from scrapy.utils.python import to_unicode
+from scrapy.utils.reqser import request_to_dict, request_from_dict
 
 import redis
 import random
@@ -391,7 +392,7 @@ def enqueue_request(self, request):
         if not request.dont_filter and self.dupefilter.request_seen(request):
             self.logger.debug("Request not added back to redis")
             return
-        req_dict = self.request_to_dict(request)
+        req_dict = request_to_dict(request, self.spider)
 
         if not self.is_blacklisted(req_dict['meta']['appid'],
                                    req_dict['meta']['crawlid']):
@@ -436,28 +437,6 @@ def enqueue_request(self, request):
                               .format(appid=req_dict['meta']['appid'],
                                       id=req_dict['meta']['crawlid']))
 
-    def request_to_dict(self, request):
-        '''
-        Convert Request object to a dict.
-        modified from scrapy.utils.reqser
-        '''
-        req_dict = {
-            # urls should be safe (safe_string_url)
-            'url': to_unicode(request.url),
-            'method': request.method,
-            'headers': dict(request.headers),
-            'body': request.body,
-            'cookies': request.cookies,
-            'meta': request.meta,
-            '_encoding': request._encoding,
-            'priority': request.priority,
-            'dont_filter': request.dont_filter,
-             #  callback/errback are assumed to be a bound instance of the spider
-            'callback': None if request.callback is None else request.callback.__name__,
-            'errback': None if request.errback is None else request.errback.__name__,
-        }
-        return req_dict
-
     def find_item(self):
         '''
         Finds an item from the throttled queues
@@ -504,50 +483,47 @@ def next_request(self):
         if item:
             self.logger.debug(u"Found url to crawl {url}" \
                     .format(url=item['url']))
-            try:
-                req = Request(item['url'])
-            except ValueError:
-                # need absolute url
-                # need better url validation here
-                req = Request('http://' + item['url'])
-
-            try:
-                if 'callback' in item and item['callback'] is not None:
-                    req.callback = getattr(self.spider, item['callback'])
-            except AttributeError:
-                self.logger.warn("Unable to find callback method")
-
-            try:
-                if 'errback' in item and item['errback'] is not None:
-                    req.errback = getattr(self.spider, item['errback'])
-            except AttributeError:
-                self.logger.warn("Unable to find errback method")
-
             if 'meta' in item:
-                item = item['meta']
-
-            # defaults not in schema
-            if 'curdepth' not in item:
-                item['curdepth'] = 0
-            if "retry_times" not in item:
-                item['retry_times'] = 0
-
-            for key in list(item.keys()):
-                req.meta[key] = item[key]
+                # item is a serialized request
+                req = request_from_dict(item)
+            else:
+                # item is feeded from outside and is not a serialized request,
+                # parse it manually
+                req = self.request_from_feed(item)
 
             # extra check to add items to request
-            if 'useragent' in item and item['useragent'] is not None:
-                req.headers['User-Agent'] = item['useragent']
-            if 'cookie' in item and item['cookie'] is not None:
-                if isinstance(item['cookie'], dict):
-                    req.cookies = item['cookie']
-                elif isinstance(item['cookie'], basestring):
-                    req.cookies = self.parse_cookie(item['cookie'])
+            if 'useragent' in req.meta and req.meta['useragent'] is not None:
+                req.headers['User-Agent'] = req.meta['useragent']
 
             return req
 
         return None
 
+    def request_from_feed(self, item):
+        try:
+            req = Request(item['url'])
+        except ValueError:
+            # need absolute url
+            # need better url validation here
+            req = Request('http://' + item['url'])
+
+        # defaults not in schema
+        if 'curdepth' not in item:
+            item['curdepth'] = 0
+        if "retry_times" not in item:
+            item['retry_times'] = 0
+
+        for key in list(item.keys()):
+            req.meta[key] = item[key]
+
+        # extra check to add items to request
+        if 'cookie' in item and item['cookie'] is not None:
+            if isinstance(item['cookie'], dict):
+                req.cookies = item['cookie']
+            elif isinstance(item['cookie'], basestring):
+                req.cookies = self.parse_cookie(item['cookie'])
+        return req
+
     def parse_cookie(self, string):
         '''
         Parses a cookie string like returned in a Set-Cookie header
diff --git a/crawler/requirements.txt b/crawler/requirements.txt
index 58214803..172a6aae 100644
--- a/crawler/requirements.txt
+++ b/crawler/requirements.txt
@@ -1,40 +1,40 @@
-attrs==16.3.0 # Updated from 16.1.0
-cffi==1.9.1 # Updated from 1.7.0
+attrs==17.2.0 # Updated from 16.3.0
+cffi==1.10.0 # Updated from 1.9.1
 ConcurrentLogHandler==0.9.1
-cryptography==1.8.1 # Updated from 1.5
-cssselect==1.0.1 # Updated from 0.9.2
+cryptography==2.0.3 # Updated from 1.8.1
+cssselect==1.0.1
 enum34==1.1.6
 funcsigs==1.0.2
-future==0.16.0 # Updated from 0.15.2
-idna==2.5 # Updated from 2.1
-ipaddress==1.0.18 # Updated from 1.0.16
-kafka-python==1.3.3 # Updated from 1.3.2
-kazoo==2.2.1
-lxml==3.7.3 # Updated from 3.6.4
+future==0.16.0
+idna==2.6 # Updated from 2.5
+ipaddress==1.0.18
+kafka-python==1.3.4 # Updated from 1.3.3
+kazoo==2.4.0 # Updated from 2.2.1
+lxml==3.8.0 # Updated from 3.7.3
 mock==2.0.0
 nose==1.3.7
-parsel==1.1.0 # Updated from 1.0.3
-pbr==2.0.0 # Updated from 1.10.0
-pyasn1==0.2.3 # Updated from 0.1.9
-pyasn1-modules==0.0.8
-pycparser==2.17 # Updated from 2.14
+parsel==1.2.0 # Updated from 1.1.0
+pbr==3.1.1 # Updated from 2.0.0
+pyasn1==0.3.2 # Updated from 0.2.3
+pyasn1-modules==0.0.11 # Updated from 0.0.8
+pycparser==2.18 # Updated from 2.17
 PyDispatcher==2.0.5
-pyOpenSSL==16.2.0 # Updated from 16.1.0
-python-json-logger==0.1.7 # Updated from 0.1.5
+pyOpenSSL==17.2.0 # Updated from 16.2.0
+python-json-logger==0.1.8 # Updated from 0.1.7
 PyYAML==3.12
 queuelib==1.4.2
 redis==2.10.5
-requests==2.13.0 # Updated from 2.11.1
-requests-file==1.4.1 # Updated from 1.4
+requests==2.18.3 # Updated from 2.13.0
+requests-file==1.4.2 # Updated from 1.4.1
 retrying==1.3.3
-Scrapy==1.3.3
+Scrapy==1.4.0 # Updated from 1.3.3
 ../utils # scutils==1.3.0dev0
-service-identity==16.0.0
+service-identity==17.0.0 # Updated from 16.0.0
 six==1.10.0
-testfixtures==4.13.5 # Updated from 4.10.0
-tldextract==2.0.2 # Updated from 2.0.1
-Twisted==17.1.0 # Updated from 16.4.0
+testfixtures==5.1.1 # Updated from 4.13.5
+tldextract==2.1.0 # Updated from 2.0.2
+Twisted==17.5.0 # Updated from 17.1.0
 ujson==1.35
-w3lib==1.17.0 # Updated from 1.16.0
-zope.interface==4.3.3 # Updated from 4.2.0
+w3lib==1.18.0 # Updated from 1.17.0
+zope.interface==4.4.2 # Updated from 4.3.3
 # Generated with piprot 0.9.7
diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py
index 75db7b18..aaf3e42a 100644
--- a/crawler/tests/test_distributed_scheduler.py
+++ b/crawler/tests/test_distributed_scheduler.py
@@ -7,6 +7,7 @@
 from mock import MagicMock
 from crawling.distributed_scheduler import DistributedScheduler
 from scrapy.http import Request
+from scrapy.utils.reqser import request_to_dict
 from scutils.redis_throttled_queue import RedisThrottledQueue
 
 
@@ -139,6 +140,21 @@ def test_find_item(self):
         self.assertEqual(self.scheduler.find_item(), None) # should also not raise exception
 
 
+class TestDistributedSchedulerRequestFromFeed(ThrottleMixin, TestCase):
+    def test_request_from_feed(self):
+        self.req = self.get_request()
+        feed = {
+            "url": "http://ex.com",
+            "crawlid": "abc123",
+            "appid": "myapp",
+            "spiderid": "link",
+        }
+        out = self.scheduler.request_from_feed(feed)
+        self.assertEquals(out.url, 'http://ex.com')
+        for key in out.meta:
+            self.assertEqual(out.meta[key], self.req.meta[key])
+
+
 class TestDistributedSchedulerNextRequest(ThrottleMixin, TestCase):
 
     @mock.patch('time.time', return_value=5)
@@ -169,12 +185,26 @@ def test_next_request(self, t):
         except Exception as e:
             self.assertEqual(str(e), "ip")
 
-        # test got item
-        self.scheduler.find_item = MagicMock(
-                                        return_value={"url": "http://ex.com",
-                                                      "crawlid": "abc123",
-                                                      "appid": "myapp",
-                                                      "spiderid": "link"})
+        # test request from feed
+        feed = {
+            "url": "http://ex.com",
+            "crawlid": "abc123",
+            "appid": "myapp",
+            "spiderid": "link",
+        }
+        self.scheduler.find_item = MagicMock(return_value=feed)
+        out = self.scheduler.next_request()
+        self.assertEquals(out.url, 'http://ex.com')
+        for key in out.meta:
+            self.assertEqual(out.meta[key], self.req.meta[key])
+
+        # test request from serialized request
+        exist_req = Request('http://ex.com')
+        exist_item = request_to_dict(exist_req)
+        exist_item["meta"]["crawlid"] = "abc123"
+        exist_item["meta"]["appid"] = "myapp"
+        exist_item["meta"]["spiderid"] = "link"
+        self.scheduler.find_item = MagicMock(return_value=exist_item)
         out = self.scheduler.next_request()
         self.assertEquals(out.url, 'http://ex.com')
         for key in out.meta:

From 91636fa84e36a97e28f77f73430d2cb49bc5f82e Mon Sep 17 00:00:00 2001
From: gas1121 <jtdzhx@gmail.com>
Date: Mon, 14 Aug 2017 18:53:29 +0800
Subject: [PATCH 02/22] update comment

---
 crawler/crawling/distributed_scheduler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py
index 706ebb36..16ef7c27 100644
--- a/crawler/crawling/distributed_scheduler.py
+++ b/crawler/crawling/distributed_scheduler.py
@@ -487,8 +487,7 @@ def next_request(self):
                 # item is a serialized request
                 req = request_from_dict(item)
             else:
-                # item is feeded from outside and is not a serialized request,
-                # parse it manually
+                # item is a feed from outside, parse it manually
                 req = self.request_from_feed(item)
 
             # extra check to add items to request

From b6824bc9e86d18ebfee56be573944d9d76233e5b Mon Sep 17 00:00:00 2001
From: gas1121 <jtdzhx@gmail.com>
Date: Mon, 14 Aug 2017 19:45:46 +0800
Subject: [PATCH 03/22] fix error when converting dict to request in scheduler

---
 crawler/crawling/distributed_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py
index 16ef7c27..fb4fbb7d 100644
--- a/crawler/crawling/distributed_scheduler.py
+++ b/crawler/crawling/distributed_scheduler.py
@@ -485,7 +485,7 @@ def next_request(self):
                     .format(url=item['url']))
             if 'meta' in item:
                 # item is a serialized request
-                req = request_from_dict(item)
+                req = request_from_dict(item, self.spider)
             else:
                 # item is a feed from outside, parse it manually
                 req = self.request_from_feed(item)

From 0deaf741334a538601f9f69f83c7c6b3e73309e5 Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Sun, 20 Aug 2017 14:08:30 -0400
Subject: [PATCH 04/22] Updated master requirements from crawler changes

---
 requirements.txt | 50 ++++++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5a83d46a..a1e1a338 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,47 +6,47 @@ Jinja2==2.9.5 # Updated from 2.8
 MarkupSafe==1.0 # Updated from 0.23
 PyDispatcher==2.0.5
 PyYAML==3.12
-Scrapy==1.3.3 # Updated from 1.3.1
-Twisted==17.1.0 # Updated from 16.4.0
+Scrapy==1.4.0 # Updated from 1.3.3
+Twisted==17.5.0 # Updated from 17.1.0
 Werkzeug==0.12.1 # Updated from 0.11.11
-attrs==16.3.0 # Updated from 16.1.0
-cffi==1.9.1 # Updated from 1.7.0
+attrs==17.2.0 # Updated from 16.3.0
+cffi==1.10.0 # Updated from 1.9.1
 characteristic==14.3.0
 click==6.7 # Updated from 6.6
 coverage==4.3.4 # Updated from 4.0.3
-cryptography==1.8.1 # Updated from 1.5
+cryptography==2.0.3 # Updated from 1.8.1
 cssselect==1.0.1 # Updated from 0.9.2
 enum34==1.1.6
 funcsigs==1.0.2
-future==0.16.0 # Updated from 0.15.2
-idna==2.5 # Updated from 2.1
-ipaddress==1.0.18 # Updated from 1.0.16
+future==0.16.0
+idna==2.6 # Updated from 2.5
+ipaddress==1.0.18
 itsdangerous==0.24
 jsonschema==2.6.0 # Updated from 2.5.1
-kafka-python==1.3.3 # Updated from 1.3.2
-kazoo==2.2.1
-lxml==3.7.3 # Updated from 3.6.4
+kafka-python==1.3.4 # Updated from 1.3.3
+kazoo==2.4.0 # Updated from 2.2.1
+lxml==3.8.0 # Updated from 3.7.3
 mock==2.0.0
 nose==1.3.7
-parsel==1.1.0 # Updated from 1.0.3
-pbr==2.0.0 # Updated from 1.10.0
-pyOpenSSL==16.2.0 # Updated from 16.1.0
-pyasn1-modules==0.0.8
-pyasn1==0.2.3 # Updated from 0.1.9
-pycparser==2.17 # Updated from 2.14
-python-json-logger==0.1.7 # Updated from 0.1.5
+parsel==1.2.0 # Updated from 1.1.0
+pbr==3.1.1 # Updated from 2.0.0
+pyOpenSSL==17.2.0 # Updated from 16.2.0
+pyasn1==0.3.2 # Updated from 0.2.3
+pyasn1-modules==0.0.11 # Updated from 0.0.8
+pycparser==2.18 # Updated from 2.17
+python-json-logger==0.1.8 # Updated from 0.1.7
 python-redis-lock==3.2.0 # Updated from 3.1.0
 queuelib==1.4.2
 redis==2.10.5
-requests-file==1.4.1 # Updated from 1.4
-requests==2.13.0 # Updated from 2.11.1
+requests==2.18.3 # Updated from 2.13.0
+requests-file==1.4.2 # Updated from 1.4.1
 retrying==1.3.3
 ./utils # scutils==1.3.0dev0
-service-identity==16.0.0
+service-identity==17.0.0 # Updated from 16.0.0
 six==1.10.0
-testfixtures==4.13.5 # Updated from 4.11.0
-tldextract==2.0.2 # Updated from 2.0.1
+testfixtures==5.1.1 # Updated from 4.13.5
+tldextract==2.1.0 # Updated from 2.0.2
 ujson==1.35
-w3lib==1.17.0 # Updated from 1.16.0
-zope.interface==4.3.3 # Updated from 4.2.0
+w3lib==1.18.0 # Updated from 1.17.0
+zope.interface==4.4.2 # Updated from 4.3.3
 # Generated with piprot 0.9.7
\ No newline at end of file

From 00ab692ce272b199990354ed6b19e3e4aca93048 Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Sun, 20 Aug 2017 14:25:02 -0400
Subject: [PATCH 05/22] update changelog with new pr

---
 docs/topics/changelog.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/topics/changelog.rst b/docs/topics/changelog.rst
index ed7d7ede..77f58f0d 100644
--- a/docs/topics/changelog.rst
+++ b/docs/topics/changelog.rst
@@ -12,6 +12,11 @@ Date: ??/??/????
 
 - Add Python 3 support
 
+Crawler
+^^^^^^^
+
+- Improved request to dictionary serialization support
+
 Scrapy Cluster 1.2
 ------------------
 

From 9441bdc3842aaf500116efc7fdaa86c04a8c07e0 Mon Sep 17 00:00:00 2001
From: Alex Teut <jaturken@gmail.com>
Date: Fri, 22 Sep 2017 00:43:57 +0300
Subject: [PATCH 06/22] Fix REST-Kafka topic variable name in docker
 settings.py (#136)

---
 docker/rest/settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/rest/settings.py b/docker/rest/settings.py
index 3d295dd8..bcd47c12 100644
--- a/docker/rest/settings.py
+++ b/docker/rest/settings.py
@@ -27,7 +27,7 @@ def str2bool(v):
 KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES = 10 * 1024 * 1024  # 10MB
 KAFKA_CONSUMER_SLEEP_TIME = 1
 
-KAFKA_INCOMING_TOPIC = os.getenv('KAFKA_INCOMING_TOPIC', 'demo.incoming')
+KAFKA_PRODUCER_TOPIC = os.getenv('KAFKA_PRODUCER_TOPIC', 'demo.incoming')
 KAFKA_PRODUCER_BATCH_LINGER_MS = 25  # 25 ms before flush
 KAFKA_PRODUCER_BUFFER_BYTES = 4 * 1024 * 1024  # 4MB before blocking
 

From c4ebe9208378c9b750c82c93bde9e834defcb467 Mon Sep 17 00:00:00 2001
From: Russ Ferriday <russ.ferriday@swga.com>
Date: Wed, 1 Nov 2017 15:23:09 +0000
Subject: [PATCH 07/22] Fix assert deprecations

---
 crawler/tests/online.py                       |   2 +-
 crawler/tests/test_distributed_scheduler.py   |  10 +-
 crawler/tests/test_log_retry_middleware.py    |   8 +-
 .../tests/test_meta_passthrough_middleware.py |  10 +-
 crawler/tests/test_redis_stats_middleware.py  |  18 +--
 kafka-monitor/tests/test_kafka_monitor.py     |  32 ++---
 kafka-monitor/tests/test_plugins.py           |  10 +-
 redis-monitor/tests/online.py                 |   6 +-
 redis-monitor/tests/test_plugins.py           |  44 +++----
 redis-monitor/tests/test_redis_monitor.py     |  24 ++--
 rest/tests/online.py                          |   4 +-
 rest/tests/test_rest_service.py               | 114 +++++++++---------
 utils/tests/online.py                         |   6 +-
 utils/tests/test_log_factory.py               |   4 +-
 utils/tests/test_redis_queue.py               |  14 +--
 utils/tests/test_zookeeper_watcher.py         |   6 +-
 16 files changed, 156 insertions(+), 156 deletions(-)

diff --git a/crawler/tests/online.py b/crawler/tests/online.py
index 3296cf02..9cead8e4 100644
--- a/crawler/tests/online.py
+++ b/crawler/tests/online.py
@@ -101,7 +101,7 @@ def thread_func():
                     and the_dict['crawlid'] == 'abc12345':
                 message_count += 1
 
-        self.assertEquals(message_count, 1)
+        self.assertEqual(message_count, 1)
 
     def tearDown(self):
         keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')
diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py
index aaf3e42a..86c50662 100644
--- a/crawler/tests/test_distributed_scheduler.py
+++ b/crawler/tests/test_distributed_scheduler.py
@@ -60,7 +60,7 @@ def test_enqueue_request(self, t):
 
         # test request already seen
         self.scheduler.dupefilter.request_seen = MagicMock(return_value=True)
-        self.assertEquals(self.scheduler.enqueue_request(self.req), None)
+        self.assertEqual(self.scheduler.enqueue_request(self.req), None)
 
         # test request not expiring and queue seen
         self.scheduler.queue_keys = ['link:ex.com:queue']
@@ -150,7 +150,7 @@ def test_request_from_feed(self):
             "spiderid": "link",
         }
         out = self.scheduler.request_from_feed(feed)
-        self.assertEquals(out.url, 'http://ex.com')
+        self.assertEqual(out.url, 'http://ex.com')
         for key in out.meta:
             self.assertEqual(out.meta[key], self.req.meta[key])
 
@@ -194,7 +194,7 @@ def test_next_request(self, t):
         }
         self.scheduler.find_item = MagicMock(return_value=feed)
         out = self.scheduler.next_request()
-        self.assertEquals(out.url, 'http://ex.com')
+        self.assertEqual(out.url, 'http://ex.com')
         for key in out.meta:
             self.assertEqual(out.meta[key], self.req.meta[key])
 
@@ -206,13 +206,13 @@ def test_next_request(self, t):
         exist_item["meta"]["spiderid"] = "link"
         self.scheduler.find_item = MagicMock(return_value=exist_item)
         out = self.scheduler.next_request()
-        self.assertEquals(out.url, 'http://ex.com')
+        self.assertEqual(out.url, 'http://ex.com')
         for key in out.meta:
             self.assertEqual(out.meta[key], self.req.meta[key])
 
         # test didn't get item
         self.scheduler.find_item = MagicMock(return_value=None)
-        self.assertEquals(self.scheduler.next_request(), None)
+        self.assertEqual(self.scheduler.next_request(), None)
 
 
 class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase):
diff --git a/crawler/tests/test_log_retry_middleware.py b/crawler/tests/test_log_retry_middleware.py
index 2b368c73..a64fe08e 100644
--- a/crawler/tests/test_log_retry_middleware.py
+++ b/crawler/tests/test_log_retry_middleware.py
@@ -26,7 +26,7 @@ def test_lrm_stats_setup(self):
 
         # test nothing
         self.lrm._setup_stats_status_codes()
-        self.assertEquals([str(x) for x in self.lrm.stats_dict.keys()], ['lifetime'])
+        self.assertEqual([str(x) for x in self.lrm.stats_dict.keys()], ['lifetime'])
 
         # test good/bad rolling stats
         self.lrm.stats_dict = {}
@@ -43,7 +43,7 @@ def test_lrm_stats_setup(self):
 
         # check that both keys are set up
         self.lrm._setup_stats_status_codes()
-        self.assertEquals(
+        self.assertEqual(
             sorted([str(x) for x in self.lrm.stats_dict.keys()]),
             sorted(good))
 
@@ -51,12 +51,12 @@ def test_lrm_stats_setup(self):
 
         for time_key in self.lrm.stats_dict:
             if time_key == 0:
-                self.assertEquals(
+                self.assertEqual(
                     self.lrm.stats_dict[0].key,
                     '{k}:lifetime'.format(k=k1)
                     )
             else:
-                self.assertEquals(
+                self.assertEqual(
                     self.lrm.stats_dict[time_key].key,
                     '{k}:{t}'.format(k=k1, t=time_key)
                     )
diff --git a/crawler/tests/test_meta_passthrough_middleware.py b/crawler/tests/test_meta_passthrough_middleware.py
index 19406f1a..595bbcbe 100644
--- a/crawler/tests/test_meta_passthrough_middleware.py
+++ b/crawler/tests/test_meta_passthrough_middleware.py
@@ -33,13 +33,13 @@ def test_mpm_middleware(self):
 
         for item in self.mpm.process_spider_output(a, test_list, MagicMock()):
             if isinstance(item, Request):
-                self.assertEquals(a.meta, item.meta)
+                self.assertEqual(a.meta, item.meta)
             yield_count += 1
 
-        self.assertEquals(yield_count, 3)
+        self.assertEqual(yield_count, 3)
 
         # 1 debug for the method, 1 debug for the request
-        self.assertEquals(self.mpm.logger.debug.call_count, 2)
+        self.assertEqual(self.mpm.logger.debug.call_count, 2)
 
         # test meta unchanged if already exists
         r = Request('http://aol.com')
@@ -47,6 +47,6 @@ def test_mpm_middleware(self):
 
         for item in self.mpm.process_spider_output(a, [r], MagicMock()):
             # key1 value1 did not pass through, since it was already set
-            self.assertEquals(item.meta['key1'], 'othervalue')
+            self.assertEqual(item.meta['key1'], 'othervalue')
             # key2 was not set, therefor it passed through
-            self.assertEquals(item.meta['key2'], 'value2')
+            self.assertEqual(item.meta['key2'], 'value2')
diff --git a/crawler/tests/test_redis_stats_middleware.py b/crawler/tests/test_redis_stats_middleware.py
index 614d68d1..9544f27b 100644
--- a/crawler/tests/test_redis_stats_middleware.py
+++ b/crawler/tests/test_redis_stats_middleware.py
@@ -25,12 +25,12 @@ def test_load_stats_codes(self):
         # test nothing
         spider_name = 'link'
         self.rsm._setup_stats_status_codes(spider_name)
-        self.assertEquals(list(self.rsm.stats_dict[spider_name]['status_codes'].keys()), [])
+        self.assertEqual(list(self.rsm.stats_dict[spider_name]['status_codes'].keys()), [])
 
         # test status codes only
         self.rsm.settings['STATS_RESPONSE_CODES'] = [200, 403]
         self.rsm._setup_stats_status_codes(spider_name)
-        self.assertEquals(
+        self.assertEqual(
             sorted(self.rsm.stats_dict[spider_name]['status_codes'].keys()),
             sorted([200, 403]))
         self.assertEqual(list(self.rsm.stats_dict[spider_name]['status_codes'][200].keys()),
@@ -53,10 +53,10 @@ def test_load_stats_codes(self):
 
         # check that both keys are set up
         self.rsm._setup_stats_status_codes(spider_name)
-        self.assertEquals(
+        self.assertEqual(
             sorted([str(x) for x in self.rsm.stats_dict[spider_name]['status_codes'][200].keys()]),
             sorted(good))
-        self.assertEquals(
+        self.assertEqual(
             sorted([str(x) for x in self.rsm.stats_dict[spider_name]['status_codes'][403].keys()]),
             sorted(good))
 
@@ -65,24 +65,24 @@ def test_load_stats_codes(self):
 
         for time_key in self.rsm.stats_dict[spider_name]['status_codes'][200]:
             if time_key == 0:
-                self.assertEquals(
+                self.assertEqual(
                     self.rsm.stats_dict[spider_name]['status_codes'][200][0].key,
                     '{k}:lifetime'.format(k=k1)
                     )
             else:
-                self.assertEquals(
+                self.assertEqual(
                     self.rsm.stats_dict[spider_name]['status_codes'][200][time_key].key,
                     '{k}:{t}'.format(k=k1, t=time_key)
                     )
 
         for time_key in self.rsm.stats_dict[spider_name]['status_codes'][403]:
             if time_key == 0:
-                self.assertEquals(
+                self.assertEqual(
                     self.rsm.stats_dict[spider_name]['status_codes'][403][0].key,
                     '{k}:lifetime'.format(k=k2)
                     )
             else:
-                self.assertEquals(
+                self.assertEqual(
                     self.rsm.stats_dict[spider_name]['status_codes'][403][time_key].key,
                     '{k}:{t}'.format(k=k2, t=time_key)
                     )
@@ -141,6 +141,6 @@ def test_rsm_input(self):
                 self.rsm.process_spider_input(response, spider)
 
         # 4 calls for link, 4 calls for wandering
-        self.assertEquals(fake_stats.increment.call_count, 8)
+        self.assertEqual(fake_stats.increment.call_count, 8)
 
 
diff --git a/kafka-monitor/tests/test_kafka_monitor.py b/kafka-monitor/tests/test_kafka_monitor.py
index 38deb72b..6d1e2e8b 100644
--- a/kafka-monitor/tests/test_kafka_monitor.py
+++ b/kafka-monitor/tests/test_kafka_monitor.py
@@ -62,8 +62,8 @@ def test_load_stats_total(self):
         self.kafka_monitor.settings['STATS_TIMES'] = []
         self.kafka_monitor._setup_stats_total(MagicMock())
 
-        self.assertEquals(list(self.kafka_monitor.stats_dict['total'].keys()), ['lifetime'])
-        self.assertEquals(list(self.kafka_monitor.stats_dict['fail'].keys()), ['lifetime'])
+        self.assertEqual(list(self.kafka_monitor.stats_dict['total'].keys()), ['lifetime'])
+        self.assertEqual(list(self.kafka_monitor.stats_dict['fail'].keys()), ['lifetime'])
 
         # test good/bad rolling stats
         self.kafka_monitor.stats_dict = {}
@@ -79,10 +79,10 @@ def test_load_stats_total(self):
         ]
 
         self.kafka_monitor._setup_stats_total(MagicMock())
-        self.assertEquals(
+        self.assertEqual(
             sorted([str(x) for x in self.kafka_monitor.stats_dict['total'].keys()]),
             sorted(good))
-        self.assertEquals(
+        self.assertEqual(
             sorted([str(x) for x in self.kafka_monitor.stats_dict['fail'].keys()]),
             sorted(good))
 
@@ -91,24 +91,24 @@ def test_load_stats_total(self):
 
         for time_key in self.kafka_monitor.stats_dict['total']:
             if time_key == 0:
-                self.assertEquals(
+                self.assertEqual(
                     self.kafka_monitor.stats_dict['total'][0].key,
                     '{k}:lifetime'.format(k=k1)
                     )
             else:
-                self.assertEquals(
+                self.assertEqual(
                     self.kafka_monitor.stats_dict['total'][time_key].key,
                     '{k}:{t}'.format(k=k1, t=time_key)
                     )
 
         for time_key in self.kafka_monitor.stats_dict['fail']:
             if time_key == 0:
-                self.assertEquals(
+                self.assertEqual(
                     self.kafka_monitor.stats_dict['fail'][0].key,
                     '{k}:lifetime'.format(k=k2)
                     )
             else:
-                self.assertEquals(
+                self.assertEqual(
                     self.kafka_monitor.stats_dict['fail'][time_key].key,
                     '{k}:{t}'.format(k=k2, t=time_key)
                     )
@@ -129,13 +129,13 @@ def test_load_stats_plugins(self):
             'ZookeeperHandler'
         ]
 
-        self.assertEquals(
+        self.assertEqual(
             sorted(list(self.kafka_monitor.stats_dict['plugins'].keys())),
             sorted(defaults))
 
         for key in self.kafka_monitor.plugins_dict:
             plugin_name = self.kafka_monitor.plugins_dict[key]['instance'].__class__.__name__
-            self.assertEquals(
+            self.assertEqual(
                 list(self.kafka_monitor.stats_dict['plugins'][plugin_name].keys()),
                 ['lifetime'])
 
@@ -154,13 +154,13 @@ def test_load_stats_plugins(self):
 
         self.kafka_monitor._setup_stats_plugins(MagicMock())
 
-        self.assertEquals(
+        self.assertEqual(
             sorted(self.kafka_monitor.stats_dict['plugins'].keys()),
             sorted(defaults))
 
         for key in self.kafka_monitor.plugins_dict:
             plugin_name = self.kafka_monitor.plugins_dict[key]['instance'].__class__.__name__
-            self.assertEquals(
+            self.assertEqual(
                 sorted([str(x) for x in self.kafka_monitor.stats_dict['plugins'][plugin_name].keys()]),
                 sorted(good))
 
@@ -168,12 +168,12 @@ def test_load_stats_plugins(self):
             k1 = 'stats:kafka-monitor:{p}'.format(p=plugin_key)
             for time_key in self.kafka_monitor.stats_dict['plugins'][plugin_key]:
                 if time_key == 0:
-                    self.assertEquals(
+                    self.assertEqual(
                         self.kafka_monitor.stats_dict['plugins'][plugin_key][0].key,
                         '{k}:lifetime'.format(k=k1)
                         )
                 else:
-                    self.assertEquals(
+                    self.assertEqual(
                         self.kafka_monitor.stats_dict['plugins'][plugin_key][time_key].key,
                         '{k}:{t}'.format(k=k1, t=time_key)
                         )
@@ -224,7 +224,7 @@ class a(object):
             self.kafka_monitor._process_messages()
             self.fail("Scrape not called")
         except AssertionError as e:
-            self.assertEquals("scrape", str(e))
+            self.assertEqual("scrape", str(e))
 
         # test that handler function is called for the actions
         message_string = "{\"uuid\":\"blah\",\"crawlid\":\"1234\"," \
@@ -237,5 +237,5 @@ class a(object):
             self.kafka_monitor._process_messages()
             self.fail("Action not called")
         except AssertionError as e:
-            self.assertEquals("action", str(e))
+            self.assertEqual("action", str(e))
 
diff --git a/kafka-monitor/tests/test_plugins.py b/kafka-monitor/tests/test_plugins.py
index 7cd946a6..96cfa5f2 100644
--- a/kafka-monitor/tests/test_plugins.py
+++ b/kafka-monitor/tests/test_plugins.py
@@ -56,7 +56,7 @@ def test_scrape_handler(self):
             handler.handle(valid)
             self.fail("Action not called")
         except AssertionError as e:
-            self.assertEquals("added", str(e))
+            self.assertEqual("added", str(e))
 
         # check timeout is added
         handler.redis_conn.zadd = MagicMock()
@@ -66,7 +66,7 @@ def test_scrape_handler(self):
             handler.handle(valid)
             self.fail("Expires not called")
         except AssertionError as e:
-            self.assertEquals("expires", str(e))
+            self.assertEqual("expires", str(e))
 
     def test_action_handler(self):
         handler = ActionHandler()
@@ -85,7 +85,7 @@ def test_action_handler(self):
             handler.handle(valid)
             self.fail("Added not called")
         except AssertionError as e:
-            self.assertEquals("added", str(e))
+            self.assertEqual("added", str(e))
 
     def test_stats_handler(self):
         handler = StatsHandler()
@@ -102,7 +102,7 @@ def test_stats_handler(self):
             handler.handle(valid)
             self.fail("Added not called")
         except AssertionError as e:
-            self.assertEquals("added", str(e))
+            self.assertEqual("added", str(e))
 
     def test_zookeeper_handler(self):
         handler = ZookeeperHandler()
@@ -123,7 +123,7 @@ def test_zookeeper_handler(self):
             handler.handle(valid)
             self.fail("Added not called")
         except AssertionError as e:
-            self.assertEquals("added", str(e))
+            self.assertEqual("added", str(e))
 
     def test_bad_plugins(self):
         class ForgotSchema(BaseHandler):
diff --git a/redis-monitor/tests/online.py b/redis-monitor/tests/online.py
index 4db06aef..b3c67668 100644
--- a/redis-monitor/tests/online.py
+++ b/redis-monitor/tests/online.py
@@ -93,7 +93,7 @@ def test_process_item(self):
         self.redis_monitor._process_plugin(plugin)
 
         # ensure the key is gone
-        self.assertEquals(self.redis_monitor.redis_conn.get(key), None)
+        self.assertEqual(self.redis_monitor.redis_conn.get(key), None)
         self.redis_monitor.close()
         sleep(10)
         # now test the message was sent to kafka
@@ -109,10 +109,10 @@ def test_process_item(self):
             pass
         else:
             the_dict = json.loads(m.value)
-            self.assertEquals(success, the_dict)
+            self.assertEqual(success, the_dict)
             message_count += 1
 
-        self.assertEquals(message_count, 1)
+        self.assertEqual(message_count, 1)
 
     def tearDown(self):
         # if for some reason the tests fail, we end up falling behind on
diff --git a/redis-monitor/tests/test_plugins.py b/redis-monitor/tests/test_plugins.py
index 3242ae40..056f4a50 100644
--- a/redis-monitor/tests/test_plugins.py
+++ b/redis-monitor/tests/test_plugins.py
@@ -73,9 +73,9 @@ def setUp(self):
 
     def test_info_regex(self):
         regex = self.fix_re(self.plugin.regex)
-        self.assertEquals(re.findall(regex, 'info:stuff:stuff'), ['info:stuff:stuff'])
-        self.assertEquals(re.findall(regex, 'info:stuff:stuff:stuff'), ['info:stuff:stuff:stuff'])
-        self.assertEquals(re.findall(regex, 'info:stuff'), [])
+        self.assertEqual(re.findall(regex, 'info:stuff:stuff'), ['info:stuff:stuff'])
+        self.assertEqual(re.findall(regex, 'info:stuff:stuff:stuff'), ['info:stuff:stuff:stuff'])
+        self.assertEqual(re.findall(regex, 'info:stuff'), [])
 
     def test_info_get_bin(self):
         v1 = "stuff"
@@ -83,7 +83,7 @@ def test_info_get_bin(self):
         v2 = 200
         self.plugin.redis_conn.zscan_iter = MagicMock(return_value=[(v1, v2)])
         ret_val = self.plugin._get_bin('key')
-        self.assertEquals(ret_val, {-200: ['stuff']})
+        self.assertEqual(ret_val, {-200: ['stuff']})
 
     def test_info_get_crawlid(self):
         master = {}
@@ -124,7 +124,7 @@ def test_info_get_crawlid(self):
             'uuid': 'ABC123'
         }
 
-        self.assertEquals(result, success)
+        self.assertEqual(result, success)
 
     def test_info_get_appid(self):
         master = {}
@@ -165,7 +165,7 @@ def test_info_get_appid(self):
                     'expires': 10
                 }}}
 
-        self.assertEquals(result, success)
+        self.assertEqual(result, success)
 
 
 class TestStopPlugin(TestCase, RegexFixer):
@@ -176,11 +176,11 @@ def setUp(self):
 
     def test_stop_regex(self):
         regex = self.fix_re(self.plugin.regex)
-        self.assertEquals(re.findall(regex, 'stop:spider:app:crawl'),
+        self.assertEqual(re.findall(regex, 'stop:spider:app:crawl'),
                           ['stop:spider:app:crawl'])
-        self.assertEquals(re.findall(regex, 'stop:spider:app'),
+        self.assertEqual(re.findall(regex, 'stop:spider:app'),
                           ['stop:spider:app'])
-        self.assertEquals(re.findall(regex, 'stop:stuff'), [])
+        self.assertEqual(re.findall(regex, 'stop:stuff'), [])
 
     def test_stop_monitor_mini_purge(self):
         self.plugin.redis_conn.scan_iter = MagicMock(return_value=['link:istresearch.com:queue'])
@@ -189,7 +189,7 @@ def test_stop_monitor_mini_purge(self):
             ['{"crawlid":"crawl", "appid":"foo"}'],
         ])
 
-        self.assertEquals(self.plugin._mini_purge("link", "app", "crawl"), 1)
+        self.assertEqual(self.plugin._mini_purge("link", "app", "crawl"), 1)
 
 
 class TestExpirePlugin(TestCase, RegexFixer):
@@ -200,9 +200,9 @@ def setUp(self):
 
     def test_stop_regex(self):
         regex = self.fix_re(self.plugin.regex)
-        self.assertEquals(re.findall(regex, 'timeout:blah1:blah2:bla3'),
+        self.assertEqual(re.findall(regex, 'timeout:blah1:blah2:bla3'),
                           ['timeout:blah1:blah2:bla3'])
-        self.assertEquals(re.findall(regex, 'timeout:blah1:blah2'), [])
+        self.assertEqual(re.findall(regex, 'timeout:blah1:blah2'), [])
 
     def test_expire_monitor_time(self):
         # if the stop monitor passes then this is just testing whether
@@ -222,7 +222,7 @@ def test_expire_monitor_time(self):
                 self.plugin.handle("key:stuff:blah:blah", 4)
             self.fail("Expire not called")
         except BaseException as e:
-            self.assertEquals("throw once", str(e))
+            self.assertEqual("throw once", str(e))
 
 class TestStatsPlugin(TestCase, RegexFixer):
     def setUp(self):
@@ -233,16 +233,16 @@ def setUp(self):
 
     def test_stats_regex(self):
         regex = self.fix_re(self.plugin.regex)
-        self.assertEquals(re.findall(regex, 'statsrequest:crawler:testApp'),
+        self.assertEqual(re.findall(regex, 'statsrequest:crawler:testApp'),
                           ['statsrequest:crawler:testApp'])
-        self.assertEquals(re.findall(regex, 'statsrequest:crawler'), [])
+        self.assertEqual(re.findall(regex, 'statsrequest:crawler'), [])
 
     def _assert_thrown(self, key, equals):
         try:
             self.plugin.handle(key, 'blah')
             self.fail(equals + " exception not thrown")
         except Exception as e:
-            self.assertEquals(equals, str(e))
+            self.assertEqual(equals, str(e))
 
     def test_stats_handle(self):
         # trying to make sure that everything is called
@@ -291,7 +291,7 @@ def side_effect(*args):
                 }
             }
         }
-        self.assertEquals(result, good)
+        self.assertEqual(result, good)
 
     def test_stats_get_machine(self):
         # tests stats on three different machines, with different spiders
@@ -313,7 +313,7 @@ def test_stats_get_machine(self):
                 'host3': {'200': {'86400': 5}}
             }
         }
-        self.assertEquals(result, good)
+        self.assertEqual(result, good)
 
     def test_stats_get_queue(self):
         # tests stats on three different machines, with different spiders
@@ -355,7 +355,7 @@ def ret_val(*args):
                 }
             }
         }
-        self.assertEquals(result, good)
+        self.assertEqual(result, good)
 
     def test_stats_get_plugin(self):
         self.plugin.redis_conn.keys = MagicMock(return_value=[
@@ -374,7 +374,7 @@ def test_stats_get_plugin(self):
             "total": {"lifetime": 5, "3600": 5},
             "fail": {"68000": 5, "3600": 5}
         }
-        self.assertEquals(result, good)
+        self.assertEqual(result, good)
 
 class TestZookeeperPlugin(TestCase, RegexFixer):
     def setUp(self):
@@ -389,9 +389,9 @@ def setUp(self):
 
     def test_zk_regex(self):
         regex = self.fix_re(self.plugin.regex)
-        self.assertEquals(re.findall(regex, 'zk:blah1:blah2:bla3'),
+        self.assertEqual(re.findall(regex, 'zk:blah1:blah2:bla3'),
                           ['zk:blah1:blah2:bla3'])
-        self.assertEquals(re.findall(regex, 'zk:blah1:blah2'), [])
+        self.assertEqual(re.findall(regex, 'zk:blah1:blah2'), [])
 
     def test_zk_handle_du(self):
         # domain update
diff --git a/redis-monitor/tests/test_redis_monitor.py b/redis-monitor/tests/test_redis_monitor.py
index f4545014..fcea510e 100644
--- a/redis-monitor/tests/test_redis_monitor.py
+++ b/redis-monitor/tests/test_redis_monitor.py
@@ -64,7 +64,7 @@ def test_active_plugins(self):
             self.redis_monitor._process_plugin(plugin)
             self.fail("Info not called")
         except BaseException as e:
-            self.assertEquals("info", str(e))
+            self.assertEqual("info", str(e))
 
         # action
         try:
@@ -72,7 +72,7 @@ def test_active_plugins(self):
             self.redis_monitor._process_plugin(plugin)
             self.fail("Stop not called")
         except BaseException as e:
-            self.assertEquals("stop", str(e))
+            self.assertEqual("stop", str(e))
 
         # expire
         try:
@@ -80,7 +80,7 @@ def test_active_plugins(self):
             self.redis_monitor._process_plugin(plugin)
             self.fail("Expire not called")
         except BaseException as e:
-            self.assertEquals("expire", str(e))
+            self.assertEqual("expire", str(e))
 
         # test that an exception within a handle method is caught
         self.redis_monitor._process_failures = MagicMock()
@@ -158,13 +158,13 @@ def test_load_stats_plugins(self):
             'ZookeeperMonitor'
         ]
 
-        self.assertEquals(
+        self.assertEqual(
             sorted(self.redis_monitor.stats_dict['plugins'].keys()),
             sorted(defaults))
 
         for key in self.redis_monitor.plugins_dict:
             plugin_name = self.redis_monitor.plugins_dict[key]['instance'].__class__.__name__
-            self.assertEquals(
+            self.assertEqual(
                 list(self.redis_monitor.stats_dict['plugins'][plugin_name].keys()),
                 ['lifetime'])
 
@@ -183,13 +183,13 @@ def test_load_stats_plugins(self):
 
         self.redis_monitor._setup_stats_plugins()
 
-        self.assertEquals(
+        self.assertEqual(
             sorted(self.redis_monitor.stats_dict['plugins'].keys()),
             sorted(defaults))
 
         for key in self.redis_monitor.plugins_dict:
             plugin_name = self.redis_monitor.plugins_dict[key]['instance'].__class__.__name__
-            self.assertEquals(
+            self.assertEqual(
                 sorted([str(x) for x in self.redis_monitor.stats_dict['plugins'][plugin_name].keys()]),
                 sorted(good))
 
@@ -197,12 +197,12 @@ def test_load_stats_plugins(self):
             k1 = 'stats:redis-monitor:{p}'.format(p=plugin_key)
             for time_key in self.redis_monitor.stats_dict['plugins'][plugin_key]:
                 if time_key == 0:
-                    self.assertEquals(
+                    self.assertEqual(
                         self.redis_monitor.stats_dict['plugins'][plugin_key][0].key,
                         '{k}:lifetime'.format(k=k1)
                         )
                 else:
-                    self.assertEquals(
+                    self.assertEqual(
                         self.redis_monitor.stats_dict['plugins'][plugin_key][time_key].key,
                         '{k}:{t}'.format(k=k1, t=time_key)
                         )
@@ -216,7 +216,7 @@ def test_main_loop(self):
             self.redis_monitor._main_loop()
             self.fail("_process_plugin not called")
         except BaseException as e:
-            self.assertEquals("normal", str(e))
+            self.assertEqual("normal", str(e))
 
     def test_precondition(self):
         self.redis_monitor.stats_dict = {}
@@ -235,12 +235,12 @@ def test_precondition(self):
             self.redis_monitor._process_key_val(instance, key, value)
             self.fail('handler not called')
         except BaseException as e:
-            self.assertEquals('handler', str(e))
+            self.assertEqual('handler', str(e))
 
     def test_get_fail_key(self):
         key = 'test'
         result = 'lock:test:failures'
-        self.assertEquals(self.redis_monitor._get_fail_key(key), result)
+        self.assertEqual(self.redis_monitor._get_fail_key(key), result)
 
     def test_process_failures(self):
         self.redis_monitor.settings = {'RETRY_FAILURES':True,
diff --git a/rest/tests/online.py b/rest/tests/online.py
index fb568618..6bdecdb2 100644
--- a/rest/tests/online.py
+++ b/rest/tests/online.py
@@ -39,10 +39,10 @@ def test_status(self):
         r = requests.get('http://127.0.0.1:{p}'.format(p=self.port_number))
         results = r.json()
 
-        self.assertEquals(results['node_health'], 'GREEN')
+        self.assertEqual(results['node_health'], 'GREEN')
 
     def tearDown(self):
         self.rest_service.close()
 
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/rest/tests/test_rest_service.py b/rest/tests/test_rest_service.py
index 2f2772f7..01ca12df 100644
--- a/rest/tests/test_rest_service.py
+++ b/rest/tests/test_rest_service.py
@@ -61,7 +61,7 @@ def test_load_schemas_bad(self):
     @mock.patch('six.moves.builtins.open', mock_open(read_data='{\"stuff\":\"value\"}'), create=True)
     def test_load_schemas_bad(self):
         self.rest_service._load_schemas()
-        self.assertEquals(self.rest_service.schemas,
+        self.assertEqual(self.rest_service.schemas,
                           {'hey2': {u'stuff': u'value'}})
 
     def test_process_messages(self):
@@ -110,7 +110,7 @@ class a(object):
         m.value = message_string
         messages = [m]
         self.rest_service._process_messages()
-        self.assertEquals(self.rest_service.uuids, {'abc123': {u'uuid': u'abc123'}})
+        self.assertEqual(self.rest_service.uuids, {'abc123': {u'uuid': u'abc123'}})
 
     def test_send_result_to_redis(self):
         # test not connected
@@ -216,8 +216,8 @@ def test_setup_kafka(self):
             self.rest_service._setup_kafka()
         except:
             pass
-        self.assertEquals(self.rest_service.consumer, None)
-        self.assertEquals(self.rest_service.producer, None)
+        self.assertEqual(self.rest_service.consumer, None)
+        self.assertEqual(self.rest_service.producer, None)
 
         # test if everything flows through
         self.rest_service._create_consumer = MagicMock()
@@ -232,7 +232,7 @@ def test_create_ret_object(self):
             "data": None,
             "error": None
         }
-        self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.FAILURE), r)
+        self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.FAILURE), r)
 
         # success
         r = {
@@ -240,7 +240,7 @@ def test_create_ret_object(self):
             "data": None,
             "error": None
         }
-        self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.SUCCESS), r)
+        self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.SUCCESS), r)
 
         # data
         r = {
@@ -248,7 +248,7 @@ def test_create_ret_object(self):
             "data": 'blah',
             "error": None
         }
-        self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.SUCCESS, data='blah'), r)
+        self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.SUCCESS, data='blah'), r)
 
         # error message
         r = {
@@ -258,7 +258,7 @@ def test_create_ret_object(self):
                 "message": 'err'
             }
         }
-        self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.FAILURE,
+        self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.FAILURE,
                                                                error=True,
                                                                error_message='err'), r)
 
@@ -271,7 +271,7 @@ def test_create_ret_object(self):
                 "cause": "the cause"
             }
         }
-        self.assertEquals(self.rest_service._create_ret_object(status=self.rest_service.FAILURE,
+        self.assertEqual(self.rest_service._create_ret_object(status=self.rest_service.FAILURE,
                                                                error=True,
                                                                error_message='err',
                                                                error_cause="the cause"), r)
@@ -306,7 +306,7 @@ def test_close(self):
         self.rest_service._close_thread = MagicMock()
         self.rest_service.close()
 
-        self.assertEquals(self.rest_service._close_thread.call_count, 4)
+        self.assertEqual(self.rest_service._close_thread.call_count, 4)
         self.assertTrue(self.rest_service.closed)
         self.assertTrue(self.rest_service.consumer.close.called)
         self.assertTrue(self.rest_service.producer.close.called)
@@ -314,19 +314,19 @@ def test_close(self):
     def test_calculate_health(self):
         self.rest_service.redis_connected = False
         self.rest_service.kafka_connected = False
-        self.assertEquals(self.rest_service._calculate_health(), "RED")
+        self.assertEqual(self.rest_service._calculate_health(), "RED")
 
         self.rest_service.redis_connected = True
         self.rest_service.kafka_connected = False
-        self.assertEquals(self.rest_service._calculate_health(), "YELLOW")
+        self.assertEqual(self.rest_service._calculate_health(), "YELLOW")
 
         self.rest_service.redis_connected = False
         self.rest_service.kafka_connected = True
-        self.assertEquals(self.rest_service._calculate_health(), "YELLOW")
+        self.assertEqual(self.rest_service._calculate_health(), "YELLOW")
 
         self.rest_service.redis_connected = True
         self.rest_service.kafka_connected = True
-        self.assertEquals(self.rest_service._calculate_health(), "GREEN")
+        self.assertEqual(self.rest_service._calculate_health(), "GREEN")
 
     def test_feed_to_kafka(self):
         self.rest_service.producer = MagicMock()
@@ -352,7 +352,7 @@ def test_log_call(self):
             override.test_log_call()
 
         self.assertTrue(override.logger.info.called)
-        self.assertEquals(override.logger.info.call_args[0][0], "test logger")
+        self.assertEqual(override.logger.info.call_args[0][0], "test logger")
 
     def test_error_catch(self):
         override = Override('settings.py')
@@ -363,7 +363,7 @@ def test_error_catch(self):
             override.logger.error = MagicMock()
             results = override.test_error1()
             self.assertTrue(override.logger.error.called)
-            self.assertEquals(override.logger.error.call_args[0][0],
+            self.assertEqual(override.logger.error.call_args[0][0],
                               "Uncaught Exception Thrown")
             d = {
                 u'data': None,
@@ -373,8 +373,8 @@ def test_error_catch(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 500)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 500)
 
         # test normal response
         with self.rest_service.app.test_request_context():
@@ -382,8 +382,8 @@ def test_error_catch(self):
             results = override.test_error2()
             self.assertFalse(override.logger.error.called)
             data = json.loads(results[0].data)
-            self.assertEquals(data, 'test data')
-            self.assertEquals(results[1], 200)
+            self.assertEqual(data, 'test data')
+            self.assertEqual(results[1], 200)
 
         # test normal response with alternate response code
         with self.rest_service.app.test_request_context():
@@ -391,8 +391,8 @@ def test_error_catch(self):
             results = override.test_error3()
             self.assertFalse(override.logger.error.called)
             data = json.loads(results[0].data)
-            self.assertEquals(data, 'test data')
-            self.assertEquals(results[1], 109)
+            self.assertEqual(data, 'test data')
+            self.assertEqual(results[1], 109)
 
     def test_validate_json(self):
         override = Override('settings.py')
@@ -404,7 +404,7 @@ def test_validate_json(self):
                                                         content_type='application/json'):
             results = override.test_json()
             self.assertTrue(override.logger.error.called)
-            self.assertEquals(override.logger.error.call_args[0][0],
+            self.assertEqual(override.logger.error.call_args[0][0],
                               'The payload must be valid JSON.')
 
             d = {
@@ -415,8 +415,8 @@ def test_validate_json(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 400)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 400)
 
         # no json
         data = '["a list", ashdasd ,\\ !]'
@@ -424,7 +424,7 @@ def test_validate_json(self):
             self.rest_service.logger.error.reset_mock()
             results = override.test_json()
             self.assertTrue(override.logger.error.called)
-            self.assertEquals(override.logger.error.call_args[0][0],
+            self.assertEqual(override.logger.error.call_args[0][0],
                               'The payload must be valid JSON.')
 
             d = {
@@ -435,8 +435,8 @@ def test_validate_json(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 400)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 400)
 
         # good json
         data = '["a list", "2", "3"]'
@@ -445,7 +445,7 @@ def test_validate_json(self):
             override.logger.reset_mock()
             results = override.test_json()
             self.assertFalse(override.logger.error.called)
-            self.assertEquals(results, 'data')
+            self.assertEqual(results, 'data')
 
     def test_validate_schema(self):
         override = Override('settings.py')
@@ -473,7 +473,7 @@ def test_validate_schema(self):
                                                         content_type='application/json'):
             results = override.test_schema()
             self.assertFalse(override.logger.error.called)
-            self.assertEquals(results, 'data')
+            self.assertEqual(results, 'data')
 
         # invalid schema
         data = u'{"value": "data here", "otherkey": "bad data"}'
@@ -481,7 +481,7 @@ def test_validate_schema(self):
                                                         content_type='application/json'):
             results = override.test_schema()
             self.assertTrue(override.logger.error.called)
-            self.assertEquals(override.logger.error.call_args[0][0],
+            self.assertEqual(override.logger.error.call_args[0][0],
                               "Invalid Schema")
 
             if six.PY3:
@@ -497,8 +497,8 @@ def test_validate_schema(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 400)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 400)
 
     # Routes ------------------
 
@@ -515,7 +515,7 @@ def test_index(self):
                 "node_health": 'RED'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
+            self.assertEqual(data, d)
 
     def test_feed(self):
         # test not connected
@@ -534,8 +534,8 @@ def test_feed(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 500)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 500)
 
         # connected
         self.rest_service.kafka_connected = True
@@ -554,8 +554,8 @@ def test_feed(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 500)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 500)
 
         # test no uuid
         self.rest_service._feed_to_kafka = MagicMock(return_value=True)
@@ -568,8 +568,8 @@ def test_feed(self):
                 u'status': u'SUCCESS'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 200)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 200)
 
         # test with uuid, got response
         time_list = [0, 1, 2, 3, 4, 5]
@@ -590,8 +590,8 @@ def fancy_get_time():
                 u'status': u'SUCCESS'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 200)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 200)
             self.assertFalse('key' in self.rest_service.uuids)
 
         # test with uuid, no response
@@ -609,10 +609,10 @@ def fancy_get_time2():
                 u'status': u'SUCCESS'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 200)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 200)
             self.assertTrue('key' in self.rest_service.uuids)
-            self.assertEquals(self.rest_service.uuids['key'], 'poll')
+            self.assertEqual(self.rest_service.uuids['key'], 'poll')
 
     def test_poll(self):
         orig = self.rest_service.validator
@@ -635,8 +635,8 @@ def test_poll(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 500)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 500)
 
         # test connected found poll key
         self.rest_service.redis_conn = MagicMock()
@@ -651,8 +651,8 @@ def test_poll(self):
                 u'status': u'SUCCESS'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 200)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 200)
 
         # test connected didnt find poll key
         self.rest_service.redis_conn.get = MagicMock(return_value=None)
@@ -668,8 +668,8 @@ def test_poll(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 404)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 404)
 
         # test connection error
         self.rest_service._spawn_redis_connection_thread = MagicMock()
@@ -679,7 +679,7 @@ def test_poll(self):
             self.rest_service.redis_conn.get = MagicMock(side_effect=ConnectionError)
             results = self.rest_service.poll()
             self.assertTrue(self.rest_service.logger.error.called)
-            self.assertEquals(self.rest_service.logger.error.call_args[0][0], "Lost connection to Redis")
+            self.assertEqual(self.rest_service.logger.error.call_args[0][0], "Lost connection to Redis")
             self.assertTrue(self.rest_service._spawn_redis_connection_thread.called)
 
             d = {
@@ -690,8 +690,8 @@ def test_poll(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 500)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 500)
 
         # test value error
         self.rest_service.logger.warning = MagicMock()
@@ -700,7 +700,7 @@ def test_poll(self):
             self.rest_service.redis_conn.get = MagicMock(side_effect=ValueError)
             results = self.rest_service.poll()
             self.assertTrue(self.rest_service.logger.warning.called)
-            self.assertEquals(self.rest_service.logger.warning.call_args[0][0], "Unparseable JSON Received from redis")
+            self.assertEqual(self.rest_service.logger.warning.call_args[0][0], "Unparseable JSON Received from redis")
 
             d = {
                 u'data': None,
@@ -710,8 +710,8 @@ def test_poll(self):
                 u'status': u'FAILURE'
             }
             data = json.loads(results[0].data)
-            self.assertEquals(data, d)
-            self.assertEquals(results[1], 500)
+            self.assertEqual(data, d)
+            self.assertEqual(results[1], 500)
 
         self.rest_service.validator = orig
 
diff --git a/utils/tests/online.py b/utils/tests/online.py
index ffa06be7..2ab0930d 100644
--- a/utils/tests/online.py
+++ b/utils/tests/online.py
@@ -405,10 +405,10 @@ def test_get_file_contents(self):
                                                ensure=False,
                                                valid_init=True)
 
-        self.assertEquals(self.zoo_watcher.get_file_contents(), self.file_data)
-        self.assertEquals(pointer_zoo_watcher.get_file_contents(),
+        self.assertEqual(self.zoo_watcher.get_file_contents(), self.file_data)
+        self.assertEqual(pointer_zoo_watcher.get_file_contents(),
                           self.file_data)
-        self.assertEquals(pointer_zoo_watcher.get_file_contents(True),
+        self.assertEqual(pointer_zoo_watcher.get_file_contents(True),
                           self.pointer_data)
 
         pointer_zoo_watcher.close()
diff --git a/utils/tests/test_log_factory.py b/utils/tests/test_log_factory.py
index 608af9d9..204cf7cb 100644
--- a/utils/tests/test_log_factory.py
+++ b/utils/tests/test_log_factory.py
@@ -313,8 +313,8 @@ def test_preserve_data(self):
         extras = {"key": "value", 'a': [1, 2, 3]}
 
         def cb(log_message=None, log_extra=None):
-            self.assertEquals(log_message, message)
-            self.assertEquals(log_extra, extras)
+            self.assertEqual(log_message, message)
+            self.assertEqual(log_extra, extras)
 
         self.logger.register_callback('>DEBUG', cb)
         self.logger.log_level = 'INFO'
diff --git a/utils/tests/test_redis_queue.py b/utils/tests/test_redis_queue.py
index 14c152f2..be2d0108 100644
--- a/utils/tests/test_redis_queue.py
+++ b/utils/tests/test_redis_queue.py
@@ -30,16 +30,16 @@ def test_encode(self):
         q = Base(MagicMock(), 'key', pickle)
         # python pickling is different between versions
         data = pickle.dumps('cool', protocol=-1).decode('latin1')
-        self.assertEquals(q._encode_item('cool'), data)
+        self.assertEqual(q._encode_item('cool'), data)
         q2 = Base(MagicMock(), 'key', ujson)
-        self.assertEquals(q2._encode_item('cool2'), '"cool2"')
+        self.assertEqual(q2._encode_item('cool2'), '"cool2"')
 
     def test_decode(self):
         q = Base(MagicMock(), 'key', pickle)
-        self.assertEquals(q._decode_item(u"\x80\x02U\x04coolq\x00."), 'cool')
+        self.assertEqual(q._decode_item(u"\x80\x02U\x04coolq\x00."), 'cool')
 
         q2 = Base(MagicMock(), 'key', ujson)
-        self.assertEquals(q2._decode_item('"cool2"'), 'cool2')
+        self.assertEqual(q2._decode_item('"cool2"'), 'cool2')
 
     def test_len(self):
         with self.assertRaises(NotImplementedError):
@@ -79,7 +79,7 @@ def test_push(self):
     def test_pop(self):
         self.assertTrue(hasattr(self.queue, 'pop'))
         self.queue.server.rpop = MagicMock(return_value='"stuff"')
-        self.assertEquals(self.queue.pop(), "stuff")
+        self.assertEqual(self.queue.pop(), "stuff")
 
 
 class TestRedisPriorityQueue(QueueMixin, TestCase):
@@ -99,7 +99,7 @@ def test_pop(self):
         m = MagicMock()
         m.execute = MagicMock(return_value=[{}, 60])
         self.queue.server.pipeline = MagicMock(return_value=m)
-        self.assertEquals(self.queue.pop(), None)
+        self.assertEqual(self.queue.pop(), None)
 
 
 class TestRedisStack(QueueMixin, TestCase):
@@ -117,4 +117,4 @@ def test_push(self):
     def test_pop(self):
         self.assertTrue(hasattr(self.queue, 'pop'))
         self.queue.server.lpop = MagicMock(return_value='"stuff"')
-        self.assertEquals(self.queue.pop(), "stuff")
+        self.assertEqual(self.queue.pop(), "stuff")
diff --git a/utils/tests/test_zookeeper_watcher.py b/utils/tests/test_zookeeper_watcher.py
index fc5b52c8..05449c6e 100644
--- a/utils/tests/test_zookeeper_watcher.py
+++ b/utils/tests/test_zookeeper_watcher.py
@@ -30,13 +30,13 @@ def test_get_file_contents(self):
         self.zoo_watcher.old_data = 'old_data'
 
         self.zoo_watcher.pointer = False
-        self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data')
+        self.assertEqual(self.zoo_watcher.get_file_contents(), 'old_data')
 
         self.zoo_watcher.pointer = True
-        self.assertEquals(self.zoo_watcher.get_file_contents(), 'old_data')
+        self.assertEqual(self.zoo_watcher.get_file_contents(), 'old_data')
 
         self.zoo_watcher.pointer = True
-        self.assertEquals(self.zoo_watcher.get_file_contents(True), 'old_pointed')
+        self.assertEqual(self.zoo_watcher.get_file_contents(True), 'old_pointed')
 
     def test_compare_pointer(self):
         self.zoo_watcher.old_pointed = '/path1'

From 3684a9feb532ff02c3663419bad84066c20aaea3 Mon Sep 17 00:00:00 2001
From: Russ Ferriday <russf@topia.com>
Date: Mon, 6 Nov 2017 17:50:45 +0000
Subject: [PATCH 08/22] Update dmoz.org to dmoztools.net since dmoz.org now
 redirects. (#145) (#147)

---
 crawler/config/example.yml                  |  4 ++--
 crawler/tests/test_distributed_scheduler.py |  2 +-
 docs/topics/crawler/controlling.rst         |  2 +-
 docs/topics/crawler/extension.rst           | 12 ++++++------
 docs/topics/introduction/quickstart.rst     |  4 ++--
 docs/topics/kafka-monitor/api.rst           | 12 ++++++------
 redis-monitor/tests/test_plugins.py         | 12 ++++++------
 7 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/crawler/config/example.yml b/crawler/config/example.yml
index f1b21897..f96f282c 100644
--- a/crawler/config/example.yml
+++ b/crawler/config/example.yml
@@ -1,9 +1,9 @@
 domains:
-    dmoz.org:
+    dmoztools.net:
         window: 60
         hits: 60
         scale: 1.0
     wikipedia.org:
         window: 60
         hits: 30
-        scale: 0.5
\ No newline at end of file
+        scale: 0.5
diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py
index 86c50662..ca22b566 100644
--- a/crawler/tests/test_distributed_scheduler.py
+++ b/crawler/tests/test_distributed_scheduler.py
@@ -220,7 +220,7 @@ class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase):
     def test_change_config(self):
         good_string = ""\
           "domains:\n"\
-          "  dmoz.org:\n"\
+          "  dmoztools.net:\n"\
           "      window: 60\n"\
           "      hits: 60\n"\
           "      scale: 1.0\n"\
diff --git a/docs/topics/crawler/controlling.rst b/docs/topics/crawler/controlling.rst
index 4daf1371..1b345c64 100644
--- a/docs/topics/crawler/controlling.rst
+++ b/docs/topics/crawler/controlling.rst
@@ -141,7 +141,7 @@ To utilize the different throttle mechanisms you can alter the following setting
 Combining Domain Queues and Throttling
 --------------------------------------
 
-At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoz.org:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process.
+At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoztools.net:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process.
 
 This results in Spiders across the cluster continually polling all available domain queues for new requests, but only receiving requests when the throttle mechanism indicates that the request limit has not gone beyond the max desired configuration. Because the throttle coordination is conducted via Redis, it is not reliant on any one Scrapy process to determine whether the cluster can or can't crawl a particular domain.
 
diff --git a/docs/topics/crawler/extension.rst b/docs/topics/crawler/extension.rst
index 85f91380..57c33980 100644
--- a/docs/topics/crawler/extension.rst
+++ b/docs/topics/crawler/extension.rst
@@ -199,7 +199,7 @@ Then, feed your cluster.
 
 ::
 
-    python kafka_monitor.py feed '{"url": "http://dmoz.org", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}'
+    python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}'
 
 If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kafkadump.py`` script, you will begin to see output like so...
 
@@ -208,8 +208,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf
     {
         "body": <omitted>,
         "crawlid": "test123456",
-        "response_url": "http://www.dmoz.org/",
-        "url": "http://www.dmoz.org/",
+        "response_url": "http://www.dmoztools.net/",
+        "url": "http://www.dmoztools.net/",
         "status_code": 200,
         "status_msg": "OK",
         "appid": "testapp",
@@ -228,8 +228,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf
     {
         "body": <omitted>,
         "crawlid": "test123456",
-        "response_url": "http://www.dmoz.org/Computers/Hardware/",
-        "url": "http://www.dmoz.org/Computers/Hardware/",
+        "response_url": "http://www.dmoztools.net/Computers/Hardware/",
+        "url": "http://www.dmoztools.net/Computers/Hardware/",
         "status_code": 200,
         "status_msg": "OK",
         "appid": "testapp",
@@ -273,4 +273,4 @@ You can also fire up more than one crawl job at a time, and track the steps that
         "wandering_spider_count": 4
     }
 
-You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities.
\ No newline at end of file
+You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities.
diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst
index cf63c8ea..e82506f7 100644
--- a/docs/topics/introduction/quickstart.rst
+++ b/docs/topics/introduction/quickstart.rst
@@ -460,7 +460,7 @@ Crawl Request:
 
 ::
 
-    python kafka_monitor.py feed '{"url": "http://dmoz.org", "appid":"testapp", "crawlid":"abc1234", "maxdepth":1}'
+    python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"abc1234", "maxdepth":1}'
 
 Now send an ``info`` action request to see what is going on with the
 crawl:
@@ -478,7 +478,7 @@ The following things will occur for this action request:
 
   ::
 
-      {u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoz.org': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'}
+      {u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoztools.net': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'}
 
 In this case we had 25 urls pending in the queue, so yours may be slightly different.
 
diff --git a/docs/topics/kafka-monitor/api.rst b/docs/topics/kafka-monitor/api.rst
index 95614f4f..cb417c2c 100644
--- a/docs/topics/kafka-monitor/api.rst
+++ b/docs/topics/kafka-monitor/api.rst
@@ -74,9 +74,9 @@ Kafka Request:
 
     ::
 
-        $ python kafka_monitor.py feed '{"url": "http://www.dmoz.org/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}'
+        $ python kafka_monitor.py feed '{"url": "http://www.dmoztools.net/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}'
 
-    - Submits a dmoz.org crawl spidering 2 levels deep with a high priority
+    - Submits a dmoztools.net crawl spidering 2 levels deep with a high priority
 
     ::
 
@@ -899,7 +899,7 @@ Zookeeper Request:
 
     ::
 
-        $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoz.org", "hits":60, "window":60, "scale":0.9}'
+        $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoztools.net", "hits":60, "window":60, "scale":0.9}'
 
 Response from Kafka:
 
@@ -907,7 +907,7 @@ Response from Kafka:
 
         {
             "action": "domain-update",
-            "domain": "dmoz.org",
+            "domain": "dmoztools.net",
             "server_time": 1464402128,
             "uuid": "abc123",
             "appid": "madisonTest"
@@ -923,7 +923,7 @@ Zookeeper Request:
 
     ::
 
-        $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoz.org"}'
+        $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoztools.net"}'
 
 Response from Kafka:
 
@@ -931,7 +931,7 @@ Response from Kafka:
 
         {
             "action": "domain-remove",
-            "domain": "dmoz.org",
+            "domain": "dmoztools.net",
             "server_time": 1464402146,
             "uuid": "abc123",
             "appid": "madisonTest"
diff --git a/redis-monitor/tests/test_plugins.py b/redis-monitor/tests/test_plugins.py
index 056f4a50..324b6d84 100644
--- a/redis-monitor/tests/test_plugins.py
+++ b/redis-monitor/tests/test_plugins.py
@@ -322,7 +322,7 @@ def test_stats_get_queue(self):
                                                 'link:istresearch.com:queue',
                                                 'link:yellowpages.com:queue',
                                                 'link:cnn.com:queue',
-                                                'wandering:dmoz.org:queue',
+                                                'wandering:dmoztools.net:queue',
                                                 'wandering:craigslist.org:queue',
                                                 ])
         results = [5, 10, 11, 1, 3]
@@ -349,7 +349,7 @@ def ret_val(*args):
                     'spider_backlog': 4,
                     'num_domains': 2,
                     'domains': [
-                        {'domain': 'dmoz.org', 'backlog': 1},
+                        {'domain': 'dmoztools.net', 'backlog': 1},
                         {'domain': 'craigslist.org', 'backlog': 3},
                     ]
                 }
@@ -395,20 +395,20 @@ def test_zk_regex(self):
 
     def test_zk_handle_du(self):
         # domain update
-        s = b'blacklist: []\ndomains:\n  dmoz.org: {hits: 60, scale: 1.0, window: 60}\n'
+        s = b'blacklist: []\ndomains:\n  dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n'
         val = '{"uuid":"blah123","hits":15,"scale":0.9,"window":60}'
-        expected = b'blacklist: []\ndomains:\n  cnn.com:\n    hits: 15\n    scale: 0.9\n    window: 60\n  dmoz.org:\n    hits: 60\n    scale: 1.0\n    window: 60\n'
+        expected = b'blacklist: []\ndomains:\n  cnn.com:\n    hits: 15\n    scale: 0.9\n    window: 60\n  dmoztools.net:\n    hits: 60\n    scale: 1.0\n    window: 60\n'
         self.plugin.zoo_client.get = MagicMock(return_value=(s,))
         self.plugin.handle(key="zk:domain-update:cnn.com:testapp", value=val)
         self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected)
 
     def test_zk_handle_dr(self):
         # domain remove
-        s = b'blacklist: []\ndomains:\n  dmoz.org: {hits: 60, scale: 1.0, window: 60}\n'
+        s = b'blacklist: []\ndomains:\n  dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n'
         val = '{"uuid":"blah123"}'
         expected = b'blacklist: []\ndomains: {}\n'
         self.plugin.zoo_client.get = MagicMock(return_value=(s,))
-        self.plugin.handle(key="zk:domain-remove:dmoz.org:testapp", value=val)
+        self.plugin.handle(key="zk:domain-remove:dmoztools.net:testapp", value=val)
         self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected)
 
     def test_zk_handle_bu(self):

From 62be326ac1db80a38a9b268e56869858c4bca4ef Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Tue, 7 Nov 2017 11:11:51 -0500
Subject: [PATCH 09/22] Fix typo

Fixes #148
---
 crawler/crawling/distributed_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py
index fb4fbb7d..58edb89c 100644
--- a/crawler/crawling/distributed_scheduler.py
+++ b/crawler/crawling/distributed_scheduler.py
@@ -82,7 +82,7 @@ def __init__(self, server, persist, update_int, timeout, retries, logger,
         self.ip_update_interval = ip_refresh
         self.add_type = add_type
         self.add_ip = add_ip
-        self.item_retires = retries
+        self.item_retries = retries
         self.logger = logger
         self.ip_regex = re.compile(ip_regex)
         self.backlog_blacklist = backlog_blacklist

From 95d524772ab8d91526b80a1ea161be52ff2e8d90 Mon Sep 17 00:00:00 2001
From: Russ Ferriday <russf@topia.com>
Date: Tue, 7 Nov 2017 16:14:09 +0000
Subject: [PATCH 10/22] Trivial tense/typos (#146)

* Trivial tense/typos

* Update dmoz.org to dmoztools.net since dmoz.org now redirects. (#145) (#147)
---
 docs/topics/introduction/quickstart.rst | 4 ++--
 docs/topics/kafka-monitor/design.rst    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst
index e82506f7..78655b4a 100644
--- a/docs/topics/introduction/quickstart.rst
+++ b/docs/topics/introduction/quickstart.rst
@@ -360,7 +360,7 @@ At this point you should have a Scrapy Cluster setup that has been tested and ap
 
 .. note:: You can append ``&`` to the end of the following commands to run them in the background, but we recommend you open different terminal windows to first get a feel of how the cluster operates.
 
-The following commands outline what you would run in a traditional environment. If using a container based solution these commands are ran when you run the container itself.
+The following commands outline what you would run in a traditional environment. If using a container based solution these commands are run when you run the container itself.
 
 **Bare Bones:**
 
@@ -425,7 +425,7 @@ Which ever setup you chose, every process within should stay running for the rem
 
 .. note:: If you chose to set the Rest service up, this section may also be performed via the :doc:`../rest/index` endpoint. You just need to ensure the JSON identified in the following section is properly fed into the :ref:`feed <feed_endpoint>` rest endpoint.
 
-*The follwing commands can be ran from the command line, whether that is on the machine itself or inside the Kafka Monitor container depends on the setup chosen above.*
+*The following commands can be run from the command line, whether that is on the machine itself or inside the Kafka Monitor container depends on the setup chosen above.*
 
 1) We now need to feed the cluster a crawl request. This is done via the same Kafka Monitor python script, but with different command line arguements.
 
diff --git a/docs/topics/kafka-monitor/design.rst b/docs/topics/kafka-monitor/design.rst
index 1558d571..827e95c6 100644
--- a/docs/topics/kafka-monitor/design.rst
+++ b/docs/topics/kafka-monitor/design.rst
@@ -11,9 +11,9 @@ Soon enough those same applications wanted the ability to retrieve information a
 
 The Kafka Monitor reads from the desired inbound Kafka topic, and applies the currently loaded Plugin's JSON APIs to the received message. The first Plugin to have a valid `JSON Schema <http://json-schema.org/latest/json-schema-core.html>`_ for the received JSON object is then allowed to do its own processing and manipulation of the object.
 
-In Scrapy Cluster's use case, the default Plugins write their requests into Redis keys, but the functionality does not stop there. The Kafka Monitor settings can alter which plugins are loaded, or add new plugins to extend functionality. These modules allow the Kafka Monitor core to have a small footprint but allow extension or different plugins to be ran.
+In Scrapy Cluster's use case, the default Plugins write their requests into Redis keys, but the functionality does not stop there. The Kafka Monitor settings can alter which plugins are loaded, or add new plugins to extend functionality. These modules allow the Kafka Monitor core to have a small footprint but allow extension or different plugins to be run.
 
-The Kafka Monitor can be ran as a single process, or part of the same Kafka consumer group spread across multiple machines and processes. This allows distributed and fault tolerant throughput to ensure the crawl requests to the cluster are always read.
+The Kafka Monitor can be run as a single process, or part of the same Kafka consumer group spread across multiple machines and processes. This allows distributed and fault tolerant throughput to ensure the crawl requests to the cluster are always read.
 
 From our own internal debugging and ensuring other applications were working properly, a utility program called Kafka Dump was also created in order to be able to interact and monitor the kafka messages coming through. This is a small dump utility with no external dependencies, to allow users to get an insight into what is being passed through the Kafka topics within the cluster.
 

From 4775c06bf7249639e3f7ff763fa600a11e57b62b Mon Sep 17 00:00:00 2001
From: innspyros <30524071+innspyros@users.noreply.github.com>
Date: Thu, 9 Nov 2017 17:51:45 +0200
Subject: [PATCH 11/22] vm.max_map_count tip (#151)

docker eleasticsearch container not running when the vm.max_map_count has the default value which is too low.
---
 docs/topics/advanced/docker.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/topics/advanced/docker.rst b/docs/topics/advanced/docker.rst
index fca01884..8aa77be2 100644
--- a/docs/topics/advanced/docker.rst
+++ b/docs/topics/advanced/docker.rst
@@ -160,6 +160,18 @@ You can ensure everything started up via:
                           /usr/sbin/sshd  ...                             p, 22/tcp, 2888/tcp,
                                                                           3888/tcp
 
+TIP
+
+In the unfortunate case that elasticsearch is not running and the following message shows up into the logs:
+
+::
+
+  ERROR: bootstrap checks failed max virtual memory areas vm.max_map_count [65530] is too low, increase to at least [262144]
+
+you have to edit the virtual memory settings of the machine you are running docker onto. 
+
+For more info about the needed edits you can follow the link to the `official Elasticsearch documentation <https://www.elastic.co/guide/en/elasticsearch/reference/current/vm-max-map-count.html>`_ .
+
 
 From here, please continue to the :ref:`Kibana <elk_kibana>` portion of the :doc:`ELK <integration>` integration guide.
 

From 61d8809e5bc08596c7f96234998da4173fceb182 Mon Sep 17 00:00:00 2001
From: innspyros <30524071+innspyros@users.noreply.github.com>
Date: Thu, 30 Nov 2017 20:02:21 +0200
Subject: [PATCH 12/22] Ansible kafka zookeeper host list. (#152)

* vm.max_map_count tip

docker eleasticsearch container not running when the vm.max_map_count has the default value which is too low.

* Ansible Kafka template consumer.properties.j2 had a hardcoded
zookeeper.connect value.

Now the value is created from the iteration of the zookeeper_host_list
(host and port).
---
 ansible/roles/kafka/templates/consumer.properties.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ansible/roles/kafka/templates/consumer.properties.j2 b/ansible/roles/kafka/templates/consumer.properties.j2
index 10b7bf4d..1eb7e368 100644
--- a/ansible/roles/kafka/templates/consumer.properties.j2
+++ b/ansible/roles/kafka/templates/consumer.properties.j2
@@ -17,7 +17,7 @@
 # Zookeeper connection string
 # comma separated host:port pairs, each corresponding to a zk
 # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002"
-zookeeper.connect=127.0.0.1:2181
+zookeeper.connect={% for host in zookeeper_host_list %}{{ host }}:{{ zookeeper_client_port|default(2181) }}{% if not loop.last %},{% endif %}{% endfor %}
 
 # timeout in ms for connecting to zookeeper
 zookeeper.connection.timeout.ms=6000

From bc6977ab6d8805d4c1282d01d28e349fe4ce37cd Mon Sep 17 00:00:00 2001
From: Spyros Vlachos <devspyrosv@users.noreply.github.com>
Date: Fri, 5 Jan 2018 10:11:43 +0200
Subject: [PATCH 13/22] Add auto encoding resolution for crawling items.

---
 crawler/crawling/items.py                    | 1 +
 crawler/crawling/pipelines.py                | 4 +++-
 crawler/crawling/spiders/link_spider.py      | 1 +
 crawler/crawling/spiders/wandering_spider.py | 1 +
 kafka-monitor/kafkadump.py                   | 4 +++-
 5 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/crawler/crawling/items.py b/crawler/crawling/items.py
index bec5f279..68f8537e 100644
--- a/crawler/crawling/items.py
+++ b/crawler/crawling/items.py
@@ -19,3 +19,4 @@ class RawResponseItem(Item):
     attrs = Field()
     success = Field()
     exception = Field()
+    encoding = Field()
diff --git a/crawler/crawling/pipelines.py b/crawler/crawling/pipelines.py
index 5baa710a..653a9071 100644
--- a/crawler/crawling/pipelines.py
+++ b/crawler/crawling/pipelines.py
@@ -181,7 +181,9 @@ def process_item(self, item, spider):
             prefix = self.topic_prefix
 
             try:
-                if self.use_base64:
+                if self.use_base64 and 'encoding' in datum:
+                    datum['body'] = base64.b64encode(bytes(datum['body']))
+                elif self.use_base64:
                     datum['body'] = base64.b64encode(bytes(datum['body'], 'utf-8'))
                 message = ujson.dumps(datum, sort_keys=True)
             except:
diff --git a/crawler/crawling/spiders/link_spider.py b/crawler/crawling/spiders/link_spider.py
index 32af800b..289c6a63 100644
--- a/crawler/crawling/spiders/link_spider.py
+++ b/crawler/crawling/spiders/link_spider.py
@@ -40,6 +40,7 @@ def parse(self, response):
         item["response_headers"] = self.reconstruct_headers(response)
         item["request_headers"] = response.request.headers
         item["body"] = response.body
+        item["encoding"] = response.encoding
         item["links"] = []
 
         # determine whether to continue spidering
diff --git a/crawler/crawling/spiders/wandering_spider.py b/crawler/crawling/spiders/wandering_spider.py
index 7b856769..66084569 100644
--- a/crawler/crawling/spiders/wandering_spider.py
+++ b/crawler/crawling/spiders/wandering_spider.py
@@ -46,6 +46,7 @@ def parse(self, response):
         item["response_headers"] = self.reconstruct_headers(response)
         item["request_headers"] = response.request.headers
         item["body"] = response.body
+        item["encoding"] = response.encoding
         item["links"] = []
         # we want to know how far our spider gets
         if item['attrs'] is None:
diff --git a/kafka-monitor/kafkadump.py b/kafka-monitor/kafkadump.py
index ff77cc29..9b4b9061 100644
--- a/kafka-monitor/kafkadump.py
+++ b/kafka-monitor/kafkadump.py
@@ -126,7 +126,9 @@ def main():
                     val = message.value
                     try:
                         item = json.loads(val)
-                        if args['decode_base64'] and 'body' in item:
+                        if args['decode_base64'] and 'body' in item and 'encoding' in item:
+                            item['body'] = base64.b64decode(item['body']).decode(item['encoding'])
+                        elif args['decode_base64'] and 'body' in item:
                             item['body'] = base64.b64decode(item['body'])
 
                         if args['no_body'] and 'body' in item:

From 580de6f067014c3851d3a95f51d9d6df2d991950 Mon Sep 17 00:00:00 2001
From: Spyros Vlachos <devspyrosv@users.noreply.github.com>
Date: Mon, 15 Jan 2018 16:20:29 +0200
Subject: [PATCH 14/22] Automatic discovery of html encoding

---
 crawler/crawling/pipelines.py   | 11 +++--
 crawler/tests/online.py         |  2 +-
 crawler/tests/test_pipelines.py | 76 +++++++++++++++++++++++++++++++--
 kafka-monitor/kafkadump.py      | 14 ++++--
 rest/tests/test_rest_service.py | 34 +++++++--------
 5 files changed, 109 insertions(+), 28 deletions(-)

diff --git a/crawler/crawling/pipelines.py b/crawler/crawling/pipelines.py
index 653a9071..99b9a0f2 100644
--- a/crawler/crawling/pipelines.py
+++ b/crawler/crawling/pipelines.py
@@ -181,10 +181,13 @@ def process_item(self, item, spider):
             prefix = self.topic_prefix
 
             try:
-                if self.use_base64 and 'encoding' in datum:
-                    datum['body'] = base64.b64encode(bytes(datum['body']))
-                elif self.use_base64:
-                    datum['body'] = base64.b64encode(bytes(datum['body'], 'utf-8'))
+                if self.use_base64:
+                    if isinstance(datum['body'], str):
+                        datum['body'] = bytes(datum['body'], datum['encoding'])
+                    datum['body'] = base64.b64encode(datum['body'])
+                elif 'encoding' in datum and 'utf-8' != datum['encoding'] and datum['encoding']:
+                    datum['body'] = datum['body'].decode(datum['encoding'])
+
                 message = ujson.dumps(datum, sort_keys=True)
             except:
                 message = 'json failed to parse'
diff --git a/crawler/tests/online.py b/crawler/tests/online.py
index 9cead8e4..ecf14af4 100644
--- a/crawler/tests/online.py
+++ b/crawler/tests/online.py
@@ -38,7 +38,7 @@ class TestLinkSpider(TestCase):
         "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\
         "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\
         "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\
-        "link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
+        "test-spider\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
 
     def setUp(self):
         self.settings = get_project_settings()
diff --git a/crawler/tests/test_pipelines.py b/crawler/tests/test_pipelines.py
index e6b54ac2..69a73971 100644
--- a/crawler/tests/test_pipelines.py
+++ b/crawler/tests/test_pipelines.py
@@ -1,6 +1,9 @@
+# coding=utf-8
 from builtins import object
 from unittest import TestCase
 import mock
+import ujson
+import base64
 from mock import MagicMock
 from crawling.pipelines import (LoggingBeforePipeline, KafkaPipeline)
 from crawling.items import RawResponseItem
@@ -24,6 +27,43 @@ def _get_item(self):
         item["request_headers"] = {}
         item["body"] = "text"
         item["links"] = []
+        item["encoding"] = "utf-8"
+
+        return item
+
+    def _get_internationalized_utf8_item(self):
+        item = RawResponseItem()
+        item['appid'] = 'app'
+        item['crawlid'] = 'crawlid'
+        item['attrs'] = {}
+        item["url"] = "http://dumb.com"
+        item["response_url"] = "http://dumb.com"
+        item["status_code"] = 200
+        item["status_msg"] = "OK"
+        item["response_headers"] = {}
+        item["request_headers"] = {}
+        item["body"] = u"This is a test - Αυτό είναι ένα τεστ - 这是一个测试 - これはテストです"
+        item["links"] = []
+        item["encoding"] = "utf-8"
+
+        return item
+
+    def _get_internationalized_iso_item(self):
+        item = RawResponseItem()
+        item['appid'] = 'app'
+        item['crawlid'] = 'crawlid'
+        item['attrs'] = {}
+        item["url"] = "http://dumb.com"
+        item["response_url"] = "http://dumb.com"
+        item["status_code"] = 200
+        item["status_msg"] = "OK"
+        item["response_headers"] = {}
+        item["request_headers"] = {}
+        # Fill the item["body"] with the string 'αυτό είναι ένα τεστ' that was encoded in iso-8859-7
+        # using iconv and further encoded in base64 in order to store it inside this file.
+        item["body"] = base64.b64decode('4fX0/CDl3+3h6SDd7eEg9OXz9Ao=')
+        item["links"] = []
+        item["encoding"] = "iso-8859-7"
 
         return item
 
@@ -75,7 +115,7 @@ def test_process_item(self, e):
 
         # test normal send, no appid topics
         self.pipe.process_item(item, spider)
-        expected = '{"appid":"app","attrs":{},"body":"text","crawlid":"crawlid","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
+        expected = '{"appid":"app","attrs":{},"body":"text","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
         self.pipe.producer.send.assert_called_once_with('prefix.crawled_firehose',
                                                         expected)
         self.pipe.producer.send.reset_mock()
@@ -93,10 +133,40 @@ def test_process_item(self, e):
         self.pipe.appid_topics = False
         self.pipe.use_base64 = True
         self.pipe.process_item(item, spider)
-        expected = '{"appid":"app","attrs":{},"body":"dGV4dA==","crawlid":"crawlid","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
-        self.pipe.producer.send.assert_called_once_with('prefix.crawled_firehose',
+        expected = '{"appid":"app","attrs":{},"body":"dGV4dA==","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
+        self.pipe.producer.send.assert_called_with('prefix.crawled_firehose',
                                                         expected)
 
+        # test base64 encode/decode with utf-8 encoding
+        item = self._get_internationalized_utf8_item()
+        self.pipe.appid_topics = False
+        self.pipe.use_base64 = True
+        self.pipe.process_item(item, spider)
+        expected = '{"appid":"app","attrs":{},"body":"VGhpcyBpcyBhIHRlc3QgLSDOkc+Fz4TPjCDOtc6vzr3Osc65IM6tzr3OsSDPhM61z4PPhCAtIOi\\/meaYr+S4gOS4qua1i+ivlSAtIOOBk+OCjOOBr+ODhuOCueODiOOBp+OBmQ==","crawlid":"crawlid","encoding":"utf-8","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
+        self.pipe.producer.send.assert_called_with('prefix.crawled_firehose',
+                                                        expected)
+        # unpack the arguments used for the previous assertion call
+        call_args, call_kwargs = self.pipe.producer.send.call_args
+        crawl_args_dict = ujson.loads(call_args[1])
+        decoded_string = base64.b64decode(crawl_args_dict['body']).decode(crawl_args_dict['encoding'])
+        self.assertEquals(decoded_string, item.get('body'))
+
+        # test base64 encode/decode with iso encoding
+        item = self._get_internationalized_iso_item()
+        self.pipe.appid_topics = False
+        self.pipe.use_base64 = True
+        self.pipe.process_item(item, spider)
+        expected = '{"appid":"app","attrs":{},"body":"4fX0\\/CDl3+3h6SDd7eEg9OXz9Ao=","crawlid":"crawlid","encoding":"iso-8859-7","links":[],"request_headers":{},"response_headers":{},"response_url":"http:\\/\\/dumb.com","status_code":200,"status_msg":"OK","timestamp":"the time","url":"http:\\/\\/dumb.com"}'
+        self.pipe.producer.send.assert_called_with('prefix.crawled_firehose',
+                                                   expected)
+        # unpack the arguments used for the previous assertion call
+        call_args, call_kwargs = self.pipe.producer.send.call_args
+        crawl_args_dict = ujson.loads(call_args[1])
+        decoded_string = base64.b64decode(crawl_args_dict['body']).decode(crawl_args_dict['encoding'])
+        self.assertEquals(decoded_string, item.get('body').decode(item.get('encoding')))
+        # Test again against the original (before it was encoded in iso) string
+        self.assertEquals(decoded_string, u"αυτό είναι ένα τεστ\n")
+
         # test kafka exception
         item = self._get_item()
         copy = deepcopy(item)
diff --git a/kafka-monitor/kafkadump.py b/kafka-monitor/kafkadump.py
index 9b4b9061..1db8d797 100644
--- a/kafka-monitor/kafkadump.py
+++ b/kafka-monitor/kafkadump.py
@@ -10,6 +10,7 @@
 import time
 import argparse
 import base64
+import six
 
 from scutils.settings_wrapper import SettingsWrapper
 from scutils.log_factory import LogFactory
@@ -56,6 +57,10 @@ def main():
                              required=False, const=True, default=False,
                              help="Do not include the raw html 'body' key in"
                              " the json dump of the topic")
+    dump_parser.add_argument('-jb', '--just-body', action='store_const',
+                             required=False, const=True, default=False,
+                             help="Just print the raw html 'body' key in"
+                                  " the json dump of the topic")
     dump_parser.add_argument('-p', '--pretty', action='store_const',
                              required=False, const=True, default=False,
                              help="Pretty print the json objects consumed")
@@ -126,10 +131,8 @@ def main():
                     val = message.value
                     try:
                         item = json.loads(val)
-                        if args['decode_base64'] and 'body' in item and 'encoding' in item:
+                        if args['decode_base64'] and 'body' in item:
                             item['body'] = base64.b64decode(item['body']).decode(item['encoding'])
-                        elif args['decode_base64'] and 'body' in item:
-                            item['body'] = base64.b64decode(item['body'])
 
                         if args['no_body'] and 'body' in item:
                             del item['body']
@@ -140,6 +143,11 @@ def main():
 
                     if args['pretty']:
                         print(json.dumps(item, indent=4))
+                    elif args['just_body']:
+                        if six.PY2:
+                            print(item['body'].encode('utf-8'))
+                        else:
+                            print(item['body'])
                     else:
                         print(item)
                     num_records = num_records + 1
diff --git a/rest/tests/test_rest_service.py b/rest/tests/test_rest_service.py
index 01ca12df..d2f1293a 100644
--- a/rest/tests/test_rest_service.py
+++ b/rest/tests/test_rest_service.py
@@ -372,7 +372,7 @@ def test_error_catch(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 500)
 
@@ -381,7 +381,7 @@ def test_error_catch(self):
             override.logger.error.reset_mock()
             results = override.test_error2()
             self.assertFalse(override.logger.error.called)
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, 'test data')
             self.assertEqual(results[1], 200)
 
@@ -390,7 +390,7 @@ def test_error_catch(self):
             override.logger.error.reset_mock()
             results = override.test_error3()
             self.assertFalse(override.logger.error.called)
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, 'test data')
             self.assertEqual(results[1], 109)
 
@@ -414,7 +414,7 @@ def test_validate_json(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 400)
 
@@ -434,7 +434,7 @@ def test_validate_json(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 400)
 
@@ -496,7 +496,7 @@ def test_validate_schema(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 400)
 
@@ -514,7 +514,7 @@ def test_index(self):
                 "my_id": 'a908',
                 "node_health": 'RED'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
 
     def test_feed(self):
@@ -533,7 +533,7 @@ def test_feed(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 500)
 
@@ -553,7 +553,7 @@ def test_feed(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 500)
 
@@ -567,7 +567,7 @@ def test_feed(self):
                 u'error': None,
                 u'status': u'SUCCESS'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 200)
 
@@ -589,7 +589,7 @@ def fancy_get_time():
                 u'error': None,
                 u'status': u'SUCCESS'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 200)
             self.assertFalse('key' in self.rest_service.uuids)
@@ -608,7 +608,7 @@ def fancy_get_time2():
                 u'error': None,
                 u'status': u'SUCCESS'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 200)
             self.assertTrue('key' in self.rest_service.uuids)
@@ -634,7 +634,7 @@ def test_poll(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 500)
 
@@ -650,7 +650,7 @@ def test_poll(self):
                 u'error': None,
                 u'status': u'SUCCESS'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 200)
 
@@ -667,7 +667,7 @@ def test_poll(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 404)
 
@@ -689,7 +689,7 @@ def test_poll(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 500)
 
@@ -709,7 +709,7 @@ def test_poll(self):
                 },
                 u'status': u'FAILURE'
             }
-            data = json.loads(results[0].data)
+            data = json.loads(results[0].data.decode('utf-8'))
             self.assertEqual(data, d)
             self.assertEqual(results[1], 500)
 

From d0725318b767b37a77b6bffe9a456eae27249d65 Mon Sep 17 00:00:00 2001
From: Spyros Vlachos <devspyrosv@users.noreply.github.com>
Date: Mon, 15 Jan 2018 17:49:44 +0200
Subject: [PATCH 15/22] Cleanup utf-8 logic

---
 crawler/crawling/pipelines.py | 10 ++++++++--
 kafka-monitor/kafkadump.py    |  5 ++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/crawler/crawling/pipelines.py b/crawler/crawling/pipelines.py
index 99b9a0f2..b6352917 100644
--- a/crawler/crawling/pipelines.py
+++ b/crawler/crawling/pipelines.py
@@ -181,11 +181,17 @@ def process_item(self, item, spider):
             prefix = self.topic_prefix
 
             try:
+                # Get the encoding. If it's not a key of datum, return utf-8
+                encoding = datum.get('encoding', 'utf-8')
+
                 if self.use_base64:
+                    # When running in Python 2 datum['body'] is a string
                     if isinstance(datum['body'], str):
-                        datum['body'] = bytes(datum['body'], datum['encoding'])
+                        datum['body'] = bytes(datum['body'], encoding)
+                    # In Python 3 datum['body'] is already in byte form
                     datum['body'] = base64.b64encode(datum['body'])
-                elif 'encoding' in datum and 'utf-8' != datum['encoding'] and datum['encoding']:
+
+                elif 'utf-8' != encoding:
                     datum['body'] = datum['body'].decode(datum['encoding'])
 
                 message = ujson.dumps(datum, sort_keys=True)
diff --git a/kafka-monitor/kafkadump.py b/kafka-monitor/kafkadump.py
index 1db8d797..20e68a95 100644
--- a/kafka-monitor/kafkadump.py
+++ b/kafka-monitor/kafkadump.py
@@ -131,8 +131,11 @@ def main():
                     val = message.value
                     try:
                         item = json.loads(val)
+                        # Get the encoding. If it's not a key of item, return utf-8.
+                        encoding = item.get('encoding', 'utf-8')
+
                         if args['decode_base64'] and 'body' in item:
-                            item['body'] = base64.b64decode(item['body']).decode(item['encoding'])
+                            item['body'] = base64.b64decode(item['body']).decode(encoding)
 
                         if args['no_body'] and 'body' in item:
                             del item['body']

From 19470761051a54f1bd4d6421dd1856bff1be52a1 Mon Sep 17 00:00:00 2001
From: Spyros Vlachos <devspyrosv@users.noreply.github.com>
Date: Wed, 17 Jan 2018 16:56:18 +0200
Subject: [PATCH 16/22] Fix spider name in crawler's online tests (#161)

Because of the wrong spider name, a wrong redis directory ended up containing the queue key.
This way the spider was not crawling the provided link.
---
 crawler/tests/online.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler/tests/online.py b/crawler/tests/online.py
index 9cead8e4..ecf14af4 100644
--- a/crawler/tests/online.py
+++ b/crawler/tests/online.py
@@ -38,7 +38,7 @@ class TestLinkSpider(TestCase):
         "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\
         "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\
         "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\
-        "link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
+        "test-spider\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
 
     def setUp(self):
         self.settings = get_project_settings()

From 5e20adf616ae915f2f57c341070fe1bae8846d42 Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Wed, 17 Jan 2018 10:40:20 -0500
Subject: [PATCH 17/22] fix typo for consumer vs producer

Resolves #163
---
 rest/rest_service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rest/rest_service.py b/rest/rest_service.py
index 342a4f7f..a823b5f9 100644
--- a/rest/rest_service.py
+++ b/rest/rest_service.py
@@ -424,7 +424,7 @@ def _create_consumer(self):
 
     @retry(wait_exponential_multiplier=500, wait_exponential_max=10000)
     def _create_producer(self):
-        """Tries to establish a Kafka consumer connection"""
+        """Tries to establish a Kafka producer connection"""
         if not self.closed:
             try:
                 self.logger.debug("Creating new kafka producer using brokers: " +

From 9bd8c9558868177966c18b4a053404956cf88e03 Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Wed, 17 Jan 2018 10:56:29 -0500
Subject: [PATCH 18/22] test at logging in on Travis without email flag

---
 travis/docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/docker.sh b/travis/docker.sh
index 00a33ea7..85f68a73 100755
--- a/travis/docker.sh
+++ b/travis/docker.sh
@@ -42,7 +42,7 @@ if [ "$TRAVIS_BRANCH" = "dev" ] && [ "$TRAVIS_PULL_REQUEST" = "false" ] && [ "$T
     sudo docker rmi istresearch/scrapy-cluster:rest-test
 
     # log into docker
-    sudo docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
+    sudo docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
 
     # push new containers
     sudo docker push istresearch/scrapy-cluster

From dad16cd0e3c923e7e4bdc4e587f2c9e83270327c Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Fri, 19 Jan 2018 10:30:11 -0500
Subject: [PATCH 19/22] Scutils LogFactory update to pass all available extras.

This is a sneaky bug seen when you override the LogObject class `add_extras()` method, and then you would like to use the callback functionality with your newly returned values.

Before, the callback would only be passed the original extra parameter, now it is passed the updated parameters.
---
 utils/scutils/log_factory.py    | 10 +++++-----
 utils/scutils/version.py        |  2 +-
 utils/tests/test_log_factory.py |  4 +++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/utils/scutils/log_factory.py b/utils/scutils/log_factory.py
index a89b9fab..c0c16abd 100644
--- a/utils/scutils/log_factory.py
+++ b/utils/scutils/log_factory.py
@@ -208,7 +208,7 @@ def debug(self, message, extra={}):
         if self.level_dict['DEBUG'] >= self.level_dict[self.log_level]:
             extras = self.add_extras(extra, "DEBUG")
             self._write_message(message, extras)
-            self.cb_handler.fire_callbacks('DEBUG', message, extra)
+            self.cb_handler.fire_callbacks('DEBUG', message, extras)
 
     def info(self, message, extra={}):
         '''
@@ -220,7 +220,7 @@ def info(self, message, extra={}):
         if self.level_dict['INFO'] >= self.level_dict[self.log_level]:
             extras = self.add_extras(extra, "INFO")
             self._write_message(message, extras)
-            self.cb_handler.fire_callbacks('INFO', message, extra)
+            self.cb_handler.fire_callbacks('INFO', message, extras)
 
     def warn(self, message, extra={}):
         '''
@@ -241,7 +241,7 @@ def warning(self, message, extra={}):
         if self.level_dict['WARNING'] >= self.level_dict[self.log_level]:
             extras = self.add_extras(extra, "WARNING")
             self._write_message(message, extras)
-            self.cb_handler.fire_callbacks('WARNING', message, extra)
+            self.cb_handler.fire_callbacks('WARNING', message, extras)
 
     def error(self, message, extra={}):
         '''
@@ -253,7 +253,7 @@ def error(self, message, extra={}):
         if self.level_dict['ERROR'] >= self.level_dict[self.log_level]:
             extras = self.add_extras(extra, "ERROR")
             self._write_message(message, extras)
-            self.cb_handler.fire_callbacks('ERROR', message, extra)
+            self.cb_handler.fire_callbacks('ERROR', message, extras)
 
     def critical(self, message, extra={}):
         '''
@@ -265,7 +265,7 @@ def critical(self, message, extra={}):
         if self.level_dict['CRITICAL'] >= self.level_dict[self.log_level]:
             extras = self.add_extras(extra, "CRITICAL")
             self._write_message(message, extras)
-            self.cb_handler.fire_callbacks('CRITICAL', message, extra)
+            self.cb_handler.fire_callbacks('CRITICAL', message, extras)
 
     def _write_message(self, message, extra):
         '''
diff --git a/utils/scutils/version.py b/utils/scutils/version.py
index d8b10d6e..d955d061 100644
--- a/utils/scutils/version.py
+++ b/utils/scutils/version.py
@@ -1,2 +1,2 @@
-__version__ = '1.3.0dev3'
+__version__ = '1.3.0dev4'
 VERSION = tuple(int(x) for x in __version__.split('.'))
diff --git a/utils/tests/test_log_factory.py b/utils/tests/test_log_factory.py
index 204cf7cb..cff371fd 100644
--- a/utils/tests/test_log_factory.py
+++ b/utils/tests/test_log_factory.py
@@ -309,12 +309,14 @@ def multiply_5(log_message=None, log_extra=None):
         self.assertEqual(6, self.logger.x)
 
     def test_preserve_data(self):
+        self.logger._get_time = MagicMock(return_value='2015-11-12T10:11:12.0Z')
         message = "THIS IS A TEST"
         extras = {"key": "value", 'a': [1, 2, 3]}
+        extras_add = self.logger.add_extras(extras, 'INFO')
 
         def cb(log_message=None, log_extra=None):
             self.assertEqual(log_message, message)
-            self.assertEqual(log_extra, extras)
+            self.assertEqual(log_extra, extras_add)
 
         self.logger.register_callback('>DEBUG', cb)
         self.logger.log_level = 'INFO'

From b35a1644256284b6217fd42ea4ec211621876c31 Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Fri, 19 Jan 2018 09:58:42 -0500
Subject: [PATCH 20/22] change istresearch.com to something more stable

Hopefully this fixes the crawler integration tests which have been failing, tested locally and works
---
 crawler/tests/online.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawler/tests/online.py b/crawler/tests/online.py
index ecf14af4..3ff68ebb 100644
--- a/crawler/tests/online.py
+++ b/crawler/tests/online.py
@@ -35,7 +35,7 @@ class CustomSpider(LinkSpider):
 class TestLinkSpider(TestCase):
 
     example_feed = "{\"allowed_domains\":null,\"allow_regex\":null,\""\
-        "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\
+        "crawlid\":\"abc12345\",\"url\":\"http://dmoztools.net/\",\"expires\":0,\""\
         "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\
         "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\
         "test-spider\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
@@ -77,7 +77,7 @@ def test_crawler_process(self):
         d = runner.crawl(CustomSpider)
         d.addBoth(lambda _: reactor.stop())
         # add crawl to redis
-        key = "test-spider:istresearch.com:queue"
+        key = "test-spider:dmoztools.net:queue"
         self.redis_conn.zadd(key, self.example_feed, -99)
 
         # run the spider, give 20 seconds to see the url, crawl it,

From 99505d83e2eb5e810b54306a22eaea6391c40774 Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Fri, 19 Jan 2018 10:11:52 -0500
Subject: [PATCH 21/22] documentation updates for new url

URL changed from istresearch.com to dmoztools.net for better stability
---
 docs/topics/advanced/rediskeys.rst       |  2 +-
 docs/topics/introduction/quickstart.rst  |  4 ++--
 docs/topics/kafka-monitor/quickstart.rst | 10 +++++-----
 docs/topics/rest/api.rst                 |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/topics/advanced/rediskeys.rst b/docs/topics/advanced/rediskeys.rst
index 77f74fa6..e5ca5d0a 100644
--- a/docs/topics/advanced/rediskeys.rst
+++ b/docs/topics/advanced/rediskeys.rst
@@ -69,6 +69,6 @@ If you run the integration tests, there may be temporary Redis keys created that
 
 - **cluster:test** - Used when testing the Kafka Monitor can act and set a key in Redis
 
-- **test-spider:istresearch.com:queue** - Used when testing the crawler installation can interact with Redis and Kafka
+- **test-spider:dmoztools.net:queue** - Used when testing the crawler installation can interact with Redis and Kafka
 
 - **stats:crawler:<hostname>:test-spider:<window>** - Automatically created and destoryed during crawler testing by the stats collection mechanism settings.
diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst
index 78655b4a..513252de 100644
--- a/docs/topics/introduction/quickstart.rst
+++ b/docs/topics/introduction/quickstart.rst
@@ -431,7 +431,7 @@ Which ever setup you chose, every process within should stay running for the rem
 
 ::
 
-    python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"abc123"}'
+    python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"abc123"}'
 
 You will see the following output on the command line for that successful request:
 
@@ -439,7 +439,7 @@ You will see the following output on the command line for that successful reques
 
     2015-12-22 15:45:37,457 [kafka-monitor] INFO: Feeding JSON into demo.incoming
     {
-        "url": "http://istresearch.com",
+        "url": "http://dmoztools.net",
         "crawlid": "abc123",
         "appid": "testapp"
     }
diff --git a/docs/topics/kafka-monitor/quickstart.rst b/docs/topics/kafka-monitor/quickstart.rst
index 1392b60c..9708bf83 100644
--- a/docs/topics/kafka-monitor/quickstart.rst
+++ b/docs/topics/kafka-monitor/quickstart.rst
@@ -33,7 +33,7 @@ JSON Object feeder into your desired Kafka Topic. This takes a valid JSON object
 
 ::
 
-    $ python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"ABC123"}'
+    $ python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"ABC123"}'
 
 The command line feed is very slow and should not be used in production. Instead, you should write your own continuously running application to feed Kafka the desired API requests that you require.
 
@@ -89,10 +89,10 @@ Feed an item
 
 ::
 
-    $ python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"ABC123"}'
+    $ python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"ABC123"}'
     2016-01-05 15:14:44,829 [kafka-monitor] INFO: Feeding JSON into demo.incoming
     {
-        "url": "http://istresearch.com",
+        "url": "http://dmoztools.net",
         "crawlid": "ABC123",
         "appid": "testapp"
     }
@@ -116,8 +116,8 @@ If you have a :ref:`Crawler <crawler>` running, you should see the html come thr
         "response_headers": {
             <headers omitted>
         },
-        "response_url": "http://istresearch.com",
-        "url": "http://istresearch.com",
+        "response_url": "http://dmoztools.net",
+        "url": "http://dmoztools.net",
         "status_code": 200,
         "status_msg": "OK",
         "appid": "testapp",
diff --git a/docs/topics/rest/api.rst b/docs/topics/rest/api.rst
index cb2aab4b..524b17a6 100644
--- a/docs/topics/rest/api.rst
+++ b/docs/topics/rest/api.rst
@@ -156,7 +156,7 @@ Feed a crawl request
 
 ::
 
-    $ curl scdev:5343/feed -H "Content-Type: application/json" -d '{"url":"istresearch.com", "appid":"madisonTest", "crawlid":"abc123"}'
+    $ curl scdev:5343/feed -H "Content-Type: application/json" -d '{"url":"http://dmoztools.net", "appid":"madisonTest", "crawlid":"abc123"}'
 
 Feed a Stats request
 

From d90d4e91c5c19073751fa57665b1a96d4b55625f Mon Sep 17 00:00:00 2001
From: Madison Bahmer <madison.bahmer@istresearch.com>
Date: Tue, 23 Jan 2018 10:33:30 -0500
Subject: [PATCH 22/22] changelog updates for dev

This should bring us up to speed with all the changes made via PR's or other minor issues
---
 docs/topics/changelog.rst | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/docs/topics/changelog.rst b/docs/topics/changelog.rst
index 77f58f0d..055c0b33 100644
--- a/docs/topics/changelog.rst
+++ b/docs/topics/changelog.rst
@@ -12,11 +12,32 @@ Date: ??/??/????
 
 - Add Python 3 support
 
+- Fixed assert deprecations in unit tests
+
+- Corrected Ansible host list for zookeeper
+
+- Minor documentation changes/updates
+
 Crawler
 ^^^^^^^
 
 - Improved request to dictionary serialization support
 
+- Updated item encoding serialization
+
+Rest
+^^^^
+
+- Corrected Kafka environment variable
+
+Utils
+^^^^^
+
+- Fixed LogFactory callback with correct extras dictionary
+
+- Adds quicker shutdown to ThreadedCounter object
+
+
 Scrapy Cluster 1.2
 ------------------
 
@@ -130,7 +151,7 @@ Redis Monitor Changes
 - Added Stats Collection
 
 Crawler Changes
-^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^
 
 - Upgraded Crawler to be compatible with Scrapy 1.0
 
@@ -151,7 +172,7 @@ Crawler Changes
 - Added example Wandering Spider
 
 Scrapy Cluster 1.0
----------------------
+------------------
 
 Date: 5/21/2015