From d598dbc33c7ad23a975bc235fa51578d64f80220 Mon Sep 17 00:00:00 2001 From: Russ Ferriday Date: Mon, 6 Nov 2017 17:50:45 +0000 Subject: [PATCH] Update dmoz.org to dmoztools.net since dmoz.org now redirects. (#145) (#147) --- crawler/config/example.yml | 4 ++-- crawler/tests/test_distributed_scheduler.py | 2 +- docs/topics/crawler/controlling.rst | 2 +- docs/topics/crawler/extension.rst | 12 ++++++------ docs/topics/introduction/quickstart.rst | 2 +- docs/topics/kafka-monitor/api.rst | 12 ++++++------ redis-monitor/tests/test_plugins.py | 12 ++++++------ 7 files changed, 23 insertions(+), 23 deletions(-) diff --git a/crawler/config/example.yml b/crawler/config/example.yml index f1b21897..f96f282c 100644 --- a/crawler/config/example.yml +++ b/crawler/config/example.yml @@ -1,9 +1,9 @@ domains: - dmoz.org: + dmoztools.net: window: 60 hits: 60 scale: 1.0 wikipedia.org: window: 60 hits: 30 - scale: 0.5 \ No newline at end of file + scale: 0.5 diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py index 86c50662..ca22b566 100644 --- a/crawler/tests/test_distributed_scheduler.py +++ b/crawler/tests/test_distributed_scheduler.py @@ -220,7 +220,7 @@ class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase): def test_change_config(self): good_string = ""\ "domains:\n"\ - " dmoz.org:\n"\ + " dmoztools.net:\n"\ " window: 60\n"\ " hits: 60\n"\ " scale: 1.0\n"\ diff --git a/docs/topics/crawler/controlling.rst b/docs/topics/crawler/controlling.rst index 4daf1371..1b345c64 100644 --- a/docs/topics/crawler/controlling.rst +++ b/docs/topics/crawler/controlling.rst @@ -141,7 +141,7 @@ To utilize the different throttle mechanisms you can alter the following setting Combining Domain Queues and Throttling -------------------------------------- -At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoz.org:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process. +At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoztools.net:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process. This results in Spiders across the cluster continually polling all available domain queues for new requests, but only receiving requests when the throttle mechanism indicates that the request limit has not gone beyond the max desired configuration. Because the throttle coordination is conducted via Redis, it is not reliant on any one Scrapy process to determine whether the cluster can or can't crawl a particular domain. diff --git a/docs/topics/crawler/extension.rst b/docs/topics/crawler/extension.rst index 85f91380..57c33980 100644 --- a/docs/topics/crawler/extension.rst +++ b/docs/topics/crawler/extension.rst @@ -199,7 +199,7 @@ Then, feed your cluster. :: - python kafka_monitor.py feed '{"url": "http://dmoz.org", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}' + python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}' If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kafkadump.py`` script, you will begin to see output like so... @@ -208,8 +208,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf { "body": , "crawlid": "test123456", - "response_url": "http://www.dmoz.org/", - "url": "http://www.dmoz.org/", + "response_url": "http://www.dmoztools.net/", + "url": "http://www.dmoztools.net/", "status_code": 200, "status_msg": "OK", "appid": "testapp", @@ -228,8 +228,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf { "body": , "crawlid": "test123456", - "response_url": "http://www.dmoz.org/Computers/Hardware/", - "url": "http://www.dmoz.org/Computers/Hardware/", + "response_url": "http://www.dmoztools.net/Computers/Hardware/", + "url": "http://www.dmoztools.net/Computers/Hardware/", "status_code": 200, "status_msg": "OK", "appid": "testapp", @@ -273,4 +273,4 @@ You can also fire up more than one crawl job at a time, and track the steps that "wandering_spider_count": 4 } -You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities. \ No newline at end of file +You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities. diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst index 652e4882..230d47b0 100644 --- a/docs/topics/introduction/quickstart.rst +++ b/docs/topics/introduction/quickstart.rst @@ -478,7 +478,7 @@ The following things will occur for this action request: :: - {u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoz.org': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'} + {u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoztools.net': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'} In this case we had 25 urls pending in the queue, so yours may be slightly different. diff --git a/docs/topics/kafka-monitor/api.rst b/docs/topics/kafka-monitor/api.rst index f5aa9ec3..00ac1d52 100644 --- a/docs/topics/kafka-monitor/api.rst +++ b/docs/topics/kafka-monitor/api.rst @@ -74,9 +74,9 @@ Kafka Request: :: - $ python kafka_monitor.py feed '{"url": "http://www.dmoz.org/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}' + $ python kafka_monitor.py feed '{"url": "http://www.dmoztools.net/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}' - - Submits a dmoz.org crawl spidering 2 levels deep with a high priority + - Submits a dmoztools.net crawl spidering 2 levels deep with a high priority :: @@ -899,7 +899,7 @@ Zookeeper Request: :: - $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoz.org", "hits":60, "window":60, "scale":0.9}' + $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoztools.net", "hits":60, "window":60, "scale":0.9}' Response from Kafka: @@ -907,7 +907,7 @@ Response from Kafka: { "action": "domain-update", - "domain": "dmoz.org", + "domain": "dmoztools.net", "server_time": 1464402128, "uuid": "abc123", "appid": "madisonTest" @@ -923,7 +923,7 @@ Zookeeper Request: :: - $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoz.org"}' + $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoztools.net"}' Response from Kafka: @@ -931,7 +931,7 @@ Response from Kafka: { "action": "domain-remove", - "domain": "dmoz.org", + "domain": "dmoztools.net", "server_time": 1464402146, "uuid": "abc123", "appid": "madisonTest" diff --git a/redis-monitor/tests/test_plugins.py b/redis-monitor/tests/test_plugins.py index 056f4a50..324b6d84 100644 --- a/redis-monitor/tests/test_plugins.py +++ b/redis-monitor/tests/test_plugins.py @@ -322,7 +322,7 @@ def test_stats_get_queue(self): 'link:istresearch.com:queue', 'link:yellowpages.com:queue', 'link:cnn.com:queue', - 'wandering:dmoz.org:queue', + 'wandering:dmoztools.net:queue', 'wandering:craigslist.org:queue', ]) results = [5, 10, 11, 1, 3] @@ -349,7 +349,7 @@ def ret_val(*args): 'spider_backlog': 4, 'num_domains': 2, 'domains': [ - {'domain': 'dmoz.org', 'backlog': 1}, + {'domain': 'dmoztools.net', 'backlog': 1}, {'domain': 'craigslist.org', 'backlog': 3}, ] } @@ -395,20 +395,20 @@ def test_zk_regex(self): def test_zk_handle_du(self): # domain update - s = b'blacklist: []\ndomains:\n dmoz.org: {hits: 60, scale: 1.0, window: 60}\n' + s = b'blacklist: []\ndomains:\n dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n' val = '{"uuid":"blah123","hits":15,"scale":0.9,"window":60}' - expected = b'blacklist: []\ndomains:\n cnn.com:\n hits: 15\n scale: 0.9\n window: 60\n dmoz.org:\n hits: 60\n scale: 1.0\n window: 60\n' + expected = b'blacklist: []\ndomains:\n cnn.com:\n hits: 15\n scale: 0.9\n window: 60\n dmoztools.net:\n hits: 60\n scale: 1.0\n window: 60\n' self.plugin.zoo_client.get = MagicMock(return_value=(s,)) self.plugin.handle(key="zk:domain-update:cnn.com:testapp", value=val) self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected) def test_zk_handle_dr(self): # domain remove - s = b'blacklist: []\ndomains:\n dmoz.org: {hits: 60, scale: 1.0, window: 60}\n' + s = b'blacklist: []\ndomains:\n dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n' val = '{"uuid":"blah123"}' expected = b'blacklist: []\ndomains: {}\n' self.plugin.zoo_client.get = MagicMock(return_value=(s,)) - self.plugin.handle(key="zk:domain-remove:dmoz.org:testapp", value=val) + self.plugin.handle(key="zk:domain-remove:dmoztools.net:testapp", value=val) self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected) def test_zk_handle_bu(self):