Skip to content

Commit

Permalink
Memoize calls to Crawler.crawl() for performance win in find-links ba…
Browse files Browse the repository at this point in the history
…sed resolution.
  • Loading branch information
kwlzn committed Dec 17, 2015
1 parent 1e70fdd commit 36c361c
Showing 1 changed file with 22 additions and 3 deletions.
25 changes: 22 additions & 3 deletions pex/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .http import Context
from .link import Link
from .tracer import TRACER
from .util import Memoizer

if PY3:
from queue import Empty, Queue
Expand Down Expand Up @@ -64,6 +65,9 @@ def partition(L, pred):
class Crawler(object):
"""A multi-threaded crawler that supports local (disk) and remote (web) crawling."""

# Memoizer for calls to Crawler.crawl().
_CRAWL_CACHE = Memoizer()

@classmethod
def crawl_local(cls, link):
try:
Expand Down Expand Up @@ -99,7 +103,22 @@ def __init__(self, context=None, threads=1):
self._threads = threads
self.context = context or Context.get()

def _make_cache_key(self, links, follow_links):
return hash((follow_links,) + tuple(hash(item) for item in links))

def crawl(self, link_or_links, follow_links=False):
links = list(Link.wrap_iterable(link_or_links))
cache_key = self._make_cache_key(links, follow_links)

# Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE).
result = self._CRAWL_CACHE.get(cache_key)
if result is None:
result = self._crawl(links, follow_links)
self._CRAWL_CACHE.store(cache_key, result)

return result

def _crawl(self, link_or_links, follow_links):
links, seen = set(), set()
queue = Queue()
converged = threading.Event()
Expand Down Expand Up @@ -127,7 +146,8 @@ def execute():
queue.put(rel)
queue.task_done()

for link in Link.wrap_iterable(link_or_links):
for i, link in enumerate(link_or_links):
TRACER.log('crawling link i=%s link=%s follow_links=%s' % (i, link, follow_links), V=3)
queue.put(link)

workers = []
Expand All @@ -140,6 +160,5 @@ def execute():
queue.join()
converged.set()

# We deliberately not join back the worker threads, since they are no longer of
# any use to us.
# We deliberately do not join the worker threads, since they are no longer of any use to us.
return links

0 comments on commit 36c361c

Please sign in to comment.