Skip to content

Commit

Permalink
Merge pull request #69 from TeamHG-Memex/pathlib
Browse files Browse the repository at this point in the history
Use pathlib instead of codecs
  • Loading branch information
lopuhin authored May 10, 2017
2 parents c9688de + 0c34716 commit ba2c49a
Showing 1 changed file with 10 additions and 13 deletions.
23 changes: 10 additions & 13 deletions undercrawler/spiders.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from base64 import b64decode
import codecs
import contextlib
import hashlib
import os
from pathlib import Path
import re
from typing import Optional
from urllib.parse import urljoin, urlsplit
Expand Down Expand Up @@ -30,7 +30,7 @@ class BaseSpider(scrapy.Spider):

def __init__(self, url, search_terms=None, *args, **kwargs):
if url.startswith('.') or url.startswith('/'):
with codecs.open(url, 'r', encoding='utf8') as f:
with Path(url).open('rt', encoding='utf8') as f:
urls = [line.strip() for line in f]
else:
urls = [u for u in url.split() if u]
Expand All @@ -43,7 +43,7 @@ def __init__(self, url, search_terms=None, *args, **kwargs):
canonicalize=False)
self.state = {}
self.use_splash = None # set up in start_requests
self._screenshot_dest = None # set up in _take_screenshot
self._screenshot_dest = None # type: Path
# Load headless horseman scripts
self.lua_source = load_directive('headless_horseman.lua')
self.js_source = load_directive('headless_horseman.js')
Expand Down Expand Up @@ -221,7 +221,7 @@ def _pagination_urls(self, response):
def extra_search_terms(self):
st_file = self.settings.get('SEARCH_TERMS_FILE')
if st_file:
with codecs.open(st_file, 'r', encoding='utf8') as f:
with Path(st_file).open('rt', encoding='utf8') as f:
return [line.strip() for line in f]
else:
return []
Expand Down Expand Up @@ -271,19 +271,16 @@ def _take_screenshot(self, response) -> Optional[str]:
if not screenshot:
return None
if self._screenshot_dest is None:
self._screenshot_dest = (
self._screenshot_dest = Path(
self.settings.get('SCREENSHOT_DEST', 'screenshots'))
if not os.path.exists(self._screenshot_dest):
os.mkdir(self._screenshot_dest)
filename = os.path.join(
self._screenshot_dest,
self._screenshot_dest.mkdir(parents=True, exist_ok=True)
path = self._screenshot_dest.joinpath(
'{prefix}{uuid}.png'.format(
prefix=self.settings.get('SCREENSHOT_PREFIX', ''),
uuid=uuid.uuid4()))
with open(filename, 'wb') as f:
f.write(b64decode(screenshot))
self.logger.debug('Saved %s screenshot to %s' % (response, filename))
return filename
path.write_bytes(b64decode(screenshot))
self.logger.debug('Saved %s screenshot to %s' % (response, path))
return str(path)


class ArachnadoSpider(BaseSpider):
Expand Down

0 comments on commit ba2c49a

Please sign in to comment.