diff --git a/requirements-sphinx.txt b/requirements-sphinx.txt index 995d297e..102c690c 100644 --- a/requirements-sphinx.txt +++ b/requirements-sphinx.txt @@ -3,7 +3,6 @@ chardet>=2.0.1,<=2.3 dnspython3==1.12 html5lib>=0.999,<1.0 # lxml>=3.1.0,<=3.5 # except for this because it requires building C libs -namedlist>=1.3,<=1.7 psutil>=2.0,<=4.2 sqlalchemy>=0.9,<=1.0.13 tornado>=3.2.2,<5.0 diff --git a/requirements.txt b/requirements.txt index e9fbfb52..97b43156 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ chardet>=2.0.1,<=2.3 dnspython3==1.12 html5lib>=0.999,<1.0 lxml>=3.1.0,<=3.5 -namedlist>=1.3,<=1.7 psutil>=2.0,<=4.2 sqlalchemy>=0.9,<=1.0.13 tornado>=3.2.2,<5.0 diff --git a/setup.py b/setup.py index 775472a7..a84a7050 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,6 @@ def get_version(): 'chardet', 'dnspython3', 'html5lib', - 'namedlist', 'sqlalchemy', 'tornado', 'yapsy', diff --git a/wpull/application/factory.py b/wpull/application/factory.py index ba2d0b5a..f6802a1f 100644 --- a/wpull/application/factory.py +++ b/wpull/application/factory.py @@ -1,9 +1,9 @@ # encoding=utf-8 '''Instance creation and management.''' -import collections +import collections, typing -class Factory(collections.Mapping, object): +class Factory(typing.Mapping, object): '''Allows selection of classes and keeps track of instances. This class behaves like a mapping. Keys are names of classes and values are diff --git a/wpull/application/plugin.py b/wpull/application/plugin.py index 68028f87..92f3dfc3 100644 --- a/wpull/application/plugin.py +++ b/wpull/application/plugin.py @@ -61,7 +61,7 @@ def event(name: Any): return _plugin_attach_decorator(name, category=PluginFunctionCategory.event) -class InterfaceRegistry(collections.Mapping): +class InterfaceRegistry(typing.Mapping): def __init__(self): super().__init__() self._interfaces = {} diff --git a/wpull/application/tasks/download.py b/wpull/application/tasks/download.py index bbbb552e..7e989c9a 100644 --- a/wpull/application/tasks/download.py +++ b/wpull/application/tasks/download.py @@ -33,10 +33,7 @@ def process(self, session: AppSession): @classmethod def _build_html_parser(cls, session: AppSession): - if session.args.html_parser == 'html5lib': - from wpull.document.htmlparse.html5lib_ import HTMLParser - else: - from wpull.document.htmlparse.lxml_ import HTMLParser + from wpull.document.htmlparse.lxml_ import HTMLParser session.factory.class_map['HTMLParser'] = HTMLParser session.factory.new('HTMLParser') diff --git a/wpull/cache.py b/wpull/cache.py index c3f46dac..db305fa4 100644 --- a/wpull/cache.py +++ b/wpull/cache.py @@ -1,5 +1,5 @@ '''Caching.''' -import abc +import abc, typing import collections import sys import time @@ -16,7 +16,7 @@ total_ordering = lambda obj: obj -class BaseCache(collections.Mapping, object): +class BaseCache(typing.Mapping, object): @abc.abstractmethod def __setitem__(self, key, value): pass diff --git a/wpull/collections.py b/wpull/collections.py index 773cf013..67ad78ac 100644 --- a/wpull/collections.py +++ b/wpull/collections.py @@ -1,7 +1,7 @@ # encoding=utf-8 '''Data structures.''' from collections import OrderedDict -import collections +import collections, typing import copy import itertools import functools @@ -14,7 +14,7 @@ class OrderedDefaultDict(OrderedDict): ''' def __init__(self, default_factory=None, *args, **kwargs): if default_factory is not None and \ - not isinstance(default_factory, collections.Callable): + not isinstance(default_factory, typing.Callable): raise TypeError('First argument must be callable') OrderedDict.__init__(self, *args, **kwargs) self.default_factory = default_factory @@ -237,7 +237,7 @@ def clear(self): self.tail = None -class FrozenDict(collections.Mapping, collections.Hashable): +class FrozenDict(typing.Mapping, typing.Hashable): '''Immutable mapping wrapper.''' __slots__ = ('orig_dict', 'hash_cache',) diff --git a/wpull/driver/phantomjs.py b/wpull/driver/phantomjs.py index 6052e64f..2791f07d 100644 --- a/wpull/driver/phantomjs.py +++ b/wpull/driver/phantomjs.py @@ -3,8 +3,8 @@ import os.path import subprocess import tempfile +from typing import Any -import namedlist import asyncio from wpull.driver.process import Process @@ -13,41 +13,38 @@ _logger = logging.getLogger(__name__) - -PhantomJSDriverParams = namedlist.namedtuple( - 'PhantomJSDriverParamsType', [ - 'url', - ('snapshot_paths', []), - ('wait_time', 1), - ('num_scrolls', 10), - ('smart_scroll', True), - ('snapshot', True), - ('viewport_size', (1200, 1920)), - ('paper_size', (2400, 3840)), - ('event_log_filename', None), - ('action_log_filename', None), - ('custom_headers', {}), - ('page_settings', {}), - ] -) -'''PhantomJS Driver parameters - -Attributes: - url (str): URL of page to fetch. - snapshot_type (list): List of filenames. Accepted extensions are html, - pdf, png, gif. - wait_time (float): Time between page scrolls. - num_scrolls (int): Maximum number of scrolls. - smart_scroll (bool): Whether to stop scrolling if number of - requests & responses do not change. - snapshot (bool): Whether to take snapshot files. - viewport_size (tuple): Width and height of the page viewport. - paper_size (tuple): Width and height of the paper size. - event_log_filename (str): Path to save page events. - action_log_filename (str): Path to save page action manipulation events. - custom_headers (dict): Custom HTTP request headers. - page_settings (dict): Page settings. -''' +from typing import NamedTuple +class PhantomJSDriverParams(NamedTuple): + '''PhantomJS Driver parameters + + Attributes: + url (str): URL of page to fetch. + snapshot_type (list): List of filenames. Accepted extensions are html, + pdf, png, gif. + wait_time (float): Time between page scrolls. + num_scrolls (int): Maximum number of scrolls. + smart_scroll (bool): Whether to stop scrolling if number of + requests & responses do not change. + snapshot (bool): Whether to take snapshot files. + viewport_size (tuple): Width and height of the page viewport. + paper_size (tuple): Width and height of the paper size. + event_log_filename (str): Path to save page events. + action_log_filename (str): Path to save page action manipulation events. + custom_headers (dict): Custom HTTP request headers. + page_settings (dict): Page settings. + ''' + url: str + snapshot_paths: Any = [] + wait_time: Any = 1 + num_scrolls: Any = 10 + smart_scroll: Any = True + snapshot: Any = True + viewport_size: Any = (1200, 1920) + paper_size: Any = (2400, 3840) + event_log_filename: Any = None + action_log_filename: Any = None + custom_headers: Any = {} + page_settings: Any = {} class PhantomJSDriver(Process): diff --git a/wpull/driver/process.py b/wpull/driver/process.py index e370538e..b5a15052 100644 --- a/wpull/driver/process.py +++ b/wpull/driver/process.py @@ -53,8 +53,8 @@ def start(self, use_atexit=True): ) self._process = yield from process_future - self._stderr_reader = asyncio.async(self._read_stderr()) - self._stdout_reader = asyncio.async(self._read_stdout()) + self._stderr_reader = asyncio.create_task(self._read_stderr()) + self._stdout_reader = asyncio.create_task(self._read_stdout()) if use_atexit: atexit.register(self.close) diff --git a/wpull/namevalue.py b/wpull/namevalue.py index dcdb9f9f..aba4c687 100644 --- a/wpull/namevalue.py +++ b/wpull/namevalue.py @@ -4,6 +4,7 @@ import gettext import io import textwrap +import typing from wpull.collections import OrderedDefaultDict @@ -11,7 +12,7 @@ _ = gettext.gettext -class NameValueRecord(collections.MutableMapping): +class NameValueRecord(typing.MutableMapping): '''An ordered mapping of name-value pairs. Duplicated names are accepted. diff --git a/wpull/network/pool.py b/wpull/network/pool.py index a9aeb6b9..e3d682f8 100644 --- a/wpull/network/pool.py +++ b/wpull/network/pool.py @@ -37,8 +37,7 @@ def empty(self) -> bool: '''Return whether the pool is empty.''' return not self.ready and not self.busy - @asyncio.coroutine - def clean(self, force: bool=False): + async def clean(self, force: bool=False): '''Clean closed connections. Args: @@ -46,7 +45,7 @@ def clean(self, force: bool=False): Coroutine. ''' - with (yield from self._lock): + async with self._lock: for connection in tuple(self.ready): if force or connection.closed(): connection.close() @@ -149,8 +148,7 @@ def __init__(self, max_host_count: int=6, def host_pools(self) -> Mapping[tuple, HostPool]: return self._host_pools - @asyncio.coroutine - def acquire(self, host: str, port: int, use_ssl: bool=False, + async def acquire(self, host: str, port: int, use_ssl: bool=False, host_key: Optional[Any]=None) \ -> Union[Connection, SSLConnection]: '''Return an available connection. @@ -167,7 +165,7 @@ def acquire(self, host: str, port: int, use_ssl: bool=False, assert isinstance(port, int), 'Expect int. Got {}'.format(type(port)) assert not self._closed - yield from self._process_no_wait_releases() + await self._process_no_wait_releases() if use_ssl: connection_factory = functools.partial( @@ -184,7 +182,7 @@ def acquire(self, host: str, port: int, use_ssl: bool=False, key = host_key or (host, port, use_ssl) - with (yield from self._host_pools_lock): + async with self._host_pools_lock: if key not in self._host_pools: host_pool = self._host_pools[key] = HostPool( connection_factory, @@ -197,7 +195,7 @@ def acquire(self, host: str, port: int, use_ssl: bool=False, _logger.debug('Check out %s', key) - connection = yield from host_pool.acquire() + connection = await host_pool.acquire() connection.key = key # TODO: Verify this assert is always true @@ -205,7 +203,7 @@ def acquire(self, host: str, port: int, use_ssl: bool=False, # assert key in self._host_pools # assert self._host_pools[key] == host_pool - with (yield from self._host_pools_lock): + async with self._host_pools_lock: self._host_pool_waiters[key] -= 1 return connection @@ -271,8 +269,7 @@ def context_wrapper(): return context_wrapper() - @asyncio.coroutine - def clean(self, force: bool=False): + async def clean(self, force: bool=False): '''Clean all closed connections. Args: @@ -282,9 +279,9 @@ def clean(self, force: bool=False): ''' assert not self._closed - with (yield from self._host_pools_lock): + async with self._host_pools_lock: for key, pool in tuple(self._host_pools.items()): - yield from pool.clean(force=force) + await pool.clean(force=force) if not self._host_pool_waiters[key] and pool.empty(): del self._host_pools[key] diff --git a/wpull/processor/coprocessor/phantomjs.py b/wpull/processor/coprocessor/phantomjs.py index fb1af12e..2133c745 100644 --- a/wpull/processor/coprocessor/phantomjs.py +++ b/wpull/processor/coprocessor/phantomjs.py @@ -8,10 +8,12 @@ import tempfile import io -import namedlist + import asyncio from typing import Callable +from typing import NamedTuple +from typing import Any from wpull.backport.logging import BraceMessage as __ from wpull.document.html import HTMLReader @@ -24,35 +26,32 @@ import wpull.url -PhantomJSParams = namedlist.namedtuple( - 'PhantomJSParamsType', [ - ('snapshot_types', ('html', 'pdf')), - ('wait_time', 1), - ('num_scrolls', 10), - ('smart_scroll', True), - ('snapshot', True), - ('viewport_size', (1200, 1920)), - ('paper_size', (2400, 3840)), - ('load_time', 900), - ('custom_headers', {}), - ('page_settings', {}), - ] -) -'''PhantomJS parameters - -Attributes: - snapshot_type (list): File types. Accepted are html, pdf, png, gif. - wait_time (float): Time between page scrolls. - num_scrolls (int): Maximum number of scrolls. - smart_scroll (bool): Whether to stop scrolling if number of - requests & responses do not change. - snapshot (bool): Whether to take snapshot files. - viewport_size (tuple): Width and height of the page viewport. - paper_size (tuple): Width and height of the paper size. - load_time (float): Maximum time to wait for page load. - custom_headers (dict): Default HTTP headers. - page_settings (dict): Page settings. -''' +class PhantomJSParams(NamedTuple): + '''PhantomJS parameters + + Attributes: + snapshot_type (list): File types. Accepted are html, pdf, png, gif. + wait_time (float): Time between page scrolls. + num_scrolls (int): Maximum number of scrolls. + smart_scroll (bool): Whether to stop scrolling if number of + requests & responses do not change. + snapshot (bool): Whether to take snapshot files. + viewport_size (tuple): Width and height of the page viewport. + paper_size (tuple): Width and height of the paper size. + load_time (float): Maximum time to wait for page load. + custom_headers (dict): Default HTTP headers. + page_settings (dict): Page settings. + ''' + snapshot_types: Any = ('html', 'pdf') + wait_time: Any = 1 + num_scrolls: Any = 10 + smart_scroll: Any = True + snapshot: Any = True + viewport_size: Any = (1200, 1920) + paper_size: Any = (2400, 3840) + load_time: Any = 900 + custom_headers: Any = {} + page_settings: Any = {} _logger = logging.getLogger(__name__) diff --git a/wpull/processor/ftp.py b/wpull/processor/ftp.py index 6a327355..88583e13 100644 --- a/wpull/processor/ftp.py +++ b/wpull/processor/ftp.py @@ -9,8 +9,9 @@ import tempfile import urllib.parse -import namedlist + from typing import cast +from typing import Any from wpull.backport.logging import StyleAdapter from wpull.body import Body @@ -34,24 +35,20 @@ GLOB_CHARS = frozenset('[]*?') +from typing import NamedTuple +class FTPProcessorFetchParams(NamedTuple): + '''FTPProcessorFetchParams -FTPProcessorFetchParams = namedlist.namedtuple( - 'FTPProcessorFetchParamsType', - [ - ('remove_listing', True), - ('glob', True), - ('preserve_permissions', False), - ('retr_symlinks', True), - ] -) -'''FTPProcessorFetchParams - -Args: - remove_listing (bool): Remove `.listing` files after fetching. - glob (bool): Enable URL globbing. - preserve_permissions (bool): Preserve file permissions. - follow_symlinks (bool): Follow symlinks. -''' + Args: + remove_listing (bool): Remove `.listing` files after fetching. + glob (bool): Enable URL globbing. + preserve_permissions (bool): Preserve file permissions. + follow_symlinks (bool): Follow symlinks. + ''' + remove_listing: Any = True + glob: Any = True + preserve_permissions: Any = False + retr_symlinks: Any = True class HookPreResponseBreak(ProtocolError): diff --git a/wpull/processor/web.py b/wpull/processor/web.py index 3ccc1ea2..90590287 100644 --- a/wpull/processor/web.py +++ b/wpull/processor/web.py @@ -4,7 +4,7 @@ import io import logging -import namedlist + import asyncio from typing import cast, Tuple @@ -31,22 +31,21 @@ _logger = StyleAdapter(logging.getLogger(__name__)) _ = gettext.gettext -WebProcessorFetchParams = namedlist.namedtuple( - 'WebProcessorFetchParamsType', - [ - ('post_data', None), - ('strong_redirects', True), - ('content_on_error', False), - ] -) -'''WebProcessorFetchParams - -Args: - post_data (str): If provided, all requests will be POSTed with the - given `post_data`. `post_data` must be in percent-encoded - query format ("application/x-www-form-urlencoded"). - strong_redirects (bool): If True, redirects are allowed to span hosts. -''' +from typing import NamedTuple, Any + + +class WebProcessorFetchParams(NamedTuple): + '''WebProcessorFetchParams + + Args: + post_data (str): If provided, all requests will be POSTed with the + given `post_data`. `post_data` must be in percent-encoded + query format ("application/x-www-form-urlencoded"). + strong_redirects (bool): If True, redirects are allowed to span hosts. + ''' + post_data: Any = None + strong_redirects: Any = True + content_on_error: Any = False class HookPreResponseBreak(ProtocolError): diff --git a/wpull/protocol/ftp/ls/listing.py b/wpull/protocol/ftp/ls/listing.py index a2243e9c..af7d865c 100644 --- a/wpull/protocol/ftp/ls/listing.py +++ b/wpull/protocol/ftp/ls/listing.py @@ -2,33 +2,32 @@ import re import itertools -import namedlist + from wpull.protocol.ftp.ls.date import parse_datetime import wpull.protocol.ftp.ls.date +from typing import NamedTuple +from typing import Any -FileEntry = namedlist.namedtuple( - 'FileEntryType', - [ - 'name', - ('type', None), - ('size', None), - ('date', None), - ('dest', None), - ('perm', None) - ]) -'''A row in a listing. - -Attributes: - name (str): Filename. - type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None`` - size (int, None): Size of file. - date (:class:`datetime.datetime`, None): A datetime object in UTC. - dest (str, None): Destination filename for symlinks. - perm (int, None): Unix permissions expressed as an integer. -''' +class FileEntry(NamedTuple): + '''A row in a listing. + Attributes: + name (str): Filename. + type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None`` + size (int, None): Size of file. + date (:class:`datetime.datetime`, None): A datetime object in UTC. + dest (str, None): Destination filename for symlinks. + perm (int, None): Unix permissions expressed as an integer. + ''' + name: str + type: Any = None + size: Any = None + date: Any = None + dest: Any = None + perm: Any = None + class ListingError(ValueError): '''Error during parsing a listing.''' diff --git a/wpull/scraper/base.py b/wpull/scraper/base.py index 59cadefd..c88b1f5d 100644 --- a/wpull/scraper/base.py +++ b/wpull/scraper/base.py @@ -2,32 +2,30 @@ import abc import collections import io -import namedlist + from wpull.document.base import BaseTextStreamReader, \ BaseHTMLReader, BaseExtractiveReader from wpull.scraper.util import urljoin_safe +from typing import NamedTuple +from typing import Any + +class LinkContext(NamedTuple): + '''A named tuple describing a scraped link. -LinkContext = namedlist.namedtuple( - 'LinkContextType', - [ - 'link', - ('inline', False), - ('linked', False), - ('link_type', None), - ('extra', None) - ] -) -'''A named tuple describing a scraped link. - -Attributes: - link (str): The link that was scraped. - inline (bool): Whether the link is an embeded object. - linked (bool): Whether the link links to another page. - link_type: A value from :class:`.item.LinkType`. - extra: Any extra info. -''' + Attributes: + link (str): The link that was scraped. + inline (bool): Whether the link is an embeded object. + linked (bool): Whether the link links to another page. + link_type: A value from :class:`.item.LinkType`. + extra: Any extra info. + ''' + link: str + inline: Any = False + linked: Any = False + link_type: Any = None + extra: Any = None class ScrapeResult(dict): diff --git a/wpull/warc/recorder.py b/wpull/warc/recorder.py index 3d8c3dc8..941eba00 100644 --- a/wpull/warc/recorder.py +++ b/wpull/warc/recorder.py @@ -10,7 +10,7 @@ import re import shutil -import namedlist + from wpull.backport.logging import StyleAdapter from wpull.namevalue import NameValueRecord @@ -25,48 +25,44 @@ from wpull.protocol.http.request import Response as HTTPResponse import wpull.util import wpull.version - +from typing import NamedTuple, Any _logger = StyleAdapter(logging.getLogger(__name__)) _ = gettext.gettext -WARCRecorderParams = namedlist.namedtuple( - 'WARCRecorderParamsType', - [ - ('compress', True), - ('extra_fields', None), - ('temp_dir', './'), - ('log', True), - ('appending', False), - ('digests', True), - ('cdx', None), - ('max_size', None), - ('move_to', None), - ('url_table', None), - ('software_string', None) - ] -) -''':class:`WARCRecorder` parameters. - -Args: - compress (bool): If True, files will be compressed with gzip - extra_fields (list): A list of key-value pairs containing extra - metadata fields - temp_dir (str): Directory to use for temporary files - log (bool): Include the program logging messages in the WARC file - appending (bool): If True, the file is not overwritten upon opening - digests (bool): If True, the SHA1 hash digests will be written. - cdx (bool): If True, a CDX file will be written. - max_size (int): If provided, output files are named like - ``name-00000.ext`` and the log file will be in ``name-meta.ext``. - move_to (str): If provided, completed WARC files and CDX files will be - moved to the given directory - url_table (:class:`.database.URLTable`): If given, then ``revist`` - records will be written. - software_string (str): The value for the ``software`` field in the - Warcinfo record. -''' +class WARCRecorderParams(NamedTuple): + ''':class:`WARCRecorder` parameters. + + Args: + compress (bool): If True, files will be compressed with gzip + extra_fields (list): A list of key-value pairs containing extra + metadata fields + temp_dir (str): Directory to use for temporary files + log (bool): Include the program logging messages in the WARC file + appending (bool): If True, the file is not overwritten upon opening + digests (bool): If True, the SHA1 hash digests will be written. + cdx (bool): If True, a CDX file will be written. + max_size (int): If provided, output files are named like + ``name-00000.ext`` and the log file will be in ``name-meta.ext``. + move_to (str): If provided, completed WARC files and CDX files will be + moved to the given directory + url_table (:class:`.database.URLTable`): If given, then ``revist`` + records will be written. + software_string (str): The value for the ``software`` field in the + Warcinfo record. + ''' + compress: bool = True + extra_fields: Any = None + temp_dir: Any = './' + log: Any = True + appending: Any = False + digests: Any = True + cdx: Any = None + max_size: Any = None + move_to: Any = None + url_table: Any = None + software_string: Any = None class WARCRecorder(object):