Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove namedtuple; fix bugs #471

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion requirements-sphinx.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ chardet>=2.0.1,<=2.3
dnspython3==1.12
html5lib>=0.999,<1.0
# lxml>=3.1.0,<=3.5 # except for this because it requires building C libs
namedlist>=1.3,<=1.7
psutil>=2.0,<=4.2
sqlalchemy>=0.9,<=1.0.13
tornado>=3.2.2,<5.0
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ chardet>=2.0.1,<=2.3
dnspython3==1.12
html5lib>=0.999,<1.0
lxml>=3.1.0,<=3.5
namedlist>=1.3,<=1.7
psutil>=2.0,<=4.2
sqlalchemy>=0.9,<=1.0.13
tornado>=3.2.2,<5.0
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ def get_version():
'chardet',
'dnspython3',
'html5lib',
'namedlist',
'sqlalchemy',
'tornado',
'yapsy',
Expand Down
4 changes: 2 additions & 2 deletions wpull/application/factory.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# encoding=utf-8
'''Instance creation and management.'''
import collections
import collections, typing


class Factory(collections.Mapping, object):
class Factory(typing.Mapping, object):
'''Allows selection of classes and keeps track of instances.

This class behaves like a mapping. Keys are names of classes and values are
Expand Down
2 changes: 1 addition & 1 deletion wpull/application/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def event(name: Any):
return _plugin_attach_decorator(name, category=PluginFunctionCategory.event)


class InterfaceRegistry(collections.Mapping):
class InterfaceRegistry(typing.Mapping):
def __init__(self):
super().__init__()
self._interfaces = {}
Expand Down
5 changes: 1 addition & 4 deletions wpull/application/tasks/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,7 @@ def process(self, session: AppSession):

@classmethod
def _build_html_parser(cls, session: AppSession):
if session.args.html_parser == 'html5lib':
from wpull.document.htmlparse.html5lib_ import HTMLParser
else:
from wpull.document.htmlparse.lxml_ import HTMLParser
from wpull.document.htmlparse.lxml_ import HTMLParser

session.factory.class_map['HTMLParser'] = HTMLParser
session.factory.new('HTMLParser')
Expand Down
4 changes: 2 additions & 2 deletions wpull/cache.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
'''Caching.'''
import abc
import abc, typing
import collections
import sys
import time
Expand All @@ -16,7 +16,7 @@
total_ordering = lambda obj: obj


class BaseCache(collections.Mapping, object):
class BaseCache(typing.Mapping, object):
@abc.abstractmethod
def __setitem__(self, key, value):
pass
Expand Down
6 changes: 3 additions & 3 deletions wpull/collections.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# encoding=utf-8
'''Data structures.'''
from collections import OrderedDict
import collections
import collections, typing
import copy
import itertools
import functools
Expand All @@ -14,7 +14,7 @@ class OrderedDefaultDict(OrderedDict):
'''
def __init__(self, default_factory=None, *args, **kwargs):
if default_factory is not None and \
not isinstance(default_factory, collections.Callable):
not isinstance(default_factory, typing.Callable):
raise TypeError('First argument must be callable')
OrderedDict.__init__(self, *args, **kwargs)
self.default_factory = default_factory
Expand Down Expand Up @@ -237,7 +237,7 @@ def clear(self):
self.tail = None


class FrozenDict(collections.Mapping, collections.Hashable):
class FrozenDict(typing.Mapping, typing.Hashable):
'''Immutable mapping wrapper.'''
__slots__ = ('orig_dict', 'hash_cache',)

Expand Down
69 changes: 33 additions & 36 deletions wpull/driver/phantomjs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import os.path
import subprocess
import tempfile
from typing import Any

import namedlist
import asyncio

from wpull.driver.process import Process
Expand All @@ -13,41 +13,38 @@

_logger = logging.getLogger(__name__)


PhantomJSDriverParams = namedlist.namedtuple(
'PhantomJSDriverParamsType', [
'url',
('snapshot_paths', []),
('wait_time', 1),
('num_scrolls', 10),
('smart_scroll', True),
('snapshot', True),
('viewport_size', (1200, 1920)),
('paper_size', (2400, 3840)),
('event_log_filename', None),
('action_log_filename', None),
('custom_headers', {}),
('page_settings', {}),
]
)
'''PhantomJS Driver parameters

Attributes:
url (str): URL of page to fetch.
snapshot_type (list): List of filenames. Accepted extensions are html,
pdf, png, gif.
wait_time (float): Time between page scrolls.
num_scrolls (int): Maximum number of scrolls.
smart_scroll (bool): Whether to stop scrolling if number of
requests & responses do not change.
snapshot (bool): Whether to take snapshot files.
viewport_size (tuple): Width and height of the page viewport.
paper_size (tuple): Width and height of the paper size.
event_log_filename (str): Path to save page events.
action_log_filename (str): Path to save page action manipulation events.
custom_headers (dict): Custom HTTP request headers.
page_settings (dict): Page settings.
'''
from typing import NamedTuple
class PhantomJSDriverParams(NamedTuple):
'''PhantomJS Driver parameters

Attributes:
url (str): URL of page to fetch.
snapshot_type (list): List of filenames. Accepted extensions are html,
pdf, png, gif.
wait_time (float): Time between page scrolls.
num_scrolls (int): Maximum number of scrolls.
smart_scroll (bool): Whether to stop scrolling if number of
requests & responses do not change.
snapshot (bool): Whether to take snapshot files.
viewport_size (tuple): Width and height of the page viewport.
paper_size (tuple): Width and height of the paper size.
event_log_filename (str): Path to save page events.
action_log_filename (str): Path to save page action manipulation events.
custom_headers (dict): Custom HTTP request headers.
page_settings (dict): Page settings.
'''
url: str
snapshot_paths: Any = []
wait_time: Any = 1
num_scrolls: Any = 10
smart_scroll: Any = True
snapshot: Any = True
viewport_size: Any = (1200, 1920)
paper_size: Any = (2400, 3840)
event_log_filename: Any = None
action_log_filename: Any = None
custom_headers: Any = {}
page_settings: Any = {}


class PhantomJSDriver(Process):
Expand Down
4 changes: 2 additions & 2 deletions wpull/driver/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def start(self, use_atexit=True):
)
self._process = yield from process_future

self._stderr_reader = asyncio.async(self._read_stderr())
self._stdout_reader = asyncio.async(self._read_stdout())
self._stderr_reader = asyncio.create_task(self._read_stderr())
self._stdout_reader = asyncio.create_task(self._read_stdout())

if use_atexit:
atexit.register(self.close)
Expand Down
3 changes: 2 additions & 1 deletion wpull/namevalue.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import gettext
import io
import textwrap
import typing

from wpull.collections import OrderedDefaultDict


_ = gettext.gettext


class NameValueRecord(collections.MutableMapping):
class NameValueRecord(typing.MutableMapping):
'''An ordered mapping of name-value pairs.

Duplicated names are accepted.
Expand Down
23 changes: 10 additions & 13 deletions wpull/network/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,15 @@ def empty(self) -> bool:
'''Return whether the pool is empty.'''
return not self.ready and not self.busy

@asyncio.coroutine
def clean(self, force: bool=False):
async def clean(self, force: bool=False):
'''Clean closed connections.

Args:
force: Clean connected and idle connections too.

Coroutine.
'''
with (yield from self._lock):
async with self._lock:
for connection in tuple(self.ready):
if force or connection.closed():
connection.close()
Expand Down Expand Up @@ -149,8 +148,7 @@ def __init__(self, max_host_count: int=6,
def host_pools(self) -> Mapping[tuple, HostPool]:
return self._host_pools

@asyncio.coroutine
def acquire(self, host: str, port: int, use_ssl: bool=False,
async def acquire(self, host: str, port: int, use_ssl: bool=False,
host_key: Optional[Any]=None) \
-> Union[Connection, SSLConnection]:
'''Return an available connection.
Expand All @@ -167,7 +165,7 @@ def acquire(self, host: str, port: int, use_ssl: bool=False,
assert isinstance(port, int), 'Expect int. Got {}'.format(type(port))
assert not self._closed

yield from self._process_no_wait_releases()
await self._process_no_wait_releases()

if use_ssl:
connection_factory = functools.partial(
Expand All @@ -184,7 +182,7 @@ def acquire(self, host: str, port: int, use_ssl: bool=False,

key = host_key or (host, port, use_ssl)

with (yield from self._host_pools_lock):
async with self._host_pools_lock:
if key not in self._host_pools:
host_pool = self._host_pools[key] = HostPool(
connection_factory,
Expand All @@ -197,15 +195,15 @@ def acquire(self, host: str, port: int, use_ssl: bool=False,

_logger.debug('Check out %s', key)

connection = yield from host_pool.acquire()
connection = await host_pool.acquire()
connection.key = key

# TODO: Verify this assert is always true
# assert host_pool.count() <= host_pool.max_connections
# assert key in self._host_pools
# assert self._host_pools[key] == host_pool

with (yield from self._host_pools_lock):
async with self._host_pools_lock:
self._host_pool_waiters[key] -= 1

return connection
Expand Down Expand Up @@ -271,8 +269,7 @@ def context_wrapper():

return context_wrapper()

@asyncio.coroutine
def clean(self, force: bool=False):
async def clean(self, force: bool=False):
'''Clean all closed connections.

Args:
Expand All @@ -282,9 +279,9 @@ def clean(self, force: bool=False):
'''
assert not self._closed

with (yield from self._host_pools_lock):
async with self._host_pools_lock:
for key, pool in tuple(self._host_pools.items()):
yield from pool.clean(force=force)
await pool.clean(force=force)

if not self._host_pool_waiters[key] and pool.empty():
del self._host_pools[key]
Expand Down
59 changes: 29 additions & 30 deletions wpull/processor/coprocessor/phantomjs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
import tempfile
import io

import namedlist

import asyncio

from typing import Callable
from typing import NamedTuple
from typing import Any

from wpull.backport.logging import BraceMessage as __
from wpull.document.html import HTMLReader
Expand All @@ -24,35 +26,32 @@
import wpull.url


PhantomJSParams = namedlist.namedtuple(
'PhantomJSParamsType', [
('snapshot_types', ('html', 'pdf')),
('wait_time', 1),
('num_scrolls', 10),
('smart_scroll', True),
('snapshot', True),
('viewport_size', (1200, 1920)),
('paper_size', (2400, 3840)),
('load_time', 900),
('custom_headers', {}),
('page_settings', {}),
]
)
'''PhantomJS parameters

Attributes:
snapshot_type (list): File types. Accepted are html, pdf, png, gif.
wait_time (float): Time between page scrolls.
num_scrolls (int): Maximum number of scrolls.
smart_scroll (bool): Whether to stop scrolling if number of
requests & responses do not change.
snapshot (bool): Whether to take snapshot files.
viewport_size (tuple): Width and height of the page viewport.
paper_size (tuple): Width and height of the paper size.
load_time (float): Maximum time to wait for page load.
custom_headers (dict): Default HTTP headers.
page_settings (dict): Page settings.
'''
class PhantomJSParams(NamedTuple):
'''PhantomJS parameters

Attributes:
snapshot_type (list): File types. Accepted are html, pdf, png, gif.
wait_time (float): Time between page scrolls.
num_scrolls (int): Maximum number of scrolls.
smart_scroll (bool): Whether to stop scrolling if number of
requests & responses do not change.
snapshot (bool): Whether to take snapshot files.
viewport_size (tuple): Width and height of the page viewport.
paper_size (tuple): Width and height of the paper size.
load_time (float): Maximum time to wait for page load.
custom_headers (dict): Default HTTP headers.
page_settings (dict): Page settings.
'''
snapshot_types: Any = ('html', 'pdf')
wait_time: Any = 1
num_scrolls: Any = 10
smart_scroll: Any = True
snapshot: Any = True
viewport_size: Any = (1200, 1920)
paper_size: Any = (2400, 3840)
load_time: Any = 900
custom_headers: Any = {}
page_settings: Any = {}


_logger = logging.getLogger(__name__)
Expand Down
Loading