Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linear lookup for Python 2 #35

Merged
merged 17 commits into from
Jan 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ language: python

python:
- 2.7
- 3.5
- &latest_py3 3.7
- 3.6
- &latest_py3 3.8

jobs:
fast_finish: true
Expand Down
9 changes: 9 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
v1.1.0
======

#32: For read-only zip files, complexity of ``.exists`` and
``joinpath`` is now constant time instead of ``O(n)``, preventing
quadratic time in common use-cases and rendering large
zip files unusable for Path. Big thanks to Benjy Weinberger
for the bug report and contributed fix (#33).

v1.0.0
======

Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ packages = find:
include_package_data = true
python_requires = >=2.7
install_requires =
more_itertools
contextlib2; python_version < "3.4"
setup_requires = setuptools_scm >= 1.15.0

[options.extras_require]
Expand All @@ -31,8 +31,8 @@ testing =

# local
pathlib2
contextlib2
unittest2
jaraco.itertools

docs =
# upstream
Expand Down
44 changes: 42 additions & 2 deletions test_zipp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import zipfile
import posixpath
import contextlib
import tempfile
import shutil
Expand All @@ -25,6 +24,8 @@
except AttributeError:
import unittest2 as unittest

import jaraco.itertools

import zipp

__metaclass__ = type
Expand All @@ -36,7 +37,7 @@ def add_dirs(zf):
Given a writable zip file zf, inject directory entries for
any directories implied by the presence of children.
"""
for name in zipp.Path._implied_dirs(zf.namelist()):
for name in zipp.CompleteDirs._implied_dirs(zf.namelist()):
zf.writestr(name, b"")
return zf

Expand Down Expand Up @@ -196,3 +197,42 @@ def test_missing_dir_parent(self):
for alpharep in self.zipfile_alpharep():
root = zipp.Path(alpharep)
assert (root / 'missing dir/').parent.at == ''

def test_mutability(self):
"""
If the underlying zipfile is changed, the Path object should
reflect that change.
"""
for alpharep in self.zipfile_alpharep():
root = zipp.Path(alpharep)
a, b, g = root.iterdir()
alpharep.writestr('foo.txt', b'foo')
alpharep.writestr('bar/baz.txt', b'baz')
assert any(
child.name == 'foo.txt'
for child in root.iterdir())
assert (root / 'foo.txt').read_text() == 'foo'
baz, = (root / 'bar').iterdir()
assert baz.read_text() == 'baz'

HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13

def huge_zipfile(self):
"""Create a read-only zipfile with a huge number of entries entries."""
strm = io.BytesIO()
zf = zipfile.ZipFile(strm, "w")
for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)):
zf.writestr(entry, entry)
zf.mode = 'r'
return zf

def test_joinpath_constant_time(self):
"""
Ensure joinpath on items in zipfile is linear time.
"""
root = zipp.Path(self.huge_zipfile())
entries = jaraco.itertools.Counter(root.iterdir())
for entry in entries:
entry.joinpath('suffix')
# Check the file iterated all items
assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES
134 changes: 94 additions & 40 deletions zipp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
import zipfile
import functools
import itertools
from collections import OrderedDict

import more_itertools
try:
from contextlib import suppress
except ImportError:
from contextlib2 import suppress

__metaclass__ = type

Expand Down Expand Up @@ -55,6 +59,90 @@ def _ancestry(path):
path, tail = posixpath.split(path)


class CompleteDirs(zipfile.ZipFile):
"""
A ZipFile subclass that ensures that implied directories
are always included in the namelist.
"""

@staticmethod
def _implied_dirs(names):
parents = itertools.chain.from_iterable(map(_parents, names))
# Deduplicate entries in original order
implied_dirs = OrderedDict.fromkeys(
p + posixpath.sep for p in parents
# Cast names to a set for O(1) lookups
if p + posixpath.sep not in set(names)
)
return implied_dirs

def namelist(self):
names = super(CompleteDirs, self).namelist()
return names + list(self._implied_dirs(names))

def _name_set(self):
return set(self.namelist())

def resolve_dir(self, name):
"""
If the name represents a directory, return that name
as a directory (with the trailing slash).
"""
names = self._name_set()
dirname = name + '/'
dir_match = name not in names and dirname in names
return dirname if dir_match else name

@classmethod
def make(cls, source):
"""
Given a source (filename or zipfile), return an
appropriate CompleteDirs subclass.
"""
if isinstance(source, CompleteDirs):
return source

if not isinstance(source, zipfile.ZipFile):
return cls(_pathlib_compat(source))

# Only allow for FastPath when supplied zipfile is read-only
if 'r' not in source.mode:
cls = CompleteDirs

res = cls.__new__(cls)
vars(res).update(vars(source))
return res


class FastLookup(CompleteDirs):
"""
ZipFile subclass to ensure implicit
dirs exist and are resolved rapidly.
"""
def namelist(self):
with suppress(AttributeError):
return self.__names
self.__names = super(FastLookup, self).namelist()
return self.__names

def _name_set(self):
with suppress(AttributeError):
return self.__lookup
self.__lookup = super(FastLookup, self)._name_set()
return self.__lookup


def _pathlib_compat(path):
"""
For path-like objects, convert to a filename for compatibility
on Python 3.6.1 and earlier.
"""
try:
return path.__fspath__()
except AttributeError:
return str(path)


class Path:
"""
A pathlib-compatible interface for zip files.
Expand Down Expand Up @@ -123,24 +211,9 @@ class Path:
__repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})"

def __init__(self, root, at=""):
self.root = (
root
if isinstance(root, zipfile.ZipFile)
else zipfile.ZipFile(self._pathlib_compat(root))
)
self.root = FastLookup.make(root)
self.at = at

@staticmethod
def _pathlib_compat(path):
"""
For path-like objects, convert to a filename for compatibility
on Python 3.6.1 and earlier.
"""
try:
return path.__fspath__()
except AttributeError:
return str(path)

@property
def open(self):
return functools.partial(self.root.open, self.at)
Expand Down Expand Up @@ -170,12 +243,12 @@ def is_file(self):
return not self.is_dir()

def exists(self):
return self.at in self._names()
return self.at in self.root._name_set()

def iterdir(self):
if not self.is_dir():
raise ValueError("Can't listdir a file")
subs = map(self._next, self._names())
subs = map(self._next, self.root.namelist())
return filter(self._is_child, subs)

def __str__(self):
Expand All @@ -185,36 +258,17 @@ def __repr__(self):
return self.__repr.format(self=self)

def joinpath(self, add):
add = self._pathlib_compat(add)
next = posixpath.join(self.at, add)
next_dir = posixpath.join(self.at, add, "")
names = self._names()
return self._next(next_dir if next not in names and next_dir in names else next)
next = posixpath.join(self.at, _pathlib_compat(add))
return self._next(self.root.resolve_dir(next))

__truediv__ = joinpath

@staticmethod
def _implied_dirs(names):
return more_itertools.unique_everseen(
parent + "/"
for name in names
for parent in _parents(name)
if parent + "/" not in names
)

@classmethod
def _add_implied_dirs(cls, names):
return names + list(cls._implied_dirs(names))

@property
def parent(self):
parent_at = posixpath.dirname(self.at.rstrip('/'))
if parent_at:
parent_at += '/'
return self._next(parent_at)

def _names(self):
return self._add_implied_dirs(self.root.namelist())

if sys.version_info < (3,):
__div__ = __truediv__