diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 2fee4d7ff..2bb3b043b 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -101,10 +101,10 @@ jobs: - name: Download ML Data run: | - lkbuild fetch-data -d ml-100k - lkbuild fetch-data -d ml-1m - lkbuild fetch-data -d ml-10m - lkbuild fetch-data -d ml-20m + python -m lenskit.datasets.fetch ml-100k + python -m lenskit.datasets.fetch ml-1m + python -m lenskit.datasets.fetch ml-10m + python -m lenskit.datasets.fetch ml-20m - name: Install for Testing run: | diff --git a/lenskit/datasets/__init__.py b/lenskit/datasets/__init__.py new file mode 100644 index 000000000..262a14652 --- /dev/null +++ b/lenskit/datasets/__init__.py @@ -0,0 +1 @@ +from .movielens import * # noqa: F403 diff --git a/lenskit/datasets/fetch.py b/lenskit/datasets/fetch.py new file mode 100644 index 000000000..0be178a2e --- /dev/null +++ b/lenskit/datasets/fetch.py @@ -0,0 +1,82 @@ +import sys +from zipfile import ZipFile +from urllib.request import urlopen +import argparse +from pathlib import Path +import logging + +_log = logging.getLogger('lenskit.datasets.fetch') + +ML_LOC = "http://files.grouplens.org/datasets/movielens/" +ML_DATASETS = { + 'ml-100k': 'ml-100k/u.data', + 'ml-1m': 'ml-1m/ratings.dat', + 'ml-10m': 'ml-10M100K/ratings.dat', + 'ml-20m': 'ml-20m/ratings.csv', + 'ml-25m': 'ml-25m/ratings.csv', + 'ml-latest': 'ml-latest/ratings.csv', + 'ml-latest-small': 'ml-latest-small/ratings.csv', +} + + +def fetch_ml(name: str, base_dir: Path): + """ + Fetch a MovieLens dataset. The followings names are recognized: + + . ml-100k + . ml-1m + . ml-10m + . ml-20m + . ml-25m + . ml-latest + . ml-latest-small + + Args: + name: + The name of the dataset. + base_dir: + The base directory into which data should be extracted. + """ + zipname = f'{name}.zip' + zipfile = base_dir / zipname + zipurl = ML_LOC + zipname + + test_file = base_dir / ML_DATASETS[name] + if test_file.exists(): + _log.info(test_file, 'already exists') + return + + _log.info('downloading data set %s', name) + with zipfile.open('wb') as zf: + res = urlopen(zipurl) + block = res.read(8 * 1024 * 1024) + while len(block): + _log.debug('received %d bytes', len(block)) + zf.write(block) + block = res.read(8 * 1024 * 1024) + + _log.info('unpacking data set') + with ZipFile(zipfile, 'r') as zf: + zf.extractall(base_dir) + + +def _fetch_main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + parser = argparse.ArgumentParser() + parser.add_argument('name', help='the name of the dataset to fetch') + parser.add_argument('--data-dir', metavar='DIR', help='save extracted data to DIR', default='data') + args = parser.parse_args() + + name = args.name + _log.info('fetching data set %s', name) + dir = Path(args.data_dir) + _log.info('extracting data to %s', dir) + if name.startswith('ml-'): + fetch_ml(name, dir) + else: + _log.error('unknown data set %s', name) + raise ValueError('invalid data set') + + +if __name__ == '__main__': + _fetch_main() diff --git a/lenskit/datasets.py b/lenskit/datasets/movielens.py similarity index 97% rename from lenskit/datasets.py rename to lenskit/datasets/movielens.py index 82d606fa0..01bca6ba0 100644 --- a/lenskit/datasets.py +++ b/lenskit/datasets/movielens.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np +from lenskit.util import cached _log = logging.getLogger(__name__) @@ -22,20 +23,7 @@ __doctest_skip__.append('ML10M.*') __doctest_skip__.append('MLM.*') - -def cached(prop): - cache = '_cached_' + prop.__name__ - - def getter(self): - val = getattr(self, cache, None) - if val is None: - val = prop(self) - setattr(self, cache, val) - return val - - getter.__doc__ = prop.__doc__ - - return property(getter) +__all__ = ['MovieLens', 'ML100K', 'ML1M', 'ML10M'] class MovieLens: diff --git a/lenskit/util/__init__.py b/lenskit/util/__init__.py index 479b2ab80..8740287f6 100644 --- a/lenskit/util/__init__.py +++ b/lenskit/util/__init__.py @@ -86,6 +86,24 @@ def last_memo(func=None, check_type='identity'): return LastMemo(func, check_type) +def cached(prop): + """ + Decorator for property getters to cache the property value. + """ + cache = '_cached_' + prop.__name__ + + def getter(self): + val = getattr(self, cache, None) + if val is None: + val = prop(self) + setattr(self, cache, val) + return val + + getter.__doc__ = prop.__doc__ + + return property(getter) + + def no_progress(obj, **kwargs): return obj