lenskit · mdekstrand · Nov 11, 2023 · Nov 11, 2023 · Nov 11, 2023 · Nov 11, 2023
diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml
@@ -101,10 +101,10 @@ jobs:
 
       - name: Download ML Data
         run: |
-          lkbuild fetch-data -d ml-100k
-          lkbuild fetch-data -d ml-1m
-          lkbuild fetch-data -d ml-10m
-          lkbuild fetch-data -d ml-20m
+          python -m lenskit.datasets.fetch ml-100k
+          python -m lenskit.datasets.fetch ml-1m
+          python -m lenskit.datasets.fetch ml-10m
+          python -m lenskit.datasets.fetch ml-20m
 
       - name: Install for Testing
         run: |

diff --git a/lenskit/datasets/__init__.py b/lenskit/datasets/__init__.py
@@ -0,0 +1 @@
+from .movielens import * # noqa: F403
diff --git a/lenskit/datasets/fetch.py b/lenskit/datasets/fetch.py
@@ -0,0 +1,82 @@
+import sys
+from zipfile import ZipFile
+from urllib.request import urlopen
+import argparse
+from pathlib import Path
+import logging
+
+_log = logging.getLogger('lenskit.datasets.fetch')
+
+ML_LOC = "http://files.grouplens.org/datasets/movielens/"
+ML_DATASETS = {
+    'ml-100k': 'ml-100k/u.data',
+    'ml-1m': 'ml-1m/ratings.dat',
+    'ml-10m': 'ml-10M100K/ratings.dat',
+    'ml-20m': 'ml-20m/ratings.csv',
+    'ml-25m': 'ml-25m/ratings.csv',
+    'ml-latest': 'ml-latest/ratings.csv',
+    'ml-latest-small': 'ml-latest-small/ratings.csv',
+}
+
+
+def fetch_ml(name: str, base_dir: Path):
+    """
+    Fetch a MovieLens dataset.  The followings names are recognized:
+
+    . ml-100k
+    . ml-1m
+    . ml-10m
+    . ml-20m
+    . ml-25m
+    . ml-latest
+    . ml-latest-small
+
+    Args:
+        name:
+            The name of the dataset.
+        base_dir:
+            The base directory into which data should be extracted.
+    """
+    zipname = f'{name}.zip'
+    zipfile = base_dir / zipname
+    zipurl = ML_LOC + zipname
+
+    test_file = base_dir / ML_DATASETS[name]
+    if test_file.exists():
+        _log.info(test_file, 'already exists')
+        return
+
+    _log.info('downloading data set %s', name)
+    with zipfile.open('wb') as zf:
+        res = urlopen(zipurl)
+        block = res.read(8 * 1024 * 1024)
+        while len(block):
+            _log.debug('received %d bytes', len(block))
+            zf.write(block)
+            block = res.read(8 * 1024 * 1024)
+
+    _log.info('unpacking data set')
+    with ZipFile(zipfile, 'r') as zf:
+        zf.extractall(base_dir)
+
+
+def _fetch_main():
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    parser = argparse.ArgumentParser()
+    parser.add_argument('name', help='the name of the dataset to fetch')
+    parser.add_argument('--data-dir', metavar='DIR', help='save extracted data to DIR', default='data')
+    args = parser.parse_args()
+
+    name = args.name
+    _log.info('fetching data set %s', name)
+    dir = Path(args.data_dir)
+    _log.info('extracting data to %s', dir)
+    if name.startswith('ml-'):
+        fetch_ml(name, dir)
+    else:
+        _log.error('unknown data set %s', name)
+        raise ValueError('invalid data set')
+
+
+if __name__ == '__main__':
+    _fetch_main()
diff --git a/lenskit/datasets.py → lenskit/datasets/movielens.py b/lenskit/datasets.py → lenskit/datasets/movielens.py
@@ -8,6 +8,7 @@
 
 import pandas as pd
 import numpy as np
+from lenskit.util import cached
 
 _log = logging.getLogger(__name__)
 
@@ -22,20 +23,7 @@
     __doctest_skip__.append('ML10M.*')
     __doctest_skip__.append('MLM.*')
 
-
-def cached(prop):
-    cache = '_cached_' + prop.__name__
-
-    def getter(self):
-        val = getattr(self, cache, None)
-        if val is None:
-            val = prop(self)
-            setattr(self, cache, val)
-        return val
-
-    getter.__doc__ = prop.__doc__
-
-    return property(getter)
+__all__ = ['MovieLens', 'ML100K', 'ML1M', 'ML10M']
 
 
 class MovieLens:

diff --git a/lenskit/util/__init__.py b/lenskit/util/__init__.py
@@ -86,6 +86,24 @@ def last_memo(func=None, check_type='identity'):
         return LastMemo(func, check_type)
 
 
+def cached(prop):
+    """
+    Decorator for property getters to cache the property value.
+    """
+    cache = '_cached_' + prop.__name__
+
+    def getter(self):
+        val = getattr(self, cache, None)
+        if val is None:
+            val = prop(self)
+            setattr(self, cache, val)
+        return val
+
+    getter.__doc__ = prop.__doc__
+
+    return property(getter)
+
+
 def no_progress(obj, **kwargs):
     return obj
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .movielens import * # noqa: F403
Copy link codeclimate bot Nov 11, 2023 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. At least two spaces before inline comment