Skip to content

Commit

Permalink
Add dump-iknn script
Browse files Browse the repository at this point in the history
  • Loading branch information
mdekstrand committed May 4, 2024
1 parent 0e9fa57 commit a71e3e4
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 0 deletions.
1 change: 1 addition & 0 deletions envs/lenskit-py3.10-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies:
- numba<0.59,>=0.56
- numpy>=1.23
- pandas<3,>=1.5
- pyarrow>=15
- pyproject2conda~=0.11
- pytest-cov>=2.12
- pytest-doctestplus==1.*
Expand Down
1 change: 1 addition & 0 deletions envs/lenskit-py3.10-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ dependencies:
- numba<0.59,>=0.56
- numpy>=1.23
- pandas<3,>=1.5
- pyarrow>=15
- pyproject2conda~=0.11
- pytest-cov>=2.12
- pytest-doctestplus==1.*
Expand Down
1 change: 1 addition & 0 deletions envs/lenskit-py3.10-doc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- binpickle>=0.3.2
- cffi>=1.15.0
- csr>=0.5
- just
- myst-nb>=0.13
- numba<0.59,>=0.56
- numpy>=1.23
Expand Down
1 change: 1 addition & 0 deletions envs/lenskit-py3.10-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies:
- coverage>=5
- csr>=0.5
- hypothesis>=6
- just
- numba<0.59,>=0.56
- numpy>=1.23
- pandas<3,>=1.5
Expand Down
1 change: 1 addition & 0 deletions envs/lenskit-py3.11-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies:
- numba<0.59,>=0.56
- numpy>=1.23
- pandas<3,>=1.5
- pyarrow>=15
- pyproject2conda~=0.11
- pytest-cov>=2.12
- pytest-doctestplus==1.*
Expand Down
1 change: 1 addition & 0 deletions envs/lenskit-py3.11-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ dependencies:
- numba<0.59,>=0.56
- numpy>=1.23
- pandas<3,>=1.5
- pyarrow>=15
- pyproject2conda~=0.11
- pytest-cov>=2.12
- pytest-doctestplus==1.*
Expand Down
1 change: 1 addition & 0 deletions envs/lenskit-py3.11-doc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- binpickle>=0.3.2
- cffi>=1.15.0
- csr>=0.5
- just
- myst-nb>=0.13
- numba<0.59,>=0.56
- numpy>=1.23
Expand Down
1 change: 1 addition & 0 deletions envs/lenskit-py3.11-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies:
- coverage>=5
- csr>=0.5
- hypothesis>=6
- just
- numba<0.59,>=0.56
- numpy>=1.23
- pandas<3,>=1.5
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ dev = [
"copier ==9.*",
"unbeheader ~= 1.3", # p2c: -p
"ipython >= 7",
"pyarrow>=15",
"pyproject2conda ~=0.11",
"sphinx-autobuild >= 2021",
"docopt >= 0.6",
Expand Down
55 changes: 55 additions & 0 deletions utils/dump-iknn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Train and save the item-item similarity matrix.
Usage:
dump-iknn.py [-d DATA] [-n NBRS] [-m NBRS] [-s SIM] -o FILE
Options:
-d DATA, --dataset=DATA
Learn k-NN matrix on DATA [default: ml-latest-small].
-o FILE, --output=FILE
Write output to FILE.
"""

import logging
import sys

import pandas as pd
from docopt import docopt

from lenskit.algorithms.item_knn import ItemItem
from lenskit.datasets import MovieLens

_log = logging.getLogger("dump-iknn")


def main(args):
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
data = args["--dataset"]
_log.info("loading data %s", data)
ml = MovieLens(f"data/{data}")

ii_args = {}
if args["-n"]:
ii_args["save_nbrs"] = int(args["-n"])
if args["-m"]:
ii_args["min_nbrs"] = int(args["-m"])
if args["-s"]:
ii_args["min_sim"] = float(args["-s"])

algo = ItemItem(20, **ii_args)
_log.info("training algorithm")
algo.fit(ml.ratings)

outf = args["--output"]
_log.info("saving neighbors to %s", outf)
items = algo.item_index_
mat = algo.sim_matrix_.to_scipy().tocoo()
sims = pd.DataFrame({"i1": items[mat.row], "i2": items[mat.col], "sim": mat.data})
sims.sort_values(["i1", "i2"], inplace=True)
sims.to_parquet(outf, index=False)


if __name__ == "__main__":
args = docopt(__doc__)
main(args)

0 comments on commit a71e3e4

Please sign in to comment.