diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6f7a6d9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,111 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..63b4b68 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..8437202 --- /dev/null +++ b/Pipfile @@ -0,0 +1,17 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +active-semi-supervised-clustering = {editable = true, path = "."} + +[dev-packages] +jupyter = "*" +scikit-learn = "*" +setuptools = "*" +wheel = "*" +twine = "*" + +[requires] +python_version = "3" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..d6268ad --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,593 @@ +{ + "_meta": { + "hash": { + "sha256": "2973e7ae8a939a5aaed6fafcf3846461ac5263fdc33fb726af4a815ea3019e91" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "active-semi-supervised-clustering": { + "editable": true, + "path": "." + }, + "metric-learn": { + "hashes": [ + "sha256:3f3ccd61b6fd09ef780becab1f56a31c434d1d4ae9fc8b6386540ed91a0ba917", + "sha256:697fa55bc11f97a36835cf70a7833b93bb5481a3468f503fb4da22bf0137b400" + ], + "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "version": "==0.4.0" + }, + "numpy": { + "hashes": [ + "sha256:1c362ad12dd09a43b348bb28dd2295dd9cdf77f41f0f45965e04ba97f525b864", + "sha256:2156a06bd407918df4ac0122df6497a9c137432118f585e5b17d543e593d1587", + "sha256:24e4149c38489b51fc774b1e1faa9103e82f73344d7a00ba66f6845ab4769f3f", + "sha256:340ec1697d9bb3a9c464028af7a54245298502e91178bddb4c37626d36e197b7", + "sha256:35db8d419345caa4eeaa65cd63f34a15208acd87530a30f0bc25fc84f55c8c80", + "sha256:361370e9b7f5e44c41eee29f2bb5cb3b755abb4b038bce6d6cbe08db7ff9cb74", + "sha256:36e8dcd1813ca92ce7e4299120cee6c03adad33d89b54862c1b1a100443ac399", + "sha256:378378973546ecc1dfaf9e24c160d683dd04df871ecd2dcc86ce658ca20f92c0", + "sha256:419e6faee16097124ee627ed31572c7e80a1070efa25260b78097cca240e219a", + "sha256:4287104c24e6a09b9b418761a1e7b1bbde65105f110690ca46a23600a3c606b8", + "sha256:549f3e9778b148a47f4fb4682955ed88057eb627c9fe5467f33507c536deda9d", + "sha256:5e359e9c531075220785603e5966eef20ccae9b3b6b8a06fdfb66c084361ce92", + "sha256:5ee7f3dbbdba0da75dec7e94bd7a2b10fe57a83e1b38e678200a6ad8e7b14fdc", + "sha256:62d55e96ec7b117d3d5e618c15efcf769e70a6effaee5842857b64fb4883887a", + "sha256:719b6789acb2bc86ea9b33a701d7c43dc2fc56d95107fd3c5b0a8230164d4dfb", + "sha256:7a70f2b60d48828cba94a54a8776b61a9c2657a803d47f5785f8062e3a9c7c55", + "sha256:7b9e37f194f8bcdca8e9e6af92e2cbad79e360542effc2dd6b98d63955d8d8a3", + "sha256:83b8fc18261b70f45bece2d392537c93dc81eb6c539a16c9ac994c47fc79f09a", + "sha256:9473ad28375710ab18378e72b59422399b27e957e9339c413bf00793b4b12df0", + "sha256:95b085b253080e5d09f7826f5e27dce067bae813a132023a77b739614a29de6e", + "sha256:98b86c62c08c2e5dc98a9c856d4a95329d11b1c6058cb9b5191d5ea6891acd09", + "sha256:a3bd01d6d3ed3d7c06d7f9979ba5d68281f15383fafd53b81aa44b9191047cf8", + "sha256:c81a6afc1d2531a9ada50b58f8c36197f8418ef3d0611d4c1d7af93fdcda764f", + "sha256:ce75ed495a746e3e78cfa22a77096b3bff2eda995616cb7a542047f233091268", + "sha256:dae8618c0bcbfcf6cf91350f8abcdd84158323711566a8c5892b5c7f832af76f", + "sha256:df0b02c6705c5d1c25cc35c7b5d6b6f9b3b30833f9d178843397ae55ecc2eebb", + "sha256:e3660744cda0d94b90141cdd0db9308b958a372cfeee8d7188fdf5ad9108ea82", + "sha256:f2362d0ca3e16c37782c1054d7972b8ad2729169567e3f0f4e5dd3cdf85f188e" + ], + "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "version": "==1.15.1" + }, + "scikit-learn": { + "hashes": [ + "sha256:0a718b5ffbd5053fb3f9e1a2e20b7c4f256dd8035e246b907d3117d20bac0260", + "sha256:1725540b754a9967778e9385e1ee2c8db50d5ab70ed835c9f5e36002ffabc169", + "sha256:3e3ce307d7c5c5811658ba8686b24b571a8244eaafe707665ad601f400d5ce98", + "sha256:42ad71502237c9fe300ecf157f5a394df717789a2dde541dd7034b539c70bdcc", + "sha256:42cba716db197e0d1670e2fc13c4cc4a86d5c5358120ccfee6ec427b154e74ff", + "sha256:47b4090b7686642e41176becb7c42ef3cc665d7ee0db5e7ea5d307ec9779327e", + "sha256:51d99a08c8bf689cf60c9d8dca6e3d3e5f6d762def85ad735dcea11fb528a89b", + "sha256:5f7577fbb2399a4712e96cf0e786638168940a876c33735a1b5d5a86ba4b1370", + "sha256:66bfc2b6b15db1725d03ea657ec9184ff09dcbf1ecd834ef85f2edc2c9cbba97", + "sha256:69a34d389d9ca4687ad00af4e11d53686771f484c37366f68617ef656bab16ab", + "sha256:75297f3dd6685f01555f1bb75846995d45650af417280b69c81bf11b6987aed5", + "sha256:9ebb38ab1d0ee143982aed561811903ac6c1abb512ae2b9019b3b65bde63ffb9", + "sha256:a402c1484fe65df42d5dbc22a58e0695fe3afe2b0b229aee2a09c6d60ba8e5c2", + "sha256:aad6b9aac1617bd7efa0450643888bbd3410679a94bc8680d9863825686ef369", + "sha256:ad4db28d3dc16c01df75ed6efb72524537de3839a5d179fcf94094359fc72ec5", + "sha256:b276739a5f863ccacb61999a3067d0895ee291c95502929b2ae56ea1f882e888", + "sha256:b3dc88c4d2bcb26ffc5afe16d053ae28317d7d1de083651defcd5453a04f1563", + "sha256:b3e4681253e95da5aa5c231889a32b084fd997962bf8beda6f796bf422f734b2", + "sha256:c3d852d49d6c1710089d4513702099fa6f8e1aebfedf222319d80c47b0a195f8", + "sha256:c6612e7e43988b8b5e1957150449493a55f9c059de641083df7a964f86f2d1e7", + "sha256:c69e5c6051366a6ac9600d730276db939b1a205e42504ec0b8371f154b0058db", + "sha256:ce121baa8e85ec27c3065281657dcd78adaab7dcb046c7fe96ad4e5a9dcb6610", + "sha256:ed2a9a9bea6ec443b7effe5695c9c168b7bf9a67df6d880729760feda871b6a3", + "sha256:efd842d70b87e3ef3429c3149840b9189d4441ca951ab0cec62c94a964e219d9", + "sha256:f1428af5c381f6eef30ffbc7e047b7c713d4efa5d7bf5e57b62b3fc8d387044b", + "sha256:f6c7bf8cd4de1640b760b47f4d28deb26dbbf9acbe0194cdff54a898e190d872", + "sha256:f8329ac2160ad8bbbac6a507374685ceca3f24ca427fa9ee61a501280e1972d9", + "sha256:fefba2a43b92f8393366093b60efbe984a72a2b41cce16b4002005e4104ef938" + ], + "version": "==0.19.2" + }, + "scipy": { + "hashes": [ + "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", + "sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a", + "sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd", + "sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3", + "sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37", + "sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463", + "sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3", + "sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631", + "sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5", + "sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a", + "sha256:42d9149a2fff7affdd352d157fa5717033767857c11bd55aa4a519a44343dfef", + "sha256:625f25a6b7d795e8830cb70439453c9f163e6870e710ec99eba5722775b318f3", + "sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f", + "sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559", + "sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692", + "sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", + "sha256:8b984f0821577d889f3c7ca8445564175fb4ac7c7f9659b7c60bef95b2b70e76", + "sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac", + "sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a", + "sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2", + "sha256:d40dc7f494b06dcee0d303e51a00451b2da6119acbeaccf8369f2d29e28917ac", + "sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01", + "sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552", + "sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40", + "sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020", + "sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae", + "sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40", + "sha256:f25c281f12c0da726c6ed00535ca5d1622ec755c30a3f8eafef26cf43fede694" + ], + "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "version": "==1.1.0" + }, + "six": { + "hashes": [ + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + ], + "version": "==1.11.0" + } + }, + "develop": { + "appnope": { + "hashes": [ + "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", + "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" + ], + "markers": "sys_platform == 'darwin'", + "version": "==0.1.0" + }, + "backcall": { + "hashes": [ + "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", + "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" + ], + "version": "==0.1.0" + }, + "bleach": { + "hashes": [ + "sha256:0ee95f6167129859c5dce9b1ca291ebdb5d8cd7e382ca0e237dfd0dad63f63d8", + "sha256:24754b9a7d530bf30ce7cbc805bc6cce785660b4a10ff3a43633728438c105ab" + ], + "version": "==2.1.4" + }, + "certifi": { + "hashes": [ + "sha256:376690d6f16d32f9d1fe8932551d80b23e9d393a8578c5633a2ed39a64861638", + "sha256:456048c7e371c089d0a77a5212fb37a2c2dce1e24146e3b7e0261736aaeaa22a" + ], + "version": "==2018.8.24" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "decorator": { + "hashes": [ + "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", + "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" + ], + "version": "==4.3.0" + }, + "defusedxml": { + "hashes": [ + "sha256:24d7f2f94f7f3cb6061acb215685e5125fbcdc40a857eff9de22518820b0a4f4", + "sha256:702a91ade2968a82beb0db1e0766a6a273f33d4616a6ce8cde475d8e09853b20" + ], + "version": "==0.5.0" + }, + "entrypoints": { + "hashes": [ + "sha256:10ad569bb245e7e2ba425285b9fa3e8178a0dc92fc53b1e1c553805e15a8825b", + "sha256:d2d587dde06f99545fb13a383d2cd336a8ff1f359c5839ce3a64c917d10c029f" + ], + "markers": "python_version >= '2.7'", + "version": "==0.2.3" + }, + "html5lib": { + "hashes": [ + "sha256:20b159aa3badc9d5ee8f5c647e5efd02ed2a66ab8d354930bd9ff139fc1dc0a3", + "sha256:66cb0dcfdbbc4f9c3ba1a63fdb511ffdbd4f513b2b6d81b80cd26ce6b3fb3736" + ], + "version": "==1.0.1" + }, + "idna": { + "hashes": [ + "sha256:156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", + "sha256:684a38a6f903c1d71d6d5fac066b58d7768af4de2b832e426ec79c30daa94a16" + ], + "version": "==2.7" + }, + "ipykernel": { + "hashes": [ + "sha256:00d88b7e628e4e893359119b894451611214bce09776a3bf8248fe42cb48ada6", + "sha256:a706b975376efef98b70e10cd167ab9506cf08a689d689a3c7daf344c15040f6", + "sha256:c5a498c70f7765c34f3397cf943b069057f5bef4e0218e4cfbb733e9f38fa5fa" + ], + "version": "==4.9.0" + }, + "ipython": { + "hashes": [ + "sha256:007dcd929c14631f83daff35df0147ea51d1af420da303fd078343878bd5fb62", + "sha256:b0f2ef9eada4a68ef63ee10b6dde4f35c840035c50fd24265f8052c98947d5a4" + ], + "markers": "python_version >= '3.3'", + "version": "==6.5.0" + }, + "ipython-genutils": { + "hashes": [ + "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", + "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" + ], + "version": "==0.2.0" + }, + "ipywidgets": { + "hashes": [ + "sha256:0f2b5cde9f272cb49d52f3f0889fdd1a7ae1e74f37b48dac35a83152780d2b7b", + "sha256:a3e224f430163f767047ab9a042fc55adbcab0c24bbe6cf9f306c4f89fdf0ba3" + ], + "version": "==7.4.2" + }, + "jedi": { + "hashes": [ + "sha256:b409ed0f6913a701ed474a614a3bb46e6953639033e31f769ca7581da5bd1ec1", + "sha256:c254b135fb39ad76e78d4d8f92765ebc9bf92cbc76f49e97ade1d5f5121e1f6f" + ], + "version": "==0.12.1" + }, + "jinja2": { + "hashes": [ + "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd", + "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4" + ], + "version": "==2.10" + }, + "jsonschema": { + "hashes": [ + "sha256:000e68abd33c972a5248544925a0cae7d1125f9bf6c58280d37546b946769a08", + "sha256:6ff5f3180870836cae40f06fa10419f557208175f13ad7bc26caa77beb1f6e02" + ], + "version": "==2.6.0" + }, + "jupyter": { + "hashes": [ + "sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7", + "sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78", + "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f" + ], + "index": "pypi", + "version": "==1.0.0" + }, + "jupyter-client": { + "hashes": [ + "sha256:27befcf0446b01e29853014d6a902dd101ad7d7f94e2252b1adca17c3466b761", + "sha256:59e6d791e22a8002ad0e80b78c6fd6deecab4f9e1b1aa1a22f4213de271b29ea" + ], + "version": "==5.2.3" + }, + "jupyter-console": { + "hashes": [ + "sha256:3f928b817fc82cda95e431eb4c2b5eb21be5c483c2b43f424761a966bb808094", + "sha256:545dedd3aaaa355148093c5609f0229aeb121b4852995c2accfa64fe3e0e55cd" + ], + "version": "==5.2.0" + }, + "jupyter-core": { + "hashes": [ + "sha256:927d713ffa616ea11972534411544589976b2493fc7e09ad946e010aa7eb9970", + "sha256:ba70754aa680300306c699790128f6fbd8c306ee5927976cbe48adacf240c0b7" + ], + "version": "==4.4.0" + }, + "markupsafe": { + "hashes": [ + "sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665" + ], + "version": "==1.0" + }, + "mistune": { + "hashes": [ + "sha256:b4c512ce2fc99e5a62eb95a4aba4b73e5f90264115c40b70a21e1f7d4e0eac91", + "sha256:bc10c33bfdcaa4e749b779f62f60d6e12f8215c46a292d05e486b869ae306619" + ], + "version": "==0.8.3" + }, + "nbconvert": { + "hashes": [ + "sha256:08d21cf4203fabafd0d09bbd63f06131b411db8ebeede34b0fd4be4548351779", + "sha256:a8a2749f972592aa9250db975304af6b7337f32337e523a2c995cc9e12c07807" + ], + "version": "==5.4.0" + }, + "nbformat": { + "hashes": [ + "sha256:b9a0dbdbd45bb034f4f8893cafd6f652ea08c8c1674ba83f2dc55d3955743b0b", + "sha256:f7494ef0df60766b7cabe0a3651556345a963b74dbc16bc7c18479041170d402" + ], + "version": "==4.4.0" + }, + "notebook": { + "hashes": [ + "sha256:66dd59e76e755584ae9450eb015c39f55d4bb1d8ec68f2c694d2b3cba7bf5c7e", + "sha256:e2c8e931cc19db4f8c63e6a396efbc13a228b2cb5b2919df011b946f28239a08" + ], + "version": "==5.6.0" + }, + "pandocfilters": { + "hashes": [ + "sha256:b3dd70e169bb5449e6bc6ff96aea89c5eea8c5f6ab5e207fc2f521a2cf4a0da9" + ], + "version": "==1.4.2" + }, + "parso": { + "hashes": [ + "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", + "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" + ], + "version": "==0.3.1" + }, + "pexpect": { + "hashes": [ + "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba", + "sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b" + ], + "markers": "sys_platform != 'win32'", + "version": "==4.6.0" + }, + "pickleshare": { + "hashes": [ + "sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b", + "sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5" + ], + "version": "==0.7.4" + }, + "pkginfo": { + "hashes": [ + "sha256:5878d542a4b3f237e359926384f1dde4e099c9f5525d236b1840cf704fa8d474", + "sha256:a39076cb3eb34c333a0dd390b568e9e1e881c7bf2cc0aee12120636816f55aee" + ], + "version": "==1.4.2" + }, + "prometheus-client": { + "hashes": [ + "sha256:17bc24c09431644f7c65d7bce9f4237252308070b6395d6d8e87767afe867e24" + ], + "version": "==0.3.1" + }, + "prompt-toolkit": { + "hashes": [ + "sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381", + "sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4", + "sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917" + ], + "version": "==1.0.15" + }, + "ptyprocess": { + "hashes": [ + "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", + "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" + ], + "markers": "os_name != 'nt'", + "version": "==0.6.0" + }, + "pygments": { + "hashes": [ + "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", + "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc" + ], + "version": "==2.2.0" + }, + "python-dateutil": { + "hashes": [ + "sha256:1adb80e7a782c12e52ef9a8182bebeb73f1d7e24e374397af06fb4956c8dc5c0", + "sha256:e27001de32f627c22380a688bcc43ce83504a7bc5da472209b4c70f02829f0b8" + ], + "version": "==2.7.3" + }, + "pyzmq": { + "hashes": [ + "sha256:25a0715c8f69cf72f67cfe5a68a3f3ed391c67c063d2257bec0fe7fc2c7f08f8", + "sha256:2bab63759632c6b9e0d5bf19cc63c3b01df267d660e0abcf230cf0afaa966349", + "sha256:30ab49d99b24bf0908ebe1cdfa421720bfab6f93174e4883075b7ff38cc555ba", + "sha256:32c7ca9fc547a91e3c26fc6080b6982e46e79819e706eb414dd78f635a65d946", + "sha256:41219ae72b3cc86d97557fe5b1ef5d1adc1057292ec597b50050874a970a39cf", + "sha256:4b8c48a9a13cea8f1f16622f9bd46127108af14cd26150461e3eab71e0de3e46", + "sha256:55724997b4a929c0d01b43c95051318e26ddbae23565018e138ae2dc60187e59", + "sha256:65f0a4afae59d4fc0aad54a917ab599162613a761b760ba167d66cc646ac3786", + "sha256:6f88591a8b246f5c285ee6ce5c1bf4f6bd8464b7f090b1333a446b6240a68d40", + "sha256:75022a4c60dcd8765bb9ca32f6de75a0ec83b0d96e0309dc479f4c7b21f26cb7", + "sha256:76ea493bfab18dcb090d825f3662b5612e2def73dffc196d51a5194b0294a81d", + "sha256:7b60c045b80709e4e3c085bab9b691e71761b44c2b42dbb047b8b498e7bc16b3", + "sha256:8e6af2f736734aef8ed6f278f9f552ec7f37b1a6b98e59b887484a840757f67d", + "sha256:9ac2298e486524331e26390eac14e4627effd3f8e001d4266ed9d8f1d2d31cce", + "sha256:9ba650f493a9bc1f24feca1d90fce0e5dd41088a252ac9840131dfbdbf3815ca", + "sha256:a02a4a385e394e46012dc83d2e8fd6523f039bb52997c1c34a2e0dd49ed839c1", + "sha256:a3ceee84114d9f5711fa0f4db9c652af0e4636c89eabc9b7f03a3882569dd1ed", + "sha256:a72b82ac1910f2cf61a49139f4974f994984475f771b0faa730839607eeedddf", + "sha256:ab136ac51027e7c484c53138a0fab4a8a51e80d05162eb7b1585583bcfdbad27", + "sha256:c095b224300bcac61e6c445e27f9046981b1ac20d891b2f1714da89d34c637c8", + "sha256:c5cc52d16c06dc2521340d69adda78a8e1031705924e103c0eb8fc8af861d810", + "sha256:d612e9833a89e8177f8c1dc68d7b4ff98d3186cd331acd616b01bbdab67d3a7b", + "sha256:e828376a23c66c6fe90dcea24b4b72cd774f555a6ee94081670872918df87a19", + "sha256:e9767c7ab2eb552796440168d5c6e23a99ecaade08dda16266d43ad461730192", + "sha256:ebf8b800d42d217e4710d1582b0c8bff20cdcb4faad7c7213e52644034300924" + ], + "markers": "python_version >= '2.7' and python_version != '3.2*' and python_version != '3.0*' and python_version != '3.1*'", + "version": "==17.1.2" + }, + "qtconsole": { + "hashes": [ + "sha256:298431d376d71a02eb1a04fe6e72dd4beb82b83423d58b17d532e0af838e62fa", + "sha256:7870b19e6a6b0ab3acc09ee65463c0ca7568b3a01a6902d7c4e1ed2c4fc4e176" + ], + "version": "==4.4.1" + }, + "requests": { + "hashes": [ + "sha256:63b52e3c866428a224f97cab011de738c36aec0185aa91cfacd418b5d58911d1", + "sha256:ec22d826a36ed72a7358ff3fe56cbd4ba69dd7a6718ffd450ff0e9df7a47ce6a" + ], + "version": "==2.19.1" + }, + "requests-toolbelt": { + "hashes": [ + "sha256:42c9c170abc2cacb78b8ab23ac957945c7716249206f90874651971a4acff237", + "sha256:f6a531936c6fa4c6cfce1b9c10d5c4f498d16528d2a54a22ca00011205a187b5" + ], + "version": "==0.8.0" + }, + "scikit-learn": { + "hashes": [ + "sha256:0a718b5ffbd5053fb3f9e1a2e20b7c4f256dd8035e246b907d3117d20bac0260", + "sha256:1725540b754a9967778e9385e1ee2c8db50d5ab70ed835c9f5e36002ffabc169", + "sha256:3e3ce307d7c5c5811658ba8686b24b571a8244eaafe707665ad601f400d5ce98", + "sha256:42ad71502237c9fe300ecf157f5a394df717789a2dde541dd7034b539c70bdcc", + "sha256:42cba716db197e0d1670e2fc13c4cc4a86d5c5358120ccfee6ec427b154e74ff", + "sha256:47b4090b7686642e41176becb7c42ef3cc665d7ee0db5e7ea5d307ec9779327e", + "sha256:51d99a08c8bf689cf60c9d8dca6e3d3e5f6d762def85ad735dcea11fb528a89b", + "sha256:5f7577fbb2399a4712e96cf0e786638168940a876c33735a1b5d5a86ba4b1370", + "sha256:66bfc2b6b15db1725d03ea657ec9184ff09dcbf1ecd834ef85f2edc2c9cbba97", + "sha256:69a34d389d9ca4687ad00af4e11d53686771f484c37366f68617ef656bab16ab", + "sha256:75297f3dd6685f01555f1bb75846995d45650af417280b69c81bf11b6987aed5", + "sha256:9ebb38ab1d0ee143982aed561811903ac6c1abb512ae2b9019b3b65bde63ffb9", + "sha256:a402c1484fe65df42d5dbc22a58e0695fe3afe2b0b229aee2a09c6d60ba8e5c2", + "sha256:aad6b9aac1617bd7efa0450643888bbd3410679a94bc8680d9863825686ef369", + "sha256:ad4db28d3dc16c01df75ed6efb72524537de3839a5d179fcf94094359fc72ec5", + "sha256:b276739a5f863ccacb61999a3067d0895ee291c95502929b2ae56ea1f882e888", + "sha256:b3dc88c4d2bcb26ffc5afe16d053ae28317d7d1de083651defcd5453a04f1563", + "sha256:b3e4681253e95da5aa5c231889a32b084fd997962bf8beda6f796bf422f734b2", + "sha256:c3d852d49d6c1710089d4513702099fa6f8e1aebfedf222319d80c47b0a195f8", + "sha256:c6612e7e43988b8b5e1957150449493a55f9c059de641083df7a964f86f2d1e7", + "sha256:c69e5c6051366a6ac9600d730276db939b1a205e42504ec0b8371f154b0058db", + "sha256:ce121baa8e85ec27c3065281657dcd78adaab7dcb046c7fe96ad4e5a9dcb6610", + "sha256:ed2a9a9bea6ec443b7effe5695c9c168b7bf9a67df6d880729760feda871b6a3", + "sha256:efd842d70b87e3ef3429c3149840b9189d4441ca951ab0cec62c94a964e219d9", + "sha256:f1428af5c381f6eef30ffbc7e047b7c713d4efa5d7bf5e57b62b3fc8d387044b", + "sha256:f6c7bf8cd4de1640b760b47f4d28deb26dbbf9acbe0194cdff54a898e190d872", + "sha256:f8329ac2160ad8bbbac6a507374685ceca3f24ca427fa9ee61a501280e1972d9", + "sha256:fefba2a43b92f8393366093b60efbe984a72a2b41cce16b4002005e4104ef938" + ], + "version": "==0.19.2" + }, + "send2trash": { + "hashes": [ + "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2", + "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b" + ], + "version": "==1.5.0" + }, + "simplegeneric": { + "hashes": [ + "sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173" + ], + "version": "==0.8.1" + }, + "six": { + "hashes": [ + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + ], + "version": "==1.11.0" + }, + "terminado": { + "hashes": [ + "sha256:55abf9ade563b8f9be1f34e4233c7b7bde726059947a593322e8a553cc4c067a", + "sha256:65011551baff97f5414c67018e908110693143cfbaeb16831b743fe7cad8b927" + ], + "markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "version": "==0.8.1" + }, + "testpath": { + "hashes": [ + "sha256:039fa6a6c9fd3488f8336d23aebbfead5fa602c4a47d49d83845f55a595ec1b4", + "sha256:0d5337839c788da5900df70f8e01015aec141aa3fe7936cb0d0a2953f7ac7609" + ], + "version": "==0.3.1" + }, + "tornado": { + "hashes": [ + "sha256:0662d28b1ca9f67108c7e3b77afabfb9c7e87bde174fbda78186ecedc2499a9d", + "sha256:4e5158d97583502a7e2739951553cbd88a72076f152b4b11b64b9a10c4c49409", + "sha256:732e836008c708de2e89a31cb2fa6c0e5a70cb60492bee6f1ea1047500feaf7f", + "sha256:8154ec22c450df4e06b35f131adc4f2f3a12ec85981a203301d310abf580500f", + "sha256:8e9d728c4579682e837c92fdd98036bd5cdefa1da2aaf6acf26947e6dd0c01c5", + "sha256:d4b3e5329f572f055b587efc57d29bd051589fb5a43ec8898c77a47ec2fa2bbb", + "sha256:e5f2585afccbff22390cddac29849df463b252b711aa2ce7c5f3f342a5b3b444" + ], + "markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "version": "==5.1.1" + }, + "tqdm": { + "hashes": [ + "sha256:18f1818ce951aeb9ea162ae1098b43f583f7d057b34d706f66939353d1208889", + "sha256:df02c0650160986bac0218bb07952245fc6960d23654648b5d5526ad5a4128c9" + ], + "markers": "python_version != '3.0.*' and python_version != '3.1.*' and python_version >= '2.6'", + "version": "==4.26.0" + }, + "traitlets": { + "hashes": [ + "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", + "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" + ], + "version": "==4.3.2" + }, + "twine": { + "hashes": [ + "sha256:08eb132bbaec40c6d25b358f546ec1dc96ebd2638a86eea68769d9e67fe2b129", + "sha256:2fd9a4d9ff0bcacf41fdc40c8cb0cfaef1f1859457c9653fd1b92237cc4e9f25" + ], + "index": "pypi", + "version": "==1.11.0" + }, + "urllib3": { + "hashes": [ + "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", + "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" + ], + "markers": "python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*' and python_version >= '2.6'", + "version": "==1.23" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + }, + "webencodings": { + "hashes": [ + "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", + "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" + ], + "version": "==0.5.1" + }, + "wheel": { + "hashes": [ + "sha256:0a2e54558a0628f2145d2fc822137e322412115173e8a2ddbe1c9024338ae83c", + "sha256:80044e51ec5bbf6c894ba0bc48d26a8c20a9ba629f4ca19ea26ecfcf87685f5f" + ], + "index": "pypi", + "version": "==0.31.1" + }, + "widgetsnbextension": { + "hashes": [ + "sha256:14b2c65f9940c9a7d3b70adbe713dbd38b5ec69724eebaba034d1036cf3d4740", + "sha256:fa618be8435447a017fd1bf2c7ae922d0428056cfc7449f7a8641edf76b48265" + ], + "version": "==3.4.2" + } + } +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..c71ec96 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +# active-semi-supervised-clustering + +Active semi-supervised clustering algorithms for scikit-learn. + +## Algorithms + +### Semi-supervised clustering + +* Seeded-KMeans +* Constrainted-KMeans +* COP-KMeans +* Pairwise constrained K-Means (PCK-Means) +* Metric K-Means (MK-Means) +* Metric pairwise constrained K-Means (MPCK-Means) + +### Active learning of pairwise clustering + +* Explore & Consolidate +* Min-max +* Normalized point-based uncertainty (NPU) method + +## Installation + +``` +pip install active-semi-supervised-clustering +``` + +## Usage + +```python +from sklearn import datasets, metrics +from active_semi_clustering.semi_supervised.pairwise_constraints import PCKMeans +from active_semi_clustering.active.pairwise_constraints import ExampleOracle, ExploreConsolidate, MinMax +``` + +```python +X, y = datasets.load_iris(return_X_y=True) +``` + +First, obtain some pairwise constraints from an oracle. + +```python +# TODO implement your own oracle that will, for example, query a domain expert via GUI or CLI +oracle = ExampleOracle(y, max_queries_cnt=10) + +active_learner = MinMax(n_clusters=3) +active_learner.fit(X, oracle=oracle) +pairwise_constraints = active_learner.pairwise_constraints_ +``` + +Then, use the constraints to do the clustering. + +```python +clusterer = PCKMeans(n_clusters=3) +clusterer.fit(X, ml=pairwise_constraints[0], cl=pairwise_constraints[1]) +``` + +Evaluate the clustering using Adjusted Rand Score. + +```python +metrics.adjusted_rand_score(y, clusterer.labels_) +``` diff --git a/active_semi_clustering/__init__.py b/active_semi_clustering/__init__.py new file mode 100644 index 0000000..071109f --- /dev/null +++ b/active_semi_clustering/__init__.py @@ -0,0 +1,2 @@ +from .semi_supervised.labeled_data import KMeans, SeededKMeans, ConstrainedKMeans +from .semi_supervised.pairwise_constraints import COPKMeans, PCKMeans, MPCKMeans, MPCKMeansMF, MKMeans, RCAKMeans \ No newline at end of file diff --git a/active_semi_clustering/active/__init__.py b/active_semi_clustering/active/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/active_semi_clustering/active/pairwise_constraints/__init__.py b/active_semi_clustering/active/pairwise_constraints/__init__.py new file mode 100644 index 0000000..5b86755 --- /dev/null +++ b/active_semi_clustering/active/pairwise_constraints/__init__.py @@ -0,0 +1,4 @@ +from .explore_consolidate import ExploreConsolidate +from .min_max import MinMax +from .npu import NPU +from .example_oracle import ExampleOracle \ No newline at end of file diff --git a/active_semi_clustering/active/pairwise_constraints/example_oracle.py b/active_semi_clustering/active/pairwise_constraints/example_oracle.py new file mode 100644 index 0000000..982e016 --- /dev/null +++ b/active_semi_clustering/active/pairwise_constraints/example_oracle.py @@ -0,0 +1,17 @@ +class MaximumQueriesExceeded(Exception): + pass + + +class ExampleOracle: + def __init__(self, labels, max_queries_cnt=20): + self.labels = labels + self.queries_cnt = 0 + self.max_queries_cnt = max_queries_cnt + + def query(self, i, j): + "Query the oracle to find out whether i and j should be must-linked" + if self.queries_cnt < self.max_queries_cnt: + self.queries_cnt += 1 + return self.labels[i] == self.labels[j] + else: + raise MaximumQueriesExceeded diff --git a/active_semi_clustering/active/pairwise_constraints/explore_consolidate.py b/active_semi_clustering/active/pairwise_constraints/explore_consolidate.py new file mode 100644 index 0000000..50d065b --- /dev/null +++ b/active_semi_clustering/active/pairwise_constraints/explore_consolidate.py @@ -0,0 +1,97 @@ +import numpy as np + +from .helpers import get_constraints_from_neighborhoods +from .example_oracle import MaximumQueriesExceeded + + +class ExploreConsolidate: + def __init__(self, n_clusters=3, **kwargs): + self.n_clusters = n_clusters + + def fit(self, X, oracle=None): + if oracle.max_queries_cnt <= 0: + return [], [] + + neighborhoods = self._explore(X, self.n_clusters, oracle) + neighborhoods = self._consolidate(neighborhoods, X, oracle) + + self.pairwise_constraints_ = get_constraints_from_neighborhoods(neighborhoods) + + return self + + def _explore(self, X, k, oracle): + neighborhoods = [] + traversed = [] + n = X.shape[0] + + x = np.random.choice(n) + neighborhoods.append([x]) + traversed.append(x) + + try: + while len(neighborhoods) < k: + + max_distance = 0 + farthest = None + + for i in range(n): + if i not in traversed: + distance = dist(i, traversed, X) + if distance > max_distance: + max_distance = distance + farthest = i + + new_neighborhood = True + for neighborhood in neighborhoods: + if oracle.query(farthest, neighborhood[0]): + neighborhood.append(farthest) + new_neighborhood = False + break + + if new_neighborhood: + neighborhoods.append([farthest]) + + traversed.append(farthest) + + except MaximumQueriesExceeded: + pass + + return neighborhoods + + def _consolidate(self, neighborhoods, X, oracle): + n = X.shape[0] + + neighborhoods_union = set() + for neighborhood in neighborhoods: + for i in neighborhood: + neighborhoods_union.add(i) + + remaining = set() + for i in range(n): + if i not in neighborhoods_union: + remaining.add(i) + + while True: + + try: + i = np.random.choice(list(remaining)) + + sorted_neighborhoods = sorted(neighborhoods, key=lambda neighborhood: dist(i, neighborhood, X)) + + for neighborhood in sorted_neighborhoods: + if oracle.query(i, neighborhood[0]): + neighborhood.append(i) + break + + neighborhoods_union.add(i) + remaining.remove(i) + + except MaximumQueriesExceeded: + break + + return neighborhoods + + +def dist(i, S, points): + distances = np.array([np.sqrt(((points[i] - points[j]) ** 2).sum()) for j in S]) + return distances.min() diff --git a/active_semi_clustering/active/pairwise_constraints/helpers.py b/active_semi_clustering/active/pairwise_constraints/helpers.py new file mode 100644 index 0000000..f257abf --- /dev/null +++ b/active_semi_clustering/active/pairwise_constraints/helpers.py @@ -0,0 +1,18 @@ +def get_constraints_from_neighborhoods(neighborhoods): + ml = [] + + for neighborhood in neighborhoods: + for i in neighborhood: + for j in neighborhood: + if i != j: + ml.append((i, j)) + + cl = [] + for neighborhood in neighborhoods: + for other_neighborhood in neighborhoods: + if neighborhood != other_neighborhood: + for i in neighborhood: + for j in other_neighborhood: + cl.append((i, j)) + + return ml, cl diff --git a/active_semi_clustering/active/pairwise_constraints/min_max.py b/active_semi_clustering/active/pairwise_constraints/min_max.py new file mode 100644 index 0000000..a48f008 --- /dev/null +++ b/active_semi_clustering/active/pairwise_constraints/min_max.py @@ -0,0 +1,53 @@ +import numpy as np + +from .example_oracle import MaximumQueriesExceeded +from .explore_consolidate import ExploreConsolidate + + +class MinMax(ExploreConsolidate): + def _consolidate(self, neighborhoods, X, oracle): + n = X.shape[0] + + skeleton = set() + for neighborhood in neighborhoods: + for i in neighborhood: + skeleton.add(i) + + remaining = set() + for i in range(n): + if i not in skeleton: + remaining.add(i) + + distances = np.zeros((n, n)) + for i in range(n): + for j in range(n): + distances[i, j] = np.sqrt(((X[i] - X[j]) ** 2).sum()) + + kernel_width = np.percentile(distances, 20) + + while True: + try: + max_similarities = np.full(n, fill_value=float('+inf')) + for x_i in remaining: + max_similarities[x_i] = np.max([similarity(X[x_i], X[x_j], kernel_width) for x_j in skeleton]) + + q_i = max_similarities.argmin() + + sorted_neighborhoods = reversed(sorted(neighborhoods, key=lambda neighborhood: np.max([similarity(X[q_i], X[n_i], kernel_width) for n_i in neighborhood]))) + + for neighborhood in sorted_neighborhoods: + if oracle.query(q_i, neighborhood[0]): + neighborhood.append(q_i) + break + + skeleton.add(q_i) + remaining.remove(q_i) + + except MaximumQueriesExceeded: + break + + return neighborhoods + + +def similarity(x, y, kernel_width): + return np.exp(-((x - y) ** 2).sum() / (2 * (kernel_width ** 2))) diff --git a/active_semi_clustering/active/pairwise_constraints/npu.py b/active_semi_clustering/active/pairwise_constraints/npu.py new file mode 100644 index 0000000..5408720 --- /dev/null +++ b/active_semi_clustering/active/pairwise_constraints/npu.py @@ -0,0 +1,125 @@ +import numpy as np +from sklearn.ensemble import RandomForestClassifier + +from .example_oracle import MaximumQueriesExceeded +from active_semi_clustering.exceptions import EmptyClustersException + + +class NPU: + def __init__(self, clusterer=None, **kwargs): + self.clusterer = clusterer + + def fit(self, X, oracle=None): + n = X.shape[0] + ml, cl = [], [] + neighborhoods = [] + + x_i = np.random.choice(list(range(n))) + neighborhoods.append([x_i]) + + while True: + try: + while True: + try: + self.clusterer.fit(X, ml=ml, cl=cl) + except EmptyClustersException: + continue + break + + x_i, p_i = self._most_informative(X, self.clusterer, neighborhoods) + + sorted_neighborhoods = list(zip(*reversed(sorted(zip(p_i, neighborhoods)))))[1] + # print(x_i, neighborhoods, p_i, sorted_neighborhoods) + + must_link_found = False + + for neighborhood in sorted_neighborhoods: + + must_linked = oracle.query(x_i, neighborhood[0]) + if must_linked: + # TODO is it necessary? this preprocessing is part of the clustering algorithms + for x_j in neighborhood: + ml.append([x_i, x_j]) + + for other_neighborhood in neighborhoods: + if neighborhood != other_neighborhood: + for x_j in other_neighborhood: + cl.append([x_i, x_j]) + + neighborhood.append(x_i) + must_link_found = True + break + + # TODO should we add the cannot-link in case the algorithm stops before it queries all neighborhoods? + + if not must_link_found: + for neighborhood in neighborhoods: + for x_j in neighborhood: + cl.append([x_i, x_j]) + + neighborhoods.append([x_i]) + + except MaximumQueriesExceeded: + break + + self.pairwise_constraints_ = ml, cl + + return self + + def _most_informative(self, X, clusterer, neighborhoods): + n = X.shape[0] + l = len(neighborhoods) + + neighborhoods_union = set() + for neighborhood in neighborhoods: + for i in neighborhood: + neighborhoods_union.add(i) + + unqueried_indices = set(range(n)) - neighborhoods_union + + # TODO if there is only one neighborhood then choose the point randomly? + if l <= 1: + return np.random.choice(list(unqueried_indices)), [1] + + # Learn a random forest classifier + n_estimators = 50 + rf = RandomForestClassifier(n_estimators=n_estimators) + rf.fit(X, clusterer.labels_) + + # Compute the similarity matrix + leaf_indices = rf.apply(X) + S = np.zeros((n, n)) + for i in range(n): + for j in range(n): + S[i, j] = (leaf_indices[i,] == leaf_indices[j,]).sum() + S = S / n_estimators + + p = np.empty((n, l)) + uncertainties = np.zeros(n) + expected_costs = np.ones(n) + + # For each point that is not in any neighborhood... + # TODO iterate only unqueried indices + for x_i in range(n): + if not x_i in neighborhoods_union: + for n_i in range(l): + p[x_i, n_i] = (S[x_i, neighborhoods[n_i]].sum() / len(neighborhoods[n_i])) + + # If the point is not similar to any neighborhood set equal probabilities of belonging to each neighborhood + if np.all(p[x_i,] == 0): + p[x_i,] = np.ones(l) + + p[x_i,] = p[x_i,] / p[x_i,].sum() + + if not np.any(p[x_i,] == 1): + positive_p_i = p[x_i, p[x_i,] > 0] + uncertainties[x_i] = -(positive_p_i * np.log2(positive_p_i)).sum() + expected_costs[x_i] = (positive_p_i * range(1, len(positive_p_i) + 1)).sum() + else: + uncertainties[x_i] = 0 + expected_costs[x_i] = 1 # ? + + normalized_uncertainties = uncertainties / expected_costs + + most_informative_i = np.argmax(normalized_uncertainties) + return most_informative_i, p[most_informative_i] diff --git a/active_semi_clustering/active/pairwise_constraints/random.py b/active_semi_clustering/active/pairwise_constraints/random.py new file mode 100644 index 0000000..61bb973 --- /dev/null +++ b/active_semi_clustering/active/pairwise_constraints/random.py @@ -0,0 +1,22 @@ +import numpy as np + + +class Random: + def __init__(self, n_clusters=3, **kwargs): + self.n_clusters = n_clusters + + def fit(self, X, oracle=None): + constraints = [np.random.choice(range(X.shape[0]), size=2, replace=False).tolist() for _ in range(oracle.max_queries_cnt)] + + ml, cl = [], [] + + for i, j in constraints: + must_linked = oracle.query(i, j) + if must_linked: + ml.append((i, j)) + else: + cl.append((i, j)) + + self.pairwise_constraints_ = ml, cl + + return self diff --git a/active_semi_clustering/exceptions.py b/active_semi_clustering/exceptions.py new file mode 100644 index 0000000..290cd74 --- /dev/null +++ b/active_semi_clustering/exceptions.py @@ -0,0 +1,10 @@ +class ClusteringNotFoundException(Exception): + pass + + +class EmptyClustersException(Exception): + pass + + +class InconsistentConstraintsException(Exception): + pass diff --git a/active_semi_clustering/farthest_first_traversal.py b/active_semi_clustering/farthest_first_traversal.py new file mode 100644 index 0000000..98382ea --- /dev/null +++ b/active_semi_clustering/farthest_first_traversal.py @@ -0,0 +1,55 @@ +import numpy as np + + +def dist(i, S, points): + distances = np.array([np.sqrt(((points[i] - points[j]) ** 2).sum()) for j in S]) + return distances.min() + + +def farthest_first_traversal(points, k): + traversed = [] + + # Choose the first point randomly + i = np.random.choice(len(points)) + traversed.append(i) + + # Find remaining n - 1 maximally separated points + for _ in range(k - 1): + max_dst, max_dst_index = 0, None + + for i in range(len(points)): + if i not in traversed: + dst = dist(i, traversed, points) + + if dst > max_dst: + max_dst = dst + max_dst_index = i + + traversed.append(max_dst_index) + + return traversed + + +def weighted_farthest_first_traversal(points, weights, k): + traversed = [] + + # Choose the first point randomly (weighted) + i = np.random.choice(len(points), size=1, p=weights)[0] + traversed.append(i) + + # Find remaining n - 1 maximally separated points + for _ in range(k - 1): + max_dst, max_dst_index = 0, None + + for i in range(len(points)): + if i not in traversed: + dst = dist(i, traversed, points) + weighted_dst = weights[i] * dst + + if weighted_dst > max_dst: + max_dst = weighted_dst + max_dst_index = i + + traversed.append(max_dst_index) + + return traversed diff --git a/active_semi_clustering/semi_supervised/__init__.py b/active_semi_clustering/semi_supervised/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/active_semi_clustering/semi_supervised/labeled_data/__init__.py b/active_semi_clustering/semi_supervised/labeled_data/__init__.py new file mode 100644 index 0000000..ff2607f --- /dev/null +++ b/active_semi_clustering/semi_supervised/labeled_data/__init__.py @@ -0,0 +1,3 @@ +from .kmeans import KMeans +from .seededkmeans import SeededKMeans +from .constrainedkmeans import ConstrainedKMeans \ No newline at end of file diff --git a/active_semi_clustering/semi_supervised/labeled_data/constrainedkmeans.py b/active_semi_clustering/semi_supervised/labeled_data/constrainedkmeans.py new file mode 100644 index 0000000..6723a81 --- /dev/null +++ b/active_semi_clustering/semi_supervised/labeled_data/constrainedkmeans.py @@ -0,0 +1,25 @@ +import numpy as np + +from .kmeans import EmptyClustersException +from .seededkmeans import SeededKMeans + + +class ConstrainedKMeans(SeededKMeans): + def _assign_clusters(self, X, y, cluster_centers, dist): + labels = np.full(X.shape[0], fill_value=-1) + + for i, x in enumerate(X): + if y[i] != -1: + labels[i] = y[i] + else: + labels[i] = np.argmin([dist(x, c) for c in cluster_centers]) + + # Handle empty clusters + # See https://github.com/scikit-learn/scikit-learn/blob/0.19.1/sklearn/cluster/_k_means.pyx#L309 + n_samples_in_cluster = np.bincount(labels, minlength=self.n_clusters) + empty_clusters = np.where(n_samples_in_cluster == 0)[0] + + if len(empty_clusters) > 0: + raise EmptyClustersException + + return labels diff --git a/active_semi_clustering/semi_supervised/labeled_data/kmeans.py b/active_semi_clustering/semi_supervised/labeled_data/kmeans.py new file mode 100644 index 0000000..5dd2552 --- /dev/null +++ b/active_semi_clustering/semi_supervised/labeled_data/kmeans.py @@ -0,0 +1,58 @@ +import numpy as np + +from active_semi_clustering.exceptions import EmptyClustersException + + +class KMeans: + def __init__(self, n_clusters=3, max_iter=100): + self.n_clusters = n_clusters + self.max_iter = max_iter + + def fit(self, X, y=None, **kwargs): + # Initialize cluster centers + cluster_centers = self._init_cluster_centers(X, y) + + # Repeat until convergence + for iteration in range(self.max_iter): + prev_cluster_centers = cluster_centers.copy() + + # Assign clusters + labels = self._assign_clusters(X, y, cluster_centers, self._dist) + + # Estimate means + cluster_centers = self._get_cluster_centers(X, labels) + + # Check for convergence + cluster_centers_shift = (prev_cluster_centers - cluster_centers) + converged = np.allclose(cluster_centers_shift, np.zeros(cluster_centers.shape), atol=1e-6, rtol=0) + + if converged: break + + self.cluster_centers_, self.labels_ = cluster_centers, labels + + return self + + def _init_cluster_centers(self, X, y=None): + return X[np.random.choice(X.shape[0], self.n_clusters, replace=False), :] + + def _dist(self, x, y): + return np.sqrt(np.sum((x - y) ** 2)) + + def _assign_clusters(self, X, y, cluster_centers, dist): + labels = np.full(X.shape[0], fill_value=-1) + + for i, x in enumerate(X): + labels[i] = np.argmin([dist(x, c) for c in cluster_centers]) + + # Handle empty clusters + # See https://github.com/scikit-learn/scikit-learn/blob/0.19.1/sklearn/cluster/_k_means.pyx#L309 + n_samples_in_cluster = np.bincount(labels, minlength=self.n_clusters) + empty_clusters = np.where(n_samples_in_cluster == 0)[0] + + if len(empty_clusters) > 0: + raise EmptyClustersException + + return labels + + def _get_cluster_centers(self, X, labels): + return np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)]) diff --git a/active_semi_clustering/semi_supervised/labeled_data/seededkmeans.py b/active_semi_clustering/semi_supervised/labeled_data/seededkmeans.py new file mode 100644 index 0000000..311caf4 --- /dev/null +++ b/active_semi_clustering/semi_supervised/labeled_data/seededkmeans.py @@ -0,0 +1,11 @@ +import numpy as np + +from .kmeans import KMeans + + +class SeededKMeans(KMeans): + def _init_cluster_centers(self, X, y=None): + if np.all(y == -1): + return X[np.random.choice(X.shape[0], self.n_clusters, replace=False), :] + else: + return self._get_cluster_centers(X, y) diff --git a/active_semi_clustering/semi_supervised/pairwise_constraints/__init__.py b/active_semi_clustering/semi_supervised/pairwise_constraints/__init__.py new file mode 100644 index 0000000..67f3755 --- /dev/null +++ b/active_semi_clustering/semi_supervised/pairwise_constraints/__init__.py @@ -0,0 +1,6 @@ +from .copkmeans import COPKMeans +from .pckmeans import PCKMeans +from .mpckmeans import MPCKMeans +from .mpckmeansmf import MPCKMeansMF +from .mkmeans import MKMeans +from .rcakmeans import RCAKMeans \ No newline at end of file diff --git a/active_semi_clustering/semi_supervised/pairwise_constraints/constraints.py b/active_semi_clustering/semi_supervised/pairwise_constraints/constraints.py new file mode 100644 index 0000000..d0618d3 --- /dev/null +++ b/active_semi_clustering/semi_supervised/pairwise_constraints/constraints.py @@ -0,0 +1,64 @@ +from active_semi_clustering.exceptions import InconsistentConstraintsException + + +# Taken from https://github.com/Behrouz-Babaki/COP-Kmeans/blob/master/copkmeans/cop_kmeans.py +def preprocess_constraints(ml, cl, n): + "Create a graph of constraints for both must- and cannot-links" + + # Represent the graphs using adjacency-lists + ml_graph, cl_graph = {}, {} + for i in range(n): + ml_graph[i] = set() + cl_graph[i] = set() + + def add_both(d, i, j): + d[i].add(j) + d[j].add(i) + + for (i, j) in ml: + ml_graph[i].add(j) + ml_graph[j].add(i) + + for (i, j) in cl: + cl_graph[i].add(j) + cl_graph[j].add(i) + + def dfs(i, graph, visited, component): + visited[i] = True + for j in graph[i]: + if not visited[j]: + dfs(j, graph, visited, component) + component.append(i) + + # Run DFS from each node to get all the graph's components + # and add an edge for each pair of nodes in the component (create a complete graph) + # See http://www.techiedelight.com/transitive-closure-graph/ for more details + visited = [False] * n + neighborhoods = [] + for i in range(n): + if not visited[i] and ml_graph[i]: + component = [] + dfs(i, ml_graph, visited, component) + for x1 in component: + for x2 in component: + if x1 != x2: + ml_graph[x1].add(x2) + neighborhoods.append(component) + + for (i, j) in cl: + for x in ml_graph[i]: + add_both(cl_graph, x, j) + + for y in ml_graph[j]: + add_both(cl_graph, i, y) + + for x in ml_graph[i]: + for y in ml_graph[j]: + add_both(cl_graph, x, y) + + for i in ml_graph: + for j in ml_graph[i]: + if j != i and j in cl_graph[i]: + raise InconsistentConstraintsException('Inconsistent constraints between {} and {}'.format(i, j)) + + return ml_graph, cl_graph, neighborhoods diff --git a/active_semi_clustering/semi_supervised/pairwise_constraints/copkmeans.py b/active_semi_clustering/semi_supervised/pairwise_constraints/copkmeans.py new file mode 100644 index 0000000..4f61964 --- /dev/null +++ b/active_semi_clustering/semi_supervised/pairwise_constraints/copkmeans.py @@ -0,0 +1,96 @@ +import numpy as np + +from active_semi_clustering.exceptions import EmptyClustersException, ClusteringNotFoundException +from .constraints import preprocess_constraints + + +class COPKMeans: + def __init__(self, n_clusters=3, max_iter=100): + self.n_clusters = n_clusters + self.max_iter = max_iter + + def fit(self, X, y=None, ml=[], cl=[]): + ml_graph, cl_graph, neighborhoods = preprocess_constraints(ml, cl, X.shape[0]) + + # Initialize cluster centers + cluster_centers = self._init_cluster_centers(X) + + # Repeat until convergence + for iteration in range(self.max_iter): + prev_cluster_centers = cluster_centers.copy() + + # Assign clusters + labels = self._assign_clusters(X, cluster_centers, self._dist, ml_graph, cl_graph) + + # Estimate means + cluster_centers = self._get_cluster_centers(X, labels) + + # Check for convergence + cluster_centers_shift = (prev_cluster_centers - cluster_centers) + converged = np.allclose(cluster_centers_shift, np.zeros(cluster_centers.shape), atol=1e-6, rtol=0) + + if converged: break + + self.cluster_centers_, self.labels_ = cluster_centers, labels + + return self + + def _init_cluster_centers(self, X): + return X[np.random.choice(X.shape[0], self.n_clusters, replace=False), :] + + def _dist(self, x, y): + return np.sqrt(np.sum((x - y) ** 2)) + + def _assign_clusters(self, *args): + max_retries_cnt = 1000 + + for retries_cnt in range(max_retries_cnt): + try: + return self._try_assign_clusters(*args) + + except ClusteringNotFoundException: + continue + + raise ClusteringNotFoundException + + def _try_assign_clusters(self, X, cluster_centers, dist, ml_graph, cl_graph): + labels = np.full(X.shape[0], fill_value=-1) + + data_indices = list(range(X.shape[0])) + np.random.shuffle(data_indices) + + for i in data_indices: + distances = np.array([dist(X[i], c) for c in cluster_centers]) + # sorted_cluster_indices = np.argsort([dist(x, c) for c in cluster_centers]) + + for cluster_index in distances.argsort(): + if not self._violates_constraints(i, cluster_index, labels, ml_graph, cl_graph): + labels[i] = cluster_index + break + + if labels[i] < 0: + raise ClusteringNotFoundException + + # Handle empty clusters + # See https://github.com/scikit-learn/scikit-learn/blob/0.19.1/sklearn/cluster/_k_means.pyx#L309 + n_samples_in_cluster = np.bincount(labels, minlength=self.n_clusters) + empty_clusters = np.where(n_samples_in_cluster == 0)[0] + + if len(empty_clusters) > 0: + raise EmptyClustersException + + return labels + + def _violates_constraints(self, i, cluster_index, labels, ml_graph, cl_graph): + for j in ml_graph[i]: + if labels[j] > 0 and cluster_index != labels[j]: + return True + + for j in cl_graph[i]: + if cluster_index == labels[j]: + return True + + return False + + def _get_cluster_centers(self, X, labels): + return np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)]) diff --git a/active_semi_clustering/semi_supervised/pairwise_constraints/mkmeans.py b/active_semi_clustering/semi_supervised/pairwise_constraints/mkmeans.py new file mode 100644 index 0000000..8b8577f --- /dev/null +++ b/active_semi_clustering/semi_supervised/pairwise_constraints/mkmeans.py @@ -0,0 +1,38 @@ +import numpy as np + +from sklearn.cluster import KMeans +from metric_learn import MMC + + +class MKMeans: + def __init__(self, n_clusters=3, max_iter=1000, diagonal=True): + self.n_clusters = n_clusters + self.max_iter = max_iter + self.diagonal = diagonal + + def fit(self, X, y=None, ml=[], cl=[]): + X_transformed = X + + if ml and cl: + # ml_graph, cl_graph, _ = preprocess_constraints(ml, cl, X.shape[0]) + # + # ml, cl = [], [] + # for i, constraints in ml_graph.items(): + # for j in constraints: + # ml.append((i, j)) + # + # for i, constraints in cl_graph.items(): + # for j in constraints: + # cl.append((i, j)) + + constraints = [np.array(lst) for lst in [*zip(*ml), *zip(*cl)]] + mmc = MMC(diagonal=self.diagonal) + mmc.fit(X, constraints=constraints) + X_transformed = mmc.transform(X) + + kmeans = KMeans(n_clusters=self.n_clusters, init='random', max_iter=self.max_iter) + kmeans.fit(X_transformed) + + self.labels_ = kmeans.labels_ + + return self diff --git a/active_semi_clustering/semi_supervised/pairwise_constraints/mpckmeans.py b/active_semi_clustering/semi_supervised/pairwise_constraints/mpckmeans.py new file mode 100644 index 0000000..8beb992 --- /dev/null +++ b/active_semi_clustering/semi_supervised/pairwise_constraints/mpckmeans.py @@ -0,0 +1,168 @@ +import numpy as np +import scipy + +from active_semi_clustering.exceptions import EmptyClustersException +from active_semi_clustering.farthest_first_traversal import weighted_farthest_first_traversal +from .constraints import preprocess_constraints + +np.seterr('raise') + + +class MPCKMeans: + "MPCK-Means-S-D that learns only a single (S) diagonal (D) matrix" + + def __init__(self, n_clusters=3, max_iter=10, w=1): + self.n_clusters = n_clusters + self.max_iter = max_iter + self.w = w + + def fit(self, X, y=None, ml=[], cl=[]): + # Preprocess constraints + ml_graph, cl_graph, neighborhoods = preprocess_constraints(ml, cl, X.shape[0]) + + # Initialize cluster centers + cluster_centers = self._initialize_cluster_centers(X, neighborhoods) + + # Initialize metrics + A = np.identity(X.shape[1]) + + # Repeat until convergence + for iteration in range(self.max_iter): + prev_cluster_centers = cluster_centers.copy() + + # Find farthest pair of points according to each metric + farthest = self._find_farthest_pairs_of_points(X, A) + + # Assign clusters + labels = self._assign_clusters(X, y, cluster_centers, A, farthest, ml_graph, cl_graph, self.w) + + # Estimate means + cluster_centers = self._get_cluster_centers(X, labels) + + # Update metrics + A = self._update_metrics(X, labels, cluster_centers, farthest, ml_graph, cl_graph, self.w) + + # Check for convergence + cluster_centers_shift = (prev_cluster_centers - cluster_centers) + converged = np.allclose(cluster_centers_shift, np.zeros(cluster_centers.shape), atol=1e-6, rtol=0) + + if converged: + break + + # print('\t', iteration, converged) + + self.cluster_centers_, self.labels_ = cluster_centers, labels + + return self + + def _find_farthest_pairs_of_points(self, X, A): + farthest = None + n = X.shape[0] + max_distance = 0 + + for i in range(n): + for j in range(n): + if j < i: + distance = self._dist(X[i], X[j], A) + if distance > max_distance: + max_distance = distance + farthest = (i, j, distance) + + assert farthest is not None + + return farthest + + def _initialize_cluster_centers(self, X, neighborhoods): + neighborhood_centers = np.array([X[neighborhood].mean(axis=0) for neighborhood in neighborhoods]) + neighborhood_sizes = np.array([len(neighborhood) for neighborhood in neighborhoods]) + neighborhood_weights = neighborhood_sizes / neighborhood_sizes.sum() + + # print('\t', len(neighborhoods), neighborhood_sizes) + + if len(neighborhoods) > self.n_clusters: + cluster_centers = neighborhood_centers[weighted_farthest_first_traversal(neighborhood_centers, neighborhood_weights, self.n_clusters)] + else: + if len(neighborhoods) > 0: + cluster_centers = neighborhood_centers + else: + cluster_centers = np.empty((0, X.shape[1])) + + if len(neighborhoods) < self.n_clusters: + remaining_cluster_centers = X[np.random.choice(X.shape[0], self.n_clusters - len(neighborhoods), replace=False), :] + cluster_centers = np.concatenate([cluster_centers, remaining_cluster_centers]) + + return cluster_centers + + def _dist(self, x, y, A): + "(x - y)^T A (x - y)" + return scipy.spatial.distance.mahalanobis(x, y, A) ** 2 + + def _objective_fn(self, X, i, labels, cluster_centers, cluster_id, A, farthest, ml_graph, cl_graph, w): + term_d = self._dist(X[i], cluster_centers[cluster_id], A) - np.log(np.linalg.det(A)) / np.log(2) # FIXME is it okay that it might be negative? + + def f_m(i, j, A): + return self._dist(X[i], X[j], A) + + def f_c(i, j, A, farthest): + return farthest[2] - self._dist(X[i], X[j], A) + + term_m = 0 + for j in ml_graph[i]: + if labels[j] >= 0 and labels[j] != cluster_id: + term_m += 2 * w * f_m(i, j, A) + + term_c = 0 + for j in cl_graph[i]: + if labels[j] == cluster_id: + # assert f_c(i, j, A, farthest) >= 0 + term_c += 2 * w * f_c(i, j, A, farthest) + + return term_d + term_m + term_c + + def _assign_clusters(self, X, y, cluster_centers, A, farthest, ml_graph, cl_graph, w): + labels = np.full(X.shape[0], fill_value=-1) + + index = list(range(X.shape[0])) + np.random.shuffle(index) + for i in index: + labels[i] = np.argmin([self._objective_fn(X, i, labels, cluster_centers, cluster_id, A, farthest, ml_graph, cl_graph, w) for cluster_id, cluster_center in enumerate(cluster_centers)]) + + # Handle empty clusters + # See https://github.com/scikit-learn/scikit-learn/blob/0.19.1/sklearn/cluster/_k_means.pyx#L309 + n_samples_in_cluster = np.bincount(labels, minlength=self.n_clusters) + empty_clusters = np.where(n_samples_in_cluster == 0)[0] + + if len(empty_clusters) > 0: + # print("Empty clusters") + raise EmptyClustersException + + return labels + + def _update_metrics(self, X, labels, cluster_centers, farthest, ml_graph, cl_graph, w): + N, D = X.shape + A = np.zeros((D, D)) + + for d in range(D): + term_x = np.sum([(x[d] - cluster_centers[labels[i], d]) ** 2 for i, x in enumerate(X)]) + + term_m = 0 + for i in range(N): + for j in ml_graph[i]: + if labels[i] != labels[j]: + term_m += 1 / 2 * w * (X[i, d] - X[j, d]) ** 2 + + term_c = 0 + for i in range(N): + for j in cl_graph[i]: + if labels[i] == labels[j]: + tmp = ((X[farthest[0], d] - X[farthest[1], d]) ** 2 - (X[i, d] - X[j, d]) ** 2) + term_c += w * max(tmp, 0) + + # print('term_x', term_x, 'term_m', term_m, 'term_c', term_c) + + A[d, d] = N * 1 / max(term_x + term_m + term_c, 1e-9) + + return A + + def _get_cluster_centers(self, X, labels): + return np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)]) diff --git a/active_semi_clustering/semi_supervised/pairwise_constraints/mpckmeansmf.py b/active_semi_clustering/semi_supervised/pairwise_constraints/mpckmeansmf.py new file mode 100644 index 0000000..24da8e8 --- /dev/null +++ b/active_semi_clustering/semi_supervised/pairwise_constraints/mpckmeansmf.py @@ -0,0 +1,192 @@ +import numpy as np +import scipy + +from active_semi_clustering.exceptions import EmptyClustersException +from active_semi_clustering.farthest_first_traversal import weighted_farthest_first_traversal +from .constraints import preprocess_constraints + + +# np.seterr('raise') + +class MPCKMeansMF: + """ + MPCK-Means that learns multiple (M) full (F) matrices + """ + + def __init__(self, n_clusters=3, max_iter=100, w=1): + self.n_clusters = n_clusters + self.max_iter = max_iter + self.w = w + + def fit(self, X, y=None, ml=[], cl=[]): + # Preprocess constraints + ml_graph, cl_graph, neighborhoods = preprocess_constraints(ml, cl, X.shape[0]) + + # Initialize cluster centers + cluster_centers = self._initialize_cluster_centers(X, neighborhoods) + + # Initialize metrics + As = [np.identity(X.shape[1]) for i in range(self.n_clusters)] + + # Repeat until convergence + for iteration in range(self.max_iter): + prev_cluster_centers = cluster_centers.copy() + + # Find farthest pair of points according to each metric + farthest = self._find_farthest_pairs_of_points(X, As) + + # Assign clusters + labels = self._assign_clusters(X, y, cluster_centers, As, farthest, ml_graph, cl_graph, self.w) + + # Estimate means + cluster_centers = self._get_cluster_centers(X, labels) + + # Update metrics + As = self._update_metrics(X, labels, cluster_centers, farthest, ml_graph, cl_graph, self.w) + + # Check for convergence + cluster_centers_shift = (prev_cluster_centers - cluster_centers) + converged = np.allclose(cluster_centers_shift, np.zeros(cluster_centers.shape), atol=1e-6, rtol=0) + + if converged: + break + + # print('\t', iteration, converged) + + self.cluster_centers_, self.labels_ = cluster_centers, labels + self.As_ = As + + return self + + def _find_farthest_pairs_of_points(self, X, As): + farthest = [None] * self.n_clusters + + n = X.shape[0] + for cluster_id in range(self.n_clusters): + max_distance = 0 + + for i in range(n): + for j in range(n): + if j < i: + distance = self._dist(X[i], X[j], As[cluster_id]) + if distance > max_distance: + max_distance = distance + farthest[cluster_id] = (i, j, distance) + + return farthest + + def _initialize_cluster_centers(self, X, neighborhoods): + neighborhood_centers = np.array([X[neighborhood].mean(axis=0) for neighborhood in neighborhoods]) + neighborhood_sizes = np.array([len(neighborhood) for neighborhood in neighborhoods]) + neighborhood_weights = neighborhood_sizes / neighborhood_sizes.sum() + + # print('\t', len(neighborhoods), neighborhood_sizes) + + if len(neighborhoods) > self.n_clusters: + cluster_centers = neighborhood_centers[weighted_farthest_first_traversal(neighborhood_centers, neighborhood_weights, self.n_clusters)] + else: + if len(neighborhoods) > 0: + cluster_centers = neighborhood_centers + else: + cluster_centers = np.empty((0, X.shape[1])) + + if len(neighborhoods) < self.n_clusters: + remaining_cluster_centers = X[np.random.choice(X.shape[0], self.n_clusters - len(neighborhoods), replace=False), :] + cluster_centers = np.concatenate([cluster_centers, remaining_cluster_centers]) + + return cluster_centers + + def _dist(self, x, y, A): + "(x - y)^T A (x - y)" + return scipy.spatial.distance.mahalanobis(x, y, A) ** 2 + + def _objective_function(self, X, i, labels, cluster_centers, cluster_id, As, farthest, ml_graph, cl_graph, w): + term_d = self._dist(X[i], cluster_centers[cluster_id], As[cluster_id]) - np.log(max(np.linalg.det(As[cluster_id]), 1e-9)) + + def f_m(i, c_i, j, c_j, As): + return 1 / 2 * self._dist(X[i], X[j], As[c_i]) + 1 / 2 * self._dist(X[i], X[j], As[c_j]) + + def f_c(i, c_i, j, c_j, As, farthest): + return farthest[c_i][2] - self._dist(X[i], X[j], As[c_i]) + + term_m = 0 + for j in ml_graph[i]: + if labels[j] >= 0 and labels[j] != cluster_id: + term_m += 2 * w * f_m(i, cluster_id, j, labels[j], As) + + term_c = 0 + for j in cl_graph[i]: + if labels[j] == cluster_id: + term_c += 2 * w * f_c(i, cluster_id, j, labels[j], As, farthest) + + return term_d + term_m + term_c + + def _assign_clusters(self, X, y, cluster_centers, As, farthest, ml_graph, cl_graph, w): + labels = np.full(X.shape[0], fill_value=-1) + + index = list(range(X.shape[0])) + np.random.shuffle(index) + for i in index: + labels[i] = np.argmin( + [self._objective_function(X, i, labels, cluster_centers, cluster_id, As, farthest, ml_graph, cl_graph, w) for cluster_id, cluster_center in enumerate(cluster_centers)]) + + # Handle empty clusters + # See https://github.com/scikit-learn/scikit-learn/blob/0.19.1/sklearn/cluster/_k_means.pyx#L309 + n_samples_in_cluster = np.bincount(labels, minlength=self.n_clusters) + empty_clusters = np.where(n_samples_in_cluster == 0)[0] + + if len(empty_clusters) > 0: + # print("Empty clusters") + raise EmptyClustersException + + return labels + + def _update_metrics(self, X, labels, cluster_centers, farthest, ml_graph, cl_graph, w): + As = [] + + for cluster_id in range(self.n_clusters): + X_i = X[labels == cluster_id] + n = X_i.shape[0] + + if n == 1: + As.append(np.identity(X_i.shape[1])) + continue + + A_inv = (X_i - cluster_centers[cluster_id]).T @ (X_i - cluster_centers[cluster_id]) + + for i in range(X.shape[0]): + for j in ml_graph[i]: + if labels[i] == cluster_id or labels[j] == cluster_id: + if labels[i] != labels[j]: + A_inv += 1 / 2 * w * ((X[i][:, None] - X[j][:, None]) @ (X[i][:, None] - X[j][:, None]).T) + + for i in range(X.shape[0]): + for j in cl_graph[i]: + if labels[i] == cluster_id or labels[j] == cluster_id: + if labels[i] == labels[j]: + A_inv += w * ( + ((X[farthest[cluster_id][0]][:, None] - X[farthest[cluster_id][1]][:, None]) @ (X[farthest[cluster_id][0]][:, None] - X[farthest[cluster_id][1]][:, None]).T) - ( + (X[i][:, None] - X[j][:, None]) @ (X[i][:, None] - X[j][:, None]).T)) + + # Handle the case when the matrix is not invertible + if not self._is_invertible(A_inv): + # print("Not invertible") + A_inv += 1e-9 * np.trace(A_inv) * np.identity(A_inv.shape[0]) + + A = n * np.linalg.inv(A_inv) + + # Is A positive semidefinite? + if not np.all(np.linalg.eigvals(A) >= 0): + # print("Negative definite") + eigenvalues, eigenvectors = np.linalg.eigh(A) + A = eigenvectors @ np.diag(np.maximum(0, eigenvalues)) @ np.linalg.inv(eigenvectors) + + As.append(A) + + return As + + def _get_cluster_centers(self, X, labels): + return np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)]) + + def _is_invertible(self, A): + return A.shape[0] == A.shape[1] and np.linalg.matrix_rank(A) == A.shape[0] diff --git a/active_semi_clustering/semi_supervised/pairwise_constraints/pckmeans.py b/active_semi_clustering/semi_supervised/pairwise_constraints/pckmeans.py new file mode 100644 index 0000000..4fbce2f --- /dev/null +++ b/active_semi_clustering/semi_supervised/pairwise_constraints/pckmeans.py @@ -0,0 +1,95 @@ +import numpy as np + +from active_semi_clustering.exceptions import EmptyClustersException +from .constraints import preprocess_constraints + + +class PCKMeans: + def __init__(self, n_clusters=3, max_iter=100, w=1): + self.n_clusters = n_clusters + self.max_iter = max_iter + self.w = w + + def fit(self, X, y=None, ml=[], cl=[]): + # Preprocess constraints + ml_graph, cl_graph, neighborhoods = preprocess_constraints(ml, cl, X.shape[0]) + + # Initialize centroids + cluster_centers = self._initialize_cluster_centers(X, neighborhoods) + + # Repeat until convergence + for iteration in range(self.max_iter): + # Assign clusters + labels = self._assign_clusters(X, cluster_centers, ml_graph, cl_graph, self.w) + + # Estimate means + prev_cluster_centers = cluster_centers + cluster_centers = self._get_cluster_centers(X, labels) + + # Check for convergence + difference = (prev_cluster_centers - cluster_centers) + converged = np.allclose(difference, np.zeros(cluster_centers.shape), atol=1e-6, rtol=0) + + if converged: break + + self.cluster_centers_, self.labels_ = cluster_centers, labels + + return self + + def _initialize_cluster_centers(self, X, neighborhoods): + neighborhood_centers = np.array([X[neighborhood].mean(axis=0) for neighborhood in neighborhoods]) + neighborhood_sizes = np.array([len(neighborhood) for neighborhood in neighborhoods]) + + if len(neighborhoods) > self.n_clusters: + # Select K largest neighborhoods' centroids + cluster_centers = neighborhood_centers[np.argsort(neighborhood_sizes)[-self.n_clusters:]] + else: + if len(neighborhoods) > 0: + cluster_centers = neighborhood_centers + else: + cluster_centers = np.empty((0, X.shape[1])) + + # FIXME look for a point that is connected by cannot-links to every neighborhood set + + if len(neighborhoods) < self.n_clusters: + remaining_cluster_centers = X[np.random.choice(X.shape[0], self.n_clusters - len(neighborhoods), replace=False), :] + cluster_centers = np.concatenate([cluster_centers, remaining_cluster_centers]) + + return cluster_centers + + def _objective_function(self, X, x_i, centroids, c_i, labels, ml_graph, cl_graph, w): + distance = 1 / 2 * np.sum((X[x_i] - centroids[c_i]) ** 2) + + ml_penalty = 0 + for y_i in ml_graph[x_i]: + if labels[y_i] != -1 and labels[y_i] != c_i: + ml_penalty += w + + cl_penalty = 0 + for y_i in cl_graph[x_i]: + if labels[y_i] == c_i: + cl_penalty += w + + return distance + ml_penalty + cl_penalty + + def _assign_clusters(self, X, cluster_centers, ml_graph, cl_graph, w): + labels = np.full(X.shape[0], fill_value=-1) + + index = list(range(X.shape[0])) + np.random.shuffle(index) + for x_i in index: + labels[x_i] = np.argmin([self._objective_function(X, x_i, cluster_centers, c_i, labels, ml_graph, cl_graph, w) for c_i in range(self.n_clusters)]) + + # Handle empty clusters + # See https://github.com/scikit-learn/scikit-learn/blob/0.19.1/sklearn/cluster/_k_means.pyx#L309 + n_samples_in_cluster = np.bincount(labels, minlength=self.n_clusters) + empty_clusters = np.where(n_samples_in_cluster == 0)[0] + + if len(empty_clusters) > 0: + # print("Empty clusters") + raise EmptyClustersException + + return labels + + def _get_cluster_centers(self, X, labels): + return np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)]) diff --git a/active_semi_clustering/semi_supervised/pairwise_constraints/rcakmeans.py b/active_semi_clustering/semi_supervised/pairwise_constraints/rcakmeans.py new file mode 100644 index 0000000..cfbd34d --- /dev/null +++ b/active_semi_clustering/semi_supervised/pairwise_constraints/rcakmeans.py @@ -0,0 +1,40 @@ +import numpy as np + +from sklearn.cluster import KMeans +from metric_learn import RCA + +from .constraints import preprocess_constraints + + +class RCAKMeans: + """ + Relative Components Analysis (RCA) + KMeans + """ + + def __init__(self, n_clusters=3, max_iter=100): + self.n_clusters = n_clusters + self.max_iter = max_iter + + def fit(self, X, y=None, ml=[], cl=[]): + X_transformed = X + + if ml: + chunks = np.full(X.shape[0], -1) + ml_graph, cl_graph, neighborhoods = preprocess_constraints(ml, cl, X.shape[0]) + for i, neighborhood in enumerate(neighborhoods): + chunks[neighborhood] = i + + # print(chunks) + + rca = RCA() + rca.fit(X, chunks=chunks) + X_transformed = rca.transform(X) + + # print(rca.metric()) + + kmeans = KMeans(n_clusters=self.n_clusters, max_iter=self.max_iter) + kmeans.fit(X_transformed) + + self.labels_ = kmeans.labels_ + + return self diff --git a/examples/Active-Semi-Supervised-Clustering.ipynb b/examples/Active-Semi-Supervised-Clustering.ipynb new file mode 100644 index 0000000..ea45bff --- /dev/null +++ b/examples/Active-Semi-Supervised-Clustering.ipynb @@ -0,0 +1,106 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Active Semi-Supervised Clustering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import datasets, metrics\n", + "from active_semi_clustering.semi_supervised.pairwise_constraints import PCKMeans\n", + "from active_semi_clustering.active.pairwise_constraints import ExampleOracle, ExploreConsolidate, MinMax" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = datasets.load_iris(return_X_y=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, obtain some pairwise constraints from the oracle." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO implement your own oracle that will, for example, query a domain expert via GUI or CLI\n", + "oracle = ExampleOracle(y, max_queries_cnt=10)\n", + "\n", + "active_learner = MinMax(n_clusters=3)\n", + "active_learner.fit(X, oracle=oracle)\n", + "pairwise_constraints = active_learner.pairwise_constraints_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, use the constraints to do the clustering." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clusterer = PCKMeans(n_clusters=3)\n", + "clusterer.fit(X, ml=pairwise_constraints[0], cl=pairwise_constraints[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluate the clustering using Adjusted Rand Score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics.adjusted_rand_score(y, clusterer.labels_)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..aef1dba --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="active-semi-supervised-clustering", + version="0.0.1", + author="Jakub Svehla", + author_email="jakub.svehla@datamole.cz", + description="Active semi-supervised clustering algorithms for scikit-learn", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/datamole-ai/active-semi-supervised-clustering", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + install_requires=[ + 'numpy', + 'scipy', + 'scikit-learn', + 'metric-learn>=0.4', + ] +)