diff --git a/src/scvi_hub_models/__init__.py b/src/scvi_hub_models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scvi_hub_models/__main__.py b/src/scvi_hub_models/__main__.py new file mode 100644 index 0000000..0fb133e --- /dev/null +++ b/src/scvi_hub_models/__main__.py @@ -0,0 +1,27 @@ +from scvi_hub_models.utils import wrap_kwargs + + +@wrap_kwargs +def run_workflow( + model_name: str, + dry_run: bool = False, + repo_create: bool = False, +) -> None: + """Run the workflow for a specific model.""" + import logging + + logger = logging.getLogger(__name__) + + if model_name == "heart_cell_atlas": + from scvi_hub_models.models.heart_cell_atlas import model_workflow + elif model_name == "human_lung_cell_atlas": + from scvi_hub_models.models.human_lung_cell_atlas import model_workflow + elif model_name == "tabula_sapiens": + from scvi_hub_models.models.tabula_sapiens import model_workflow + + logger.info(f"Started running {model_name} workflow with `dry_run={dry_run}` and " f"`repo_create={repo_create}`.") + model_workflow(dry_run=dry_run, repo_create=repo_create) + + +if __name__ == "__main__": + run_workflow() diff --git a/src/scvi_hub_models/config/__init__.py b/src/scvi_hub_models/config/__init__.py new file mode 100644 index 0000000..19efaeb --- /dev/null +++ b/src/scvi_hub_models/config/__init__.py @@ -0,0 +1,9 @@ +from ._heart_cell_atlas import _CONFIG as HEART_CELL_ATLAS_CONFIG +from ._human_lung_cell_atlas import _CONFIG as HUMAN_LUNG_CELL_ATLAS_CONFIG +from ._tabula_sapiens import _CONFIG as TABULA_SAPIENS_CONFIG + +__all__ = [ + "HEART_CELL_ATLAS_CONFIG", + "HUMAN_LUNG_CELL_ATLAS_CONFIG", + "TABULA_SAPIENS_CONFIG", +] diff --git a/src/scvi_hub_models/config/_heart_cell_atlas.py b/src/scvi_hub_models/config/_heart_cell_atlas.py new file mode 100644 index 0000000..bb3ad75 --- /dev/null +++ b/src/scvi_hub_models/config/_heart_cell_atlas.py @@ -0,0 +1,13 @@ +_CONFIG = { + "model_dir": "heart_cell_atlas_scvi", + "repo_name": "scvi-tools/heart-cell-atlas-scvi", + "metadata": { + "training_data_url": "https://www.heartcellatlas.org/#DataSources", + "tissues": ["heart"], + "data_modalities": ["rna"], + "data_is_annotated": True, + "license_info": "cc-by-4.0", + "description": "Combined single cell and single nuclei RNA-Seq data of 485K cardiac cells with annotations.", + "references": "Kazumasa Kanemaru, James Cranley, Daniele Muraro, Antonio M. A. Miranda, Siew Yen Ho, Anna Wilbrey-Clark, Jan Patrick Pett, Krzysztof Polanski, Laura Richardson, Monika Litvinukova, Natsuhiko Kumasaka, Yue Qin, Zuzanna Jablonska, Claudia I. Semprich, Lukas Mach, Monika Dabrowska, Nathan Richoz, Liam Bolt, Lira Mamanova, Rakeshlal Kapuge, Sam N. Barnett, Shani Perera, Carlos Talavera-López, Ilaria Mulas, Krishnaa T. Mahbubani, Liz Tuck, Lu Wang, Margaret M. Huang, Martin Prete, Sophie Pritchard, John Dark, Kourosh Saeb-Parsy, Minal Patel, Menna R. Clatworthy, Norbert Hübner, Rasheda A. Chowdhury, Michela Noseda & Sarah A. Teichmann. Spatially resolved multiomics of human cardiac niches. Nature, July 2023. doi:10.1038/s41586-023-06311-1.", + }, +} diff --git a/src/scvi_hub_models/config/_human_lung_cell_atlas.py b/src/scvi_hub_models/config/_human_lung_cell_atlas.py new file mode 100644 index 0000000..2ddb515 --- /dev/null +++ b/src/scvi_hub_models/config/_human_lung_cell_atlas.py @@ -0,0 +1,23 @@ +_CONFIG = { + "legacy_model_url": "https://zenodo.org/records/7599104/files/HLCA_reference_model.zip", + "legacy_model_hash": "a7cd60f4342292b3cba54545bcd8a34decdc8e6b82163f009273d543e7e3910e", + "legacy_model_dir": "hlca_scanvi_reference_legacy", + "model_dir": "hlca_scanvi_reference", + "reference_adata_cxg_id": "066943a2-fdac-4b29-b348-40cede398e4e", + "reference_adata_fname": "hlca_core.h5ad", + "embedding_adata_url": "https://zenodo.org/records/7599104/files/HLCA_full_v1.1_emb.h5ad", + "embedding_adata_hash": "3e2c4da281b6883464b2a70bcc1562d1c4246de32093e7b08090673fbad56a97", + "embedding_adata_fname": "hlca_all_emb.h5ad", + "mini_model_dir": "hlca_scanvi_reference_mini", + "repo_name": "scvi-tools/human-lung-cell-atlas", + "metadata": { + "training_data_url": "https://cellxgene.cziscience.com/collections/6f6d381a-7701-4781-935c-db10d30de293", + "training_code_url": "https://github.com/LungCellAtlas/HLCA_reproducibility", + "tissues": ["nose", "respiratory airway", "lung parenchyma"], + "data_modalities": ["rna"], + "data_is_annotated": True, + "license_info": "cc-by-4.0", + "description": "The integrated Human Lung Cell Atlas (HLCA) represents the first large-scale, integrated single-cell reference atlas of the human lung.", + "references": "Lisa Sikkema, Ciro Ramírez-Suástegui, Daniel C. Strobl, Tessa E. Gillett, Luke Zappia, Elo Madissoon, Nikolay S. Markov, Laure-Emmanuelle Zaragosi, Yuge Ji, Meshal Ansari, Marie-Jeanne Arguel, Leonie Apperloo, Martin Banchero, Christophe Bécavin, Marijn Berg, Evgeny Chichelnitskiy, Mei-i Chung, Antoine Collin, Aurore C. A. Gay, Janine Gote-Schniering, Baharak Hooshiar Kashani, Kemal Inecik, Manu Jain, Theodore S. Kapellos, Tessa M. Kole, Sylvie Leroy, Christoph H. Mayr, Amanda J. Oliver, Michael von Papen, Lance Peter, Chase J. Taylor, Thomas Walzthoeni, Chuan Xu, Linh T. Bui, Carlo De Donno, Leander Dony, Alen Faiz, Minzhe Guo, Austin J. Gutierrez, Lukas Heumos, Ni Huang, Ignacio L. Ibarra, Nathan D. Jackson, Preetish Kadur Lakshminarasimha Murthy, Mohammad Lotfollahi, Tracy Tabib, Carlos Talavera-López, Kyle J. Travaglini, Anna Wilbrey-Clark, Kaylee B. Worlock, Masahiro Yoshida, Lung Biological Network Consortium, Maarten van den Berge, Yohan Bossé, Tushar J. Desai, Oliver Eickelberg, Naftali Kaminski, Mark A. Krasnow, Robert Lafyatis, Marko Z. Nikolic, Joseph E. Powell, Jayaraj Rajagopal, Mauricio Rojas, Orit Rozenblatt-Rosen, Max A. Seibold, Dean Sheppard, Douglas P. Shepherd, Don D. Sin, Wim Timens, Alexander M. Tsankov, Jeffrey Whitsett, Yan Xu, Nicholas E. Banovich, Pascal Barbry, Thu Elizabeth Duong, Christine S. Falk, Kerstin B. Meyer, Jonathan A. Kropski, Dana Pe’er, Herbert B. Schiller, Purushothama Rao Tata, Joachim L. Schultze, Sara A. Teichmann, Alexander V. Misharin, Martijn C. Nawijn, Malte D. Luecken, and Fabian J. Theis. An integrated cell atlas of the lung in health and disease. Nature Medicine, June 2023. doi:10.1038/s41591-023-02327-2.", + }, +} diff --git a/src/scvi_hub_models/config/_tabula_sapiens.py b/src/scvi_hub_models/config/_tabula_sapiens.py new file mode 100644 index 0000000..ef0da92 --- /dev/null +++ b/src/scvi_hub_models/config/_tabula_sapiens.py @@ -0,0 +1,88 @@ +_CONFIG = { + "tissues": [ + "Bladder", + "Blood", + "Bone_Marrow", + "Eye", + "Fat", + "Heart", + "Large_Intestine", + "Liver", + "Lung", + "Lymph_Node", + "Mammary", + "Muscle", + "Pancreas", + "Prostate", + "Salivary_Gland", + "Skin", + "Small_Intestine", + "Spleen", + "Thymus", + "Tongue", + "Trachea", + "Uterus", + "Vasculature", + ], + "model_hashes": { + "Bladder": "3cbdba7afe4f18e13e19228c38ffd57f0606f5524393e6c108ab15eba0da4042", + "Blood": "a7dc61b6604842a157b2a16180eb53df805fe7f3a8096985c4775fb893a7d2a3", + "Bone_Marrow": "e46f20f64f404dcbb2b5ae814061849c22e9b0b59d03a2dff65bbb6e6f792888", + "Eye": "82578a5d0867c72a6415fc77c96231617ae1799d21e33cac96f40ab1293819e1", + "Fat": "ee1befdd03313d65c6d5089b503277c347ea495df55b7282c7149eb11cc66df1", + "Heart": "3cedbb01b451ce6b1668f057ebb7fed493d1bd49dec4cbda4f8d4363944dee47", + "Large_Intestine": "ae78f8166d9aeddb9abb8fee64a9d72581db6bd002f009d3f0a69e3de2f3218e", + "Liver": "c9b55e28d589a27b97a8205746d51388eecbe02b043862b89d4511c48ca78af2", + "Lung": "8f5425470e624ce15db7bbc92cc9ede56e3893e8367812b4c991bb4dedc14f19", + "Lymph_Node": "925a1fccf1d20e97cf84d1f5b78ea07b59bfe38a60920c6db4265158654b573c", + "Mammary": "0ad9dd4e91c2b5d88f9a9c8804c379af47207aa0823a2bcb2c93c7bf9d5e9b18", + "Muscle": "2710c00173f9dd247f89e595867151ab799370797f19c3dc9f7dabc4e9d68ab4", + "Pancreas": "e2a9487863b6b218dde014f38b1e6477f041614726d5448fa23041c6d6d7916a", + "Prostate": "34c4a8362cba4000ebaf0fec2af151f8f93eeed1f0001308a732190357a5f437", + "Salivary_Gland": "d444cb54b99f1e13e920d2dba7ba2aece33d20917d19d87668c78c42ec482293", + "Skin": "9103e61226183ac4ced7fc49be081538335be2eded0ee7b9ba9dea35a5a1acbe", + "Small_Intestine": "b469b6d27bb25d2d04d8e55109a72fac08b71260969d22b3d1ccd306e6c34666", + "Spleen": "7933f10778d237e15fffaae70ea872f29f56a40e227c6e1c4f8c4d6603c0211b", + "Thymus": "d1238909376beb6ca1bf3e99a75023a5522d395f6887b0a710aad08ffc735517", + "Tongue": "c2ad649e5d9856964fe2b0472fe00cb3a85b8c429bef88e17f7ae2a95e9e28cd", + "Trachea": "b0525106022ff29c5777ed65e643f5089d44b846fde76fb1d3b7206a1d9799dd", + "Uterus": "85a878008bf2c2d91613ed5086ec6c2a5e95628be31e5a049192916f7e327191", + "Vasculature": "79e2a4a93a024ded96c8c294c328236ca8c83839c171d7adc49f95c781cf9503", + }, + "adata_hashes": { + "Bladder": "2767fd7400b1a0b24f1eef7b29f942108707756644a514e1859c97e2c85cfd7e", + "Blood": "786b052eaac01debdec526da18f8517e636698c1e08fa4015f1524ec861eb5e6", + "Bone_Marrow": "ebdeb204ed2e25b67b3fcb0d088b91efa893e72de0b94e3edd5e5eae2d637ccc", + "Eye": "6f9339aac973be7cec467dbc8b18d5cfaac45f495d14f6b1c0a814008331813e", + "Fat": "428286a39c52793c421773cb0f5e1855d459eb718f570c798b583ab2a78e2ca3", + "Heart": "57c269b0e7fbd33049bcf6f672f94390ccf47ce911d80cd7c05c8e655132ba09", + "Large_Intestine": "9360da72338356be60c185c6affe1bbd0714a6bb7a007dcafc34648534c43f77", + "Liver": "849386b699bd891f1ce9e40f6ba746ee3ac77c226b919a63a8388d0027937735", + "Lung": "65b52ee9495612e78fa7d587e3f1e3533a8a9342bc4f0742fbeca82d3d652186", + "Lymph_Node": "d1191951df41de146b52d407c2db95d02dbc09104c0b4be9d67603a28dc23d9b", + "Mammary": "ccc4804016ce554a1e934cd72ebf9a7f4609228f036b24493df0bf2f4c9029e8", + "Muscle": "89d802a4f6f8118813adcb4444e013f349012e32931b1771259d644b3a9015cb", + "Pancreas": "f1408ea16b82da49764296d5c12901ea5e9474fc53920f2838d7d85e07cb7eea", + "Prostate": "5cec0b4f30b649ab1c9455ce48ae99c0f4d424d7f03d3c71e5fa5072bfecb0f9", + "Salivary_Gland": "649b0735b14f6d1a817427ade96307e69e07f1736584c867a305beccce927748", + "Skin": "8a6ec2ef56963642a84cc94605789e16977b645cfd4824e89d5d395783b3f233", + "Small_Intestine": "03536949c92d9f80097d8b8065384ca00658a5f3ac1a8a0b8cfffeeaabc8c845", + "Spleen": "9acfb41142d1c28e1138fcefbcb8685094472dab63c0cb229691c0995447db8d", + "Thymus": "ce51cda4944adbc5e3c506bfed811bd795d7df45849e5844e26101637883f910", + "Tongue": "9a6d83487c0339b26dc0c2ad04c0b0eec6cfbd4e2ba08e5fe966843797c150be", + "Trachea": "88d43f668b6701b9eb7756c2d504b0110bf11d7d7d3970c74018b72f82f18b0b", + "Uterus": "ad6f10fd24203fa60beda94fa0175eebaf65b0f61df89b278532c531d19193fc", + "Vasculature": "bf4ce2a88de4fdab1e2f07128dc25aa055064c853e767e1cc015ce89d49ddf4c", + }, + "base_url": "https://zenodo.org/records/7608635/files/", + "models_suffix": "_pretrained_models.tar.gz", + "adata_suffix": "_training_data.h5ad", + "base_repo_name": "scvi-tools/tabula-sapiens", + "metadata": { + "data_modalities": ["rna"], + "data_is_annotated": True, + "license_info": "cc-by-4.0", + "description": "Tabula Sapiens is a benchmark, first-draft human cell atlas of nearly 500,000 cells from 24 organs of 15 normal human subjects.", + "references": "The Tabula Sapiens Consortium. The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans. Science, May 2022. doi:10.1126/science.abl4896", + }, +} diff --git a/src/scvi_hub_models/config/heart_cell_atlas.json b/src/scvi_hub_models/config/heart_cell_atlas.json new file mode 100644 index 0000000..39922a2 --- /dev/null +++ b/src/scvi_hub_models/config/heart_cell_atlas.json @@ -0,0 +1,13 @@ +{ + "model_dir": "heart_cell_atlas_scvi", + "repo_name": "scvi-tools/heart-cell-atlas-scvi", + "metadata": { + "training_data_url": "https://www.heartcellatlas.org/#DataSources", + "tissues": ["heart"], + "data_modalities": ["rna"], + "data_is_annotated": true, + "license_info": "cc-by-4.0", + "description": "Combined single cell and single nuclei RNA-Seq data of 485K cardiac cells with annotations.", + "references": "Kazumasa Kanemaru, James Cranley, Daniele Muraro, Antonio M. A. Miranda, Siew Yen Ho, Anna Wilbrey-Clark, Jan Patrick Pett, Krzysztof Polanski, Laura Richardson, Monika Litvinukova, Natsuhiko Kumasaka, Yue Qin, Zuzanna Jablonska, Claudia I. Semprich, Lukas Mach, Monika Dabrowska, Nathan Richoz, Liam Bolt, Lira Mamanova, Rakeshlal Kapuge, Sam N. Barnett, Shani Perera, Carlos Talavera-López, Ilaria Mulas, Krishnaa T. Mahbubani, Liz Tuck, Lu Wang, Margaret M. Huang, Martin Prete, Sophie Pritchard, John Dark, Kourosh Saeb-Parsy, Minal Patel, Menna R. Clatworthy, Norbert Hübner, Rasheda A. Chowdhury, Michela Noseda & Sarah A. Teichmann. Spatially resolved multiomics of human cardiac niches. Nature, July 2023. doi:10.1038/s41586-023-06311-1." + } +} diff --git a/src/scvi_hub_models/config/human_lung_cell_atlas.json b/src/scvi_hub_models/config/human_lung_cell_atlas.json new file mode 100644 index 0000000..b961b9b --- /dev/null +++ b/src/scvi_hub_models/config/human_lung_cell_atlas.json @@ -0,0 +1,23 @@ +{ + "legacy_model_url": "https://zenodo.org/records/7599104/files/HLCA_reference_model.zip", + "legacy_model_hash": "a7cd60f4342292b3cba54545bcd8a34decdc8e6b82163f009273d543e7e3910e", + "legacy_model_dir": "hlca_scanvi_reference_legacy", + "model_dir": "hlca_scanvi_reference", + "reference_adata_cxg_id": "066943a2-fdac-4b29-b348-40cede398e4e", + "reference_adata_fname": "hlca_core.h5ad", + "embedding_adata_url": "https://zenodo.org/records/7599104/files/HLCA_full_v1.1_emb.h5ad", + "embedding_adata_hash": "3e2c4da281b6883464b2a70bcc1562d1c4246de32093e7b08090673fbad56a97", + "embedding_adata_fname": "hlca_all_emb.h5ad", + "mini_model_dir": "hlca_scanvi_reference_mini", + "repo_name": "scvi-tools/human-lung-cell-atlas", + "metadata": { + "training_data_url": "https://cellxgene.cziscience.com/collections/6f6d381a-7701-4781-935c-db10d30de293", + "training_code_url": "https://github.com/LungCellAtlas/HLCA_reproducibility", + "tissues": ["nose", "respiratory airway", "lung parenchyma"], + "data_modalities": ["rna"], + "data_is_annotated": true, + "license_info": "cc-by-4.0", + "description": "The integrated Human Lung Cell Atlas (HLCA) represents the first large-scale, integrated single-cell reference atlas of the human lung.", + "references": "Lisa Sikkema, Ciro Ramírez-Suástegui, Daniel C. Strobl, Tessa E. Gillett, Luke Zappia, Elo Madissoon, Nikolay S. Markov, Laure-Emmanuelle Zaragosi, Yuge Ji, Meshal Ansari, Marie-Jeanne Arguel, Leonie Apperloo, Martin Banchero, Christophe Bécavin, Marijn Berg, Evgeny Chichelnitskiy, Mei-i Chung, Antoine Collin, Aurore C. A. Gay, Janine Gote-Schniering, Baharak Hooshiar Kashani, Kemal Inecik, Manu Jain, Theodore S. Kapellos, Tessa M. Kole, Sylvie Leroy, Christoph H. Mayr, Amanda J. Oliver, Michael von Papen, Lance Peter, Chase J. Taylor, Thomas Walzthoeni, Chuan Xu, Linh T. Bui, Carlo De Donno, Leander Dony, Alen Faiz, Minzhe Guo, Austin J. Gutierrez, Lukas Heumos, Ni Huang, Ignacio L. Ibarra, Nathan D. Jackson, Preetish Kadur Lakshminarasimha Murthy, Mohammad Lotfollahi, Tracy Tabib, Carlos Talavera-López, Kyle J. Travaglini, Anna Wilbrey-Clark, Kaylee B. Worlock, Masahiro Yoshida, Lung Biological Network Consortium, Maarten van den Berge, Yohan Bossé, Tushar J. Desai, Oliver Eickelberg, Naftali Kaminski, Mark A. Krasnow, Robert Lafyatis, Marko Z. Nikolic, Joseph E. Powell, Jayaraj Rajagopal, Mauricio Rojas, Orit Rozenblatt-Rosen, Max A. Seibold, Dean Sheppard, Douglas P. Shepherd, Don D. Sin, Wim Timens, Alexander M. Tsankov, Jeffrey Whitsett, Yan Xu, Nicholas E. Banovich, Pascal Barbry, Thu Elizabeth Duong, Christine S. Falk, Kerstin B. Meyer, Jonathan A. Kropski, Dana Pe’er, Herbert B. Schiller, Purushothama Rao Tata, Joachim L. Schultze, Sara A. Teichmann, Alexander V. Misharin, Martijn C. Nawijn, Malte D. Luecken, and Fabian J. Theis. An integrated cell atlas of the lung in health and disease. Nature Medicine, June 2023. doi:10.1038/s41591-023-02327-2." + } +} diff --git a/src/scvi_hub_models/config/tabula_sapiens.json b/src/scvi_hub_models/config/tabula_sapiens.json new file mode 100644 index 0000000..632d8ab --- /dev/null +++ b/src/scvi_hub_models/config/tabula_sapiens.json @@ -0,0 +1,88 @@ +{ + "tissues": [ + "Bladder", + "Blood", + "Bone_Marrow", + "Eye", + "Fat", + "Heart", + "Large_Intestine", + "Liver", + "Lung", + "Lymph_Node", + "Mammary", + "Muscle", + "Pancreas", + "Prostate", + "Salivary_Gland", + "Skin", + "Small_Intestine", + "Spleen", + "Thymus", + "Tongue", + "Trachea", + "Uterus", + "Vasculature" + ], + "model_hashes": { + "Bladder": "3cbdba7afe4f18e13e19228c38ffd57f0606f5524393e6c108ab15eba0da4042", + "Blood": "a7dc61b6604842a157b2a16180eb53df805fe7f3a8096985c4775fb893a7d2a3", + "Bone_Marrow": "e46f20f64f404dcbb2b5ae814061849c22e9b0b59d03a2dff65bbb6e6f792888", + "Eye": "82578a5d0867c72a6415fc77c96231617ae1799d21e33cac96f40ab1293819e1", + "Fat": "ee1befdd03313d65c6d5089b503277c347ea495df55b7282c7149eb11cc66df1", + "Heart": "3cedbb01b451ce6b1668f057ebb7fed493d1bd49dec4cbda4f8d4363944dee47", + "Large_Intestine": "ae78f8166d9aeddb9abb8fee64a9d72581db6bd002f009d3f0a69e3de2f3218e", + "Liver": "c9b55e28d589a27b97a8205746d51388eecbe02b043862b89d4511c48ca78af2", + "Lung": "8f5425470e624ce15db7bbc92cc9ede56e3893e8367812b4c991bb4dedc14f19", + "Lymph_Node": "925a1fccf1d20e97cf84d1f5b78ea07b59bfe38a60920c6db4265158654b573c", + "Mammary": "0ad9dd4e91c2b5d88f9a9c8804c379af47207aa0823a2bcb2c93c7bf9d5e9b18", + "Muscle": "2710c00173f9dd247f89e595867151ab799370797f19c3dc9f7dabc4e9d68ab4", + "Pancreas": "e2a9487863b6b218dde014f38b1e6477f041614726d5448fa23041c6d6d7916a", + "Prostate": "34c4a8362cba4000ebaf0fec2af151f8f93eeed1f0001308a732190357a5f437", + "Salivary_Gland": "d444cb54b99f1e13e920d2dba7ba2aece33d20917d19d87668c78c42ec482293", + "Skin": "9103e61226183ac4ced7fc49be081538335be2eded0ee7b9ba9dea35a5a1acbe", + "Small_Intestine": "b469b6d27bb25d2d04d8e55109a72fac08b71260969d22b3d1ccd306e6c34666", + "Spleen": "7933f10778d237e15fffaae70ea872f29f56a40e227c6e1c4f8c4d6603c0211b", + "Thymus": "d1238909376beb6ca1bf3e99a75023a5522d395f6887b0a710aad08ffc735517", + "Tongue": "c2ad649e5d9856964fe2b0472fe00cb3a85b8c429bef88e17f7ae2a95e9e28cd", + "Trachea": "b0525106022ff29c5777ed65e643f5089d44b846fde76fb1d3b7206a1d9799dd", + "Uterus": "85a878008bf2c2d91613ed5086ec6c2a5e95628be31e5a049192916f7e327191", + "Vasculature": "79e2a4a93a024ded96c8c294c328236ca8c83839c171d7adc49f95c781cf9503" + }, + "adata_hashes": { + "Bladder": "2767fd7400b1a0b24f1eef7b29f942108707756644a514e1859c97e2c85cfd7e", + "Blood": "786b052eaac01debdec526da18f8517e636698c1e08fa4015f1524ec861eb5e6", + "Bone_Marrow": "ebdeb204ed2e25b67b3fcb0d088b91efa893e72de0b94e3edd5e5eae2d637ccc", + "Eye": "6f9339aac973be7cec467dbc8b18d5cfaac45f495d14f6b1c0a814008331813e", + "Fat": "428286a39c52793c421773cb0f5e1855d459eb718f570c798b583ab2a78e2ca3", + "Heart": "57c269b0e7fbd33049bcf6f672f94390ccf47ce911d80cd7c05c8e655132ba09", + "Large_Intestine": "9360da72338356be60c185c6affe1bbd0714a6bb7a007dcafc34648534c43f77", + "Liver": "849386b699bd891f1ce9e40f6ba746ee3ac77c226b919a63a8388d0027937735", + "Lung": "65b52ee9495612e78fa7d587e3f1e3533a8a9342bc4f0742fbeca82d3d652186", + "Lymph_Node": "d1191951df41de146b52d407c2db95d02dbc09104c0b4be9d67603a28dc23d9b", + "Mammary": "ccc4804016ce554a1e934cd72ebf9a7f4609228f036b24493df0bf2f4c9029e8", + "Muscle": "89d802a4f6f8118813adcb4444e013f349012e32931b1771259d644b3a9015cb", + "Pancreas": "f1408ea16b82da49764296d5c12901ea5e9474fc53920f2838d7d85e07cb7eea", + "Prostate": "5cec0b4f30b649ab1c9455ce48ae99c0f4d424d7f03d3c71e5fa5072bfecb0f9", + "Salivary_Gland": "649b0735b14f6d1a817427ade96307e69e07f1736584c867a305beccce927748", + "Skin": "8a6ec2ef56963642a84cc94605789e16977b645cfd4824e89d5d395783b3f233", + "Small_Intestine": "03536949c92d9f80097d8b8065384ca00658a5f3ac1a8a0b8cfffeeaabc8c845", + "Spleen": "9acfb41142d1c28e1138fcefbcb8685094472dab63c0cb229691c0995447db8d", + "Thymus": "ce51cda4944adbc5e3c506bfed811bd795d7df45849e5844e26101637883f910", + "Tongue": "9a6d83487c0339b26dc0c2ad04c0b0eec6cfbd4e2ba08e5fe966843797c150be", + "Trachea": "88d43f668b6701b9eb7756c2d504b0110bf11d7d7d3970c74018b72f82f18b0b", + "Uterus": "ad6f10fd24203fa60beda94fa0175eebaf65b0f61df89b278532c531d19193fc", + "Vasculature": "bf4ce2a88de4fdab1e2f07128dc25aa055064c853e767e1cc015ce89d49ddf4c" + }, + "base_url": "https://zenodo.org/records/7608635/files/", + "models_suffix": "_pretrained_models.tar.gz", + "adata_suffix": "_training_data.h5ad", + "base_repo_name": "scvi-tools/tabula-sapiens", + "metadata": { + "data_modalities": ["rna"], + "data_is_annotated": true, + "license_info": "cc-by-4.0", + "description": "Tabula Sapiens is a benchmark, first-draft human cell atlas of nearly 500,000 cells from 24 organs of 15 normal human subjects.", + "references": "The Tabula Sapiens Consortium. The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans. Science, May 2022. doi:10.1126/science.abl4896" + } +} diff --git a/src/scvi_hub_models/models/heart_cell_atlas.py b/src/scvi_hub_models/models/heart_cell_atlas.py new file mode 100644 index 0000000..7497e1c --- /dev/null +++ b/src/scvi_hub_models/models/heart_cell_atlas.py @@ -0,0 +1,119 @@ +from anndata import AnnData +from scvi.hub import HubModel +from scvi.model import SCVI + + +def load_adata(save_dir: str) -> AnnData: + """Download and load the dataset.""" + from scvi.data import heart_cell_atlas_subsampled + + return heart_cell_atlas_subsampled(save_path=save_dir) + + +def preprocess_adata(adata: AnnData) -> AnnData: + """Preprocess the AnnData object.""" + import scanpy as sc + + sc.pp.filter_genes(adata, min_counts=3) + adata.layers["counts"] = adata.X.copy() + sc.pp.highly_variable_genes( + adata, + n_top_genes=1200, + subset=True, + layer="counts", + flavor="seurat_v3", + batch_key="cell_source", + ) + + return adata + + +def initialize_model(adata: AnnData) -> SCVI: + """Initialize the scVI model.""" + import scvi + + scvi.settings.seed = 0 + SCVI.setup_anndata( + adata, + layer="counts", + categorical_covariate_keys=["cell_source", "donor"], + continuous_covariate_keys=["percent_mito", "percent_ribo"], + ) + return SCVI(adata) + + +def train_model(model: SCVI) -> SCVI: + """Train the scVI model.""" + import scvi + import torch + + scvi.settings.seed = 0 + torch.set_float32_matmul_precision("high") + model.train() + + return model + + +def save_model(model: SCVI, config: dict, save_dir: str) -> str: + """Save the scVI model.""" + import os + + model_path = os.path.join(save_dir, config["model_dir"]) + model.save(model_path, save_anndata=True, overwrite=True) + + return model_path + + +def create_hub_model(model_path: str, config: dict) -> HubModel: + """Create a HubModel from the scVI model.""" + import anndata + from scvi.hub import HubMetadata, HubModel, HubModelCardHelper + + metadata = config["metadata"] + + hub_metadata = HubMetadata.from_dir( + model_path, + training_data_url=metadata["training_data_url"], + anndata_version=anndata.__version__, + ) + model_card = HubModelCardHelper.from_dir( + model_path, + training_data_url=metadata["training_data_url"], + tissues=metadata["tissues"], + data_modalities=metadata["data_modalities"], + description=metadata["description"], + references=metadata["references"], + license_info=metadata["license_info"], + data_is_annotated=metadata["data_is_annotated"], + anndata_version=anndata.__version__, + data_is_minified=False, + ) + return HubModel( + model_path, + metadata=hub_metadata, + model_card=model_card, + ) + + +def model_workflow(dry_run: bool, repo_create: bool): + """Run the model workflow.""" + from tempfile import TemporaryDirectory + + from scvi_hub_models.config import HEART_CELL_ATLAS_CONFIG + from scvi_hub_models.utils import upload_hub_model + + config = HEART_CELL_ATLAS_CONFIG + save_dir = TemporaryDirectory().name + + # load and preprocess the data + adata = load_adata(save_dir) + adata = preprocess_adata(adata) + + # train the model + model = initialize_model(adata) + model = train_model(model, adata) + model_path = save_model(model, config, save_dir) + + # create and upload to hub + hub_model = create_hub_model(model_path, config) + hub_model = upload_hub_model(hub_model, config["repo_name"]) diff --git a/src/scvi_hub_models/models/human_lung_cell_atlas.py b/src/scvi_hub_models/models/human_lung_cell_atlas.py new file mode 100644 index 0000000..b5b6573 --- /dev/null +++ b/src/scvi_hub_models/models/human_lung_cell_atlas.py @@ -0,0 +1,209 @@ +from anndata import AnnData +from scvi.hub import HubModel +from scvi.model import SCANVI + + +def download_legacy_model(config: dict, save_dir: str) -> str: + """Download the legacy scANVI model from Zenodo.""" + from pathlib import Path + + from pooch import Unzip, retrieve + + unzipped = retrieve( + url=config["legacy_model_url"], + known_hash=config["legacy_model_hash"], + fname=config["legacy_model_dir"], + processor=Unzip(), + path=save_dir, + ) + unzipped = sorted(unzipped) + return str(Path(unzipped[0]).parent) + + +def convert_legacy_model(legacy_model_path: str, config: dict, save_dir: str) -> str: + """Convert the legacy scANVI model.""" + import os + + from scvi.model import SCANVI + + model_path = os.path.join(save_dir, config["model_dir"]) + SCANVI.convert_legacy_save(legacy_model_path, model_path, overwrite=True) + + return model_path + + +def download_reference_adata(config: dict, save_dir: str) -> str: + """Download the reference (core) dataset from CxG.""" + import os + + from cellxgene_census import download_source_h5ad + + adata_path = os.path.join(save_dir, config["reference_adata_fname"]) + if not os.path.exists(adata_path): + download_source_h5ad(config["reference_adata_cxg_id"], to_path=adata_path) + + return adata_path + + +def preprocess_reference_adata(adata: AnnData, model_path: str) -> AnnData: + """Preprocess the reference dataset. + + 1. Set .X to raw counts + 2. Subset to genes that the model was trained on + 3. Remove unnecessary .var columns + 4. Pad empty genes with zeros + """ + from scvi.model.base import ArchesMixin + from scvi.model.base._utils import _load_saved_files + + # .X does not contain raw counts initially + adata.X = adata.raw.X + _, genes, _, _ = _load_saved_files(model_path, load_adata=False) + adata = adata[:, adata.var.index.isin(genes)].copy() + + # get rid of some var columns that we dont need + # -- will make later processing easier + del adata.var["feature_is_filtered"] + del adata.var["feature_reference"] + del adata.var["feature_biotype"] + + ArchesMixin.prepare_query_anndata(adata, model_path) + + return adata + + +def load_model(model_path: str, adata: AnnData) -> SCANVI: + """Load a scANVI model.""" + return SCANVI.load(model_path, adata=adata) + + +def postprocess_reference_adata(adata: "AnnData") -> "AnnData": + """Postprocess the reference dataset by adding feature names for padded genes.""" + gene_ids = [ + "ENSG00000253701", + "ENSG00000269936", + "ENSG00000274961", + "ENSG00000279576", + ] + feat_names = ["AL928768.3", "RP11-394O4.5", "RP3-492J12.2", "AP000769.1"] + adata.var["feature_name"] = adata.var["feature_name"].cat.add_categories(feat_names) + for gene, feature in zip(gene_ids, feat_names): + adata.var.loc[gene, "feature_name"] = feature + + return adata + + +def download_embedding_adata(config: dict, save_dir: str) -> str: + """Download the embedding dataset from Zenodo. + + Embedding dataset contains precomputed latent representations for core cells. + """ + from pooch import retrieve + + return retrieve( + url=config["embedding_adata_url"], + known_hash=config["embedding_adata_hash"], + fname=config["embedding_adata_fname"], + processor=None, + path=save_dir, + ) + + +def preprocess_embedding_adata(adata: AnnData) -> AnnData: + """Preprocess the embedding dataset by subsetting to core cells.""" + return adata[adata.obs["core_or_extension"] == "core"].copy() + + +def minify_model(model: SCANVI, ref_adata: AnnData, emb_adata: AnnData) -> SCANVI: + """Minify the model and dataset. + + Uses the precomputed mean latent posterior from the embedding dataset. + """ + qzm = emb_adata[ref_adata.obs.index].copy().X + _, qzv = model.get_latent_representation(give_mean=False, return_dist=True) + qzm_key = "SCANVI_latent_qzm" + qzv_key = "SCANVI_latent_qzv" + ref_adata.obsm[qzm_key] = qzm + ref_adata.obsm[qzv_key] = qzv + model.minify_adata(use_latent_qzm_key=qzm_key, use_latent_qzv_key=qzv_key) + + return model + + +def save_minified_model(model: SCANVI, config: dict, save_dir: str) -> str: + """Save the minified model.""" + import os + + model_path = os.path.join(save_dir, config["mini_model_dir"]) + model.save(model_path, overwrite=True, save_anndata=True) + + return model_path + + +def create_hub_model(mini_model_path: str, config: dict) -> HubModel: + """Create a HubModel from the minified model.""" + import anndata + from scvi.hub import HubMetadata, HubModel, HubModelCardHelper + + metadata = config["metadata"] + + hub_metadata = HubMetadata.from_dir( + mini_model_path, + training_data_url=metadata["training_data_url"], + anndata_version=anndata.__version__, + ) + model_card = HubModelCardHelper.from_dir( + mini_model_path, + training_data_url=metadata["training_data_url"], + training_code_url=metadata["training_code_url"], + tissues=metadata["tissues"], + data_modalities=metadata["data_modalities"], + description=metadata["description"], + references=metadata["references"], + license_info=metadata["license_info"], + data_is_annotated=metadata["data_is_annotated"], + anndata_version=anndata.__version__, + data_is_minified=True, + ) + return HubModel( + mini_model_path, + metadata=hub_metadata, + model_card=model_card, + ) + + +def model_workflow(dry_run: bool, repo_create: bool) -> None: + """Run the model workflow.""" + import json + + from anndata import read_h5ad + from scvi_hub_model.utils import upload_hub_model + + with open("../../config/human_lung_cell_atlas.json") as f: + config = json.load(f) + # save_dir = TemporaryDirectory().name + save_dir = "./data" + + # download and convert legacy model + legacy_model_path = download_legacy_model(config, save_dir) + model_path = convert_legacy_model(legacy_model_path, config, save_dir) + + # download and process reference dataset + ref_adata_path = download_reference_adata(config, save_dir) + ref_adata = read_h5ad(ref_adata_path) + ref_adata = preprocess_reference_adata(ref_adata, model_path) + model = load_model(model_path, ref_adata) + ref_adata = postprocess_reference_adata(ref_adata) + + # download and process embedding dataset + emb_adata_path = download_embedding_adata(config, save_dir) + emb_adata = read_h5ad(emb_adata_path) + emb_adata = preprocess_embedding_adata(emb_adata) + + # minify model and save + model = minify_model(model, ref_adata, emb_adata) + mini_model_path = save_minified_model(model, config, save_dir) + + # create and upload hub model + hub_model = create_hub_model(mini_model_path, config) + hub_model = upload_hub_model(hub_model, "scvi-tools/human-lung-cell-atlas") diff --git a/src/scvi_hub_models/models/tabula_sapiens.py b/src/scvi_hub_models/models/tabula_sapiens.py new file mode 100644 index 0000000..2ed0e40 --- /dev/null +++ b/src/scvi_hub_models/models/tabula_sapiens.py @@ -0,0 +1,182 @@ +from anndata import AnnData +from scvi.model.base import BaseModelClass + + +def download_models_for_tissue(tissue: str, config: dict, save_dir: str) -> str: + """Download the models for a given tissue from Zenodo. + + Returns the path to the directory containing the models. + """ + from pathlib import Path + + from pooch import Untar, retrieve + + untarred = retrieve( + url=f"{config['base_url']}{tissue}{config['models_suffix']}", + known_hash=config["model_hashes"][tissue], + fname=f"{tissue}_models", + path=save_dir, + processor=Untar(), + ) + untarred = sorted(untarred) + return str(Path(untarred[0]).parent.parent) + + +def download_models(config: dict, save_dir: str): + """Download the models for a list of tissues from Zenodo.""" + base_model_dirs = {} + for tissue in config["tissues"]: + base_model_dirs[tissue] = download_models_for_tissue(tissue, config, save_dir) + + return base_model_dirs + + +def download_adata_for_tissue(tissue: str, config: dict, save_dir: str): + """Download the dataset for a given tissue from Zenodo. + + Returns the path to the dataset. + """ + from pooch import retrieve + + return retrieve( + url=f"{config['base_url']}{tissue}{config['adata_suffix']}", + known_hash=config["adata_hashes"][tissue], + fname=f"{tissue}_adata.h5ad", + path=save_dir, + processor=None, + ) + + +def download_adatas(config: dict, save_dir: str): + """Download the datasets for a list of tissues from Zenodo.""" + adata_paths = {} + for tissue in config["tissues"]: + adata_paths[tissue] = download_adata_for_tissue(tissue, config, save_dir) + + return adata_paths + + +def minify_and_save_model(model: BaseModelClass, adata: AnnData, save_dir: str) -> str: + """Minify and save the model.""" + import os + + qzm, qzv = model.get_latent_representation(give_mean=False, return_dist=True) + model_name = model.__class__.__name__ + qzm_key = f"{model_name}_latent_qzm" + qzv_key = f"{model_name}_latent_qzv" + adata.obsm[qzm_key] = qzm + adata.obsm[qzv_key] = qzv + model.minify_adata(use_latent_qzm_key=qzm_key, use_latent_qzv_key=qzv_key) + mini_model_path = os.path.join(save_dir, f"mini_{model_name}") + model.save(mini_model_path, overwrite=True, save_anndata=True) + + return mini_model_path + + +def create_hub_model(model_path: str, tissue: str, config: dict, minified: bool = False): + """Create a HubModel from the model.""" + import anndata + from scvi.hub import HubMetadata, HubModel, HubModelCardHelper + + metadata = config["metadata"] + + hub_metadata = HubMetadata.from_dir( + model_path, + anndata_version=anndata.__version__, + ) + model_card = HubModelCardHelper.from_dir( + model_path, + training_data_url=f"{config['base_url']}{tissue}{config['adata_suffix']}", + tissues=[tissue], + data_modalities=metadata["data_modalities"], + description=metadata["description"], + references=metadata["references"], + license_info=metadata["license_info"], + data_is_annotated=True, + anndata_version=anndata.__version__, + data_is_minified=minified, + ) + return HubModel( + model_path, + metadata=hub_metadata, + model_card=model_card, + ) + + +def load_and_upload_models_for_tissue(base_model_dir: str, adata: AnnData, tissue: str, config: str): + """Load and upload the models for a given tissue.""" + import os + + from scvi_hub_models.utils import upload_hub_model + + scvi_path = os.path.join(base_model_dir, "scvi") + if os.path.isdir(scvi_path): + from scvi.model import SCVI + + scvi_model = SCVI.load(scvi_path, adata=adata) + mini_scvi_path = minify_and_save_model(scvi_model, adata, base_model_dir) + + scvi_hub_model = create_hub_model(mini_scvi_path, tissue, config, minified=True) + upload_hub_model(scvi_hub_model, tissue, "scvi") + + scanvi_path = os.path.join(base_model_dir, "scanvi") + if os.path.isdir(scanvi_path): + from scvi.model import SCANVI + + scanvi_model = SCANVI.load(scanvi_path, adata=adata) + mini_scanvi_path = minify_and_save_model(scanvi_model, adata, base_model_dir) + + scanvi_hub_model = create_hub_model(mini_scanvi_path, tissue, config, minified=True) + upload_hub_model(scanvi_hub_model, tissue, "scanvi") + + condscvi_path = os.path.join(base_model_dir, "condscvi") + if os.path.isdir(condscvi_path): + from scvi.model import CondSCVI + + condscvi_model = CondSCVI.load(condscvi_path, adata=adata) + condscvi_model.save(condscvi_path, overwrite=True, save_anndata=True) + + condscvi_hub_model = create_hub_model(condscvi_path, tissue, config, minified=False) + upload_hub_model(condscvi_hub_model, tissue, "condscvi") + + stereoscope_path = os.path.join(base_model_dir, "stereoscope") + if os.path.isdir(stereoscope_path): + from scvi.external import RNAStereoscope + + stereoscope_model = RNAStereoscope.load(stereoscope_path, adata=adata) + stereoscope_model.save(stereoscope_path, overwrite=True, save_anndata=True) + + stereoscope_hub_model = create_hub_model(stereoscope_path, tissue, config, minified=False) + upload_hub_model(stereoscope_hub_model, tissue, "stereoscope") + + del scvi_hub_model + del scanvi_hub_model + del condscvi_hub_model + del stereoscope_hub_model + + +def load_and_upload_models(base_model_dirs: dict, adata_paths: dict, config: dict): + """Docstring""" + from anndata import read_h5ad + + for tissue in config["tissues"]: + adata = read_h5ad(adata_paths[tissue]) + load_and_upload_models_for_tissue(base_model_dirs[tissue], adata, tissue, config) + del adata + + +def model_workflow(): + """Run the model workflow.""" + import json + + with open("../../config/tabula_sapiens.json") as f: + config = json.load(f) + # save_dir = TemporaryDirectory().name + save_dir = "./data" + + # download models and datasets + base_model_dirs = download_models(config, save_dir) + adata_paths = download_adatas(config, save_dir) + + # # load and upload models + load_and_upload_models(base_model_dirs, adata_paths, config) diff --git a/src/scvi_hub_models/utils/__init__.py b/src/scvi_hub_models/utils/__init__.py new file mode 100644 index 0000000..b10583a --- /dev/null +++ b/src/scvi_hub_models/utils/__init__.py @@ -0,0 +1,9 @@ +from ._file import make_parents +from ._kwargs import wrap_kwargs +from ._upload import upload_hub_model + +__all__ = [ + "make_parents", + "wrap_kwargs", + "upload_hub_model", +] diff --git a/src/scvi_hub_models/utils/_file.py b/src/scvi_hub_models/utils/_file.py new file mode 100644 index 0000000..deebc05 --- /dev/null +++ b/src/scvi_hub_models/utils/_file.py @@ -0,0 +1,9 @@ +from __future__ import annotations + + +def make_parents(*paths) -> None: + """Make parent directories of a file path if they do not exist.""" + from pathlib import Path + + for p in paths: + Path(p).parent.mkdir(parents=True, exist_ok=True) diff --git a/src/scvi_hub_models/utils/_kwargs.py b/src/scvi_hub_models/utils/_kwargs.py new file mode 100644 index 0000000..d8e2944 --- /dev/null +++ b/src/scvi_hub_models/utils/_kwargs.py @@ -0,0 +1,12 @@ +from __future__ import annotations + + +def wrap_kwargs(fn: callable) -> callable: + """Wrap a function to accept keyword arguments from the command line.""" + from inspect import signature + + import click + + for param in signature(fn).parameters: + fn = click.option("--" + param, type=str)(fn) + return click.command()(fn) diff --git a/src/scvi_hub_models/utils/_upload.py b/src/scvi_hub_models/utils/_upload.py new file mode 100644 index 0000000..80d2159 --- /dev/null +++ b/src/scvi_hub_models/utils/_upload.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from scvi.hub import HubModel + + +def upload_hub_model(hub_model: HubModel, repo_name: str, **kwargs) -> HubModel: + """Upload the HubModel to HuggingFace Hub.""" + import os + + hub_model.push_to_huggingface_hub(repo_name=repo_name, repo_token=os.environ["HF_API_TOKEN"], **kwargs) + return hub_model