Skip to content

Commit

Permalink
add tutorial notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
bkmartinjr authored and ryan-williams committed Sep 25, 2024
1 parent ccf373d commit a8c16c6
Show file tree
Hide file tree
Showing 4 changed files with 1,102 additions and 2 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/python-tiledbsoma-ml.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@ name: python-tiledbsoma-ml CI
on:
pull_request:
branches: ["**"]
paths-ignore: ['scripts/**']
paths-ignore:
- "scripts/**"
- "notebooks/**"
push:
branches: [main]
paths-ignore: ['scripts/**']
paths-ignore:
- "scripts/**"
- "notebooks/**"
workflow_dispatch:

jobs:
Expand Down
242 changes: 242 additions & 0 deletions notebooks/tutorial_lightning.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training a model with PyTorch Lightning\n",
"\n",
"This tutorial provides a quick overview of training a toy model with Lightning, using the `tiledbsoma_ml.ExperimentAxisQueryIterableDataset` class, on data from the [CZI CELLxGENE Census](https://chanzuckerberg.github.io/cellxgene-census/) dataset. This is intended only to demonstrate the use of the `ExperimentAxisQueryIterableDataset`, and not as an example of how to train a biologically useful model.\n",
"\n",
"For more information on these API, please refer to the [`tutorial_pytorch` notebook](tutorial_pytorch.ipynb).\n",
"\n",
"**Prerequesites**\n",
"\n",
"Install `tiledbsoma_ml` and `scikit-learn`, for example:\n",
"\n",
"> pip install tiledbsoma_ml scikit-learn\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize SOMA Experiment query as training data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/torchdata/datapipes/__init__.py:18: UserWarning: \n",
"################################################################################\n",
"WARNING!\n",
"The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a\n",
"future torchdata release! Please see https://github.com/pytorch/data/issues/1196\n",
"to learn more and leave feedback.\n",
"################################################################################\n",
"\n",
" deprecation_warning()\n"
]
}
],
"source": [
"import pytorch_lightning as pl\n",
"import tiledbsoma as soma\n",
"import torch\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"import tiledbsoma_ml as soma_ml\n",
"\n",
"CZI_Census_Homo_Sapiens_URL = \"s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/\"\n",
"\n",
"experiment = soma.open(\n",
" CZI_Census_Homo_Sapiens_URL,\n",
" context=soma.SOMATileDBContext(tiledb_config={\"vfs.s3.region\": \"us-west-2\"}),\n",
")\n",
"obs_value_filter = \"tissue_general == 'tongue' and is_primary_data == True\"\n",
"\n",
"with experiment.axis_query(\n",
" measurement_name=\"RNA\", obs_query=soma.AxisQuery(value_filter=obs_value_filter)\n",
") as query:\n",
" obs_df = query.obs(column_names=[\"cell_type\"]).concat().to_pandas()\n",
" cell_type_encoder = LabelEncoder().fit(obs_df[\"cell_type\"].unique())\n",
"\n",
" experiment_dataset = soma_ml.ExperimentAxisQueryIterableDataset(\n",
" query,\n",
" X_name=\"raw\",\n",
" obs_column_names=[\"cell_type\"],\n",
" batch_size=128,\n",
" shuffle=True,\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the Lightning module"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"class LogisticRegressionLightning(pl.LightningModule):\n",
" def __init__(self, input_dim, output_dim, cell_type_encoder, learning_rate=1e-5):\n",
" super(LogisticRegressionLightning, self).__init__()\n",
" self.linear = torch.nn.Linear(input_dim, output_dim)\n",
" self.cell_type_encoder = cell_type_encoder\n",
" self.learning_rate = learning_rate\n",
" self.loss_fn = torch.nn.CrossEntropyLoss()\n",
"\n",
" def forward(self, x):\n",
" outputs = torch.sigmoid(self.linear(x))\n",
" return outputs\n",
"\n",
" def training_step(self, batch, batch_idx):\n",
" X_batch, y_batch = batch\n",
" # X_batch = X_batch.float()\n",
" X_batch = torch.from_numpy(X_batch).float().to(self.device)\n",
"\n",
" # Perform prediction\n",
" outputs = self(X_batch)\n",
"\n",
" # Determine the predicted label\n",
" probabilities = torch.nn.functional.softmax(outputs, 1)\n",
" predictions = torch.argmax(probabilities, axis=1)\n",
"\n",
" # Compute loss\n",
" y_batch = torch.from_numpy(\n",
" self.cell_type_encoder.transform(y_batch[\"cell_type\"])\n",
" ).to(self.device)\n",
" loss = self.loss_fn(outputs, y_batch.long())\n",
"\n",
" # Compute accuracy\n",
" train_correct = (predictions == y_batch).sum().item()\n",
" train_accuracy = train_correct / len(predictions)\n",
"\n",
" # Log loss and accuracy\n",
" self.log(\"train_loss\", loss, prog_bar=True)\n",
" self.log(\"train_accuracy\", train_accuracy, prog_bar=True)\n",
"\n",
" return loss\n",
"\n",
" def configure_optimizers(self):\n",
" optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n",
" return optimizer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train the model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"GPU available: True (cuda), used: True\n",
"TPU available: False, using: 0 TPU cores\n",
"HPU available: False, using: 0 HPUs\n",
"/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
"LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
"\n",
" | Name | Type | Params | Mode \n",
"-----------------------------------------------------\n",
"0 | linear | Linear | 726 K | train\n",
"1 | loss_fn | CrossEntropyLoss | 0 | train\n",
"-----------------------------------------------------\n",
"726 K Trainable params\n",
"0 Non-trainable params\n",
"726 K Total params\n",
"2.905 Total estimated model params size (MB)\n",
"2 Modules in train mode\n",
"0 Modules in eval mode\n",
"/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.\n",
"/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/utilities/data.py:122: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 19: 100%|██████████| 118/118 [00:08<00:00, 14.31it/s, v_num=5, train_loss=1.670, train_accuracy=0.977]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"`Trainer.fit` stopped: `max_epochs=20` reached.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 19: 100%|██████████| 118/118 [00:08<00:00, 14.28it/s, v_num=5, train_loss=1.670, train_accuracy=0.977]\n"
]
}
],
"source": [
"dataloader = soma_ml.experiment_dataloader(experiment_dataset)\n",
"\n",
"# The size of the input dimension is the number of genes\n",
"input_dim = experiment_dataset.shape[1]\n",
"\n",
"# The size of the output dimension is the number of distinct cell_type values\n",
"output_dim = len(cell_type_encoder.classes_)\n",
"\n",
"# Initialize the PyTorch Lightning model\n",
"model = LogisticRegressionLightning(\n",
" input_dim, output_dim, cell_type_encoder=cell_type_encoder\n",
")\n",
"\n",
"# Define the PyTorch Lightning Trainer\n",
"trainer = pl.Trainer(max_epochs=20)\n",
"\n",
"# set precision\n",
"torch.set_float32_matmul_precision(\"high\")\n",
"\n",
"# Train the model\n",
"trainer.fit(model, train_dataloaders=dataloader)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "toymodel",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit a8c16c6

Please sign in to comment.