Skip to content

Commit

Permalink
v0.3.1 (#303)
Browse files Browse the repository at this point in the history
* Add WDL input to set number of retries. (#247)

* Move hash computation so that it is recomputed on retry, and now-invalid checkpoint is not loaded. (#258)

* Bug fix for WDL using MTX input (#246)

* Memory-efficient posterior generation (#263)

* Fix posterior and estimator integer overflow bugs on Windows (#259)

* Move from setup.py to pyproject.toml (#240)

* Fix bugs with report generation across platforms (#302)

---------

Co-authored-by: kshakir <[email protected]>
Co-authored-by: alecw <[email protected]>
  • Loading branch information
3 people authored Oct 31, 2023
1 parent 4990df7 commit 3a4dc8d
Show file tree
Hide file tree
Showing 23 changed files with 254 additions and 207 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/run_pytest.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Run cellbender's tests
# Run cellbender test suite

name: 'pytest'

Expand All @@ -7,11 +7,13 @@ on: pull_request
jobs:
build:

runs-on: 'ubuntu-latest'
strategy:
matrix:
os: ['ubuntu-latest', 'windows-latest']
python-version: ['3.7']

runs-on: ${{ matrix.os }}

steps:
- name: 'Checkout repo'
uses: actions/checkout@v3
Expand Down
5 changes: 4 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
include README.rst
include LICENSE
include requirements.txt
include requirements-rtd.txt
include requirements-rtd.txt
include requirements-dev.txt
include cellbender/VERSION.txt
include cellbender/remove_background/report.ipynb
2 changes: 1 addition & 1 deletion build_docker_release.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

tag=$(cat cellbender/__init__.py | sed -e 's?__version__ = ??' | sed "s/^'\(.*\)'$/\1/")
tag=$(cat cellbender/VERSION.txt)
release=v${tag}

docker build \
Expand Down
1 change: 1 addition & 0 deletions cellbender/VERSION.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.3.0
4 changes: 3 additions & 1 deletion cellbender/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
__version__ = '0.3.0'
from .base_cli import get_version

__version__ = get_version()
7 changes: 1 addition & 6 deletions cellbender/base_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,7 @@ def read(rel_path):


def get_version() -> str:
for line in read('__init__.py').splitlines():
if line.startswith('__version__'):
delim = '"' if '"' in line else "'"
return line.split(delim)[1]
else:
raise RuntimeError("Unable to find version string.")
return read('VERSION.txt').splitlines()[0]


class AbstractCLI(ABC):
Expand Down
2 changes: 1 addition & 1 deletion cellbender/remove_background/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def make_tarball(files: List[str], tarball_name: str) -> bool:
for file in files:
# without arcname, unpacking results in unpredictable file locations!
tar.add(file, arcname=os.path.basename(file))
os.rename(tarball_name + '.tmp', tarball_name)
os.replace(tarball_name + '.tmp', tarball_name)
return True


Expand Down
11 changes: 0 additions & 11 deletions cellbender/remove_background/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,17 +207,6 @@ def setup_and_logging(args):
+ ' '.join(['cellbender', 'remove-background'] + sys.argv[2:]))
logger.info("CellBender " + get_version())

# Set up checkpointing by creating a unique workflow hash.
hashcode = create_workflow_hashcode(
module_path=os.path.dirname(cellbender.__file__),
args_to_remove=(['output_file', 'fpr', 'input_checkpoint_tarball', 'debug',
'posterior_batch_size', 'checkpoint_min', 'truth_file',
'posterior_regularization', 'cdf_threshold_q', 'prq_alpha',
'estimator', 'use_multiprocessing_estimation', 'cpu_threads']
+ (['epochs'] if args.constant_learning_rate else [])),
args=args)[:10]
args.checkpoint_filename = hashcode # store this in args
logger.info(f'(Workflow hash {hashcode})')
return args, file_handler


Expand Down
55 changes: 15 additions & 40 deletions cellbender/remove_background/estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def _estimation_array_to_csr(index_converter,
data: np.ndarray,
m: np.ndarray,
noise_offsets: Optional[Dict[int, int]],
dtype=np.int64) -> sp.csr_matrix:
dtype=np.int) -> sp.csr_matrix:
"""Say you have point estimates for each count matrix element (data) and
you have the 'm'-indices for each value (m). This returns a CSR matrix
that has the shape of the count matrix, where duplicate entries have
Expand All @@ -229,7 +229,7 @@ def _estimation_array_to_csr(index_converter,
a flat format, indexed by 'm'.
m: Array of the same length as data, where each entry is an m-index.
noise_offsets: Noise count offset values keyed by 'm'.
dtype: Data type for sparse matrix. Int32 is too small for 'm' indices.
dtype: Data type for values of sparse matrix
Results:
noise_csr: Noise point estimate, as a CSR sparse matrix.
Expand All @@ -238,7 +238,7 @@ def _estimation_array_to_csr(index_converter,
row, col = index_converter.get_ng_indices(m_inds=m)
if noise_offsets is not None:
data = data + np.array([noise_offsets.get(i, 0) for i in m])
coo = sp.coo_matrix((data.astype(dtype), (row.astype(dtype), col.astype(dtype))),
coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
shape=index_converter.matrix_shape, dtype=dtype)
coo.sum_duplicates()
return coo.tocsr()
Expand Down Expand Up @@ -463,11 +463,9 @@ def estimate_noise(self,
if use_multiple_processes:

logger.info('Dividing dataset into chunks of genes')
chunk_logic_list = list(
self._gene_chunk_iterator(
noise_log_prob_coo=noise_log_prob_coo,
n_chunks=n_chunks,
)
chunk_logic_list = self._gene_chunk_iterator(
noise_log_prob_coo=noise_log_prob_coo,
n_chunks=n_chunks,
)

logger.info('Computing the output in asynchronous chunks in parallel...')
Expand Down Expand Up @@ -538,10 +536,9 @@ def estimate_noise(self,
def _gene_chunk_iterator(self,
noise_log_prob_coo: sp.coo_matrix,
n_chunks: int) \
-> Generator[np.ndarray, None, None]:
"""Yields chunks of the posterior that can be treated as independent,
from the standpoint of MCKP count estimation. That is, they contain all
matrix entries for any genes they include.
-> List[np.ndarray]:
"""Return a list of logical (size m) arrays used to select gene chunks
on which to compute the MCKP estimate. These chunks are independent.
Args:
noise_log_prob_coo: Full noise log prob posterior COO
Expand All @@ -551,36 +548,14 @@ def _gene_chunk_iterator(self,
Logical array which indexes elements of coo posterior for the chunk
"""

# TODO this generator is way too slow

# approximate number of entries in a chunk
# approx_chunk_entries = (noise_log_prob_coo.data.size - 1) // n_chunks

# get gene annotations
_, genes = self.index_converter.get_ng_indices(m_inds=noise_log_prob_coo.row)
genes_series = pd.Series(genes)

# things we need to keep track of for each chunk
# current_chunk_genes = []
# entry_logic = np.zeros(noise_log_prob_coo.data.size, dtype=bool)

# TODO eliminate for loop to speed this up
# take the list of genes from the coo, sort it, and divide it evenly
# somehow break ties for genes overlapping boundaries of divisions
sorted_genes = np.sort(genes)
gene_arrays = np.array_split(sorted_genes, n_chunks)
last_gene_set = {}
for gene_array in gene_arrays:
gene_set = set(gene_array)
gene_set = gene_set.difference(last_gene_set) # only the new stuff
# if there is a second chunk, make sure there is a gene unique to it
if (n_chunks > 1) and (len(gene_set) == len(set(genes))): # all genes in first set
# this mainly exists for tests
gene_set = gene_set - {gene_arrays[-1][-1]}
last_gene_set = gene_set
entry_logic = genes_series.isin(gene_set).values
if sum(entry_logic) > 0:
yield entry_logic
gene_chunk_arrays = np.array_split(np.arange(self.index_converter.total_n_genes), n_chunks)

gene_logic_arrays = [genes_series.isin(x).values for x in gene_chunk_arrays]
return gene_logic_arrays

def _chunk_estimate_noise(self,
noise_log_prob_coo: sp.coo_matrix,
Expand Down Expand Up @@ -810,7 +785,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
"""
array_length = len(np.unique(noise_log_prob_coo.row))

m = np.zeros(array_length)
m = np.zeros(array_length, dtype=np.uint64)
out = np.zeros(array_length)
a = 0

Expand All @@ -829,7 +804,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
out[a:(a + len_s)] = s.detach().cpu().numpy()
a = a + len_s

return {'m': m.astype(int), 'result': out}
return {'m': m, 'result': out}


def pandas_grouped_apply(coo: sp.coo_matrix,
Expand Down
74 changes: 41 additions & 33 deletions cellbender/remove_background/posterior.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def _get_cell_noise_count_posterior_coo(
f'accurate for your dataset.')
raise RuntimeError('Zero cells found!')

dataloader_index_to_analyzed_bc_index = np.where(cell_logic)[0]
dataloader_index_to_analyzed_bc_index = torch.where(torch.tensor(cell_logic))[0]
cell_data_loader = DataLoader(
count_matrix[cell_logic],
empty_drop_dataset=None,
Expand All @@ -468,6 +468,12 @@ def _get_cell_noise_count_posterior_coo(
log_probs = []
ind = 0
n_minibatches = len(cell_data_loader)
analyzed_gene_inds = torch.tensor(self.analyzed_gene_inds.copy())
if analyzed_bcs_only:
barcode_inds = torch.tensor(self.dataset_obj.analyzed_barcode_inds.copy())
else:
barcode_inds = torch.tensor(self.barcode_inds.copy())
nonzero_noise_offset_dict = {}

logger.info('Computing posterior noise count probabilities in mini-batches.')

Expand Down Expand Up @@ -505,57 +511,52 @@ def _get_cell_noise_count_posterior_coo(
)

# Get the original gene index from gene index in the trimmed dataset.
genes_i = self.analyzed_gene_inds[genes_i_analyzed]
genes_i = analyzed_gene_inds[genes_i_analyzed.cpu()]

# Barcode index in the dataloader.
bcs_i = bcs_i_chunk + ind
bcs_i = (bcs_i_chunk + ind).cpu()

# Obtain the real barcode index since we only use cells.
bcs_i = dataloader_index_to_analyzed_bc_index[bcs_i]

# Translate chunk barcode inds to overall inds.
if analyzed_bcs_only:
bcs_i = self.dataset_obj.analyzed_barcode_inds[bcs_i]
else:
bcs_i = self.barcode_inds[bcs_i]
bcs_i = barcode_inds[bcs_i]

# Add sparse matrix values to lists.
try:
bcs.extend(bcs_i.tolist())
genes.extend(genes_i.tolist())
c.extend(c_i.tolist())
log_probs.extend(log_prob_i.tolist())
c_offset.extend(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed]
.detach().cpu().numpy())
except TypeError as e:
# edge case of a single value
bcs.append(bcs_i)
genes.append(genes_i)
c.append(c_i)
log_probs.append(log_prob_i)
c_offset.append(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed]
.detach().cpu().numpy())
bcs.append(bcs_i.detach())
genes.append(genes_i.detach())
c.append(c_i.detach().cpu())
log_probs.append(log_prob_i.detach().cpu())

# Update offset dict with any nonzeros.
nonzero_offset_inds, nonzero_noise_count_offsets = dense_to_sparse_op_torch(
noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed].detach().flatten(),
)
m_i = self.index_converter.get_m_indices(cell_inds=bcs_i, gene_inds=genes_i)

nonzero_noise_offset_dict.update(
dict(zip(m_i[nonzero_offset_inds.detach().cpu()].tolist(),
nonzero_noise_count_offsets.detach().cpu().tolist()))
)
c_offset.append(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed].detach().cpu())

# Increment barcode index counter.
ind += data.shape[0] # Same as data_loader.batch_size

# Convert the lists to numpy arrays.
log_probs = np.array(log_probs, dtype=float)
c = np.array(c, dtype=np.uint32)
barcodes = np.array(bcs, dtype=np.uint64) # uint32 is too small!
genes = np.array(genes, dtype=np.uint64) # use same as above for IndexConverter
noise_count_offsets = np.array(c_offset, dtype=np.uint32)
# Concatenate lists.
log_probs = torch.cat(log_probs)
c = torch.cat(c)
barcodes = torch.cat(bcs)
genes = torch.cat(genes)

# Translate (barcode, gene) inds to 'm' format index.
m = self.index_converter.get_m_indices(cell_inds=barcodes, gene_inds=genes)

# Put the counts into a sparse csr_matrix.
self._noise_count_posterior_coo = sp.coo_matrix(
(log_probs, (m, c)),
shape=[np.prod(self.count_matrix_shape), n_counts_max],
shape=[np.prod(self.count_matrix_shape, dtype=np.uint64), n_counts_max],
)
noise_offset_dict = dict(zip(m, noise_count_offsets))
nonzero_noise_offset_dict = {k: v for k, v in noise_offset_dict.items() if (v > 0)}
self._noise_count_posterior_coo_offsets = nonzero_noise_offset_dict
return self._noise_count_posterior_coo

Expand Down Expand Up @@ -1517,7 +1518,9 @@ def __repr__(self):
f'\n\ttotal_n_genes: {self.total_n_genes}'
f'\n\tmatrix_shape: {self.matrix_shape}')

def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndarray:
def get_m_indices(self,
cell_inds: Union[np.ndarray, torch.Tensor],
gene_inds: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
"""Given arrays of cell indices and gene indices, suitable for a sparse matrix,
convert them to 'm' index values.
"""
Expand All @@ -1527,7 +1530,12 @@ def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndar
if not ((gene_inds >= 0) & (gene_inds < self.total_n_genes)).all():
raise ValueError(f'Requested gene_inds out of range: '
f'{gene_inds[(gene_inds < 0) | (gene_inds >= self.total_n_genes)]}')
return cell_inds * self.total_n_genes + gene_inds
if type(cell_inds) == np.ndarray:
return cell_inds.astype(np.uint64) * self.total_n_genes + gene_inds.astype(np.uint64)
elif type(cell_inds) == torch.Tensor:
return cell_inds.type(torch.int64) * self.total_n_genes + gene_inds.type(torch.int64)
else:
raise ValueError('IndexConverter.get_m_indices received cell_inds of unkown object type')

def get_ng_indices(self, m_inds: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Given a list of 'm' index values, return two arrays: cell index values
Expand Down
25 changes: 15 additions & 10 deletions cellbender/remove_background/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import subprocess
import datetime
import os
import shutil
import logging
from typing import Dict, Optional

Expand All @@ -42,26 +43,30 @@


def _run_notebook(file):
subprocess.run(f'cp {file} tmp.report.ipynb', shell=True)
shutil.copy(file, 'tmp.report.ipynb')
subprocess.run(run_notebook_str(file='tmp.report.ipynb'), shell=True)
subprocess.run(f'rm tmp.report.ipynb', shell=True)
os.remove('tmp.report.ipynb')
return 'tmp.report.nbconvert.ipynb'


def _to_html(file, output) -> str:
subprocess.run(to_html_str(file=file, output=output), shell=True)
subprocess.run(f'mv {file.replace(".ipynb", ".html")} {output}', shell=True)
subprocess.run(f'rm {file}', shell=True)
os.replace(file.replace(".ipynb", ".html"), output)
os.remove(file)
return output


def _postprocess_html(file: str, title: str):
with open(file, mode='r') as f:
html = f.read()
html = html.replace('<title>tmp.report.nbconvert</title>',
f'<title>{title}</title>')
with open(file, mode='w') as f:
f.write(html)
try:
with open(file, mode='r', encoding="utf8", errors="surrogateescape") as f:
html = f.read()
html = html.replace('<title>tmp.report.nbconvert</title>',
f'<title>{title}</title>')
with open(file, mode='w', encoding="utf8", errors="surrogateescape") as f:
f.write(html)
except:
logger.warning('Failed to overwrite default HTML report title. '
'This is purely aesthetic and does not affect output.')


def run_notebook_make_html(file, output) -> str:
Expand Down
Loading

0 comments on commit 3a4dc8d

Please sign in to comment.