v0.3.1 (#303)

* Add WDL input to set number of retries. (#247) * Move hash computation so that it is recomputed on retry, and now-invalid checkpoint is not loaded. (#258) * Bug fix for WDL using MTX input (#246) * Memory-efficient posterior generation (#263) * Fix posterior and estimator integer overflow bugs on Windows (#259) * Move from setup.py to pyproject.toml (#240) * Fix bugs with report generation across platforms (#302) --------- Co-authored-by: kshakir <[email protected]> Co-authored-by: alecw <[email protected]>
broadinstitute · Oct 31, 2023 · 3a4dc8d · 3a4dc8d
1 parent 4990df7
commit 3a4dc8d
Show file tree

Hide file tree

Showing 23 changed files with 254 additions and 207 deletions.
diff --git a/.github/workflows/run_pytest.yml b/.github/workflows/run_pytest.yml
@@ -1,4 +1,4 @@
-# Run cellbender's tests
+# Run cellbender test suite
 
 name: 'pytest'
 
@@ -7,11 +7,13 @@ on: pull_request
 jobs:
   build:
 
-    runs-on: 'ubuntu-latest'
     strategy:
       matrix:
+        os: ['ubuntu-latest', 'windows-latest']
         python-version: ['3.7']
 
+    runs-on: ${{ matrix.os }}
+
     steps:
     - name: 'Checkout repo'
       uses: actions/checkout@v3

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,7 @@
 include README.rst
 include LICENSE
 include requirements.txt
-include requirements-rtd.txt
+include requirements-rtd.txt
+include requirements-dev.txt
+include cellbender/VERSION.txt
+include cellbender/remove_background/report.ipynb
diff --git a/build_docker_release.sh b/build_docker_release.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-tag=$(cat cellbender/__init__.py | sed -e 's?__version__ = ??' | sed "s/^'\(.*\)'$/\1/")
+tag=$(cat cellbender/VERSION.txt)
 release=v${tag}
 
 docker build \

diff --git a/cellbender/VERSION.txt b/cellbender/VERSION.txt
@@ -0,0 +1 @@
+0.3.0
diff --git a/cellbender/__init__.py b/cellbender/__init__.py
@@ -1 +1,3 @@
-__version__ = '0.3.0'
+from .base_cli import get_version
+
+__version__ = get_version()
diff --git a/cellbender/base_cli.py b/cellbender/base_cli.py
@@ -23,12 +23,7 @@ def read(rel_path):
 
 
 def get_version() -> str:
-    for line in read('__init__.py').splitlines():
-        if line.startswith('__version__'):
-            delim = '"' if '"' in line else "'"
-            return line.split(delim)[1]
-    else:
-        raise RuntimeError("Unable to find version string.")
+    return read('VERSION.txt').splitlines()[0]
 
 
 class AbstractCLI(ABC):

diff --git a/cellbender/remove_background/checkpoint.py b/cellbender/remove_background/checkpoint.py
@@ -297,7 +297,7 @@ def make_tarball(files: List[str], tarball_name: str) -> bool:
         for file in files:
             # without arcname, unpacking results in unpredictable file locations!
             tar.add(file, arcname=os.path.basename(file))
-    os.rename(tarball_name + '.tmp', tarball_name)
+    os.replace(tarball_name + '.tmp', tarball_name)
     return True
 
 

diff --git a/cellbender/remove_background/cli.py b/cellbender/remove_background/cli.py
@@ -207,17 +207,6 @@ def setup_and_logging(args):
                 + ' '.join(['cellbender', 'remove-background'] + sys.argv[2:]))
     logger.info("CellBender " + get_version())
 
-    # Set up checkpointing by creating a unique workflow hash.
-    hashcode = create_workflow_hashcode(
-        module_path=os.path.dirname(cellbender.__file__),
-        args_to_remove=(['output_file', 'fpr', 'input_checkpoint_tarball', 'debug',
-                         'posterior_batch_size', 'checkpoint_min', 'truth_file',
-                         'posterior_regularization', 'cdf_threshold_q', 'prq_alpha',
-                         'estimator', 'use_multiprocessing_estimation', 'cpu_threads']
-                        + (['epochs'] if args.constant_learning_rate else [])),
-        args=args)[:10]
-    args.checkpoint_filename = hashcode  # store this in args
-    logger.info(f'(Workflow hash {hashcode})')
     return args, file_handler
 
 

diff --git a/cellbender/remove_background/estimation.py b/cellbender/remove_background/estimation.py
@@ -218,7 +218,7 @@ def _estimation_array_to_csr(index_converter,
                              data: np.ndarray,
                              m: np.ndarray,
                              noise_offsets: Optional[Dict[int, int]],
-                             dtype=np.int64) -> sp.csr_matrix:
+                             dtype=np.int) -> sp.csr_matrix:
     """Say you have point estimates for each count matrix element (data) and
     you have the 'm'-indices for each value (m). This returns a CSR matrix
     that has the shape of the count matrix, where duplicate entries have
@@ -229,7 +229,7 @@ def _estimation_array_to_csr(index_converter,
             a flat format, indexed by 'm'.
         m: Array of the same length as data, where each entry is an m-index.
         noise_offsets: Noise count offset values keyed by 'm'.
-        dtype: Data type for sparse matrix. Int32 is too small for 'm' indices.
+        dtype: Data type for values of sparse matrix
 
     Results:
         noise_csr: Noise point estimate, as a CSR sparse matrix.
@@ -238,7 +238,7 @@ def _estimation_array_to_csr(index_converter,
     row, col = index_converter.get_ng_indices(m_inds=m)
     if noise_offsets is not None:
         data = data + np.array([noise_offsets.get(i, 0) for i in m])
-    coo = sp.coo_matrix((data.astype(dtype), (row.astype(dtype), col.astype(dtype))),
+    coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
                         shape=index_converter.matrix_shape, dtype=dtype)
     coo.sum_duplicates()
     return coo.tocsr()
@@ -463,11 +463,9 @@ def estimate_noise(self,
         if use_multiple_processes:
 
             logger.info('Dividing dataset into chunks of genes')
-            chunk_logic_list = list(
-                self._gene_chunk_iterator(
-                    noise_log_prob_coo=noise_log_prob_coo,
-                    n_chunks=n_chunks,
-                )
+            chunk_logic_list = self._gene_chunk_iterator(
+                noise_log_prob_coo=noise_log_prob_coo,
+                n_chunks=n_chunks,
             )
 
             logger.info('Computing the output in asynchronous chunks in parallel...')
@@ -538,10 +536,9 @@ def estimate_noise(self,
     def _gene_chunk_iterator(self,
                              noise_log_prob_coo: sp.coo_matrix,
                              n_chunks: int) \
-            -> Generator[np.ndarray, None, None]:
-        """Yields chunks of the posterior that can be treated as independent,
-        from the standpoint of MCKP count estimation.  That is, they contain all
-        matrix entries for any genes they include.
+            -> List[np.ndarray]:
+        """Return a list of logical (size m) arrays used to select gene chunks
+        on which to compute the MCKP estimate. These chunks are independent.
 
         Args:
             noise_log_prob_coo: Full noise log prob posterior COO
@@ -551,36 +548,14 @@ def _gene_chunk_iterator(self,
             Logical array which indexes elements of coo posterior for the chunk
         """
 
-        # TODO this generator is way too slow
-
-        # approximate number of entries in a chunk
-        # approx_chunk_entries = (noise_log_prob_coo.data.size - 1) // n_chunks
-
         # get gene annotations
         _, genes = self.index_converter.get_ng_indices(m_inds=noise_log_prob_coo.row)
         genes_series = pd.Series(genes)
 
-        # things we need to keep track of for each chunk
-        # current_chunk_genes = []
-        # entry_logic = np.zeros(noise_log_prob_coo.data.size, dtype=bool)
-
-        # TODO eliminate for loop to speed this up
-        # take the list of genes from the coo, sort it, and divide it evenly
-        # somehow break ties for genes overlapping boundaries of divisions
-        sorted_genes = np.sort(genes)
-        gene_arrays = np.array_split(sorted_genes, n_chunks)
-        last_gene_set = {}
-        for gene_array in gene_arrays:
-            gene_set = set(gene_array)
-            gene_set = gene_set.difference(last_gene_set)  # only the new stuff
-            # if there is a second chunk, make sure there is a gene unique to it
-            if (n_chunks > 1) and (len(gene_set) == len(set(genes))):  # all genes in first set
-                # this mainly exists for tests
-                gene_set = gene_set - {gene_arrays[-1][-1]}
-            last_gene_set = gene_set
-            entry_logic = genes_series.isin(gene_set).values
-            if sum(entry_logic) > 0:
-                yield entry_logic
+        gene_chunk_arrays = np.array_split(np.arange(self.index_converter.total_n_genes), n_chunks)
+
+        gene_logic_arrays = [genes_series.isin(x).values for x in gene_chunk_arrays]
+        return gene_logic_arrays
 
     def _chunk_estimate_noise(self,
                               noise_log_prob_coo: sp.coo_matrix,
@@ -810,7 +785,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
     """
     array_length = len(np.unique(noise_log_prob_coo.row))
 
-    m = np.zeros(array_length)
+    m = np.zeros(array_length, dtype=np.uint64)
     out = np.zeros(array_length)
     a = 0
 
@@ -829,7 +804,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix,
         out[a:(a + len_s)] = s.detach().cpu().numpy()
         a = a + len_s
 
-    return {'m': m.astype(int), 'result': out}
+    return {'m': m, 'result': out}
 
 
 def pandas_grouped_apply(coo: sp.coo_matrix,

diff --git a/cellbender/remove_background/posterior.py b/cellbender/remove_background/posterior.py
@@ -451,7 +451,7 @@ def _get_cell_noise_count_posterior_coo(
                          f'accurate for your dataset.')
             raise RuntimeError('Zero cells found!')
 
-        dataloader_index_to_analyzed_bc_index = np.where(cell_logic)[0]
+        dataloader_index_to_analyzed_bc_index = torch.where(torch.tensor(cell_logic))[0]
         cell_data_loader = DataLoader(
             count_matrix[cell_logic],
             empty_drop_dataset=None,
@@ -468,6 +468,12 @@ def _get_cell_noise_count_posterior_coo(
         log_probs = []
         ind = 0
         n_minibatches = len(cell_data_loader)
+        analyzed_gene_inds = torch.tensor(self.analyzed_gene_inds.copy())
+        if analyzed_bcs_only:
+            barcode_inds = torch.tensor(self.dataset_obj.analyzed_barcode_inds.copy())
+        else:
+            barcode_inds = torch.tensor(self.barcode_inds.copy())
+        nonzero_noise_offset_dict = {}
 
         logger.info('Computing posterior noise count probabilities in mini-batches.')
 
@@ -505,57 +511,52 @@ def _get_cell_noise_count_posterior_coo(
             )
 
             # Get the original gene index from gene index in the trimmed dataset.
-            genes_i = self.analyzed_gene_inds[genes_i_analyzed]
+            genes_i = analyzed_gene_inds[genes_i_analyzed.cpu()]
 
             # Barcode index in the dataloader.
-            bcs_i = bcs_i_chunk + ind
+            bcs_i = (bcs_i_chunk + ind).cpu()
 
             # Obtain the real barcode index since we only use cells.
             bcs_i = dataloader_index_to_analyzed_bc_index[bcs_i]
 
             # Translate chunk barcode inds to overall inds.
-            if analyzed_bcs_only:
-                bcs_i = self.dataset_obj.analyzed_barcode_inds[bcs_i]
-            else:
-                bcs_i = self.barcode_inds[bcs_i]
+            bcs_i = barcode_inds[bcs_i]
 
             # Add sparse matrix values to lists.
-            try:
-                bcs.extend(bcs_i.tolist())
-                genes.extend(genes_i.tolist())
-                c.extend(c_i.tolist())
-                log_probs.extend(log_prob_i.tolist())
-                c_offset.extend(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed]
-                                .detach().cpu().numpy())
-            except TypeError as e:
-                # edge case of a single value
-                bcs.append(bcs_i)
-                genes.append(genes_i)
-                c.append(c_i)
-                log_probs.append(log_prob_i)
-                c_offset.append(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed]
-                                .detach().cpu().numpy())
+            bcs.append(bcs_i.detach())
+            genes.append(genes_i.detach())
+            c.append(c_i.detach().cpu())
+            log_probs.append(log_prob_i.detach().cpu())
+
+            # Update offset dict with any nonzeros.
+            nonzero_offset_inds, nonzero_noise_count_offsets = dense_to_sparse_op_torch(
+                noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed].detach().flatten(),
+            )
+            m_i = self.index_converter.get_m_indices(cell_inds=bcs_i, gene_inds=genes_i)
+
+            nonzero_noise_offset_dict.update(
+                dict(zip(m_i[nonzero_offset_inds.detach().cpu()].tolist(),
+                         nonzero_noise_count_offsets.detach().cpu().tolist()))
+            )
+            c_offset.append(noise_count_offset_NG[bcs_i_chunk, genes_i_analyzed].detach().cpu())
 
             # Increment barcode index counter.
             ind += data.shape[0]  # Same as data_loader.batch_size
 
-        # Convert the lists to numpy arrays.
-        log_probs = np.array(log_probs, dtype=float)
-        c = np.array(c, dtype=np.uint32)
-        barcodes = np.array(bcs, dtype=np.uint64)  # uint32 is too small!
-        genes = np.array(genes, dtype=np.uint64)  # use same as above for IndexConverter
-        noise_count_offsets = np.array(c_offset, dtype=np.uint32)
+        # Concatenate lists.
+        log_probs = torch.cat(log_probs)
+        c = torch.cat(c)
+        barcodes = torch.cat(bcs)
+        genes = torch.cat(genes)
 
         # Translate (barcode, gene) inds to 'm' format index.
         m = self.index_converter.get_m_indices(cell_inds=barcodes, gene_inds=genes)
 
         # Put the counts into a sparse csr_matrix.
         self._noise_count_posterior_coo = sp.coo_matrix(
             (log_probs, (m, c)),
-            shape=[np.prod(self.count_matrix_shape), n_counts_max],
+            shape=[np.prod(self.count_matrix_shape, dtype=np.uint64), n_counts_max],
         )
-        noise_offset_dict = dict(zip(m, noise_count_offsets))
-        nonzero_noise_offset_dict = {k: v for k, v in noise_offset_dict.items() if (v > 0)}
         self._noise_count_posterior_coo_offsets = nonzero_noise_offset_dict
         return self._noise_count_posterior_coo
 
@@ -1517,7 +1518,9 @@ def __repr__(self):
                 f'\n\ttotal_n_genes: {self.total_n_genes}'
                 f'\n\tmatrix_shape: {self.matrix_shape}')
 
-    def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndarray:
+    def get_m_indices(self,
+                      cell_inds: Union[np.ndarray, torch.Tensor],
+                      gene_inds: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """Given arrays of cell indices and gene indices, suitable for a sparse matrix,
         convert them to 'm' index values.
         """
@@ -1527,7 +1530,12 @@ def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndar
         if not ((gene_inds >= 0) & (gene_inds < self.total_n_genes)).all():
             raise ValueError(f'Requested gene_inds out of range: '
                              f'{gene_inds[(gene_inds < 0) | (gene_inds >= self.total_n_genes)]}')
-        return cell_inds * self.total_n_genes + gene_inds
+        if type(cell_inds) == np.ndarray:
+            return cell_inds.astype(np.uint64) * self.total_n_genes + gene_inds.astype(np.uint64)
+        elif type(cell_inds) == torch.Tensor:
+            return cell_inds.type(torch.int64) * self.total_n_genes + gene_inds.type(torch.int64)
+        else:
+            raise ValueError('IndexConverter.get_m_indices received cell_inds of unkown object type')
 
     def get_ng_indices(self, m_inds: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
         """Given a list of 'm' index values, return two arrays: cell index values

diff --git a/cellbender/remove_background/report.py b/cellbender/remove_background/report.py
@@ -16,6 +16,7 @@
 import subprocess
 import datetime
 import os
+import shutil
 import logging
 from typing import Dict, Optional
 
@@ -42,26 +43,30 @@
 
 
 def _run_notebook(file):
-    subprocess.run(f'cp {file} tmp.report.ipynb', shell=True)
+    shutil.copy(file, 'tmp.report.ipynb')
     subprocess.run(run_notebook_str(file='tmp.report.ipynb'), shell=True)
-    subprocess.run(f'rm tmp.report.ipynb', shell=True)
+    os.remove('tmp.report.ipynb')
     return 'tmp.report.nbconvert.ipynb'
 
 
 def _to_html(file, output) -> str:
     subprocess.run(to_html_str(file=file, output=output), shell=True)
-    subprocess.run(f'mv {file.replace(".ipynb", ".html")} {output}', shell=True)
-    subprocess.run(f'rm {file}', shell=True)
+    os.replace(file.replace(".ipynb", ".html"), output)
+    os.remove(file)
     return output
 
 
 def _postprocess_html(file: str, title: str):
-    with open(file, mode='r') as f:
-        html = f.read()
-    html = html.replace('<title>tmp.report.nbconvert</title>',
-                        f'<title>{title}</title>')
-    with open(file, mode='w') as f:
-        f.write(html)
+    try:
+        with open(file, mode='r', encoding="utf8", errors="surrogateescape") as f:
+            html = f.read()
+        html = html.replace('<title>tmp.report.nbconvert</title>',
+                            f'<title>{title}</title>')
+        with open(file, mode='w', encoding="utf8", errors="surrogateescape") as f:
+            f.write(html)
+    except:
+        logger.warning('Failed to overwrite default HTML report title. '
+                       'This is purely aesthetic and does not affect output.')
 
 
 def run_notebook_make_html(file, output) -> str: