Merge branch 'release/1.4.2'

Hoohm · Mar 11, 2019 · 66734b0 · 66734b0
2 parents 7aa0f71 + a558ed9
commit 66734b0
Show file tree

Hide file tree

Showing 13 changed files with 427 additions and 98 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,11 +5,37 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
 
+## [1.4.2] - 11.03.2019
+### Added
+- Cell barcode correction based on whitelist instead of top cells. This will trigger automatically
+  when a whitelist is provided.
+- UMI correction for tags with more than than 20'000 umis is too slow right now.
+  Cells containing a TAG with more than 20'000 umis will be discarded. This means that more than 20'000
+  antibody tags with different umis have been captured which is odd. Such high numbers could mean
+  that cells have been aggregating and can't be used for downstream analysis. 
+  The number of cells are reported under `Bad cells` and the dense matrix of those cells is provided in `uncorrected_cells`.
+  Fixing issues #32
+- The option to not correct for umis with `--no_umi_correction`.
+- Now prints how many reads will be processed.
+- Checks that tags are indeed made of ATGC bases.
+- Added a check for cell barcode correction and collapsing threshold for given whitelist.
+- New option to allow for a sliding window when aligning TAGS. `--sliding-window`
+  Use when you have a variable sequence before your tags.
+
+### Changed
+- Sparse matrix is now based on int32 instead of int16. Fixing issue #40
+- Cell barcode correction without a whitelist is not outputing any error anymore.
+  This is due to the fact that it uses a hard threshold based on the `-cells` option
+  if it kind find one. Fixing issues #29, #36
+- Upgraded umi_tools to `1.0.0`
+- fixed the error linked to umi_tools version update `getCellWhitelist` #45.
+
+
 ## [1.4.0] - 24.01.2019
 ### Added
 - Enabled parallelization using multiprocessing. You can choose how many
   cores/threads you want to use with the `-T` `--threads` option. Takes max
-  cpu by default. It will run on only one core if you run only 1'000'000 reads.
+  cpu by default. It will run on only one core if you run 1'000'000 reads or less.
 - The output is now given in a gzipped mtx format. This is to ensure smooth usage for 
   new/larger datasets such as data from novaseq sequencers. For those who still want
   to use the dense format, there is an option `--dense` which will add the dense output

diff --git a/README.md b/README.md
@@ -1,3 +1,4 @@
+[![DOI](https://zenodo.org/badge/99617772.svg)](https://zenodo.org/badge/latestdoi/99617772)
 # CITE-seq-Count
 A python package that allows to count antibody TAGS from a [CITE-seq](https://www.nature.com/articles/nmeth.4380) and/or [cell hashing](https://www.biorxiv.org/content/early/2017/12/21/237693) experiment.
 
@@ -14,7 +15,7 @@ Installation
 -------------------------------------------
 
 ```
-pip install CITE-seq-Count==1.4.1.1
+pip install CITE-seq-Count==1.4.2
 ```
 
 

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
@@ -7,6 +7,7 @@
 import os
 import datetime
 import pkg_resources
+import logging
 
 from argparse import ArgumentParser
 from argparse import RawTextHelpFormatter
@@ -75,6 +76,8 @@ def get_args():
     barcodes.add_argument('--umi_collapsing_dist', dest='umi_threshold',
                           required=False, type=int, default=2,
                           help="threshold for umi collapsing.")
+    barcodes.add_argument('--no_umi_correctio', required=False, action='store_true', default=False,
+                        dest='no_umi_correction', help="Deactivate UMI collapsing")
     barcodes.add_argument('--bc_collapsing_dist', dest='bc_threshold',
                           required=False, type=int, default=1,
                           help="threshold for cellular barcode collapsing.")
@@ -114,6 +117,12 @@ def get_args():
         help=("Number of bases to discard from read2.")
     )
 
+    filters.add_argument(
+        '--sliding-window', dest='sliding_window',
+        required=False, default=False, action='store_true',
+        help=("Allow for a sliding window when aligning.")
+    )
+
 
     # Remaining arguments.
     parser.add_argument('-T', '--threads', required=False, type=int,
@@ -136,12 +145,11 @@ def get_args():
                         help="Print extra information for debugging.")
     parser.add_argument('--version', action='version', version='CITE-seq-Count v{}'.format(version),
                         help="Print version number.")
-
     # Finally! Too many options XD
     return parser
 
 
-def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordered_tags_map, umis_corrected, bcs_corrected, args):
+def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordered_tags_map, umis_corrected, bcs_corrected, bad_cells, args):
     """
     Creates a report with details about the run in a yaml format.
 
@@ -154,8 +162,8 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
         args (arg_parse): Arguments provided by the user.
 
     """
-    total_mapped = sum(reads_per_cell.values())
     total_unmapped = sum(no_match.values())
+    total_mapped = sum(reads_per_cell.values()) - total_unmapped
     mapped_perc = round((total_mapped/n_reads)*100)
     unmapped_perc = round((total_unmapped/n_reads)*100)
 
@@ -167,6 +175,7 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
 Reads processed: {}
 Percentage mapped: {}
 Percentage unmapped: {}
+Uncorrected cells: {}
 Correction:
 \tCell barcodes collapsing threshold: {}
 \tCell barcodes corrected: {}
@@ -191,6 +200,7 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
             n_reads,
             mapped_perc,
             unmapped_perc,
+            len(bad_cells),
             args.bc_threshold,
             bcs_corrected,
             args.umi_threshold,
@@ -206,6 +216,15 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
             args.start_trim))
 
 def main():
+    #Create logger and stream handler
+    logger = logging.getLogger('cite_seq_count')
+    logger.setLevel(logging.CRITICAL)
+    ch = logging.StreamHandler()
+    ch.setLevel(logging.CRITICAL)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+
     start_time = time.time()
     parser = get_args()
     if not sys.argv[1:]:
@@ -215,10 +234,12 @@ def main():
     # Parse arguments.
     args = parser.parse_args()
     if args.whitelist:
-        whitelist = preprocessing.parse_whitelist_csv(args.whitelist,
-                                        args.cb_last - args.cb_first + 1)
+        (whitelist, args.bc_threshold) = preprocessing.parse_whitelist_csv(
+            filename=args.whitelist,
+            barcode_length=args.cb_last - args.cb_first + 1,
+            collapsing_threshold=args.bc_threshold)
     else:
-        whitelist = None
+        whitelist = False
 
     # Load TAGs/ABs.
     ab_map = preprocessing.parse_tags_csv(args.tags)
@@ -243,8 +264,8 @@ def main():
         n_lines = preprocessing.get_n_lines(args.read1_path)
     n_reads = int(n_lines/4)
     n_threads = args.n_threads
-
     print('Started mapping')
+    print('Processing {:,} reads'.format(n_reads))
     #Run with one process
     if n_threads <= 1 or n_reads < 1000001:
         print('CITE-seq-Count is running with one core.')
@@ -260,13 +281,14 @@ def main():
                 whitelist=whitelist,
                 debug=args.debug,
                 start_trim=args.start_trim,
-                maximum_distance=args.max_error)
+                maximum_distance=args.max_error,
+                sliding_window=args.sliding_window)
         print('Mapping done')
         umis_per_cell = Counter()
         reads_per_cell = Counter()
         for cell_barcode,counts in final_results.items():
-            umis_per_cell[cell_barcode] = sum([len(counts[UMI]) for UMI in counts if UMI != 'unmapped'])
-            reads_per_cell[cell_barcode] = sum([sum(counts[UMI].values()) for UMI in counts if UMI != 'unmapped'])
+            umis_per_cell[cell_barcode] = sum([len(counts[UMI]) for UMI in counts])
+            reads_per_cell[cell_barcode] = sum([sum(counts[UMI].values()) for UMI in counts])
     else:
         # Run with multiple processes
         print('CITE-seq-Count is running with {} cores.'.format(n_threads))
@@ -286,13 +308,15 @@ def main():
                     whitelist,
                     args.debug,
                     args.start_trim,
-                    args.max_error),
+                    args.max_error,
+                    args.sliding_window),
                 callback=parallel_results.append,
                 error_callback=sys.stderr)
         p.close()
         p.join()
         print('Mapping done')
         print('Merging results')
+
         (
             final_results,
             umis_per_cell,
@@ -301,35 +325,47 @@ def main():
         ) = processing.merge_results(parallel_results=parallel_results)
         del(parallel_results)
 
-    # Correct cell barcodes
-    (
-        final_results,
-        umis_per_cell,
-        bcs_corrected
-    ) = processing.correct_cells(
-            final_results=final_results,
-            umis_per_cell=umis_per_cell,
-            expected_cells=args.expected_cells,
-            collapsing_threshold=args.bc_threshold)
-
-    # Correct umi barcodes
-    (
-        final_results,
-        umis_corrected
-    ) = processing.correct_umis(
-        final_results=final_results,
-        collapsing_threshold=args.umi_threshold)
-
     ordered_tags_map = OrderedDict()
     for i,tag in enumerate(ab_map.values()):
         ordered_tags_map[tag] = i
     ordered_tags_map['unmapped'] = i + 1
 
+
+    # Correct cell barcodes
+    if(len(umis_per_cell) <= args.expected_cells):
+        print("Number of expected cells, {}, is higher " \
+            "than number of cells found {}.\nNot performing" \
+            "cell barcode correction" \
+            "".format(args.expected_cells, len(umis_per_cell)))
+        bcs_corrected = 0
+    else:
+        print('Correcting cell barcodes')
+        if not whitelist:
+            (
+                final_results,
+                umis_per_cell,
+                bcs_corrected
+            ) = processing.correct_cells(
+                    final_results=final_results,
+                    umis_per_cell=umis_per_cell,
+                    expected_cells=args.expected_cells,
+                    collapsing_threshold=args.bc_threshold)
+        else:
+            (
+                final_results,
+                umis_per_cell,
+                bcs_corrected) = processing.correct_cells_whitelist(
+                    final_results=final_results,
+                    umis_per_cell=umis_per_cell,
+                    whitelist=whitelist,
+                    collapsing_threshold=args.bc_threshold)
 
-    # Sort cells by number of mapped umis
+    # Correct umi barcodes
     if not whitelist:
         top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
         top_cells = set([pair[0] for pair in top_cells_tuple])
+
+    # Sort cells by number of mapped umis
     else:
         top_cells = whitelist
         # Add potential missing cell barcodes.
@@ -339,24 +375,56 @@ def main():
             else:
                 final_results[missing_cell] = dict()
                 for TAG in ordered_tags_map:
-                    final_results[missing_cell][TAG] = 0
+                    final_results[missing_cell][TAG] = Counter()
                 top_cells.add(missing_cell)
+    #If we want umi correction
+    if not args.no_umi_correction:
+        (
+            final_results,
+            umis_corrected,
+            aberrant_cells
+        ) = processing.correct_umis(
+            final_results=final_results,
+            collapsing_threshold=args.umi_threshold,
+            top_cells=top_cells,
+            max_umis=20000)
+    else:
+        umis_corrected = 0
+        aberrant_cells = set()
+    for cell_barcode in aberrant_cells:
+        top_cells.remove(cell_barcode)
+    #Create sparse aberrant cells matrix
+    (
+    umi_aberrant_matrix,
+    read_aberrant_matrix
+    ) = processing.generate_sparse_matrices(
+        final_results=final_results,
+        ordered_tags_map=ordered_tags_map,
+        top_cells=aberrant_cells)
+
+    #Write uncorrected cells to dense output
+    io.write_dense(
+            sparse_matrix=umi_aberrant_matrix,
+            index=list(ordered_tags_map.keys()),
+            columns=aberrant_cells,
+            outfolder=os.path.join(args.outfolder,'uncorrected_cells'),
+            filename='dense_umis.tsv')
 
-
-
     (
         umi_results_matrix,
         read_results_matrix
     ) = processing.generate_sparse_matrices(
         final_results=final_results,
         ordered_tags_map=ordered_tags_map,
         top_cells=top_cells)
+    # Write umis to file
     io.write_to_files(
         sparse_matrix=umi_results_matrix,
         top_cells=top_cells,
         ordered_tags_map=ordered_tags_map,
         data_type='umi',
         outfolder=args.outfolder)
+    # Write reads to file
     io.write_to_files(
         sparse_matrix=read_results_matrix,
         top_cells=top_cells,
@@ -365,6 +433,7 @@ def main():
         outfolder=args.outfolder)
 
     top_unmapped = merged_no_match.most_common(args.unknowns_top)
+
     with open(os.path.join(args.outfolder, args.unmapped_file),'w') as unknown_file:
         unknown_file.write('tag,count\n')
         for element in top_unmapped:
@@ -378,14 +447,16 @@ def main():
         ordered_tags_map=ordered_tags_map,
         umis_corrected=umis_corrected,
         bcs_corrected=bcs_corrected,
+        bad_cells=aberrant_cells,
         args=args)
     if args.dense:
         print('Writing dense format output')
         io.write_dense(
             sparse_matrix=umi_results_matrix,
             index=list(ordered_tags_map.keys()),
             columns=top_cells,
-            file_path=os.path.join(args.outfolder, 'dense_umis.tsv'))
+            outfolder=args.outfolder,
+            filename='dense_umis.tsv')
 
 if __name__ == '__main__':
     main()
diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py
@@ -31,6 +31,8 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol
             shutil.copyfileobj(mtx_in, mtx_gz)
     os.remove(os.path.join(prefix,'matrix.mtx'))
 
-def write_dense(sparse_matrix, index, columns, file_path):
+def write_dense(sparse_matrix, index, columns, outfolder, filename):
+    prefix = os.path.join(outfolder)
+    os.makedirs(prefix, exist_ok=True)
     pandas_dense = pd.DataFrame(sparse_matrix.todense(), columns=columns, index=index)
-    pandas_dense.to_csv(file_path, sep='\t')
+    pandas_dense.to_csv(os.path.join(outfolder,filename), sep='\t')