From 57977091d1c18e80914316b91a8acfecb3e7b83d Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Tue, 5 Feb 2019 20:12:33 +0100
Subject: [PATCH 01/14] added no_umi_correction, changed which barcodes should
 be corrected

---
 cite_seq_count/__main__.py   | 36 +++++++++++++++++++++---------------
 cite_seq_count/processing.py |  4 ++--
 tests/test_processing.py     |  2 +-
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index 67c1696..2ee8242 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -75,6 +75,8 @@ def get_args():
     barcodes.add_argument('--umi_collapsing_dist', dest='umi_threshold',
                           required=False, type=int, default=2,
                           help="threshold for umi collapsing.")
+    barcodes.add_argument('--no_umi_correction', required=False, action='store_true', default=False,
+                        dest='no_umi_correction', help="Deactivate UMI collapsing")
     barcodes.add_argument('--bc_collapsing_dist', dest='bc_threshold',
                           required=False, type=int, default=1,
                           help="threshold for cellular barcode collapsing.")
@@ -300,6 +302,10 @@ def main():
             merged_no_match
         ) = processing.merge_results(parallel_results=parallel_results)
         del(parallel_results)
+    ordered_tags_map = OrderedDict()
+    for i,tag in enumerate(ab_map.values()):
+        ordered_tags_map[tag] = i
+    ordered_tags_map['unmapped'] = i + 1
 
     # Correct cell barcodes
     (
@@ -313,23 +319,10 @@ def main():
             collapsing_threshold=args.bc_threshold)
     
     # Correct umi barcodes
-    (
-        final_results,
-        umis_corrected
-    ) = processing.correct_umis(
-        final_results=final_results,
-        collapsing_threshold=args.umi_threshold)
-
-    ordered_tags_map = OrderedDict()
-    for i,tag in enumerate(ab_map.values()):
-        ordered_tags_map[tag] = i
-    ordered_tags_map['unmapped'] = i + 1
-
-
-    # Sort cells by number of mapped umis
     if not whitelist:
         top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
         top_cells = set([pair[0] for pair in top_cells_tuple])
+    # Sort cells by number of mapped umis
     else:
         top_cells = whitelist
         # Add potential missing cell barcodes.
@@ -339,8 +332,21 @@ def main():
             else:
                 final_results[missing_cell] = dict()
                 for TAG in ordered_tags_map:
-                    final_results[missing_cell][TAG] = 0
+                    final_results[missing_cell][TAG] = Counter()
                 top_cells.add(missing_cell)
+    if not args.no_umi_correction:
+        (
+            final_results,
+            umis_corrected
+        ) = processing.correct_umis(
+            final_results=final_results,
+            collapsing_threshold=args.umi_threshold,
+            top_cells=top_cells)
+    else:
+        umis_corrected = 0
+    
+
+    
     
 
 
diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
index e2280e3..8489696 100644
--- a/cite_seq_count/processing.py
+++ b/cite_seq_count/processing.py
@@ -164,7 +164,7 @@ def merge_results(parallel_results):
     return(merged_results, umis_per_cell, reads_per_cell, merged_no_match)
 
 
-def correct_umis(final_results, collapsing_threshold):
+def correct_umis(final_results, collapsing_threshold, top_cells):
     """
     Corrects umi barcodes within same cell/tag groups.
     
@@ -178,7 +178,7 @@ def correct_umis(final_results, collapsing_threshold):
     """
     print('Correcting umis')
     corrected_umis = 0
-    for cell_barcode in final_results:
+    for cell_barcode in top_cells:
         for TAG in final_results[cell_barcode]:
             if len(final_results[cell_barcode][TAG]) > 1:
                 umi_clusters = network.UMIClusterer()
diff --git a/tests/test_processing.py b/tests/test_processing.py
index 8801b89..7317539 100644
--- a/tests/test_processing.py
+++ b/tests/test_processing.py
@@ -156,7 +156,7 @@ def test_classify_reads_multi_process(data):
 
 @pytest.mark.dependency(depends=['test_classify_reads_multi_process'])
 def test_correct_umis(data):
-    temp = processing.correct_umis(pytest.results, 2)
+    temp = processing.correct_umis(pytest.results, 2, pytest.corrected_results.keys())
     results = temp[0]
     n_corrected = temp[1]
     for cell_barcode in results.keys():

From 1195695f094949b6d3fc0ba2212d0ba8aa73017f Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Sat, 16 Feb 2019 14:08:18 +0100
Subject: [PATCH 02/14] adding logger to prevent unintended submodule prints

---
 cite_seq_count/__main__.py   | 9 ++++-----
 cite_seq_count/processing.py | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index 2ee8242..6be0692 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -7,6 +7,7 @@
 import os
 import datetime
 import pkg_resources
+import logging
 
 from argparse import ArgumentParser
 from argparse import RawTextHelpFormatter
@@ -208,6 +209,7 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
             args.start_trim))
 
 def main():
+    logger = logging.getLogger('cite_seq_count')
     start_time = time.time()
     parser = get_args()
     if not sys.argv[1:]:
@@ -334,6 +336,7 @@ def main():
                 for TAG in ordered_tags_map:
                     final_results[missing_cell][TAG] = Counter()
                 top_cells.add(missing_cell)
+    #If we want umi correction
     if not args.no_umi_correction:
         (
             final_results,
@@ -344,11 +347,6 @@ def main():
             top_cells=top_cells)
     else:
         umis_corrected = 0
-    
-
-    
-    
-
 
     (
         umi_results_matrix,
@@ -371,6 +369,7 @@ def main():
         outfolder=args.outfolder)
       
     top_unmapped = merged_no_match.most_common(args.unknowns_top)
+
     with open(os.path.join(args.outfolder, args.unmapped_file),'w') as unknown_file:
         unknown_file.write('tag,count\n')
         for element in top_unmapped:
diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
index 8489696..e1d5cea 100644
--- a/cite_seq_count/processing.py
+++ b/cite_seq_count/processing.py
@@ -228,7 +228,7 @@ def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_c
                         final_results[real_barcode][TAG].update(temp[TAG])
                     temp_umi_counts = umis_per_cell.pop(fake_barcode)
                     umis_per_cell[real_barcode] += temp_umi_counts
-    except Exception as e:
+    except:
         print('Could not find a good local minima for correction.\nNo cell barcode correction was done.')
     return(final_results, umis_per_cell, corrected_barcodes)
 

From bc6dea01ea25f2468c0e5e0c6d7044ffa0c9fe1b Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Sat, 16 Feb 2019 14:11:43 +0100
Subject: [PATCH 03/14] version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 594ef24..9629d6d 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="CITE-seq-Count",
-    version="1.4.1",
+    version="1.4.2",
     author="Roelli Patrick",
     author_email="patrick.roelli@gmail.com",
     description="A python package that counts CITE seq or hashing data for single cell experiments",

From 88aa6647b4c770c8a75ea76f7a9b37160efc0f5a Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Mon, 18 Feb 2019 14:19:20 +0100
Subject: [PATCH 04/14] skipping TAGS with more than 20000 umis

---
 cite_seq_count/__main__.py   | 20 ++++++++++-----
 cite_seq_count/processing.py | 49 ++++++++++++++++++++++++++----------
 2 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index 6be0692..4a99d08 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -76,7 +76,7 @@ def get_args():
     barcodes.add_argument('--umi_collapsing_dist', dest='umi_threshold',
                           required=False, type=int, default=2,
                           help="threshold for umi collapsing.")
-    barcodes.add_argument('--no_umi_correction', required=False, action='store_true', default=False,
+    barcodes.add_argument('--no_umi_correctio', required=False, action='store_true', default=False,
                         dest='no_umi_correction', help="Deactivate UMI collapsing")
     barcodes.add_argument('--bc_collapsing_dist', dest='bc_threshold',
                           required=False, type=int, default=1,
@@ -139,12 +139,11 @@ def get_args():
                         help="Print extra information for debugging.")
     parser.add_argument('--version', action='version', version='CITE-seq-Count v{}'.format(version),
                         help="Print version number.")
-    
     # Finally! Too many options XD
     return parser
 
 
-def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordered_tags_map, umis_corrected, bcs_corrected, args):
+def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordered_tags_map, umis_corrected, bcs_corrected, bad_cells, args):
     """
     Creates a report with details about the run in a yaml format.
 
@@ -170,6 +169,7 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
 Reads processed: {}
 Percentage mapped: {}
 Percentage unmapped: {}
+Bad cells: {}
 Correction:
 \tCell barcodes collapsing threshold: {}
 \tCell barcodes corrected: {}
@@ -194,6 +194,7 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
             n_reads,
             mapped_perc,
             unmapped_perc,
+            len(bad_cells),
             args.bc_threshold,
             bcs_corrected,
             args.umi_threshold,
@@ -340,14 +341,18 @@ def main():
     if not args.no_umi_correction:
         (
             final_results,
-            umis_corrected
+            umis_corrected,
+            aberrant_cells
         ) = processing.correct_umis(
             final_results=final_results,
             collapsing_threshold=args.umi_threshold,
-            top_cells=top_cells)
+            top_cells=top_cells,
+            max_umis=20000)
     else:
         umis_corrected = 0
-
+        aberrant_cells = set()
+    for cell_barcode in aberrant_cells:
+        top_cells.remove(cell_barcode)
     (
         umi_results_matrix,
         read_results_matrix
@@ -355,12 +360,14 @@ def main():
         final_results=final_results,
         ordered_tags_map=ordered_tags_map,
         top_cells=top_cells)
+    # Write umis to file
     io.write_to_files(
         sparse_matrix=umi_results_matrix,
         top_cells=top_cells,
         ordered_tags_map=ordered_tags_map,
         data_type='umi',
         outfolder=args.outfolder)
+    # Write reads to file
     io.write_to_files(
         sparse_matrix=read_results_matrix,
         top_cells=top_cells,
@@ -383,6 +390,7 @@ def main():
         ordered_tags_map=ordered_tags_map,
         umis_corrected=umis_corrected,
         bcs_corrected=bcs_corrected,
+        bad_cells=aberrant_cells,
         args=args)
     if args.dense:
         print('Writing dense format output')
diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
index e1d5cea..01c0174 100644
--- a/cite_seq_count/processing.py
+++ b/cite_seq_count/processing.py
@@ -9,7 +9,7 @@
 from collections import defaultdict
 
 from itertools import islice
-from numpy import int16
+from numpy import int32
 from scipy import sparse
 from umi_tools import network
 from umi_tools import umi_methods
@@ -164,7 +164,7 @@ def merge_results(parallel_results):
     return(merged_results, umis_per_cell, reads_per_cell, merged_no_match)
 
 
-def correct_umis(final_results, collapsing_threshold, top_cells):
+def correct_umis(final_results, collapsing_threshold, top_cells, max_umis):
     """
     Corrects umi barcodes within same cell/tag groups.
     
@@ -178,22 +178,45 @@ def correct_umis(final_results, collapsing_threshold, top_cells):
     """
     print('Correcting umis')
     corrected_umis = 0
+    aberrant_umi_count_cells = set()
     for cell_barcode in top_cells:
         for TAG in final_results[cell_barcode]:
-            if len(final_results[cell_barcode][TAG]) > 1:
+            n_umis = len(final_results[cell_barcode][TAG])
+            if n_umis > 1 and n_umis <= max_umis:
                 umi_clusters = network.UMIClusterer()
                 UMIclusters = umi_clusters(
                     final_results[cell_barcode][TAG].keys(),
                     final_results[cell_barcode][TAG],
                     collapsing_threshold)
-                for umi_cluster in UMIclusters:  # This is a list with the first element the dominant barcode
-                    if(len(umi_cluster) > 1):  # This means we got a correction
-                        major_umi = umi_cluster[0]
-                        for minor_umi in umi_cluster[1:]:
-                            corrected_umis += 1
-                            temp = final_results[cell_barcode][TAG].pop(minor_umi)
-                            final_results[cell_barcode][TAG][major_umi] += temp
-    return(final_results, corrected_umis)
+                (new_res, temp_corrected_umis) = update_umi_counts(UMIclusters, final_results[cell_barcode].pop(TAG))
+                final_results[cell_barcode][TAG] = new_res
+                corrected_umis += temp_corrected_umis
+            elif n_umis > max_umis:
+                aberrant_umi_count_cells.add(cell_barcode)
+    return(final_results, corrected_umis, aberrant_umi_count_cells)
+
+
+def update_umi_counts(UMIclusters, cell_tag_counts):
+    """
+    Update a dict object with umis corrected.
+
+    Args:
+        UMIclusters (list): List of lists with corrected umis
+        cell_tag_counts (Counter): Counter of umis
+
+    Returns:
+        cell_tag_counts (Counter): Updated Counter of umis
+        temp_corrected_umis (int): Number of corrected umis
+    """
+    temp_corrected_umis = 0
+    for umi_cluster in UMIclusters:  # This is a list with the first element the dominant barcode
+        if(len(umi_cluster) > 1):  # This means we got a correction
+            major_umi = umi_cluster[0]
+            for minor_umi in umi_cluster[1:]:
+                temp_corrected_umis += 1
+                temp = cell_tag_counts.pop(minor_umi)
+                cell_tag_counts[major_umi] += temp
+    return(cell_tag_counts, temp_corrected_umis)
 
 
 def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_cells):
@@ -246,8 +269,8 @@ def generate_sparse_matrices(final_results, ordered_tags_map, top_cells):
         read_results_matrix (scipy.sparse.dok_matrix): Read counts
 
     """
-    umi_results_matrix = sparse.dok_matrix((len(ordered_tags_map) ,len(top_cells)), dtype=int16)
-    read_results_matrix = sparse.dok_matrix((len(ordered_tags_map) ,len(top_cells)), dtype=int16)
+    umi_results_matrix = sparse.dok_matrix((len(ordered_tags_map) ,len(top_cells)), dtype=int32)
+    read_results_matrix = sparse.dok_matrix((len(ordered_tags_map) ,len(top_cells)), dtype=int32)
     for i,cell_barcode in enumerate(top_cells):
         for j,TAG in enumerate(final_results[cell_barcode]):
             if final_results[cell_barcode][TAG]:

From baaf0d5b9584194d7646aed9caae0db2d9963e48 Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Tue, 19 Feb 2019 10:55:32 +0100
Subject: [PATCH 05/14] changed cell barcode correction

---
 cite_seq_count/__main__.py   | 8 ++++++++
 cite_seq_count/processing.py | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index 4a99d08..232af4d 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -210,7 +210,15 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
             args.start_trim))
 
 def main():
+    #Create logger and stream handler
     logger = logging.getLogger('cite_seq_count')
+    logger.setLevel(logging.CRITICAL)
+    ch = logging.StreamHandler()
+    ch.setLevel(logging.CRITICAL)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+
     start_time = time.time()
     parser = get_args()
     if not sys.argv[1:]:
diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
index 01c0174..e4bca1a 100644
--- a/cite_seq_count/processing.py
+++ b/cite_seq_count/processing.py
@@ -239,7 +239,7 @@ def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_c
         cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
             cell_barcode_counts=umis_per_cell,
             expect_cells=expected_cells,
-            cell_number=False,
+            cell_number=expected_cells,
             error_correct_threshold=collapsing_threshold,
             plotfile_prefix=False)
         if true_to_false_map:

From 644330e308a7a46569452250a4d2fc6c213eddd1 Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Tue, 19 Feb 2019 15:38:08 +0100
Subject: [PATCH 06/14] added changelog

---
 CHANGELOG.md                    |  21 +++++-
 cite_seq_count/__main__.py      |  43 ++++++++----
 cite_seq_count/preprocessing.py |   6 +-
 cite_seq_count/processing.py    | 119 ++++++++++++++++++++++++++------
 setup.py                        |   3 +-
 5 files changed, 151 insertions(+), 41 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ba6cd6c..fe44ad9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,11 +5,30 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
 
+## [1.4.2] - 20.02.2019
+### Added
+- Cell barcode correction based on whitelist instead of top cells. This will trigger automatically
+  when a whitelist is provided.
+- UMI correction for tags with more than than 20'000 umis is too slow right now.
+  Cells containing a TAG with more than 20'000 umis will be discarded. This means that more than 20'000
+  antibody tags with different umis have been captured which is odd. Such high numbers could mean
+  that cells have been aggregating and can't be used for downstream analysis. 
+  The number of cells are reported under `Bad cells`. Fixing issues #32
+- The option to not correct for umis with `--no_umi_correction`.
+- Now prints how many reads will be processed.
+- Check that tags are indeed made of ATGC bases.
+
+### Changed
+- Sparse matrix is now a 32bit int structure instead of 16bit. Fixing issue #40
+- Cell barcode correction without a whitelist is not outputing any error anymore.
+  This is due to the fact that it uses a hard threshold based on the `-cells` option
+  instead of trying to discover one. Fixing issues #29, #36
+
 ## [1.4.0] - 24.01.2019
 ### Added
 - Enabled parallelization using multiprocessing. You can choose how many
   cores/threads you want to use with the `-T` `--threads` option. Takes max
-  cpu by default. It will run on only one core if you run only 1'000'000 reads.
+  cpu by default. It will run on only one core if you run 1'000'000 reads or less.
 - The output is now given in a gzipped mtx format. This is to ensure smooth usage for 
   new/larger datasets such as data from novaseq sequencers. For those who still want
   to use the dense format, there is an option `--dense` which will add the dense output
diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index 232af4d..b553557 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -156,8 +156,8 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
         args (arg_parse): Arguments provided by the user.
 
     """
-    total_mapped = sum(reads_per_cell.values())
     total_unmapped = sum(no_match.values())
+    total_mapped = sum(reads_per_cell.values()) - total_unmapped
     mapped_perc = round((total_mapped/n_reads)*100)
     unmapped_perc = round((total_unmapped/n_reads)*100)
     
@@ -231,7 +231,7 @@ def main():
         whitelist = preprocessing.parse_whitelist_csv(args.whitelist,
                                         args.cb_last - args.cb_first + 1)
     else:
-        whitelist = None
+        whitelist = False
 
     # Load TAGs/ABs.
     ab_map = preprocessing.parse_tags_csv(args.tags)
@@ -256,8 +256,8 @@ def main():
         n_lines = preprocessing.get_n_lines(args.read1_path)
     n_reads = int(n_lines/4)
     n_threads = args.n_threads
-    
     print('Started mapping')
+    print('Processing {:,} reads'.format(n_reads))
     #Run with one process
     if n_threads <= 1 or n_reads < 1000001:
         print('CITE-seq-Count is running with one core.')
@@ -278,8 +278,8 @@ def main():
         umis_per_cell = Counter()
         reads_per_cell = Counter()
         for cell_barcode,counts in final_results.items():
-            umis_per_cell[cell_barcode] = sum([len(counts[UMI]) for UMI in counts if UMI != 'unmapped'])
-            reads_per_cell[cell_barcode] = sum([sum(counts[UMI].values()) for UMI in counts if UMI != 'unmapped'])
+            umis_per_cell[cell_barcode] = sum([len(counts[UMI]) for UMI in counts])
+            reads_per_cell[cell_barcode] = sum([sum(counts[UMI].values()) for UMI in counts])
     else:
         # Run with multiple processes
         print('CITE-seq-Count is running with {} cores.'.format(n_threads))
@@ -306,6 +306,7 @@ def main():
         p.join()
         print('Mapping done')
         print('Merging results')
+
         (
             final_results,
             umis_per_cell,
@@ -313,22 +314,34 @@ def main():
             merged_no_match
         ) = processing.merge_results(parallel_results=parallel_results)
         del(parallel_results)
+
     ordered_tags_map = OrderedDict()
     for i,tag in enumerate(ab_map.values()):
         ordered_tags_map[tag] = i
     ordered_tags_map['unmapped'] = i + 1
 
     # Correct cell barcodes
-    (
-        final_results,
-        umis_per_cell,
-        bcs_corrected
-    ) = processing.correct_cells(
-            final_results=final_results,
-            umis_per_cell=umis_per_cell,
-            expected_cells=args.expected_cells,
-            collapsing_threshold=args.bc_threshold)
-    
+    print('Correcting cell barcodes')
+    if not whitelist:
+        (
+            final_results,
+            umis_per_cell,
+            bcs_corrected
+        ) = processing.correct_cells(
+                final_results=final_results,
+                umis_per_cell=umis_per_cell,
+                expected_cells=args.expected_cells,
+                collapsing_threshold=args.bc_threshold)
+    else:
+        (
+            final_results,
+            umis_per_cell,
+            bcs_corrected) = processing.correct_cells_whitelist(
+                final_results=final_results,
+                umis_per_cell=umis_per_cell,
+                whitelist=whitelist,
+                collapsing_threshold=args.bc_threshold)
+
     # Correct umi barcodes
     if not whitelist:
         top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
diff --git a/cite_seq_count/preprocessing.py b/cite_seq_count/preprocessing.py
index 7ccc589..464a6d5 100644
--- a/cite_seq_count/preprocessing.py
+++ b/cite_seq_count/preprocessing.py
@@ -131,7 +131,11 @@ def check_tags(tags, maximum_distance):
         distance = Levenshtein.distance(a, b)
         if (distance <= (maximum_distance - 1)):
             offending_pairs.append([a, b, distance])
-    
+    DNA_pattern = regex.compile('^[ATGC]*$')
+    for tag in tags:
+        if not DNA_pattern.match(tag):
+            print('This tag {} is not only composed of ATGC bases.\nPlease check your tags file'.format(tag))
+            sys.exit('Exiting the application.\n')
     # If offending pairs are found, print them all.
     if offending_pairs:
         print(
diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
index e4bca1a..3b2c23a 100644
--- a/cite_seq_count/processing.py
+++ b/cite_seq_count/processing.py
@@ -4,6 +4,7 @@
 import os
 import Levenshtein
 import regex
+import pybktree
 
 from collections import Counter
 from collections import defaultdict
@@ -157,9 +158,8 @@ def merge_results(parallel_results):
                 if mapped[cell_barcode][TAG]:
                     for UMI in mapped[cell_barcode][TAG]:
                         merged_results[cell_barcode][TAG][UMI] += mapped[cell_barcode][TAG][UMI]
-                        if TAG != 'unmapped':
-                            umis_per_cell[cell_barcode] += len(mapped[cell_barcode][TAG])
-                            reads_per_cell[cell_barcode] += mapped[cell_barcode][TAG][UMI]
+                        umis_per_cell[cell_barcode] += len(mapped[cell_barcode][TAG])
+                        reads_per_cell[cell_barcode] += mapped[cell_barcode][TAG][UMI]
         merged_no_match.update(unmapped)
     return(merged_results, umis_per_cell, reads_per_cell, merged_no_match)
 
@@ -218,6 +218,32 @@ def update_umi_counts(UMIclusters, cell_tag_counts):
                 cell_tag_counts[major_umi] += temp
     return(cell_tag_counts, temp_corrected_umis)
 
+def collapse_cells(true_to_false, umis_per_cell, final_results):
+    """
+    Collapses cell barcodes based on the mapping true_to_false
+
+    Args:
+        true_to_false (dict): Mapping between the reference and the "mutated" barcodes.
+        umis_per_cell (Counter): Counter of number of umis per cell.
+        final_results (dict): Dict of dict of Counters with mapping results.
+
+    Returns:
+        umis_per_cell (Counter): Counter of number of umis per cell.
+        final_results (dict): Same as input but with corrected cell barcodes.
+        corrected_barcodes (int): How many cell barcodes have been corrected.
+    """
+    print('Collapsing cell barcodes')
+    corrected_barcodes = 0
+    for real_barcode in true_to_false:
+        for fake_barcode in true_to_false[real_barcode]:
+            if real_barcode in final_results:
+                temp = final_results.pop(fake_barcode)
+                corrected_barcodes += 1
+                for TAG in temp.keys():
+                    final_results[real_barcode][TAG].update(temp[TAG])
+                temp_umi_counts = umis_per_cell.pop(fake_barcode)
+                umis_per_cell[real_barcode] += temp_umi_counts    
+    return(umis_per_cell, final_results, corrected_barcodes)
 
 def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_cells):
     """
@@ -231,28 +257,75 @@ def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_c
     
     Returns:
         final_results (dict): Same as input but with corrected umis.
+        umis_per_cell (Counter): Counter of umis per cell after cell barcode correction
         corrected_umis (int): How many umis have been corrected.
     """
-    print('Correcting cell barcodes')
-    corrected_barcodes = 0
-    try:
-        cell_whitelist, true_to_false_map = umi_methods.getCellWhitelist(
-            cell_barcode_counts=umis_per_cell,
-            expect_cells=expected_cells,
-            cell_number=expected_cells,
-            error_correct_threshold=collapsing_threshold,
-            plotfile_prefix=False)
-        if true_to_false_map:
-            for real_barcode in true_to_false_map:
-                for fake_barcode in true_to_false_map[real_barcode]:
-                    temp = final_results.pop(fake_barcode)
-                    corrected_barcodes += 1
-                    for TAG in temp.keys():
-                        final_results[real_barcode][TAG].update(temp[TAG])
-                    temp_umi_counts = umis_per_cell.pop(fake_barcode)
-                    umis_per_cell[real_barcode] += temp_umi_counts
-    except:
-        print('Could not find a good local minima for correction.\nNo cell barcode correction was done.')
+    print('Finding a whitelist')
+    cell_whitelist, true_to_false = umi_methods.getCellWhitelist(
+        cell_barcode_counts=umis_per_cell,
+        expect_cells=expected_cells,
+        cell_number=expected_cells,
+        error_correct_threshold=collapsing_threshold,
+        plotfile_prefix=False)
+    
+    (
+        umis_per_cell,
+        final_results,
+        corrected_barcodes
+        ) = collapse_cells(
+            true_to_false=true_to_false,
+            umis_per_cell=umis_per_cell,
+            final_results=final_results)
+    return(final_results, umis_per_cell, corrected_barcodes)
+
+
+def correct_cells_whitelist(final_results, umis_per_cell, whitelist, collapsing_threshold):
+    """
+    Corrects cell barcodes.
+    
+    Args:
+        final_results (dict): Dict of dict of Counters with mapping results.
+        umis_per_cell (Counter): Counter of number of umis per cell.
+        whitelist (set): The whitelist reference given by the user.
+        collapsing_threshold (int): Max distance between umis.
+    
+    Returns:
+        final_results (dict): Same as input but with corrected umis.
+        umis_per_cell (Counter): Counter of umis per cell after cell barcode correction
+        corrected_umis (int): How many umis have been corrected.
+    """
+    true_to_false = defaultdict(set)
+    barcode_tree = pybktree.BKTree(Levenshtein.hamming, whitelist)
+    print('Generated barcode tree from whitelist')
+    cell_barcodes = list(final_results.keys())
+    print('Finding reference candidates')
+    for i, cell_barcode in enumerate(cell_barcodes):
+        if cell_barcode in whitelist:
+            # if the barcode is already whitelisted, no need to add
+            continue
+        # get all members of whitelist that are at distance of collapsing_threshold
+        candidates = [white_cell for d, white_cell in barcode_tree.find(cell_barcode, collapsing_threshold) if d > 0]
+
+        if len(candidates) == 0:
+            # the cell doesnt match to any whitelisted barcode,
+            # hence we have to drop it
+            # (as it cannot be asscociated with any frequent barcode)
+            continue
+        elif len(candidates) == 1:
+            white_cell_str = candidates[0]
+            true_to_false[white_cell_str].add(cell_barcode)
+        else:
+            # more than on whitelisted candidate:
+            # we drop it as its not uniquely assignable
+            continue
+    (
+        umis_per_cell,
+        final_results,
+        corrected_barcodes
+        ) = collapse_cells(
+            true_to_false=true_to_false,
+            umis_per_cell=umis_per_cell,
+            final_results=final_results)
     return(final_results, umis_per_cell, corrected_barcodes)
 
 
diff --git a/setup.py b/setup.py
index 9629d6d..6901884 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,8 @@
           'umi_tools>=0.5.5',
           'pytest==4.1.0',
           'pytest-dependency==0.4.0',
-          'pandas>=0.23.4'
+          'pandas>=0.23.4',
+          'pybktree==1.1'
       ],
       python_requires='>=3.6'
 )
\ No newline at end of file

From aae24942450b4fbd03ec0364d884bcbde498cb23 Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Wed, 6 Mar 2019 10:59:53 +0100
Subject: [PATCH 07/14] added DOI in both readme and documentation

---
 README.md          |  1 +
 docs/docs/index.md | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a764f00..630eab0 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+[![DOI](https://zenodo.org/badge/99617772.svg)](https://zenodo.org/badge/latestdoi/99617772)
 # CITE-seq-Count
 A python package that allows to count antibody TAGS from a [CITE-seq](https://www.nature.com/articles/nmeth.4380) and/or [cell hashing](https://www.biorxiv.org/content/early/2017/12/21/237693) experiment.
 
diff --git a/docs/docs/index.md b/docs/docs/index.md
index b8bb589..b2ba21d 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -6,4 +6,12 @@ IMPORTANT NEWS
 
 For users who have processed data using CITE-seq-Count version `1.3.4` please rerun any data using version 1.4.0.
 
-There was a bug and the counts are not umi counts!
\ No newline at end of file
+There was a bug and the counts are not umi counts!
+
+
+How to cite CITE-seq-Count
+-------------------------------
+
+You can use the DOI provided bellow.
+
+[![DOI](https://zenodo.org/badge/99617772.svg)](https://zenodo.org/badge/latestdoi/99617772)
\ No newline at end of file

From 4a5906acbeaa7261df34d4d47a55e37719927b7c Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Sun, 10 Mar 2019 15:06:27 +0100
Subject: [PATCH 08/14] lot of changes

---
 CHANGELOG.md                    | 15 ++++--
 cite_seq_count/__main__.py      | 87 +++++++++++++++++++++++----------
 cite_seq_count/io.py            |  6 ++-
 cite_seq_count/preprocessing.py | 39 +++++++++++++--
 cite_seq_count/processing.py    | 52 ++++++++++++++++++--
 docs/docs/Running-the-script.md | 21 +++++++-
 setup.py                        |  2 +-
 tests/test_preprocessing.py     |  2 +-
 tests/test_processing.py        | 11 +++--
 9 files changed, 186 insertions(+), 49 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fe44ad9..af8817f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,16 +13,23 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
   Cells containing a TAG with more than 20'000 umis will be discarded. This means that more than 20'000
   antibody tags with different umis have been captured which is odd. Such high numbers could mean
   that cells have been aggregating and can't be used for downstream analysis. 
-  The number of cells are reported under `Bad cells`. Fixing issues #32
+  The number of cells are reported under `Bad cells` and the dense matrix of those cells is provided in `uncorrected_cells`.
+  Fixing issues #32
 - The option to not correct for umis with `--no_umi_correction`.
 - Now prints how many reads will be processed.
-- Check that tags are indeed made of ATGC bases.
+- Checks that tags are indeed made of ATGC bases.
+- Added a check for cell barcode correction and collapsing threshold.
+- New option to allow for a sliding window when aligning TAGS. `--sliding-window`
+  Use when you have a variable sequence before your tags.
 
 ### Changed
-- Sparse matrix is now a 32bit int structure instead of 16bit. Fixing issue #40
+- Sparse matrix is now based on int32 instead of int16. Fixing issue #40
 - Cell barcode correction without a whitelist is not outputing any error anymore.
   This is due to the fact that it uses a hard threshold based on the `-cells` option
-  instead of trying to discover one. Fixing issues #29, #36
+  if it kind find one. Fixing issues #29, #36
+- Fixed umi_tools version to `1.0.0`
+- fixed the error linked to umi_tools version update `getCellWhitelist` #45.
+
 
 ## [1.4.0] - 24.01.2019
 ### Added
diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index b553557..1f88301 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -117,6 +117,12 @@ def get_args():
         help=("Number of bases to discard from read2.")
     )
     
+    filters.add_argument(
+        '--sliding-window', dest='sliding_window',
+        required=False, default=False, action='store_true',
+        help=("Allow for a sliding window when aligning.")
+    )
+        
 
     # Remaining arguments.
     parser.add_argument('-T', '--threads', required=False, type=int,
@@ -169,7 +175,7 @@ def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordere
 Reads processed: {}
 Percentage mapped: {}
 Percentage unmapped: {}
-Bad cells: {}
+Uncorrected cells: {}
 Correction:
 \tCell barcodes collapsing threshold: {}
 \tCell barcodes corrected: {}
@@ -228,8 +234,10 @@ def main():
     # Parse arguments.
     args = parser.parse_args()
     if args.whitelist:
-        whitelist = preprocessing.parse_whitelist_csv(args.whitelist,
-                                        args.cb_last - args.cb_first + 1)
+        (whitelist, args.bc_threshold) = preprocessing.parse_whitelist_csv(
+            filename=args.whitelist,
+            barcode_length=args.cb_last - args.cb_first + 1,
+            collapsing_threshold=args.bc_threshold)
     else:
         whitelist = False
 
@@ -273,7 +281,8 @@ def main():
                 whitelist=whitelist,
                 debug=args.debug,
                 start_trim=args.start_trim,
-                maximum_distance=args.max_error)
+                maximum_distance=args.max_error,
+                sliding_window=args.sliding_window)
         print('Mapping done')
         umis_per_cell = Counter()
         reads_per_cell = Counter()
@@ -299,7 +308,8 @@ def main():
                     whitelist,
                     args.debug,
                     args.start_trim,
-                    args.max_error),
+                    args.max_error,
+                    args.sliding_window),
                 callback=parallel_results.append,
                 error_callback=sys.stderr)
         p.close()
@@ -320,32 +330,41 @@ def main():
         ordered_tags_map[tag] = i
     ordered_tags_map['unmapped'] = i + 1
 
+    
     # Correct cell barcodes
-    print('Correcting cell barcodes')
-    if not whitelist:
-        (
-            final_results,
-            umis_per_cell,
-            bcs_corrected
-        ) = processing.correct_cells(
-                final_results=final_results,
-                umis_per_cell=umis_per_cell,
-                expected_cells=args.expected_cells,
-                collapsing_threshold=args.bc_threshold)
+    if(len(umis_per_cell) <= args.expected_cells):
+        print("Number of expected cells, {}, is higher " \
+            "than number of cells found {}.\nNot performing" \
+            "cell barcode correction" \
+            "".format(args.expected_cells, len(umis_per_cell)))
+        bcs_corrected = 0
     else:
-        (
-            final_results,
-            umis_per_cell,
-            bcs_corrected) = processing.correct_cells_whitelist(
-                final_results=final_results,
-                umis_per_cell=umis_per_cell,
-                whitelist=whitelist,
-                collapsing_threshold=args.bc_threshold)
+        print('Correcting cell barcodes')
+        if not whitelist:
+            (
+                final_results,
+                umis_per_cell,
+                bcs_corrected
+            ) = processing.correct_cells(
+                    final_results=final_results,
+                    umis_per_cell=umis_per_cell,
+                    expected_cells=args.expected_cells,
+                    collapsing_threshold=args.bc_threshold)
+        else:
+            (
+                final_results,
+                umis_per_cell,
+                bcs_corrected) = processing.correct_cells_whitelist(
+                    final_results=final_results,
+                    umis_per_cell=umis_per_cell,
+                    whitelist=whitelist,
+                    collapsing_threshold=args.bc_threshold)
 
     # Correct umi barcodes
     if not whitelist:
         top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
         top_cells = set([pair[0] for pair in top_cells_tuple])
+
     # Sort cells by number of mapped umis
     else:
         top_cells = whitelist
@@ -374,6 +393,23 @@ def main():
         aberrant_cells = set()
     for cell_barcode in aberrant_cells:
         top_cells.remove(cell_barcode)
+    #Create sparse aberrant cells matrix
+    (
+    umi_aberrant_matrix,
+    read_aberrant_matrix
+    ) = processing.generate_sparse_matrices(
+        final_results=final_results,
+        ordered_tags_map=ordered_tags_map,
+        top_cells=aberrant_cells)
+    
+    #Write uncorrected cells to dense output
+    io.write_dense(
+            sparse_matrix=umi_aberrant_matrix,
+            index=list(ordered_tags_map.keys()),
+            columns=aberrant_cells,
+            outfolder=os.path.join(args.outfolder,'uncorrected_cells'),
+            filename='dense_umis.tsv')
+    
     (
         umi_results_matrix,
         read_results_matrix
@@ -419,7 +455,8 @@ def main():
             sparse_matrix=umi_results_matrix,
             index=list(ordered_tags_map.keys()),
             columns=top_cells,
-            file_path=os.path.join(args.outfolder, 'dense_umis.tsv'))
+            outfolder=args.outfolder,
+            filename='dense_umis.tsv')
 
 if __name__ == '__main__':
     main()
diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py
index 344c3ec..5c46ee0 100644
--- a/cite_seq_count/io.py
+++ b/cite_seq_count/io.py
@@ -31,6 +31,8 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol
             shutil.copyfileobj(mtx_in, mtx_gz)
     os.remove(os.path.join(prefix,'matrix.mtx'))
 
-def write_dense(sparse_matrix, index, columns, file_path):
+def write_dense(sparse_matrix, index, columns, outfolder, filename):
+    prefix = os.path.join(outfolder)
+    os.makedirs(prefix, exist_ok=True)
     pandas_dense = pd.DataFrame(sparse_matrix.todense(), columns=columns, index=index)
-    pandas_dense.to_csv(file_path, sep='\t')
+    pandas_dense.to_csv(os.path.join(outfolder,filename), sep='\t')
diff --git a/cite_seq_count/preprocessing.py b/cite_seq_count/preprocessing.py
index 464a6d5..f8638c1 100644
--- a/cite_seq_count/preprocessing.py
+++ b/cite_seq_count/preprocessing.py
@@ -49,7 +49,7 @@ def chunk_reads(n_reads, n):
     return(indexes)
     
 
-def parse_whitelist_csv(filename, barcode_length):
+def parse_whitelist_csv(filename, barcode_length, collapsing_threshold):
     """Reads white-listed barcodes from a CSV file.
 
     The function accepts plain barcodes or even 10X style barcodes with the
@@ -69,10 +69,38 @@ def parse_whitelist_csv(filename, barcode_length):
         cell_pattern = regex.compile(r'[ATGC]{{{}}}'.format(barcode_length))
         whitelist = [row[0].strip(STRIP_CHARS) for row in csv_reader
                      if (len(row[0].strip(STRIP_CHARS)) == barcode_length)]
-        for cell_barcode in whitelist:
-            if not cell_pattern.match(cell_barcode):
-                sys.exit('This barcode {} is not only composed of ATGC bases.'.format(cell_barcode))
-    return set(whitelist)
+    for cell_barcode in whitelist:
+        if not cell_pattern.match(cell_barcode):
+            sys.exit('This barcode {} is not only composed of ATGC bases.'.format(cell_barcode))
+    collapsing_threshold=test_cell_distances(whitelist, collapsing_threshold)
+    return(set(whitelist), collapsing_threshold)
+
+
+def test_cell_distances(whitelist, collapsing_threshold):
+    """Tests cell barcode distances to validate provided cell barcode collapsing threshold
+    
+    Function needs the given whitelist as well as the threshold.
+    If the value is too high, it will rerun until an acceptable value is found.
+    
+    Args:
+        whitelist (set): Whitelist barcode set
+        collapsing_threshold (int): Value of threshold
+
+    Returns:
+        collapsing_threshold (int): Valid threshold
+    """
+    ok = False
+    while not ok:
+        print('Testing cell barcode collapsing threshold of {}'.format(collapsing_threshold))
+        for comb in combinations(whitelist, 2):
+            if Levenshtein.hamming(comb[0], comb[1]) <= collapsing_threshold:
+                collapsing_threshold -= 1
+                print('Value is too high, reducing it by 1')
+                break
+        else:
+            ok = True
+    print('Using {} for cell barcode collapsing threshold'.format(collapsing_threshold))
+    return(collapsing_threshold)
 
 
 def parse_tags_csv(filename):
@@ -216,6 +244,7 @@ def check_barcodes_lengths(read1_length, cb_first, cb_last, umi_first, umi_last)
     
     return(barcode_slice, umi_slice, barcode_umi_length)
 
+
 def blocks(files, size=65536):
     """
     A fast way of counting the lines of a large file.
diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
index 7106a0d..0b478bf 100644
--- a/cite_seq_count/processing.py
+++ b/cite_seq_count/processing.py
@@ -14,6 +14,7 @@
 from scipy import sparse
 from umi_tools import network
 from umi_tools import umi_methods
+import umi_tools.whitelist_methods as whitelist_methods
 
 
 from cite_seq_count import secondsToText
@@ -38,17 +39,53 @@ def find_best_match(TAG_seq, tags, maximum_distance):
     best_match = 'unmapped'
     best_score = maximum_distance
     for tag, name in tags.items():
-        score = Levenshtein.distance(tag, TAG_seq[:len(tag)])
-        if score <= best_score:
+        score = Levenshtein.hamming(tag, TAG_seq[:len(tag)])
+        if score == 0:
+            #Best possible match
+            return(name)
+        elif score <= best_score:
             best_score = score
             best_match = name
             return(best_match)
     return(best_match)
 
 
+def find_best_match_shift(TAG_seq, tags, maximum_distance):
+    """
+    Find the best match from the list of tags with sliding window.
+
+    Compares the Levenshtein distance between tags and the trimmed sequences.
+    The tag and the sequence must have the same length.
+    If no matches found returns 'unmapped'.
+    We add 1
+    Args:
+        TAG_seq (string): Sequence from R1 already start trimmed
+        tags (dict): A dictionary with the TAGs as keys and TAG Names as values.
+        maximum_distance (int): Maximum distance given by the user.
+
+    Returns:
+        best_match (string): The TAG name that will be used for counting.
+    """
+    best_match = 'unmapped'
+    best_score = maximum_distance
+    shifts = range(0,len(TAG_seq) - len(max(tags,key=len)))
+
+    for shift in shifts:
+        for tag, name in tags.items():
+            score = Levenshtein.hamming(tag, TAG_seq[shift:len(tag)+shift])
+            if score == 0:
+                #Best possible match
+                return(name)
+            elif score <= best_score:
+                best_score = score
+                best_match = name
+                return(best_match)
+    return(best_match)
+
+
 def map_reads(read1_path, read2_path, tags, barcode_slice,
                 umi_slice, indexes, whitelist, debug,
-                start_trim, maximum_distance):
+                start_trim, maximum_distance, sliding_window):
     """Read through R1/R2 files and generate a islice starting at a specific index.
 
     It reads both Read1 and Read2 files, creating a dict based on cell barcode.
@@ -107,7 +144,10 @@ def map_reads(read1_path, read2_path, tags, barcode_slice,
             if cell_barcode not in results:
                 results[cell_barcode] = defaultdict(Counter)
             
-            best_match = find_best_match(TAG_seq, tags, maximum_distance)
+            if(sliding_window):
+                best_match = find_best_match_shift(TAG_seq, tags, maximum_distance)
+            else:
+                best_match = find_best_match(TAG_seq, tags, maximum_distance)
             
             results[cell_barcode][best_match][UMI] += 1
             
@@ -217,6 +257,7 @@ def update_umi_counts(UMIclusters, cell_tag_counts):
                 cell_tag_counts[major_umi] += temp
     return(cell_tag_counts, temp_corrected_umis)
 
+
 def collapse_cells(true_to_false, umis_per_cell, final_results):
     """
     Collapses cell barcodes based on the mapping true_to_false
@@ -244,6 +285,7 @@ def collapse_cells(true_to_false, umis_per_cell, final_results):
                 umis_per_cell[real_barcode] += temp_umi_counts    
     return(umis_per_cell, final_results, corrected_barcodes)
 
+
 def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_cells):
     """
     Corrects cell barcodes.
@@ -260,7 +302,7 @@ def correct_cells(final_results, umis_per_cell, collapsing_threshold, expected_c
         corrected_umis (int): How many umis have been corrected.
     """
     print('Finding a whitelist')
-    cell_whitelist, true_to_false = umi_methods.getCellWhitelist(
+    cell_whitelist, true_to_false = whitelist_methods.getCellWhitelist(
         cell_barcode_counts=umis_per_cell,
         expect_cells=expected_cells,
         cell_number=expected_cells,
diff --git a/docs/docs/Running-the-script.md b/docs/docs/Running-the-script.md
index 6aa64e3..8652154 100644
--- a/docs/docs/Running-the-script.md
+++ b/docs/docs/Running-the-script.md
@@ -128,7 +128,7 @@ Barcodes from 1 to 16 and UMI from 17 to 26, then this is the input you need:
 
 Filtering for structure of the antibody barcode as well as maximum errors.
 
-* [OPTIONAL] Maximum Levenshtein distance allowed. This allows to catch antibody barcodes that might have `--max-error` errors compared to the real barcodes.
+* [OPTIONAL] Maximum Levenshtein distance allowed. This allows to catch antibody barcodes that might have `--max-error` errors compared to the real barcodes. (was `-hd` in previous versions)
 
 `--max-error MAX_ERROR`, default `3`
 
@@ -148,6 +148,23 @@ There is a sanity check when for the `MAX_ERROR` value chosen to be sure you are
 
 `-trim N_BASES, --start-trim N_BASES`, default `0`
 
+* [OPTIONAL] Activate sliding window alignement. Use this when you have a protocol that has a variable sequence before the inserted TAG.
+
+`--sliding-window`, default `False`
+
+Example:
+
+The TAG: `ATGCTAGCT` with a variable prefix: `TTCAATTTCA`
+R2 reads:
+```
+TTCA ATGCTAGCTAAAAAAAAAAAAAAAAA
+TTCAA ATGCTAGCTAAAAAAAAAAAAAAAA
+TTCAAT ATGCTAGCTAAAAAAAAAAAAAAA
+TTCAATT ATGCTAGCTAAAAAAAAAAAAAA
+TTCAATTT ATGCTAGCTAAAAAAAAAAAAA
+TTCAATTTC ATGCTAGCTAAAAAAAAAAAA
+```
+
 
 ### OUTPUT
 You have to choose either the number of cells you expect or give it a list of cell barcodes to retrieve.
@@ -156,7 +173,7 @@ You have to choose either the number of cells you expect or give it a list of ce
 
 `-cells CELLS, --expected_cells CELLS`
 
-* [Optional] If you have a whitelist of barcodes produced by the cDNA data, you are using a well-plate based protocol or any known whitelist, you can use this list to only extract TAGS matching those barcodes. Typically using the cell list from the mRNA data of the same experiment.
+* [Optional] If you have a whitelist of barcodes produced by the cDNA data, you are using a well-plate based protocol or a platform reference, you can use this list to only extract TAGS matching those barcodes. Typically using the cell list from the mRNA data of the same experiment. This is highly recommended as knowing the true cells helps for cell barcode correction.
 
 `-wl WHITELIST, --whitelist WHITELIST`
 
diff --git a/setup.py b/setup.py
index f32fca8..79a17c3 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
           'python-levenshtein>=0.12.0',
           'scipy>=1.1.0',
           'multiprocess>=0.70.6.1',
-          'umi_tools>=1.0.0',
+          'umi_tools==1.0.0',
           'pytest==4.1.0',
           'pytest-dependency==0.4.0',
           'pandas>=0.23.4',
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 16b4682..f702caa 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -44,7 +44,7 @@ def data():
 
 @pytest.mark.dependency()
 def test_parse_whitelist_csv(data):
-    assert preprocessing.parse_whitelist_csv(pytest.correct_whitelist_path, 16) == pytest.correct_whitelist
+    assert preprocessing.parse_whitelist_csv(pytest.correct_whitelist_path, 16, 1) == (pytest.correct_whitelist,1)
 
 @pytest.mark.dependency()
 def test_parse_tags_csv(data):
diff --git a/tests/test_processing.py b/tests/test_processing.py
index 7317539..d1b105e 100644
--- a/tests/test_processing.py
+++ b/tests/test_processing.py
@@ -85,13 +85,15 @@ def data():
         {'test2-CGTACGTAGCCTAGC':
         Counter({b'TAGCTTAGTA': 5, b'GCGATGCATA': 1})}
     }
-    pytest.umis_per_cell = {
+    pytest.umis_per_cell = Counter({
         'ACTGTTTTATTGGCCT': 1,
         'TTCATAAGGTAGGGAT': 2
-    }
+    })
     pytest.expected_cells = 2
     pytest.no_match = Counter()
     pytest.collapsing_threshold = 1
+    pytest.sliding_window = False
+    pytest.max_umis = 20000
 
     pytest.sequence_pool = []
     pytest.tags_complete = preprocessing.check_tags(preprocessing.parse_tags_csv('tests/test_data/tags/correct.csv'), 5)
@@ -151,12 +153,13 @@ def test_classify_reads_multi_process(data):
         pytest.correct_whitelist,
         pytest.debug,
         pytest.start_trim,
-        pytest.maximum_distance)
+        pytest.maximum_distance,
+        pytest.sliding_window)
     assert len(results) == 2
 
 @pytest.mark.dependency(depends=['test_classify_reads_multi_process'])
 def test_correct_umis(data):
-    temp = processing.correct_umis(pytest.results, 2, pytest.corrected_results.keys())
+    temp = processing.correct_umis(pytest.results, 2, pytest.corrected_results.keys(), pytest.max_umis)
     results = temp[0]
     n_corrected = temp[1]
     for cell_barcode in results.keys():

From c0719191a4c95713fd57cf6e41c4c8f934a68e51 Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Mon, 11 Mar 2019 10:48:21 +0100
Subject: [PATCH 09/14] added a line in changelog

---
 CHANGELOG.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af8817f..0438822 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
 
-## [1.4.2] - 20.02.2019
+## [1.4.2] - 11.03.2019
 ### Added
 - Cell barcode correction based on whitelist instead of top cells. This will trigger automatically
   when a whitelist is provided.
@@ -18,7 +18,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 - The option to not correct for umis with `--no_umi_correction`.
 - Now prints how many reads will be processed.
 - Checks that tags are indeed made of ATGC bases.
-- Added a check for cell barcode correction and collapsing threshold.
+- Added a check for cell barcode correction and collapsing threshold for given whitelist.
 - New option to allow for a sliding window when aligning TAGS. `--sliding-window`
   Use when you have a variable sequence before your tags.
 
@@ -27,7 +27,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 - Cell barcode correction without a whitelist is not outputing any error anymore.
   This is due to the fact that it uses a hard threshold based on the `-cells` option
   if it kind find one. Fixing issues #29, #36
-- Fixed umi_tools version to `1.0.0`
+- Upgraded umi_tools to `1.0.0`
 - fixed the error linked to umi_tools version update `getCellWhitelist` #45.
 
 

From 4ca5aa190a36c5731dcec089548492f9231256e1 Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Mon, 11 Mar 2019 10:53:25 +0100
Subject: [PATCH 10/14] added spaces

---
 tests/test_processing.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_processing.py b/tests/test_processing.py
index d1b105e..624fa03 100644
--- a/tests/test_processing.py
+++ b/tests/test_processing.py
@@ -157,6 +157,7 @@ def test_classify_reads_multi_process(data):
         pytest.sliding_window)
     assert len(results) == 2
 
+
 @pytest.mark.dependency(depends=['test_classify_reads_multi_process'])
 def test_correct_umis(data):
     temp = processing.correct_umis(pytest.results, 2, pytest.corrected_results.keys(), pytest.max_umis)
@@ -168,10 +169,12 @@ def test_correct_umis(data):
             assert sum(results[cell_barcode][TAG].values()) == sum(pytest.corrected_results[cell_barcode][TAG].values())
     assert n_corrected == 3
 
+
 @pytest.mark.dependency(depends=['test_correct_umis'])
 def test_correct_cells(data):
     processing.correct_cells(pytest.corrected_results, pytest.umis_per_cell, pytest.expected_cells, pytest.collapsing_threshold)
 
+
 @pytest.mark.dependency(depends=['test_correct_umis'])
 def test_generate_sparse_matrices(data):
     (umi_results_matrix, read_results_matrix) = processing.generate_sparse_matrices(

From ee0884b185558d3924079f72e35b7333ba706737 Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Mon, 11 Mar 2019 11:07:04 +0100
Subject: [PATCH 11/14] added some documentation

---
 docs/docs/index.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/docs/index.md b/docs/docs/index.md
index b2ba21d..79133a5 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -12,6 +12,13 @@ There was a bug and the counts are not umi counts!
 How to cite CITE-seq-Count
 -------------------------------
 
+If you are using `CITE-seq-count` for you experiments, please consider citing it.
+
 You can use the DOI provided bellow.
 
-[![DOI](https://zenodo.org/badge/99617772.svg)](https://zenodo.org/badge/latestdoi/99617772)
\ No newline at end of file
+[![DOI](https://zenodo.org/badge/99617772.svg)](https://zenodo.org/badge/latestdoi/99617772)
+
+
+Bugs or feature requests
+-----------------------------
+You are welcome to address any issues on the [github page](https://github.com/Hoohm/CITE-seq-Count/issues)
\ No newline at end of file

From ee6e93e23134b448eca0ff1a47ae1ec7825b600b Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Mon, 11 Mar 2019 11:07:24 +0100
Subject: [PATCH 12/14] bumped version on the docs

---
 docs/docs/Installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/Installation.md b/docs/docs/Installation.md
index dc1ef20..cf1dd6d 100644
--- a/docs/docs/Installation.md
+++ b/docs/docs/Installation.md
@@ -4,7 +4,7 @@ Installation with pip
 `CITE-seq-Count` is stored on pypi. You can install it using the following command:
 
 ```
-pip install CITE-seq-Count
+pip install CITE-seq-Count==1.4.2
 ```
 
 

From 87d56e42a4d1479dc20b3f74ea913c426a4244cf Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Mon, 11 Mar 2019 11:07:34 +0100
Subject: [PATCH 13/14] bumped version on the README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b0a374a..718b2ed 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Installation
 -------------------------------------------
 
 ```
-pip install CITE-seq-Count==1.4.1
+pip install CITE-seq-Count==1.4.2
 ```
 
 

From a558ed9cab7680d34fff541a15d392509995f412 Mon Sep 17 00:00:00 2001
From: Patrick Roelli <patrick.roelli@gmail.com>
Date: Mon, 11 Mar 2019 11:12:42 +0100
Subject: [PATCH 14/14] updated documentation to read sparse matrix

---
 docs/docs/Reading-the-output.md | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/docs/docs/Reading-the-output.md b/docs/docs/Reading-the-output.md
index e8961b3..cf28e72 100644
--- a/docs/docs/Reading-the-output.md
+++ b/docs/docs/Reading-the-output.md
@@ -39,6 +39,7 @@ CITE-seq-Count Version: 1.4.0
 Reads processed: 50000000
 Percentage mapped: 95
 Percentage unmapped: 5
+Uncorrected cells: 33
 Correction:
 	Cell barcodes collapsing threshold: 1
 	Cell barcodes corrected: 20000
@@ -65,7 +66,25 @@ Packages to read MTX
 
 I recommend using `Seurat` and their `Read10x` function to read the results.
 
-Example: `Read10x('OUTFOLDER/umi_count', gene.column=1)`
+With Seurat V3:
+
+`Read10x('OUTFOLDER/umi_count/', gene.column=1)`
+
+
+With Matrix:
+
+```
+library(Matrix)
+matrix_dir = "/path_to_your_directory/out_cite_seq_count/umi_count/"
+barcode.path <- paste0(matrix_dir, "barcodes.tsv.gz")
+features.path <- paste0(matrix_dir, "features.tsv.gz")
+matrix.path <- paste0(matrix_dir, "matrix.mtx.gz")
+mat <- readMM(file = matrix.path)
+feature.names = read.delim(features.path, header = FALSE, stringsAsFactors = FALSE)
+barcode.names = read.delim(barcode.path, header = FALSE, stringsAsFactors = FALSE)
+colnames(mat) = barcode.names$V1
+rownames(mat) = feature.names$V1
+```
 
 **Python**