From 0be27f98e9ee3d58d9ff5357af1c9725af96925f Mon Sep 17 00:00:00 2001
From: Aitor Morales-Gregorio
 <43403140+morales-gregorio@users.noreply.github.com>
Date: Tue, 12 Jan 2021 16:50:00 +0100
Subject: [PATCH] Improve memory efficiency of _create_sparse_matrix in
 BinnedSpikeTrain class (#395)

Co-authored-by: kleinjohann <a.kleinjohann@fz-juelich.de>
---
 elephant/conversion.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/elephant/conversion.py b/elephant/conversion.py
index b35602acf..1f6ffa8b8 100644
--- a/elephant/conversion.py
+++ b/elephant/conversion.py
@@ -1065,11 +1065,23 @@ def _create_sparse_matrix(self, spiketrains):
             Spike trains to bin.
 
         """
+
+        # The data type for numeric values
+        data_dtype = np.int32
+
         if not _check_neo_spiketrain(spiketrains):
             # a binned numpy array
-            sparse_matrix = sps.csr_matrix(spiketrains, dtype=np.int32)
+            sparse_matrix = sps.csr_matrix(spiketrains, dtype=data_dtype)
             return sparse_matrix
 
+        # Get index dtype that can accomodate the largest index
+        # (this is the same dtype that will be used for the index arrays of the
+        #  sparse matrix, so already using it here avoids array duplication)
+        shape = (len(spiketrains), self.n_bins)
+        numtype = np.int32
+        if max(shape) > np.iinfo(numtype).max:
+            numtype = np.int64
+
         row_ids, column_ids = [], []
         # data
         counts = []
@@ -1089,21 +1101,29 @@ def _create_sparse_matrix(self, spiketrains):
             valid_bins = bins[bins < self.n_bins]
             n_discarded += len(bins) - len(valid_bins)
             f, c = np.unique(valid_bins, return_counts=True)
+            # f inherits the dtype np.int32 from bins, but c is created in
+            # np.unique with the default int dtype (usually np.int64)
+            c = c.astype(data_dtype)
             column_ids.append(f)
             counts.append(c)
-            row_ids.append(np.repeat(idx, repeats=len(f)))
+            row_ids.append(np.repeat(idx, repeats=len(f)).astype(numtype))
 
         if n_discarded > 0:
             warnings.warn("Binning discarded {} last spike(s) of the "
                           "input spiketrain".format(n_discarded))
 
+        # Stacking preserves the data type. In any case, while creating
+        # the sparse matrix, a copy is performed even if we set 'copy' to False
+        # explicitly (however, this might change in future scipy versions -
+        # this depends on scipy csr matrix initialization implementation).
         counts = np.hstack(counts)
-        row_ids = np.hstack(row_ids)
         column_ids = np.hstack(column_ids)
+        row_ids = np.hstack(row_ids)
 
         sparse_matrix = sps.csr_matrix((counts, (row_ids, column_ids)),
-                                       shape=(len(spiketrains), self.n_bins),
-                                       dtype=np.int32, copy=False)
+                                       shape=shape, dtype=data_dtype,
+                                       copy=False)
+
         return sparse_matrix