refactor: improve memory usage of concatenation of files

greenhub-project · Sep 13, 2019 · f9a7f48 · f9a7f48
1 parent e0cadc8
commit f9a7f48
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 3 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,7 @@
 # tags from Docker Hub.
 FROM python:3.7-slim
 
-LABEL Name=dataset-converter Version=0.2.0
+LABEL Name=dataset-converter Version=0.2.1
 LABEL maintainer="Hugo Matalonga <[email protected]>"
 
 ARG UID=1000

diff --git a/app/app.py b/app/app.py
@@ -165,8 +165,12 @@ def load_multiple(options):
 
   if not options['partition']:
     print('Merging all processed chunks')
-    # concat the list into dataframe 
-    return pd.concat(chunk_list)
+    # concat the list into dataframe
+    df = None
+    while chunk_list:
+      df = pd.concat([df, chunk_list.pop(0)], ignore_index=True)
+
+    return df
 
 
 def convert_df(params):

diff --git a/config/samples.yml.example b/config/samples.yml.example
@@ -4,6 +4,7 @@ chunksize: 1000000
 compression: true
 partition: false
 usecols:
+  - id
   - device_id
   - timestamp
   - app_version