diff --git a/README.md b/README.md index 8e27870..60b7e71 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,9 @@ s3select uses the same authentication and endpoint configuration as [aws-cli](ht First get some help:
 $ s3select -h
-usage: s3select [-h] [-w WHERE] [-d DELIMITER] [-l LIMIT] [-v] [-c] [-H]
-                [-o OUTPUT_FIELDS] [-t THREAD_COUNT] [--profile PROFILE]
-                [-M MAX_RETRIES]
+usage: s3select [-h] [-w WHERE] [-d FIELD_DELIMITER] [-D RECORD_DELIMITER]
+                [-l LIMIT] [-v] [-c] [-H] [-o OUTPUT_FIELDS] [-t THREAD_COUNT]
+                [--profile PROFILE] [-M MAX_RETRIES]
                 prefixes [prefixes ...]
 
 s3select makes s3 select querying API much easier and faster
@@ -61,9 +61,14 @@ optional arguments:
   -h, --help            show this help message and exit
   -w WHERE, --where WHERE
                         WHERE part of the SQL query
-  -d DELIMITER, --delimiter DELIMITER
-                        Delimiter to be used for CSV files. If specified CSV
-                        parsing will be used. By default we expect JSON input
+  -d FIELD_DELIMITER, --field_delimiter FIELD_DELIMITER
+                        Field delimiter to be used for CSV files. If specified
+                        CSV parsing will be used. By default we expect JSON
+                        input
+  -D RECORD_DELIMITER, --record_delimiter RECORD_DELIMITER
+                        Record delimiter to be used for CSV files. If
+                        specified CSV parsing will be used. By default we
+                        expect JSON input
   -l LIMIT, --limit LIMIT
                         Maximum number of results to return
   -v, --verbose         Be more verbose
diff --git a/s3select b/s3select
index aa72858..ed6ee17 100755
--- a/s3select
+++ b/s3select
@@ -67,12 +67,14 @@ class S3ListThread(threading.Thread):
 class ScanOneKey(threading.Thread):
     def __init__(
             self, files_queue, events_queue, s3, output_fields=None, count=None,
-            delimiter=None, where=None, limit=None, max_retries=None):
+            field_delimiter=None, record_delimiter=None, where=None, limit=None,
+            max_retries=None):
         threading.Thread.__init__(self)
         self.max_retries = max_retries
         self.limit = limit
         self.where = where
-        self.delimiter = delimiter
+        self.field_delimiter = field_delimiter
+        self.record_delimiter = record_delimiter
         self.count = count
         self.output_fields = output_fields
         self.files_queue = files_queue
@@ -99,10 +101,24 @@ class ScanOneKey(threading.Thread):
                 return
             input_ser = {'JSON': {"Type": "Document"}}
             output_ser = {'JSON': {}}
-            if self.delimiter is not None:
-                input_ser = {'CSV': {"FieldDelimiter": self.delimiter,
-                                     "FileHeaderInfo": "NONE"}}
-                output_ser = {'CSV': {"FieldDelimiter": self.delimiter}}
+            if self.field_delimiter is not None or \
+                    self.record_delimiter is not None:
+
+                if self.field_delimiter is None:
+                    self.field_delimiter = "\n"
+                if self.record_delimiter is None:
+                    self.record_delimiter = ","
+
+                input_ser = {
+                    'CSV':
+                        {
+                            "FieldDelimiter": self.field_delimiter,
+                            "FileHeaderInfo": "NONE",
+                            "RecordDelimiter": self.record_delimiter,
+                            "QuoteCharacter": ''
+                        }
+                }
+                output_ser = {'CSV': {"FieldDelimiter": self.field_delimiter}}
 
             if self.count:
                 # no need to parse JSON if we are only expecting the count of
@@ -221,15 +237,15 @@ def refresh_status_bar(
 
 
 def select(prefixes=None, verbose=False, profile=None, thread_count=150,
-           count=False, limit=0, output_fields=None, delimiter=None,
-           where=None, max_retries=20, with_filename=False):
-
+           count=False, limit=0, output_fields=None, field_delimiter=None,
+           record_delimiter=None, where=None, max_retries=20,
+           with_filename=False):
     if prefixes is None:
         raise Exception("S3 path prefix must be defined")
 
     # shortcut as specifying \t from command line is bit tricky with all escapes
-    if delimiter is not None and "\\t" in delimiter:
-        delimiter = '\t'
+    if field_delimiter is not None and "\\t" in field_delimiter:
+        field_delimiter = '\t'
 
     if profile is not None:
         boto3.setup_default_session(profile_name=profile)
@@ -249,7 +265,8 @@ def select(prefixes=None, verbose=False, profile=None, thread_count=150,
         else:
             thread = ScanOneKey(
                 files_queue, events_queue, s3, count=count, limit=limit,
-                output_fields=output_fields, delimiter=delimiter, where=where,
+                output_fields=output_fields, field_delimiter=field_delimiter,
+                record_delimiter=record_delimiter, where=where,
                 max_retries=max_retries)
 
         # daemon threads allow for fast exit if max number of records has been
@@ -351,9 +368,15 @@ if __name__ == "__main__":
     )
     parser.add_argument(
         "-d",
-        "--delimiter",
-        help="Delimiter to be used for CSV files. If specified CSV parsing will"
-             " be used. By default we expect JSON input"
+        "--field_delimiter",
+        help="Field delimiter to be used for CSV files. If specified CSV "
+             "parsing will be used. By default we expect JSON input"
+    )
+    parser.add_argument(
+        "-D",
+        "--record_delimiter",
+        help="Record delimiter to be used for CSV files. If specified CSV "
+             "parsing will be used. By default we expect JSON input"
     )
     parser.add_argument(
         "-l",