merge from RBQL

mechatroner · Dec 3, 2022 · f41d12e · f41d12e
1 parent f3ef874
commit f41d12e
Show file tree

Hide file tree

Showing 8 changed files with 327 additions and 187 deletions.
diff --git a/README.md b/README.md
@@ -143,7 +143,7 @@ RBQL is distributed with CLI apps, text editor plugins, Python and JS libraries.
 * Use Python or JavaScript expressions inside _SELECT_, _UPDATE_, _WHERE_ and _ORDER BY_ statements
 * Supports multiple input formats
 * Result set of any query immediately becomes a first-class table on its own
-* No need to provide FROM statement in the query - input table is defined by the current context
+* No need to provide FROM statement in the query when the input table is defined by the current context.
 * Supports all main SQL keywords
 * Supports aggregate functions and GROUP BY queries
 * Supports user-defined functions (UDF)
@@ -166,6 +166,7 @@ RBQL is distributed with CLI apps, text editor plugins, Python and JS libraries.
 * GROUP BY
 * TOP _N_
 * LIMIT _N_
+* AS
 
 All keywords have the same meaning as in SQL queries. You can check them [online](https://www.w3schools.com/sql/default.asp)  
 
@@ -205,7 +206,7 @@ _COUNT_, _ARRAY_AGG_, _MIN_, _MAX_, _SUM_, _AVG_, _VARIANCE_, _MEDIAN_
 Limitation: aggregate functions inside Python (or JS) expressions are not supported. Although you can use expressions inside aggregate functions.  
 E.g. `MAX(float(a1) / 1000)` - valid; `MAX(a1) / 1000` - invalid.  
 There is a workaround for the limitation above for _ARRAY_AGG_ function which supports an optional parameter - a callback function that can do something with the aggregated array. Example:  
-`select a2, ARRAY_AGG(a1, lambda v: sorted(v)[:5]) group by a2` - Python; `select a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) group by a2` - JS
+`SELECT a2, ARRAY_AGG(a1, lambda v: sorted(v)[:5]) GROUP BY a2` - Python; `SELECT a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) GROUP BY a2` - JS
 
 
 ### JOIN statements
@@ -249,26 +250,28 @@ You can define custom functions and/or import libraries in two special files:
 
 #### With Python expressions
 
-* `select top 100 a1, int(a2) * 10, len(a4) where a1 == "Buy" order by int(a2) desc`
-* `select * order by random.random()` - random sort
-* `select len(a.vehicle_price) / 10, a2 where int(a.vehicle_price) < 500 and a['Vehicle type'] in ["car", "plane", "boat"] limit 20` - referencing columns by names from header and using Python's "in" to emulate SQL's "in"
-* `update set a3 = 'NPC' where a3.find('Non-playable character') != -1`
-* `select NR, *` - enumerate records, NR is 1-based
-* `select * where re.match(".*ab.*", a1) is not None` - select entries where first column has "ab" pattern
-* `select a1, b1, b2 inner join ./countries.txt on a2 == b1 order by a1, a3` - example of join query
-* `select MAX(a1), MIN(a1) where a.Name != 'John' group by a2, a3` - example of aggregate query
-* `select *a1.split(':')` - Using Python3 unpack operator to split one column into many. Do not try this with other SQL engines!
+* `SELECT TOP 100 a1, int(a2) * 10, len(a4) WHERE a1 == "Buy" ORDER BY int(a2) DESC`
+* `SELECT a.id, a.weight / 1000 AS weight_kg`
+* `SELECT * ORDER BY random.random()` - random sort
+* `SELECT len(a.vehicle_price) / 10, a2 WHERE int(a.vehicle_price) < 500 and a['Vehicle type'] in ["car", "plane", "boat"] limit 20` - referencing columns by names from header and using Python's "in" to emulate SQL's "in"
+* `UPDATE SET a3 = 'NPC' WHERE a3.find('Non-playable character') != -1`
+* `SELECT NR, *` - enumerate records, NR is 1-based
+* `SELECT * WHERE re.match(".*ab.*", a1) is not None` - select entries where first column has "ab" pattern
+* `SELECT a1, b1, b2 INNER JOIN ./countries.txt ON a2 == b1 ORDER BY a1, a3` - example of join query
+* `SELECT MAX(a1), MIN(a1) WHERE a.Name != 'John' GROUP BY a2, a3` - example of aggregate query
+* `SELECT *a1.split(':')` - Using Python3 unpack operator to split one column into many. Do not try this with other SQL engines!
 
 #### With JavaScript expressions
 
-* `select top 100 a1, a2 * 10, a4.length where a1 == "Buy" order by parseInt(a2) desc`
-* `select * order by Math.random()` - random sort
-* `select top 20 a.vehicle_price.length / 10, a2 where parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header
-* `update set a3 = 'NPC' where a3.indexOf('Non-playable character') != -1`
-* `select NR, *` - enumerate records, NR is 1-based
-* `select a1, b1, b2 inner join ./countries.txt on a2 == b1 order by a1, a3` - example of join query
-* `select MAX(a1), MIN(a1) where a.Name != 'John' group by a2, a3` - example of aggregate query
-* `select ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines!
+* `SELECT TOP 100 a1, a2 * 10, a4.length WHERE a1 == "Buy" ORDER BY parseInt(a2) DESC`
+* `SELECT a.id, a.weight / 1000 AS weight_kg`
+* `SELECT * ORDER BY Math.random()` - random sort
+* `SELECT TOP 20 a.vehicle_price.length / 10, a2 WHERE parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header
+* `UPDATE SET a3 = 'NPC' WHERE a3.indexOf('Non-playable character') != -1`
+* `SELECT NR, *` - enumerate records, NR is 1-based
+* `SELECT a1, b1, b2 INNER JOIN ./countries.txt ON a2 == b1 ORDER BY a1, a3` - example of join query
+* `SELECT MAX(a1), MIN(a1) WHERE a.Name != 'John' GROUP BY a2, a3` - example of aggregate query
+* `SELECT ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines!
 
 
 ### References

diff --git a/rbql-js/csv_utils.js b/rbql-js/csv_utils.js
@@ -39,6 +39,7 @@ function extract_next_field(src, dlm, preserve_quotes_and_whitespaces, allow_ext
 
 
 function split_quoted_str(src, dlm, preserve_quotes_and_whitespaces=false) {
+    // This function is newline-agnostic i.e. it can also split records with multiline fields.
     if (src.indexOf('"') == -1) // Optimization for most common case
         return [src.split(dlm), false];
     var result = [];
@@ -116,6 +117,41 @@ function smart_split(src, dlm, policy, preserve_quotes_and_whitespaces) {
 }
 
 
+class MultilineRecordAggregator {
+    constructor(comment_prefix) {
+        this.comment_prefix = comment_prefix;
+        this.reset();
+    }
+    add_line(line_text) {
+        if (this.has_full_record || this.has_comment_line) {
+            throw new Error('Invalid usage - record aggregator must be reset before adding new lines');
+        }
+        if (this.comment_prefix && this.rfc_line_buffer.length == 0 && line_text.startsWith(this.comment_prefix)) {
+            this.has_comment_line = true;
+            return false;
+        }
+        let match_list = line_text.match(/"/g);
+        let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1;
+        this.rfc_line_buffer.push(line_text);
+        this.has_full_record = (!has_unbalanced_double_quote && this.rfc_line_buffer.length == 1) || (has_unbalanced_double_quote && this.rfc_line_buffer.length > 1);
+        return this.has_full_record;
+    }
+    is_inside_multiline_record() {
+        return this.rfc_line_buffer.length && !this.has_full_record;
+    }
+    get_full_line(line_separator) {
+        return this.rfc_line_buffer.join(line_separator);
+    }
+    get_num_lines_in_record() {
+        return this.rfc_line_buffer.length;
+    }
+    reset() {
+        this.rfc_line_buffer = [];
+        this.has_full_record = false;
+        this.has_comment_line = false;
+    }
+}
+
 
 module.exports.split_quoted_str = split_quoted_str;
 module.exports.split_whitespace_separated_str = split_whitespace_separated_str;
@@ -125,3 +161,4 @@ module.exports.rfc_quote_field = rfc_quote_field;
 module.exports.unquote_field = unquote_field;
 module.exports.unquote_fields = unquote_fields;
 module.exports.split_lines = split_lines;
+module.exports.MultilineRecordAggregator = MultilineRecordAggregator;