Skip to content

Commit

Permalink
merge from RBQL
Browse files Browse the repository at this point in the history
  • Loading branch information
mechatroner committed Dec 3, 2022
1 parent f3ef874 commit f41d12e
Show file tree
Hide file tree
Showing 8 changed files with 327 additions and 187 deletions.
41 changes: 22 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ RBQL is distributed with CLI apps, text editor plugins, Python and JS libraries.
* Use Python or JavaScript expressions inside _SELECT_, _UPDATE_, _WHERE_ and _ORDER BY_ statements
* Supports multiple input formats
* Result set of any query immediately becomes a first-class table on its own
* No need to provide FROM statement in the query - input table is defined by the current context
* No need to provide FROM statement in the query when the input table is defined by the current context.
* Supports all main SQL keywords
* Supports aggregate functions and GROUP BY queries
* Supports user-defined functions (UDF)
Expand All @@ -166,6 +166,7 @@ RBQL is distributed with CLI apps, text editor plugins, Python and JS libraries.
* GROUP BY
* TOP _N_
* LIMIT _N_
* AS

All keywords have the same meaning as in SQL queries. You can check them [online](https://www.w3schools.com/sql/default.asp)

Expand Down Expand Up @@ -205,7 +206,7 @@ _COUNT_, _ARRAY_AGG_, _MIN_, _MAX_, _SUM_, _AVG_, _VARIANCE_, _MEDIAN_
Limitation: aggregate functions inside Python (or JS) expressions are not supported. Although you can use expressions inside aggregate functions.
E.g. `MAX(float(a1) / 1000)` - valid; `MAX(a1) / 1000` - invalid.
There is a workaround for the limitation above for _ARRAY_AGG_ function which supports an optional parameter - a callback function that can do something with the aggregated array. Example:
`select a2, ARRAY_AGG(a1, lambda v: sorted(v)[:5]) group by a2` - Python; `select a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) group by a2` - JS
`SELECT a2, ARRAY_AGG(a1, lambda v: sorted(v)[:5]) GROUP BY a2` - Python; `SELECT a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) GROUP BY a2` - JS


### JOIN statements
Expand Down Expand Up @@ -249,26 +250,28 @@ You can define custom functions and/or import libraries in two special files:

#### With Python expressions

* `select top 100 a1, int(a2) * 10, len(a4) where a1 == "Buy" order by int(a2) desc`
* `select * order by random.random()` - random sort
* `select len(a.vehicle_price) / 10, a2 where int(a.vehicle_price) < 500 and a['Vehicle type'] in ["car", "plane", "boat"] limit 20` - referencing columns by names from header and using Python's "in" to emulate SQL's "in"
* `update set a3 = 'NPC' where a3.find('Non-playable character') != -1`
* `select NR, *` - enumerate records, NR is 1-based
* `select * where re.match(".*ab.*", a1) is not None` - select entries where first column has "ab" pattern
* `select a1, b1, b2 inner join ./countries.txt on a2 == b1 order by a1, a3` - example of join query
* `select MAX(a1), MIN(a1) where a.Name != 'John' group by a2, a3` - example of aggregate query
* `select *a1.split(':')` - Using Python3 unpack operator to split one column into many. Do not try this with other SQL engines!
* `SELECT TOP 100 a1, int(a2) * 10, len(a4) WHERE a1 == "Buy" ORDER BY int(a2) DESC`
* `SELECT a.id, a.weight / 1000 AS weight_kg`
* `SELECT * ORDER BY random.random()` - random sort
* `SELECT len(a.vehicle_price) / 10, a2 WHERE int(a.vehicle_price) < 500 and a['Vehicle type'] in ["car", "plane", "boat"] limit 20` - referencing columns by names from header and using Python's "in" to emulate SQL's "in"
* `UPDATE SET a3 = 'NPC' WHERE a3.find('Non-playable character') != -1`
* `SELECT NR, *` - enumerate records, NR is 1-based
* `SELECT * WHERE re.match(".*ab.*", a1) is not None` - select entries where first column has "ab" pattern
* `SELECT a1, b1, b2 INNER JOIN ./countries.txt ON a2 == b1 ORDER BY a1, a3` - example of join query
* `SELECT MAX(a1), MIN(a1) WHERE a.Name != 'John' GROUP BY a2, a3` - example of aggregate query
* `SELECT *a1.split(':')` - Using Python3 unpack operator to split one column into many. Do not try this with other SQL engines!

#### With JavaScript expressions

* `select top 100 a1, a2 * 10, a4.length where a1 == "Buy" order by parseInt(a2) desc`
* `select * order by Math.random()` - random sort
* `select top 20 a.vehicle_price.length / 10, a2 where parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header
* `update set a3 = 'NPC' where a3.indexOf('Non-playable character') != -1`
* `select NR, *` - enumerate records, NR is 1-based
* `select a1, b1, b2 inner join ./countries.txt on a2 == b1 order by a1, a3` - example of join query
* `select MAX(a1), MIN(a1) where a.Name != 'John' group by a2, a3` - example of aggregate query
* `select ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines!
* `SELECT TOP 100 a1, a2 * 10, a4.length WHERE a1 == "Buy" ORDER BY parseInt(a2) DESC`
* `SELECT a.id, a.weight / 1000 AS weight_kg`
* `SELECT * ORDER BY Math.random()` - random sort
* `SELECT TOP 20 a.vehicle_price.length / 10, a2 WHERE parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header
* `UPDATE SET a3 = 'NPC' WHERE a3.indexOf('Non-playable character') != -1`
* `SELECT NR, *` - enumerate records, NR is 1-based
* `SELECT a1, b1, b2 INNER JOIN ./countries.txt ON a2 == b1 ORDER BY a1, a3` - example of join query
* `SELECT MAX(a1), MIN(a1) WHERE a.Name != 'John' GROUP BY a2, a3` - example of aggregate query
* `SELECT ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines!


### References
Expand Down
37 changes: 37 additions & 0 deletions rbql-js/csv_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ function extract_next_field(src, dlm, preserve_quotes_and_whitespaces, allow_ext


function split_quoted_str(src, dlm, preserve_quotes_and_whitespaces=false) {
// This function is newline-agnostic i.e. it can also split records with multiline fields.
if (src.indexOf('"') == -1) // Optimization for most common case
return [src.split(dlm), false];
var result = [];
Expand Down Expand Up @@ -116,6 +117,41 @@ function smart_split(src, dlm, policy, preserve_quotes_and_whitespaces) {
}


class MultilineRecordAggregator {
constructor(comment_prefix) {
this.comment_prefix = comment_prefix;
this.reset();
}
add_line(line_text) {
if (this.has_full_record || this.has_comment_line) {
throw new Error('Invalid usage - record aggregator must be reset before adding new lines');
}
if (this.comment_prefix && this.rfc_line_buffer.length == 0 && line_text.startsWith(this.comment_prefix)) {
this.has_comment_line = true;
return false;
}
let match_list = line_text.match(/"/g);
let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1;
this.rfc_line_buffer.push(line_text);
this.has_full_record = (!has_unbalanced_double_quote && this.rfc_line_buffer.length == 1) || (has_unbalanced_double_quote && this.rfc_line_buffer.length > 1);
return this.has_full_record;
}
is_inside_multiline_record() {
return this.rfc_line_buffer.length && !this.has_full_record;
}
get_full_line(line_separator) {
return this.rfc_line_buffer.join(line_separator);
}
get_num_lines_in_record() {
return this.rfc_line_buffer.length;
}
reset() {
this.rfc_line_buffer = [];
this.has_full_record = false;
this.has_comment_line = false;
}
}


module.exports.split_quoted_str = split_quoted_str;
module.exports.split_whitespace_separated_str = split_whitespace_separated_str;
Expand All @@ -125,3 +161,4 @@ module.exports.rfc_quote_field = rfc_quote_field;
module.exports.unquote_field = unquote_field;
module.exports.unquote_fields = unquote_fields;
module.exports.split_lines = split_lines;
module.exports.MultilineRecordAggregator = MultilineRecordAggregator;
Loading

0 comments on commit f41d12e

Please sign in to comment.