diff --git a/README.md b/README.md index 1c14ed2..920366f 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ RBQL is distributed with CLI apps, text editor plugins, Python and JS libraries. * Use Python or JavaScript expressions inside _SELECT_, _UPDATE_, _WHERE_ and _ORDER BY_ statements * Supports multiple input formats * Result set of any query immediately becomes a first-class table on its own -* No need to provide FROM statement in the query - input table is defined by the current context +* No need to provide FROM statement in the query when the input table is defined by the current context. * Supports all main SQL keywords * Supports aggregate functions and GROUP BY queries * Supports user-defined functions (UDF) @@ -166,6 +166,7 @@ RBQL is distributed with CLI apps, text editor plugins, Python and JS libraries. * GROUP BY * TOP _N_ * LIMIT _N_ +* AS All keywords have the same meaning as in SQL queries. You can check them [online](https://www.w3schools.com/sql/default.asp) @@ -205,7 +206,7 @@ _COUNT_, _ARRAY_AGG_, _MIN_, _MAX_, _SUM_, _AVG_, _VARIANCE_, _MEDIAN_ Limitation: aggregate functions inside Python (or JS) expressions are not supported. Although you can use expressions inside aggregate functions. E.g. `MAX(float(a1) / 1000)` - valid; `MAX(a1) / 1000` - invalid. There is a workaround for the limitation above for _ARRAY_AGG_ function which supports an optional parameter - a callback function that can do something with the aggregated array. Example: -`select a2, ARRAY_AGG(a1, lambda v: sorted(v)[:5]) group by a2` - Python; `select a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) group by a2` - JS +`SELECT a2, ARRAY_AGG(a1, lambda v: sorted(v)[:5]) GROUP BY a2` - Python; `SELECT a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) GROUP BY a2` - JS ### JOIN statements @@ -249,26 +250,28 @@ You can define custom functions and/or import libraries in two special files: #### With Python expressions -* `select top 100 a1, int(a2) * 10, len(a4) where a1 == "Buy" order by int(a2) desc` -* `select * order by random.random()` - random sort -* `select len(a.vehicle_price) / 10, a2 where int(a.vehicle_price) < 500 and a['Vehicle type'] in ["car", "plane", "boat"] limit 20` - referencing columns by names from header and using Python's "in" to emulate SQL's "in" -* `update set a3 = 'NPC' where a3.find('Non-playable character') != -1` -* `select NR, *` - enumerate records, NR is 1-based -* `select * where re.match(".*ab.*", a1) is not None` - select entries where first column has "ab" pattern -* `select a1, b1, b2 inner join ./countries.txt on a2 == b1 order by a1, a3` - example of join query -* `select MAX(a1), MIN(a1) where a.Name != 'John' group by a2, a3` - example of aggregate query -* `select *a1.split(':')` - Using Python3 unpack operator to split one column into many. Do not try this with other SQL engines! +* `SELECT TOP 100 a1, int(a2) * 10, len(a4) WHERE a1 == "Buy" ORDER BY int(a2) DESC` +* `SELECT a.id, a.weight / 1000 AS weight_kg` +* `SELECT * ORDER BY random.random()` - random sort +* `SELECT len(a.vehicle_price) / 10, a2 WHERE int(a.vehicle_price) < 500 and a['Vehicle type'] in ["car", "plane", "boat"] limit 20` - referencing columns by names from header and using Python's "in" to emulate SQL's "in" +* `UPDATE SET a3 = 'NPC' WHERE a3.find('Non-playable character') != -1` +* `SELECT NR, *` - enumerate records, NR is 1-based +* `SELECT * WHERE re.match(".*ab.*", a1) is not None` - select entries where first column has "ab" pattern +* `SELECT a1, b1, b2 INNER JOIN ./countries.txt ON a2 == b1 ORDER BY a1, a3` - example of join query +* `SELECT MAX(a1), MIN(a1) WHERE a.Name != 'John' GROUP BY a2, a3` - example of aggregate query +* `SELECT *a1.split(':')` - Using Python3 unpack operator to split one column into many. Do not try this with other SQL engines! #### With JavaScript expressions -* `select top 100 a1, a2 * 10, a4.length where a1 == "Buy" order by parseInt(a2) desc` -* `select * order by Math.random()` - random sort -* `select top 20 a.vehicle_price.length / 10, a2 where parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header -* `update set a3 = 'NPC' where a3.indexOf('Non-playable character') != -1` -* `select NR, *` - enumerate records, NR is 1-based -* `select a1, b1, b2 inner join ./countries.txt on a2 == b1 order by a1, a3` - example of join query -* `select MAX(a1), MIN(a1) where a.Name != 'John' group by a2, a3` - example of aggregate query -* `select ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines! +* `SELECT TOP 100 a1, a2 * 10, a4.length WHERE a1 == "Buy" ORDER BY parseInt(a2) DESC` +* `SELECT a.id, a.weight / 1000 AS weight_kg` +* `SELECT * ORDER BY Math.random()` - random sort +* `SELECT TOP 20 a.vehicle_price.length / 10, a2 WHERE parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header +* `UPDATE SET a3 = 'NPC' WHERE a3.indexOf('Non-playable character') != -1` +* `SELECT NR, *` - enumerate records, NR is 1-based +* `SELECT a1, b1, b2 INNER JOIN ./countries.txt ON a2 == b1 ORDER BY a1, a3` - example of join query +* `SELECT MAX(a1), MIN(a1) WHERE a.Name != 'John' GROUP BY a2, a3` - example of aggregate query +* `SELECT ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines! ### References diff --git a/rbql-js/csv_utils.js b/rbql-js/csv_utils.js index 2609288..a8e6dd9 100755 --- a/rbql-js/csv_utils.js +++ b/rbql-js/csv_utils.js @@ -39,6 +39,7 @@ function extract_next_field(src, dlm, preserve_quotes_and_whitespaces, allow_ext function split_quoted_str(src, dlm, preserve_quotes_and_whitespaces=false) { + // This function is newline-agnostic i.e. it can also split records with multiline fields. if (src.indexOf('"') == -1) // Optimization for most common case return [src.split(dlm), false]; var result = []; @@ -116,6 +117,41 @@ function smart_split(src, dlm, policy, preserve_quotes_and_whitespaces) { } +class MultilineRecordAggregator { + constructor(comment_prefix) { + this.comment_prefix = comment_prefix; + this.reset(); + } + add_line(line_text) { + if (this.has_full_record || this.has_comment_line) { + throw new Error('Invalid usage - record aggregator must be reset before adding new lines'); + } + if (this.comment_prefix && this.rfc_line_buffer.length == 0 && line_text.startsWith(this.comment_prefix)) { + this.has_comment_line = true; + return false; + } + let match_list = line_text.match(/"/g); + let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1; + this.rfc_line_buffer.push(line_text); + this.has_full_record = (!has_unbalanced_double_quote && this.rfc_line_buffer.length == 1) || (has_unbalanced_double_quote && this.rfc_line_buffer.length > 1); + return this.has_full_record; + } + is_inside_multiline_record() { + return this.rfc_line_buffer.length && !this.has_full_record; + } + get_full_line(line_separator) { + return this.rfc_line_buffer.join(line_separator); + } + get_num_lines_in_record() { + return this.rfc_line_buffer.length; + } + reset() { + this.rfc_line_buffer = []; + this.has_full_record = false; + this.has_comment_line = false; + } +} + module.exports.split_quoted_str = split_quoted_str; module.exports.split_whitespace_separated_str = split_whitespace_separated_str; @@ -125,3 +161,4 @@ module.exports.rfc_quote_field = rfc_quote_field; module.exports.unquote_field = unquote_field; module.exports.unquote_fields = unquote_fields; module.exports.split_lines = split_lines; +module.exports.MultilineRecordAggregator = MultilineRecordAggregator; diff --git a/rbql-js/rbql.js b/rbql-js/rbql.js index 490ec2a..3a3171b 100755 --- a/rbql-js/rbql.js +++ b/rbql-js/rbql.js @@ -70,7 +70,7 @@ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions. const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs'; -const RBQL_VERSION = '0.24.0'; +const RBQL_VERSION = '0.27.0'; function check_if_brackets_match(opening_bracket, closing_bracket) { @@ -128,28 +128,32 @@ function column_info_from_text_span(text_span, string_literals) { let attribute_match = /^([ab])\.([_a-zA-Z][_a-zA-Z0-9]*)$/.exec(text_span); let subscript_int_match = /^([ab])\[([0-9]+)\]$/.exec(text_span); let subscript_str_match = /^([ab])\[___RBQL_STRING_LITERAL([0-9]+)___\]$/.exec(text_span); + let as_alias_match = /^(.*) (as|AS) +([a-zA-Z][a-zA-Z0-9_]*) *$/.exec(text_span); + if (as_alias_match !== null) { + return {table_name: null, column_index: null, column_name: null, is_star: false, alias_name: as_alias_match[3]}; + } if (simple_var_match !== null) { if (text_span == rbql_star_marker) - return {table_name: null, column_index: null, column_name: null, is_star: true}; + return {table_name: null, column_index: null, column_name: null, is_star: true, alias_name: null}; if (text_span.startsWith('___RBQL_STRING_LITERAL')) return null; let match = /^([ab])([0-9]+)$/.exec(text_span); if (match !== null) { - return {table_name: match[1], column_index: parseInt(match[2]) - 1, column_name: null, is_star: false}; + return {table_name: match[1], column_index: parseInt(match[2]) - 1, column_name: null, is_star: false, alias_name: null}; } // Some examples for this branch: NR, NF - return {table_name: null, column_index: null, column_name: text_span, is_star: false}; + return {table_name: null, column_index: null, column_name: text_span, is_star: false, alias_name: null}; } else if (attribute_match !== null) { let table_name = attribute_match[1]; let column_name = attribute_match[2]; if (column_name == rbql_star_marker) { - return {table_name: table_name, column_index: null, column_name: null, is_star: true}; + return {table_name: table_name, column_index: null, column_name: null, is_star: true, alias_name: null}; } - return {table_name: null, column_index: null, column_name: column_name, is_star: false}; + return {table_name: null, column_index: null, column_name: column_name, is_star: false, alias_name: null}; } else if (subscript_int_match != null) { let table_name = subscript_int_match[1]; let column_index = parseInt(subscript_int_match[2]) - 1; - return {table_name: table_name, column_index: column_index, column_name: null, is_star: false}; + return {table_name: table_name, column_index: column_index, column_name: null, is_star: false, alias_name: null}; } else if (subscript_str_match != null) { let table_name = subscript_str_match[1]; let replaced_string_literal_id = subscript_str_match[2]; @@ -157,7 +161,7 @@ function column_info_from_text_span(text_span, string_literals) { let quoted_column_name = string_literals[replaced_string_literal_id]; let unquoted_column_name = unquote_string(quoted_column_name); if (unquoted_column_name !== null && unquoted_column_name !== undefined) { - return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false}; + return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false, alias_name: null}; } } } @@ -575,10 +579,10 @@ class TopWriter { this.top_count = top_count; } - write(record) { + async write(record) { if (this.top_count !== null && this.NW >= this.top_count) return false; - this.subwriter.write(record); + await this.subwriter.write(record); this.NW += 1; return true; } @@ -595,10 +599,10 @@ class UniqWriter { this.seen = new Set(); } - write(record) { + async write(record) { if (!add_to_set(this.seen, JSON.stringify(record))) return true; - if (!this.subwriter.write(record)) + if (!await this.subwriter.write(record)) return false; return true; } @@ -615,7 +619,7 @@ class UniqCountWriter { this.records = new Map(); } - write(record) { + async write(record) { var key = JSON.stringify(record); var old_val = this.records.get(key); if (old_val) { @@ -630,7 +634,7 @@ class UniqCountWriter { for (var [key, value] of this.records) { let [count, record] = value; record.unshift(count); - if (!this.subwriter.write(record)) + if (!await this.subwriter.write(record)) break; } await this.subwriter.finish(); @@ -645,7 +649,7 @@ class SortedWriter { this.unsorted_entries = []; } - write(stable_entry) { + async write(stable_entry) { this.unsorted_entries.push(stable_entry); return true; } @@ -657,7 +661,7 @@ class SortedWriter { unsorted_entries.reverse(); for (var i = 0; i < unsorted_entries.length; i++) { var entry = unsorted_entries[i]; - if (!this.subwriter.write(entry[entry.length - 1])) + if (!await this.subwriter.write(entry[entry.length - 1])) break; } await this.subwriter.finish(); @@ -681,7 +685,7 @@ class AggregateWriter { for (var ag of this.aggregators) { out_fields.push(ag.get_final(key)); } - if (!this.subwriter.write(out_fields)) + if (!await this.subwriter.write(out_fields)) break; } await this.subwriter.finish(); @@ -741,13 +745,13 @@ function select_except(src, except_fields) { } -function select_simple(sort_key, NR, out_fields) { +async function select_simple(sort_key, NR, out_fields) { if (query_context.sort_key_expression !== null) { var sort_entry = sort_key.concat([NR, out_fields]); - if (!query_context.writer.write(sort_entry)) + if (!await query_context.writer.write(sort_entry)) return false; } else { - if (!query_context.writer.write(out_fields)) + if (!await query_context.writer.write(out_fields)) return false; } return true; @@ -789,12 +793,12 @@ function select_aggregated(key, transparent_values) { } -function select_unnested(sort_key, NR, folded_fields) { +async function select_unnested(sort_key, NR, folded_fields) { let out_fields = folded_fields.slice(); let unnest_pos = folded_fields.findIndex(val => val instanceof UnnestMarker); for (var i = 0; i < query_context.unnest_list.length; i++) { out_fields[unnest_pos] = query_context.unnest_list[i]; - if (!select_simple(sort_key, NR, out_fields.slice())) + if (!await select_simple(sort_key, NR, out_fields.slice())) return false; } return true; @@ -811,10 +815,10 @@ if (__RBQLMP__where_expression) { } else { let sort_key = [__RBQLMP__sort_key_expression]; if (query_context.unnest_list !== null) { - if (!select_unnested(sort_key, NR, out_fields)) + if (!await select_unnested(sort_key, NR, out_fields)) stop_flag = true; } else { - if (!select_simple(sort_key, NR, out_fields)) + if (!await select_simple(sort_key, NR, out_fields)) stop_flag = true; } } @@ -855,7 +859,7 @@ if (join_matches.length == 1 && (__RBQLMP__where_expression)) { NU += 1; __RBQLMP__update_expressions } -if (!query_context.writer.write(up_fields)) +if (!await query_context.writer.write(up_fields)) stop_flag = true; `; @@ -867,7 +871,7 @@ if (__RBQLMP__where_expression) { NU += 1; __RBQLMP__update_expressions } -if (!query_context.writer.write(up_fields)) +if (!await query_context.writer.write(up_fields)) stop_flag = true; `; @@ -972,7 +976,7 @@ async function compile_and_run(query_context) { if (lower_case_query.indexOf(' like ') != -1) throw new SyntaxError(e.message + "\nRBQL doesn't support \"LIKE\" operator, use like() function instead e.g. ... WHERE like(a1, 'foo%bar') ... "); // UT JSON if (lower_case_query.indexOf(' from ') != -1) - throw new SyntaxError(e.message + "\nRBQL doesn't use \"FROM\" keyword, e.g. you can query 'SELECT *' without FROM"); // UT JSON + throw new SyntaxError(e.message + "\nTip: If input table is defined by the environment, RBQL query should not have \"FROM\" keyword"); // UT JSON if (e && e.message && String(e.message).toLowerCase().indexOf('unexpected identifier') != -1) { if (lower_case_query.indexOf(' and ') != -1) throw new SyntaxError(e.message + "\nDid you use 'and' keyword in your query?\nJavaScript backend doesn't support 'and' keyword, use '&&' operator instead!"); @@ -1255,8 +1259,8 @@ function generate_init_statements(query_text, variables_map, join_variables_map, function replace_star_count(aggregate_expression) { - var rgx = /(^|,) *COUNT\( *\* *\) *(?:$|(?=,))/ig; - var result = aggregate_expression.replace(rgx, '$1 COUNT(1)'); + var rgx = /(?:(?<=^)|(?<=,)) *COUNT\( *\* *\)/ig; + var result = aggregate_expression.replace(rgx, ' COUNT(1)'); return str_strip(result); } @@ -1328,9 +1332,11 @@ function translate_update_expression(update_expression, input_variables_map, str function translate_select_expression(select_expression) { - let expression_without_stars = replace_star_count(select_expression); - let translated = str_strip(replace_star_vars(expression_without_stars)); - let translated_for_header = str_strip(replace_star_vars_for_header_parsing(expression_without_stars)); + let as_alias_replacement_regexp = / +(AS|as) +([a-zA-Z][a-zA-Z0-9_]*) *(?=$|,)/g; + let expression_without_counting_stars = replace_star_count(select_expression); + let expression_without_as_column_alias = expression_without_counting_stars.replace(as_alias_replacement_regexp, ''); + let translated = str_strip(replace_star_vars(expression_without_as_column_alias)); + let translated_for_header = str_strip(replace_star_vars_for_header_parsing(expression_without_counting_stars)); if (!translated.length) throw new RbqlParsingError('"SELECT" expression is empty'); return [`[].concat([${translated}])`, translated_for_header]; @@ -1571,12 +1577,30 @@ function remove_redundant_table_name(query_text) { function select_output_header(input_header, join_header, query_column_infos) { - if (input_header === null && join_header === null) - return null; - if (input_header === null) + if (input_header === null) { + assert(join_header === null); + } + let query_has_star = false; + let query_has_column_alias = false; + for (let qci of query_column_infos) { + query_has_star = query_has_star || (qci !== null && qci.is_star); + query_has_column_alias = query_has_column_alias || (qci !== null && qci.alias_name !== null); + } + if (input_header === null) { + if (query_has_star && query_has_column_alias) { + throw new RbqlParsingError('Using both * (star) and AS alias in the same query is not allowed for input tables without header'); + } + if (!query_has_column_alias) { + // Input table has no header and query has no aliases therefore the output table will be without header. + return null; + } input_header = []; - if (join_header === null) join_header = []; + } + if (join_header === null) { + // This means there is no JOIN table. + join_header = []; + } let output_header = []; for (let qci of query_column_infos) { // TODO refactor this and python version: extract this code into a function instead to always return something @@ -1592,6 +1616,8 @@ function select_output_header(input_header, join_header, query_column_infos) { } } else if (qci.column_name !== null) { output_header.push(qci.column_name); + } else if (qci.alias_name !== null) { + output_header.push(qci.alias_name); } else if (qci.column_index !== null) { if (qci.table_name == 'a' && qci.column_index < input_header.length) { output_header.push(input_header[qci.column_index]); @@ -1608,20 +1634,20 @@ function select_output_header(input_header, join_header, query_column_infos) { } -function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) { - let keys = Object.keys(inconsistent_records_info); - let entries = []; - for (let i = 0; i < keys.length; i++) { - let key = keys[i]; - let record_id = inconsistent_records_info[key]; - entries.push([record_id, key]); - } - entries.sort(function(a, b) { return a[0] - b[0]; }); +function sample_first_two_inconsistent_records(inconsistent_records_info) { + let entries = Array.from(inconsistent_records_info.entries()); + entries.sort(function(a, b) { return a[1] - b[1]; }); assert(entries.length > 1); - let [record_1, num_fields_1] = entries[0]; - let [record_2, num_fields_2] = entries[1]; + let [num_fields_1, record_num_1] = entries[0]; + let [num_fields_2, record_num_2] = entries[1]; + return [record_num_1, num_fields_1, record_num_2, num_fields_2]; +} + + +function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) { + let [record_num_1, num_fields_1, record_num_2, num_fields_2] = sample_first_two_inconsistent_records(inconsistent_records_info); let warn_msg = `Number of fields in "${table_name}" table is not consistent: `; - warn_msg += `e.g. record ${record_1} -> ${num_fields_1} fields, record ${record_2} -> ${num_fields_2} fields`; + warn_msg += `e.g. record ${record_num_1} -> ${num_fields_1} fields, record ${record_num_2} -> ${num_fields_2} fields`; return warn_msg; } @@ -1652,7 +1678,7 @@ class RBQLInputIterator { class RBQLOutputWriter { constructor(){} - write(fields) { + async write(fields) { throw new Error("Unable to call the interface method"); } @@ -1691,7 +1717,7 @@ class TableIterator extends RBQLInputIterator { this.normalize_column_names = normalize_column_names; this.variable_prefix = variable_prefix; this.nr = 0; - this.fields_info = new Object(); + this.fields_info = new Map(); this.stopped = false; } @@ -1727,13 +1753,13 @@ class TableIterator extends RBQLInputIterator { let record = this.table[this.nr]; this.nr += 1; let num_fields = record.length; - if (!this.fields_info.hasOwnProperty(num_fields)) - this.fields_info[num_fields] = this.nr; + if (!this.fields_info.has(num_fields)) + this.fields_info.set(num_fields, this.nr); return record; }; get_warnings() { - if (Object.keys(this.fields_info).length > 1) + if (this.fields_info.size > 1) return [make_inconsistent_num_fields_warning('input', this.fields_info)]; return []; }; @@ -1751,7 +1777,7 @@ class TableWriter extends RBQLOutputWriter { this.header = null; } - write(fields) { + async write(fields) { this.table.push(fields); return true; }; @@ -1799,6 +1825,8 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables query_context.aggregation_key_expression = '[' + combine_string_literals(rb_actions[GROUP_BY]['text'], string_literals) + ']'; } + + let input_header = await input_iterator.get_header(); let join_variables_map = null; let join_header = null; if (rb_actions.hasOwnProperty(JOIN)) { @@ -1813,6 +1841,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables } join_variables_map = await join_record_iterator.get_variables_map(query_text); join_header = await join_record_iterator.get_header(); + if (input_header === null && join_header !== null) { + throw new RbqlIOHandlingError('Inconsistent modes: Input table doesn\'t have a header while the Join table has a header'); + } + if (input_header !== null && join_header === null) { + throw new RbqlIOHandlingError('Inconsistent modes: Input table has a header while the Join table doesn\'t have a header'); + } let [lhs_variables, rhs_indices] = resolve_join_variables(input_variables_map, join_variables_map, variable_pairs, string_literals); let sql_join_type = {'JOIN': InnerJoiner, 'INNER JOIN': InnerJoiner, 'LEFT JOIN': LeftJoiner, 'LEFT OUTER JOIN': LeftJoiner, 'STRICT LEFT JOIN': StrictLeftJoiner}[rb_actions[JOIN]['join_subtype']]; query_context.lhs_join_var_expression = lhs_variables.length == 1 ? lhs_variables[0] : 'JSON.stringify([' + lhs_variables.join(',') + '])'; @@ -1830,7 +1864,6 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables query_context.where_expression = combine_string_literals(where_expression, string_literals); } - let input_header = await input_iterator.get_header(); if (rb_actions.hasOwnProperty(UPDATE)) { var update_expression = translate_update_expression(rb_actions[UPDATE]['text'], input_variables_map, string_literals, ' '.repeat(8)); query_context.update_expressions = combine_string_literals(update_expression, string_literals); @@ -1840,13 +1873,16 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables if (rb_actions.hasOwnProperty(SELECT)) { query_context.top_count = find_top(rb_actions); if (rb_actions.hasOwnProperty(EXCEPT)) { + if (rb_actions.hasOwnProperty(JOIN)) { + throw new RbqlParsingError('EXCEPT and JOIN are not allowed in the same query'); + } let [output_header, select_expression] = translate_except_expression(rb_actions[EXCEPT]['text'], input_variables_map, string_literals, input_header); query_context.select_expression = select_expression; query_context.writer.set_header(output_header); } else { - let [select_expression, select_expression_for_ast] = translate_select_expression(rb_actions[SELECT]['text']); + let [select_expression, select_expression_for_header] = translate_select_expression(rb_actions[SELECT]['text']); query_context.select_expression = combine_string_literals(select_expression, string_literals); - let column_infos = adhoc_parse_select_expression_to_column_infos(select_expression_for_ast, string_literals); + let column_infos = adhoc_parse_select_expression_to_column_infos(select_expression_for_header, string_literals); let output_header = select_output_header(input_header, join_header, column_infos); query_context.writer.set_header(output_header); } @@ -1947,5 +1983,6 @@ exports.adhoc_parse_select_expression_to_column_infos = adhoc_parse_select_expre exports.replace_star_count = replace_star_count; exports.replace_star_vars_for_header_parsing = replace_star_vars_for_header_parsing; exports.select_output_header = select_output_header; +exports.sample_first_two_inconsistent_records = sample_first_two_inconsistent_records; }(typeof exports === 'undefined' ? this.rbql = {} : exports)); diff --git a/rbql-js/rbql_csv.js b/rbql-js/rbql_csv.js index a798db0..6a1818a 100755 --- a/rbql-js/rbql_csv.js +++ b/rbql-js/rbql_csv.js @@ -62,19 +62,9 @@ function remove_utf8_bom(line, assumed_source_encoding) { function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) { - let keys = Object.keys(inconsistent_records_info); - let entries = []; - for (let i = 0; i < keys.length; i++) { - let key = keys[i]; - let record_id = inconsistent_records_info[key]; - entries.push([record_id, key]); - } - entries.sort(function(a, b) { return a[0] - b[0]; }); - assert(entries.length > 1); - let [record_1, num_fields_1] = entries[0]; - let [record_2, num_fields_2] = entries[1]; + let [record_num_1, num_fields_1, record_num_2, num_fields_2] = rbql.sample_first_two_inconsistent_records(inconsistent_records_info); let warn_msg = `Number of fields in "${table_name}" table is not consistent: `; - warn_msg += `e.g. record ${record_1} -> ${num_fields_1} fields, record ${record_2} -> ${num_fields_2} fields`; + warn_msg += `e.g. record ${record_num_1} -> ${num_fields_1} fields, record ${record_num_2} -> ${num_fields_2} fields`; return warn_msg; } @@ -182,7 +172,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { this.table_name = table_name; this.variable_prefix = variable_prefix; - this.comment_prefix = (comment_prefix !== null && comment_prefix.length) ? comment_prefix : null; + this.comment_prefix = comment_prefix; this.decoder = null; if (encoding == 'utf-8' && this.csv_path === null) { @@ -204,22 +194,25 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { this.utf8_bom_removed = false; // BOM doesn't get automatically removed by the decoder when utf-8 file is treated as latin-1 this.first_defective_line = null; - this.fields_info = new Object(); + this.fields_info = new Map(); this.NR = 0; // Record number this.NL = 0; // Line number (NL != NR when the CSV file has comments or multiline fields) - this.rfc_line_buffer = []; + this.line_aggregator = new csv_utils.MultilineRecordAggregator(comment_prefix); this.partially_decoded_line = ''; this.partially_decoded_line_ends_with_cr = false; + // Holds an external "resolve" function which is called when everything is fine. this.resolve_current_record = null; + // Holds an external "reject" function which is called when error has occured. this.reject_current_record = null; + // Holds last exception if we don't have any reject callbacks from clients yet. this.current_exception = null; this.produced_records_queue = new RecordQueue(); - this.process_line_polymorphic = policy == 'quoted_rfc' ? this.process_partial_rfc_record_line : this.process_record_line; + this.process_line_polymorphic = policy == 'quoted_rfc' ? this.process_partial_rfc_record_line : this.process_record_line_simple; } @@ -236,18 +229,31 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { } - handle_exception(exception) { - if (this.reject_current_record) { + reset_external_callbacks() { + // Drop external callbacks simultaneously since promises can only resolve once, see: https://stackoverflow.com/a/18218542/2898283 + this.reject_current_record = null; + this.resolve_current_record = null; + } + + try_propagate_exception() { + if (this.current_exception && this.reject_current_record) { let reject = this.reject_current_record; - this.reject_current_record = null; - this.resolve_current_record = null; + let exception = this.current_exception; + this.reset_external_callbacks(); + this.current_exception = null; reject(exception); - } else { - this.current_exception = exception; } + } + + store_or_propagate_exception(exception) { + if (this.current_exception === null) + // Ignore subsequent exceptions if we already have an unreported error. This way we prioritize earlier errors over the more recent ones. + this.current_exception = exception; + this.try_propagate_exception(); } + async preread_first_record() { if (this.header_preread_complete) return; @@ -282,6 +288,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { try_resolve_next_record() { + this.try_propagate_exception(); if (this.resolve_current_record === null) return; @@ -296,8 +303,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { if (record === null && !this.input_exhausted) return; let resolve = this.resolve_current_record; - this.resolve_current_record = null; - this.reject_current_record = null; + this.reset_external_callbacks(); resolve(record); }; @@ -313,9 +319,6 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { parent_iterator.resolve_current_record = resolve; parent_iterator.reject_current_record = reject; }); - if (this.current_exception) { - this.reject_current_record(this.current_exception); - } this.try_resolve_next_record(); return current_record_promise; }; @@ -337,42 +340,38 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { }; - process_record_line(line) { - if (this.comment_prefix !== null && line.startsWith(this.comment_prefix)) + process_record_line_simple(line) { + if (this.comment_prefix && line.startsWith(this.comment_prefix)) return; // Just skip the line + this.process_record_line(line); + } + + + process_record_line(line) { this.NR += 1; var [record, warning] = csv_utils.smart_split(line, this.delim, this.policy, false); if (warning) { if (this.first_defective_line === null) { this.first_defective_line = this.NL; if (this.policy == 'quoted_rfc') - this.handle_exception(new RbqlIOHandlingError(`Inconsistent double quote escaping in ${this.table_name} table at record ${this.NR}, line ${this.NL}`)); + this.store_or_propagate_exception(new RbqlIOHandlingError(`Inconsistent double quote escaping in ${this.table_name} table at record ${this.NR}, line ${this.NL}`)); } } let num_fields = record.length; - if (!this.fields_info.hasOwnProperty(num_fields)) - this.fields_info[num_fields] = this.NR; + if (!this.fields_info.has(num_fields)) + this.fields_info.set(num_fields, this.NR); this.produced_records_queue.enqueue(record); this.try_resolve_next_record(); }; process_partial_rfc_record_line(line) { - if (this.comment_prefix !== null && this.rfc_line_buffer.length == 0 && line.startsWith(this.comment_prefix)) - return; // Just skip the line - let match_list = line.match(/"/g); - let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1; - if (this.rfc_line_buffer.length == 0 && !has_unbalanced_double_quote) { - this.process_record_line(line); - } else if (this.rfc_line_buffer.length == 0 && has_unbalanced_double_quote) { - this.rfc_line_buffer.push(line); - } else if (!has_unbalanced_double_quote) { - this.rfc_line_buffer.push(line); - } else { - this.rfc_line_buffer.push(line); - let multiline_row = this.rfc_line_buffer.join('\n'); - this.rfc_line_buffer = []; - this.process_record_line(multiline_row); + this.line_aggregator.add_line(line); + if (this.line_aggregator.has_comment_line) { + this.line_aggregator.reset(); + } else if (this.line_aggregator.has_full_record) { + this.process_record_line(this.line_aggregator.get_full_line('\n')); + this.line_aggregator.reset(); } }; @@ -397,9 +396,9 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { decoded_string = this.decoder.decode(data_chunk); } catch (e) { if (e instanceof TypeError) { - this.handle_exception(new RbqlIOHandlingError(utf_decoding_error)); + this.store_or_propagate_exception(new RbqlIOHandlingError(utf_decoding_error)); } else { - this.handle_exception(e); + this.store_or_propagate_exception(e); } return; } @@ -419,14 +418,14 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { }; - process_data_bulk(data_chunk) { - let decoded_string = data_chunk.toString(this.encoding); + process_data_bulk(data_blob) { + let decoded_string = data_blob.toString(this.encoding); if (this.encoding == 'utf-8') { // Using hacky comparison method from here: https://stackoverflow.com/a/32279283/2898283 // TODO get rid of this once TextDecoder is really fixed or when alternative method of reliable decoding appears let control_buffer = Buffer.from(decoded_string, 'utf-8'); - if (Buffer.compare(data_chunk, control_buffer) != 0) { - this.handle_exception(new RbqlIOHandlingError(utf_decoding_error)); + if (Buffer.compare(data_blob, control_buffer) != 0) { + this.store_or_propagate_exception(new RbqlIOHandlingError(utf_decoding_error)); return; } } @@ -436,8 +435,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { for (let i = 0; i < lines.length; i++) { this.process_line(lines[i]); } - if (this.rfc_line_buffer.length > 0) { - this.process_record_line(this.rfc_line_buffer.join('\n')); + if (this.line_aggregator.is_inside_multiline_record()) { + this.process_record_line(this.line_aggregator.get_full_line('\n')); } this.input_exhausted = true; this.try_resolve_next_record(); // Should be a NOOP here? @@ -451,8 +450,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { this.partially_decoded_line = ''; this.process_line(last_line); } - if (this.rfc_line_buffer.length > 0) { - this.process_record_line(this.rfc_line_buffer.join('\n')); + if (this.line_aggregator.is_inside_multiline_record()) { + this.process_record_line(this.line_aggregator.get_full_line('\n')); } this.try_resolve_next_record(); }; @@ -474,11 +473,11 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { } else { let parent_iterator = this; return new Promise(function(resolve, reject) { - fs.readFile(parent_iterator.csv_path, (err, data_chunk) => { + fs.readFile(parent_iterator.csv_path, (err, data_blob) => { if (err) { reject(err); } else { - parent_iterator.process_data_bulk(data_chunk); + parent_iterator.process_data_bulk(data_blob); resolve(); } }); @@ -493,8 +492,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { result.push(`Inconsistent double quote escaping in ${this.table_name} table. E.g. at line ${this.first_defective_line}`); if (this.utf8_bom_removed) result.push(`UTF-8 Byte Order Mark (BOM) was found and skipped in ${this.table_name} table`); - if (Object.keys(this.fields_info).length > 1) - result.push(make_inconsistent_num_fields_warning('input', this.fields_info)); + if (this.fields_info.size > 1) + result.push(make_inconsistent_num_fields_warning(this.table_name, this.fields_info)); return result; }; } @@ -507,6 +506,7 @@ class CSVWriter extends rbql.RBQLOutputWriter { this.encoding = encoding; if (encoding) this.stream.setDefaultEncoding(encoding); + this.stream.on('error', (error_obj) => { this.store_first_error(error_obj); }) this.delim = delim; this.policy = policy; this.line_separator = line_separator; @@ -517,6 +517,7 @@ class CSVWriter extends rbql.RBQLOutputWriter { this.null_in_output = false; this.delim_in_simple_output = false; this.header_len = null; + this.first_error = null; if (policy == 'simple') { this.polymorphic_join = this.simple_join; @@ -534,6 +535,12 @@ class CSVWriter extends rbql.RBQLOutputWriter { } + store_first_error(error_obj) { + // Store only first error because it is typically more important than the subsequent ones. + if (this.first_error === null) + this.first_error = error_obj; + } + set_header(header) { if (header !== null) { this.header_len = header.length; @@ -586,13 +593,20 @@ class CSVWriter extends rbql.RBQLOutputWriter { }; - write(fields) { + async write(fields) { if (this.header_len !== null && fields.length != this.header_len) throw new RbqlIOHandlingError(`Inconsistent number of columns in output header and the current record: ${this.header_len} != ${fields.length}`); this.normalize_fields(fields); this.stream.write(this.polymorphic_join(fields)); this.stream.write(this.line_separator); - return true; + let writer_error = this.first_error; + return new Promise(function(resolve, reject) { + if (writer_error !== null) { + reject(writer_error); + } else { + resolve(true); + } + }); }; @@ -607,7 +621,11 @@ class CSVWriter extends rbql.RBQLOutputWriter { let close_stream_on_finish = this.close_stream_on_finish; let output_stream = this.stream; let output_encoding = this.encoding; + let writer_error = this.first_error; let finish_promise = new Promise(function(resolve, reject) { + if (writer_error !== null) { + reject(writer_error); + } if (close_stream_on_finish) { output_stream.end('', output_encoding, () => { resolve(); }); } else { diff --git a/rbql/_version.py b/rbql/_version.py index 5554f57..0a2f27c 100644 --- a/rbql/_version.py +++ b/rbql/_version.py @@ -1,3 +1,3 @@ # Explanation of this file purpose: https://stackoverflow.com/a/16084844/2898283 -__version__ = '0.24.0' +__version__ = '0.27.0' diff --git a/rbql/csv_utils.py b/rbql/csv_utils.py index 3e7faff..c0e102a 100644 --- a/rbql/csv_utils.py +++ b/rbql/csv_utils.py @@ -34,6 +34,7 @@ def extract_next_field(src, dlm, preserve_quotes_and_whitespaces, allow_external def split_quoted_str(src, dlm, preserve_quotes_and_whitespaces=False): + # This function is newline-agnostic i.e. it can also split records with multiline fields. assert dlm != '"' if src.find('"') == -1: # Optimization for most common case return (src.split(dlm), False) diff --git a/rbql/rbql_csv.py b/rbql/rbql_csv.py index 9c0d455..307e7f4 100755 --- a/rbql/rbql_csv.py +++ b/rbql/rbql_csv.py @@ -13,24 +13,12 @@ PY3 = sys.version_info[0] == 3 - -default_csv_encoding = 'utf-8' - -user_home_dir = os.path.expanduser('~') -table_names_settings_path = os.path.join(user_home_dir, '.rbql_table_names') - - -# TODO performance improvement: replace smart_split() with polymorphic_split() - - polymorphic_xrange = range if PY3 else xrange - -debug_mode = False - - +default_csv_encoding = 'utf-8' ansi_reset_color_code = '\u001b[0m' +debug_mode = False try: broken_pipe_exception = BrokenPipeError @@ -138,6 +126,8 @@ def find_table_path(main_table_dir, table_id): candidate_path = os.path.join(main_table_dir, candidate_path) if os.path.exists(candidate_path): return candidate_path + user_home_dir = os.path.expanduser('~') + table_names_settings_path = os.path.join(user_home_dir, '.rbql_table_names') name_record = get_index_record(table_names_settings_path, table_id) if name_record is not None and len(name_record) > 1 and os.path.exists(name_record[1]): return name_record[1] diff --git a/rbql/rbql_engine.py b/rbql/rbql_engine.py index 9978600..ad13104 100755 --- a/rbql/rbql_engine.py +++ b/rbql/rbql_engine.py @@ -4,14 +4,14 @@ import sys import re -import random -import time import ast from collections import OrderedDict, defaultdict, namedtuple -import datetime # For date operations inside user queries. -import os # For system operations inside user queries. -import math # For math operations inside user queries. +import random # For usage inside user queries only. +import datetime # For usage inside user queries only. +import os # For usage inside user queries only. +import math # For usage inside user queries only. +import time # For usage inside user queries only. from ._version import __version__ @@ -42,6 +42,12 @@ ambiguous_error_msg = 'Ambiguous variable name: "{}" is present both in input and in join tables' invalid_keyword_in_aggregate_query_error_msg = '"ORDER BY", "UPDATE" and "DISTINCT" keywords are not allowed in aggregate queries' +wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside Python expressions is not allowed, see the docs' +numeric_conversion_error = 'Unable to convert value "{}" to int or float. MIN, MAX, SUM, AVG, MEDIAN and VARIANCE aggregate functions convert their string arguments to numeric values' + +PY3 = sys.version_info[0] == 3 + +RBQL_VERSION = __version__ debug_mode = False @@ -98,21 +104,11 @@ def __init__(self, input_iterator, output_writer, user_init_code): self.variables_init_code = None -RBQL_VERSION = __version__ - - -wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside Python expressions is not allowed, see the docs' -numeric_conversion_error = 'Unable to convert value "{}" to int or float. MIN, MAX, SUM, AVG, MEDIAN and VARIANCE aggregate functions convert their string arguments to numeric values' - - -PY3 = sys.version_info[0] == 3 - - def is_str6(val): return (PY3 and isinstance(val, str)) or (not PY3 and isinstance(val, basestring)) -QueryColumnInfo = namedtuple('QueryColumnInfo', ['table_name', 'column_index', 'column_name', 'is_star']) +QueryColumnInfo = namedtuple('QueryColumnInfo', ['table_name', 'column_index', 'column_name', 'is_star', 'alias_name']) def get_field(root, field_name): @@ -122,6 +118,30 @@ def get_field(root, field_name): return None +def search_for_as_alias_pseudo_function(root): + for node in ast.walk(root): + if not isinstance(node, ast.Call): + continue + func_root = get_field(node, 'func') + if not isinstance(func_root, ast.Name): + continue + func_id = get_field(func_root, 'id') + if (func_id != 'alias_column_as_pseudo_func'): + continue + # We found the function node. Since we created the node itself earlier it must have a very specific format: it is a free function call with a single id-like argument. + args_root = get_field(node, 'args') + if not args_root or len(args_root) != 1: + raise RbqlParsingError('Unable to parse "AS" column alias') # Should never happen + arg_name_node = args_root[0] + if not isinstance(arg_name_node, ast.Name): + raise RbqlParsingError('Unable to parse "AS" column alias') # Should never happen + alias_id = get_field(arg_name_node, 'id') + if not alias_id: + raise RbqlParsingError('Unable to parse "AS" column alias') # Should never happen + return alias_id + return None + + def column_info_from_node(root): rbql_star_marker = '__RBQL_INTERNAL_STAR' if isinstance(root, ast.Name): @@ -129,15 +149,15 @@ def column_info_from_node(root): if var_name is None: return None if var_name == rbql_star_marker: - return QueryColumnInfo(table_name=None, column_index=None, column_name=None, is_star=True) + return QueryColumnInfo(table_name=None, column_index=None, column_name=None, is_star=True, alias_name=None) good_column_name_rgx = '^([ab])([0-9][0-9]*)$' match_obj = re.match(good_column_name_rgx, var_name) if match_obj is not None: table_name = match_obj.group(1) column_index = int(match_obj.group(2)) - 1 - return QueryColumnInfo(table_name=table_name, column_index=column_index, column_name=None, is_star=False) + return QueryColumnInfo(table_name=table_name, column_index=column_index, column_name=None, is_star=False, alias_name=None) # Some examples for this branch: NR, NF - return QueryColumnInfo(table_name=None, column_index=None, column_name=var_name, is_star=False) + return QueryColumnInfo(table_name=None, column_index=None, column_name=var_name, is_star=False, alias_name=None) if isinstance(root, ast.Attribute): column_name = get_field(root, 'attr') if not column_name: @@ -151,8 +171,8 @@ def column_info_from_node(root): if table_name is None or table_name not in ['a', 'b']: return None if column_name == rbql_star_marker: - return QueryColumnInfo(table_name=table_name, column_index=None, column_name=None, is_star=True) - return QueryColumnInfo(table_name=None, column_index=None, column_name=column_name, is_star=False) + return QueryColumnInfo(table_name=table_name, column_index=None, column_name=None, is_star=True, alias_name=None) + return QueryColumnInfo(table_name=None, column_index=None, column_name=column_name, is_star=False, alias_name=None) if isinstance(root, ast.Subscript): var_root = get_field(root, 'value') if not isinstance(var_root, ast.Name): @@ -175,7 +195,10 @@ def column_info_from_node(root): return None if not PY3 and isinstance(column_name, str): column_name = column_name.decode('utf-8') - return QueryColumnInfo(table_name=table_name, column_index=column_index, column_name=column_name, is_star=False) + return QueryColumnInfo(table_name=table_name, column_index=column_index, column_name=column_name, is_star=False, alias_name=None) + column_alias_name = search_for_as_alias_pseudo_function(root) + if column_alias_name: + return QueryColumnInfo(table_name=None, column_index=None, column_name=None, is_star=False, alias_name=column_alias_name) return None @@ -928,7 +951,7 @@ def exception_to_error_info(e): if re.search(' like[ (]', error_msg, flags=re.IGNORECASE) is not None: error_msg += "\nRBQL doesn't support \"LIKE\" operator, use like() function instead e.g. ... WHERE like(a1, 'foo%bar') ... " # UT JSON if error_msg.lower().find(' from ') != -1: - error_msg += "\nRBQL doesn't use \"FROM\" keyword, e.g. you can query 'SELECT *' without FROM" # UT JSON + error_msg += "\nTip: If input table is defined by the environment, RBQL query should not have \"FROM\" keyword" # UT JSON return ('syntax error', error_msg) error_type = 'unexpected' error_msg = str(e) @@ -1121,7 +1144,7 @@ def generate_init_statements(query_text, variables_map, join_variables_map): def replace_star_count(aggregate_expression): - return re.sub(r'(^|(?<=,)) *COUNT\( *\* *\) *($|(?=,))', ' COUNT(1)', aggregate_expression, flags=re.IGNORECASE).lstrip(' ') + return re.sub(r'(?:(?<=^)|(?<=,)) *COUNT\( *\* *\)', ' COUNT(1)', aggregate_expression, flags=re.IGNORECASE).lstrip(' ') def replace_star_vars(rbql_expression): @@ -1178,9 +1201,19 @@ def translate_update_expression(update_expression, input_variables_map, string_l def translate_select_expression(select_expression): - expression_without_stars = replace_star_count(select_expression) - translated = replace_star_vars(expression_without_stars).strip() - translated_for_ast = replace_star_vars_for_ast(expression_without_stars).strip() + regexp_for_as_column_alias = r' +(AS|as) +([a-zA-Z][a-zA-Z0-9_]*) *(?=$|,)' + expression_without_counting_stars = replace_star_count(select_expression) + + # TODO the problem with these replaments is that they happen on global level, the right way to do this is to split the query into columns first by using stack-parsing. + # Or we can at least replace parentheses groups with literals e.g. `(.....)` -> `(PARENT_GROUP_1)` + + expression_without_as_column_alias = re.sub(regexp_for_as_column_alias, '', expression_without_counting_stars).strip() + translated = replace_star_vars(expression_without_as_column_alias).strip() + + expression_without_as_column_alias_for_ast = re.sub(regexp_for_as_column_alias, r' == alias_column_as_pseudo_func(\2)', expression_without_counting_stars).strip() + # Replace `as xyz` with `== alias_column_as_pseudo_func(xyz)` as a workaround to make it parsable to Python ast. + translated_for_ast = replace_star_vars_for_ast(expression_without_as_column_alias_for_ast).strip() + if not len(translated): raise RbqlParsingError('"SELECT" expression is empty') # UT JSON return ('[{}]'.format(translated), translated_for_ast) @@ -1376,11 +1409,23 @@ def remove_redundant_input_table_name(query_text): def select_output_header(input_header, join_header, query_column_infos): - if input_header is None and join_header is None: - return None if input_header is None: + assert join_header is None + query_has_star = False + query_has_column_alias = False + for qci in query_column_infos: + query_has_star = query_has_star or (qci is not None and qci.is_star) + query_has_column_alias = query_has_column_alias or (qci is not None and qci.alias_name is not None) + + if input_header is None: + if query_has_star and query_has_column_alias: + raise RbqlParsingError('Using both * (star) and AS alias in the same query is not allowed for input tables without header') + if not query_has_column_alias: + return None input_header = [] + join_header = [] if join_header is None: + # This means that there is no join table. join_header = [] output_header = [] for qci in query_column_infos: @@ -1395,6 +1440,8 @@ def select_output_header(input_header, join_header, query_column_infos): output_header += join_header elif qci.column_name is not None: output_header.append(qci.column_name) + elif qci.alias_name is not None: + output_header.append(qci.alias_name) elif qci.column_index is not None: if qci.table_name == 'a' and qci.column_index < len(input_header): output_header.append(input_header[qci.column_index]) @@ -1443,6 +1490,7 @@ def shallow_parse_input_query(query_text, input_iterator, tables_registry, query query_context.aggregation_key_expression = '({},)'.format(combine_string_literals(rb_actions[GROUP_BY]['text'], string_literals)) + input_header = input_iterator.get_header() join_variables_map = None join_header = None if JOIN in rb_actions: @@ -1456,8 +1504,12 @@ def shallow_parse_input_query(query_text, input_iterator, tables_registry, query join_record_iterator.handle_query_modifier(rb_actions[WITH]) join_variables_map = join_record_iterator.get_variables_map(query_text) join_header = join_record_iterator.get_header() - # TODO check ambiguous column names here instead of external check. + if input_header is None and join_header is not None: + raise RbqlIOHandlingError('Inconsistent modes: Input table doesn\'t have a header while the Join table has a header') + if input_header is not None and join_header is None: + raise RbqlIOHandlingError('Inconsistent modes: Input table has a header while the Join table doesn\'t have a header') + # TODO check ambiguous column names here instead of external check. lhs_variables, rhs_indices = resolve_join_variables(input_variables_map, join_variables_map, variable_pairs, string_literals) joiner_type = {JOIN: InnerJoiner, INNER_JOIN: InnerJoiner, LEFT_OUTER_JOIN: LeftJoiner, LEFT_JOIN: LeftJoiner, STRICT_LEFT_JOIN: StrictLeftJoiner}[rb_actions[JOIN]['join_subtype']] query_context.lhs_join_var_expression = lhs_variables[0] if len(lhs_variables) == 1 else '({})'.format(', '.join(lhs_variables)) @@ -1478,21 +1530,23 @@ def shallow_parse_input_query(query_text, input_iterator, tables_registry, query if UPDATE in rb_actions: update_expression = translate_update_expression(rb_actions[UPDATE]['text'], input_variables_map, string_literals) query_context.update_expressions = combine_string_literals(update_expression, string_literals) - query_context.writer.set_header(input_iterator.get_header()) + query_context.writer.set_header(input_header) if SELECT in rb_actions: query_context.top_count = find_top(rb_actions) if EXCEPT in rb_actions: - output_header, select_expression = translate_except_expression(rb_actions[EXCEPT]['text'], input_variables_map, string_literals, input_iterator.get_header()) + if JOIN in rb_actions: + raise RbqlParsingError('EXCEPT and JOIN are not allowed in the same query') # UT JSON + output_header, select_expression = translate_except_expression(rb_actions[EXCEPT]['text'], input_variables_map, string_literals, input_header) else: select_expression, select_expression_for_ast = translate_select_expression(rb_actions[SELECT]['text']) select_expression = combine_string_literals(select_expression, string_literals) # We need to add string literals back in order to have relevant errors in case of exceptions during parsing combined_select_expression_for_ast = combine_string_literals(select_expression_for_ast, string_literals) column_infos = ast_parse_select_expression_to_column_infos(combined_select_expression_for_ast) - output_header = select_output_header(input_iterator.get_header(), join_header, column_infos) + output_header = select_output_header(input_header, join_header, column_infos) query_context.select_expression = select_expression query_context.writer.set_header(output_header)