diff --git a/README.md b/README.md index 5b05781..1c14ed2 100644 --- a/README.md +++ b/README.md @@ -195,9 +195,7 @@ RBQL for CSV files provides the following variables which you can use in your qu ### UPDATE statement -_UPDATE_ query produces a new table where original values are replaced according to the UPDATE expression, so it can also be considered a special type of SELECT query. This prevents accidental data loss from poorly written queries. -_UPDATE SET_ is synonym to _UPDATE_, because in RBQL there is no need to specify the source table. - +_UPDATE_ query produces a new table where original values are replaced according to the UPDATE expression, so it can also be considered a special type of SELECT query. ### Aggregate functions and queries @@ -214,9 +212,9 @@ There is a workaround for the limitation above for _ARRAY_AGG_ function which su Join table B can be referenced either by its file path or by its name - an arbitrary string which the user should provide before executing the JOIN query. RBQL supports _STRICT LEFT JOIN_ which is like _LEFT JOIN_, but generates an error if any key in the left table "A" doesn't have exactly one matching key in the right table "B". +Table B path can be either relative to the working dir, relative to the main table or absolute. Limitation: _JOIN_ statements can't contain Python/JS expressions and must have the following form: _ (/path/to/table.tsv | table_name ) ON a... == b... [AND a... == b... [AND ... ]]_ - ### SELECT EXCEPT statement SELECT EXCEPT can be used to select everything except specific columns. E.g. to select everything but columns 2 and 4, run: `SELECT * EXCEPT a2, a4` diff --git a/rbql-js/rbql.js b/rbql-js/rbql.js index 8ab6e63..be32704 100755 --- a/rbql-js/rbql.js +++ b/rbql-js/rbql.js @@ -70,7 +70,7 @@ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs'; -const RBQL_VERSION = '0.20.0'; +const RBQL_VERSION = '0.21.0'; function check_if_brackets_match(opening_bracket, closing_bracket) { @@ -156,7 +156,7 @@ function column_info_from_text_span(text_span, string_literals) { if (replaced_string_literal_id < string_literals.length) { let quoted_column_name = string_literals[replaced_string_literal_id]; let unquoted_column_name = unquote_string(quoted_column_name); - if (unquoted_column_name) { + if (unquoted_column_name !== null && unquoted_column_name !== undefined) { return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false}; } } @@ -1783,12 +1783,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables query_text = cleanup_query(query_text); var [format_expression, string_literals] = separate_string_literals(query_text); format_expression = remove_redundant_table_name(format_expression); - var input_variables_map = await input_iterator.get_variables_map(query_text); var rb_actions = separate_actions(format_expression); if (rb_actions.hasOwnProperty(WITH)) { input_iterator.handle_query_modifier(rb_actions[WITH]); } + var input_variables_map = await input_iterator.get_variables_map(query_text); if (rb_actions.hasOwnProperty(ORDER_BY) && rb_actions.hasOwnProperty(UPDATE)) throw new RbqlParsingError('"ORDER BY" is not allowed in "UPDATE" queries'); diff --git a/rbql-js/rbql_csv.js b/rbql-js/rbql_csv.js index 8f202f0..a798db0 100755 --- a/rbql-js/rbql_csv.js +++ b/rbql-js/rbql_csv.js @@ -117,11 +117,18 @@ function get_index_record(index_path, key) { } -function find_table_path(table_id) { +function find_table_path(main_table_dir, table_id) { + // If table_id is a relative path it could be relative either to the current directory or to the main table dir. var candidate_path = expanduser(table_id); if (fs.existsSync(candidate_path)) { return candidate_path; } + if (main_table_dir && !path.isAbsolute(candidate_path)) { + candidate_path = path.join(main_table_dir, candidate_path); + if (fs.existsSync(candidate_path)) { + return candidate_path; + } + } let table_names_settings_path = path.join(os.homedir(), '.rbql_table_names'); var name_record = get_index_record(table_names_settings_path, table_id); if (name_record && name_record.length > 1 && fs.existsSync(name_record[1])) { @@ -261,7 +268,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator { rbql.parse_array_variables(query_text, this.variable_prefix, variable_map); await this.preread_first_record(); - if (this.first_record) { + if (this.has_header && this.first_record) { rbql.parse_attribute_variables(query_text, this.variable_prefix, this.first_record, 'CSV header line', variable_map); rbql.parse_dictionary_variables(query_text, this.variable_prefix, this.first_record, variable_map); } @@ -624,8 +631,9 @@ class CSVWriter extends rbql.RBQLOutputWriter { class FileSystemCSVRegistry extends rbql.RBQLTableRegistry { - constructor(delim, policy, encoding, has_header=false, comment_prefix=null, options=null) { + constructor(input_file_dir, delim, policy, encoding, has_header=false, comment_prefix=null, options=null) { super(); + this.input_file_dir = input_file_dir; this.delim = delim; this.policy = policy; this.encoding = encoding; @@ -640,7 +648,7 @@ class FileSystemCSVRegistry extends rbql.RBQLTableRegistry { } get_iterator_by_table_id(table_id) { - this.table_path = find_table_path(table_id); + this.table_path = find_table_path(this.input_file_dir, table_id); if (this.table_path === null) { throw new RbqlIOHandlingError(`Unable to find join table "${table_id}"`); } @@ -683,8 +691,8 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp if (user_init_code == '' && fs.existsSync(default_init_source_path)) { user_init_code = read_user_init_code(default_init_source_path); } - - let join_tables_registry = new FileSystemCSVRegistry(input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options); + let input_file_dir = input_path ? path.dirname(input_path) : null; + let join_tables_registry = new FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options); let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, with_headers, comment_prefix); let output_writer = new CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy); diff --git a/rbql/_version.py b/rbql/_version.py index ae41f67..1a955db 100644 --- a/rbql/_version.py +++ b/rbql/_version.py @@ -1,3 +1,3 @@ # Explanation of this file purpose: https://stackoverflow.com/a/16084844/2898283 -__version__ = '0.20.0' +__version__ = '0.21.0' diff --git a/rbql/rbql_csv.py b/rbql/rbql_csv.py index 03b4972..3aa1a9c 100755 --- a/rbql/rbql_csv.py +++ b/rbql/rbql_csv.py @@ -129,10 +129,15 @@ def get_index_record(index_path, key): return None -def find_table_path(table_id): +def find_table_path(main_table_dir, table_id): + # If table_id is a relative path it could be relative either to the current directory or to the main table dir. candidate_path = os.path.expanduser(table_id) if os.path.exists(candidate_path): return candidate_path + if main_table_dir and not os.path.isabs(candidate_path): + candidate_path = os.path.join(main_table_dir, candidate_path) + if os.path.exists(candidate_path): + return candidate_path name_record = get_index_record(table_names_settings_path, table_id) if name_record is not None and len(name_record) > 1 and os.path.exists(name_record[1]): return name_record[1] @@ -378,7 +383,7 @@ def get_variables_map(self, query_text): variable_map = dict() rbql_engine.parse_basic_variables(query_text, self.variable_prefix, variable_map) rbql_engine.parse_array_variables(query_text, self.variable_prefix, variable_map) - if self.first_record is not None: + if self.has_header and self.first_record is not None: rbql_engine.parse_attribute_variables(query_text, self.variable_prefix, self.first_record, 'CSV header line', variable_map) rbql_engine.parse_dictionary_variables(query_text, self.variable_prefix, self.first_record, variable_map) return variable_map @@ -514,7 +519,8 @@ def get_warnings(self): class FileSystemCSVRegistry(rbql_engine.RBQLTableRegistry): - def __init__(self, delim, policy, encoding, has_header, comment_prefix): + def __init__(self, input_file_dir, delim, policy, encoding, has_header, comment_prefix): + self.input_file_dir = input_file_dir self.delim = delim self.policy = policy self.encoding = encoding @@ -525,7 +531,7 @@ def __init__(self, delim, policy, encoding, has_header, comment_prefix): self.table_path = None def get_iterator_by_table_id(self, table_id): - self.table_path = find_table_path(table_id) + self.table_path = find_table_path(self.input_file_dir, table_id) if self.table_path is None: raise rbql_engine.RbqlIOHandlingError('Unable to find join table "{}"'.format(table_id)) self.input_stream = open(self.table_path, 'rb') @@ -566,7 +572,8 @@ def query_csv(query_text, input_path, input_delim, input_policy, output_path, ou if user_init_code == '' and os.path.exists(default_init_source_path): user_init_code = read_user_init_code(default_init_source_path) - join_tables_registry = FileSystemCSVRegistry(input_delim, input_policy, csv_encoding, with_headers, comment_prefix) + input_file_dir = None if not input_path else os.path.dirname(input_path) + join_tables_registry = FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix) input_iterator = CSVRecordIterator(input_stream, csv_encoding, input_delim, input_policy, with_headers, comment_prefix=comment_prefix) output_writer = CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy, colorize_output=colorize_output) if debug_mode: diff --git a/rbql/rbql_engine.py b/rbql/rbql_engine.py index 98a5a93..91c3e3b 100755 --- a/rbql/rbql_engine.py +++ b/rbql/rbql_engine.py @@ -52,7 +52,7 @@ # TODO support 'AS' keyword -# FIXME consider disallowing to use values in the first row when header is not enabled (only a1, a2, ... should be allowed) and vice versa - Don't allow a1, a2 etc when header is enabled. This is to make sure that the user knows what query mode they are in. +# TODO Consider disabling a1, a2 etc variables when header is enabled. This is to make sure that the user knows what query mode they are in. GROUP_BY = 'GROUP BY' @@ -205,6 +205,8 @@ def column_info_from_node(root): column_index = get_field(slice_val_root, 'n') - 1 else: return None + if not PY3 and isinstance(column_name, str): + column_name = column_name.decode('utf-8') return QueryColumnInfo(table_name=table_name, column_index=column_index, column_name=column_name, is_star=False) return None @@ -1459,10 +1461,10 @@ def shallow_parse_input_query(query_text, input_iterator, join_tables_registry, query_text = cleanup_query(query_text) format_expression, string_literals = separate_string_literals(query_text) format_expression = remove_redundant_input_table_name(format_expression) - input_variables_map = input_iterator.get_variables_map(query_text) rb_actions = separate_actions(format_expression) if WITH in rb_actions: input_iterator.handle_query_modifier(rb_actions[WITH]) + input_variables_map = input_iterator.get_variables_map(query_text) if ORDER_BY in rb_actions and UPDATE in rb_actions: raise RbqlParsingError('"ORDER BY" is not allowed in "UPDATE" queries') # UT JSON