Skip to content

Commit

Permalink
merge from RBQL
Browse files Browse the repository at this point in the history
  • Loading branch information
mechatroner committed Dec 4, 2021
1 parent 09fa518 commit 2da337d
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 21 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,7 @@ RBQL for CSV files provides the following variables which you can use in your qu

### UPDATE statement

_UPDATE_ query produces a new table where original values are replaced according to the UPDATE expression, so it can also be considered a special type of SELECT query. This prevents accidental data loss from poorly written queries.
_UPDATE SET_ is synonym to _UPDATE_, because in RBQL there is no need to specify the source table.

_UPDATE_ query produces a new table where original values are replaced according to the UPDATE expression, so it can also be considered a special type of SELECT query.

### Aggregate functions and queries

Expand All @@ -214,9 +212,9 @@ There is a workaround for the limitation above for _ARRAY_AGG_ function which su

Join table B can be referenced either by its file path or by its name - an arbitrary string which the user should provide before executing the JOIN query.
RBQL supports _STRICT LEFT JOIN_ which is like _LEFT JOIN_, but generates an error if any key in the left table "A" doesn't have exactly one matching key in the right table "B".
Table B path can be either relative to the working dir, relative to the main table or absolute.
Limitation: _JOIN_ statements can't contain Python/JS expressions and must have the following form: _<JOIN\_KEYWORD> (/path/to/table.tsv | table_name ) ON a... == b... [AND a... == b... [AND ... ]]_


### SELECT EXCEPT statement

SELECT EXCEPT can be used to select everything except specific columns. E.g. to select everything but columns 2 and 4, run: `SELECT * EXCEPT a2, a4`
Expand Down
6 changes: 3 additions & 3 deletions rbql-js/rbql.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions


const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs';
const RBQL_VERSION = '0.20.0';
const RBQL_VERSION = '0.21.0';


function check_if_brackets_match(opening_bracket, closing_bracket) {
Expand Down Expand Up @@ -156,7 +156,7 @@ function column_info_from_text_span(text_span, string_literals) {
if (replaced_string_literal_id < string_literals.length) {
let quoted_column_name = string_literals[replaced_string_literal_id];
let unquoted_column_name = unquote_string(quoted_column_name);
if (unquoted_column_name) {
if (unquoted_column_name !== null && unquoted_column_name !== undefined) {
return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false};
}
}
Expand Down Expand Up @@ -1783,12 +1783,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
query_text = cleanup_query(query_text);
var [format_expression, string_literals] = separate_string_literals(query_text);
format_expression = remove_redundant_table_name(format_expression);
var input_variables_map = await input_iterator.get_variables_map(query_text);

var rb_actions = separate_actions(format_expression);
if (rb_actions.hasOwnProperty(WITH)) {
input_iterator.handle_query_modifier(rb_actions[WITH]);
}
var input_variables_map = await input_iterator.get_variables_map(query_text);

if (rb_actions.hasOwnProperty(ORDER_BY) && rb_actions.hasOwnProperty(UPDATE))
throw new RbqlParsingError('"ORDER BY" is not allowed in "UPDATE" queries');
Expand Down
20 changes: 14 additions & 6 deletions rbql-js/rbql_csv.js
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,18 @@ function get_index_record(index_path, key) {
}


function find_table_path(table_id) {
function find_table_path(main_table_dir, table_id) {
// If table_id is a relative path it could be relative either to the current directory or to the main table dir.
var candidate_path = expanduser(table_id);
if (fs.existsSync(candidate_path)) {
return candidate_path;
}
if (main_table_dir && !path.isAbsolute(candidate_path)) {
candidate_path = path.join(main_table_dir, candidate_path);
if (fs.existsSync(candidate_path)) {
return candidate_path;
}
}
let table_names_settings_path = path.join(os.homedir(), '.rbql_table_names');
var name_record = get_index_record(table_names_settings_path, table_id);
if (name_record && name_record.length > 1 && fs.existsSync(name_record[1])) {
Expand Down Expand Up @@ -261,7 +268,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
rbql.parse_array_variables(query_text, this.variable_prefix, variable_map);

await this.preread_first_record();
if (this.first_record) {
if (this.has_header && this.first_record) {
rbql.parse_attribute_variables(query_text, this.variable_prefix, this.first_record, 'CSV header line', variable_map);
rbql.parse_dictionary_variables(query_text, this.variable_prefix, this.first_record, variable_map);
}
Expand Down Expand Up @@ -624,8 +631,9 @@ class CSVWriter extends rbql.RBQLOutputWriter {


class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
constructor(delim, policy, encoding, has_header=false, comment_prefix=null, options=null) {
constructor(input_file_dir, delim, policy, encoding, has_header=false, comment_prefix=null, options=null) {
super();
this.input_file_dir = input_file_dir;
this.delim = delim;
this.policy = policy;
this.encoding = encoding;
Expand All @@ -640,7 +648,7 @@ class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
}

get_iterator_by_table_id(table_id) {
this.table_path = find_table_path(table_id);
this.table_path = find_table_path(this.input_file_dir, table_id);
if (this.table_path === null) {
throw new RbqlIOHandlingError(`Unable to find join table "${table_id}"`);
}
Expand Down Expand Up @@ -683,8 +691,8 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
if (user_init_code == '' && fs.existsSync(default_init_source_path)) {
user_init_code = read_user_init_code(default_init_source_path);
}

let join_tables_registry = new FileSystemCSVRegistry(input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options);
let input_file_dir = input_path ? path.dirname(input_path) : null;
let join_tables_registry = new FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options);
let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, with_headers, comment_prefix);
let output_writer = new CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy);

Expand Down
2 changes: 1 addition & 1 deletion rbql/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Explanation of this file purpose: https://stackoverflow.com/a/16084844/2898283
__version__ = '0.20.0'
__version__ = '0.21.0'

17 changes: 12 additions & 5 deletions rbql/rbql_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,15 @@ def get_index_record(index_path, key):
return None


def find_table_path(table_id):
def find_table_path(main_table_dir, table_id):
# If table_id is a relative path it could be relative either to the current directory or to the main table dir.
candidate_path = os.path.expanduser(table_id)
if os.path.exists(candidate_path):
return candidate_path
if main_table_dir and not os.path.isabs(candidate_path):
candidate_path = os.path.join(main_table_dir, candidate_path)
if os.path.exists(candidate_path):
return candidate_path
name_record = get_index_record(table_names_settings_path, table_id)
if name_record is not None and len(name_record) > 1 and os.path.exists(name_record[1]):
return name_record[1]
Expand Down Expand Up @@ -378,7 +383,7 @@ def get_variables_map(self, query_text):
variable_map = dict()
rbql_engine.parse_basic_variables(query_text, self.variable_prefix, variable_map)
rbql_engine.parse_array_variables(query_text, self.variable_prefix, variable_map)
if self.first_record is not None:
if self.has_header and self.first_record is not None:
rbql_engine.parse_attribute_variables(query_text, self.variable_prefix, self.first_record, 'CSV header line', variable_map)
rbql_engine.parse_dictionary_variables(query_text, self.variable_prefix, self.first_record, variable_map)
return variable_map
Expand Down Expand Up @@ -514,7 +519,8 @@ def get_warnings(self):


class FileSystemCSVRegistry(rbql_engine.RBQLTableRegistry):
def __init__(self, delim, policy, encoding, has_header, comment_prefix):
def __init__(self, input_file_dir, delim, policy, encoding, has_header, comment_prefix):
self.input_file_dir = input_file_dir
self.delim = delim
self.policy = policy
self.encoding = encoding
Expand All @@ -525,7 +531,7 @@ def __init__(self, delim, policy, encoding, has_header, comment_prefix):
self.table_path = None

def get_iterator_by_table_id(self, table_id):
self.table_path = find_table_path(table_id)
self.table_path = find_table_path(self.input_file_dir, table_id)
if self.table_path is None:
raise rbql_engine.RbqlIOHandlingError('Unable to find join table "{}"'.format(table_id))
self.input_stream = open(self.table_path, 'rb')
Expand Down Expand Up @@ -566,7 +572,8 @@ def query_csv(query_text, input_path, input_delim, input_policy, output_path, ou
if user_init_code == '' and os.path.exists(default_init_source_path):
user_init_code = read_user_init_code(default_init_source_path)

join_tables_registry = FileSystemCSVRegistry(input_delim, input_policy, csv_encoding, with_headers, comment_prefix)
input_file_dir = None if not input_path else os.path.dirname(input_path)
join_tables_registry = FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix)
input_iterator = CSVRecordIterator(input_stream, csv_encoding, input_delim, input_policy, with_headers, comment_prefix=comment_prefix)
output_writer = CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy, colorize_output=colorize_output)
if debug_mode:
Expand Down
6 changes: 4 additions & 2 deletions rbql/rbql_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

# TODO support 'AS' keyword

# FIXME consider disallowing to use values in the first row when header is not enabled (only a1, a2, ... should be allowed) and vice versa - Don't allow a1, a2 etc when header is enabled. This is to make sure that the user knows what query mode they are in.
# TODO Consider disabling a1, a2 etc variables when header is enabled. This is to make sure that the user knows what query mode they are in.


GROUP_BY = 'GROUP BY'
Expand Down Expand Up @@ -205,6 +205,8 @@ def column_info_from_node(root):
column_index = get_field(slice_val_root, 'n') - 1
else:
return None
if not PY3 and isinstance(column_name, str):
column_name = column_name.decode('utf-8')
return QueryColumnInfo(table_name=table_name, column_index=column_index, column_name=column_name, is_star=False)
return None

Expand Down Expand Up @@ -1459,10 +1461,10 @@ def shallow_parse_input_query(query_text, input_iterator, join_tables_registry,
query_text = cleanup_query(query_text)
format_expression, string_literals = separate_string_literals(query_text)
format_expression = remove_redundant_input_table_name(format_expression)
input_variables_map = input_iterator.get_variables_map(query_text)
rb_actions = separate_actions(format_expression)
if WITH in rb_actions:
input_iterator.handle_query_modifier(rb_actions[WITH])
input_variables_map = input_iterator.get_variables_map(query_text)

if ORDER_BY in rb_actions and UPDATE in rb_actions:
raise RbqlParsingError('"ORDER BY" is not allowed in "UPDATE" queries') # UT JSON
Expand Down

0 comments on commit 2da337d

Please sign in to comment.