merge from RBQL

mechatroner · Dec 4, 2021 · 2da337d · 2da337d
1 parent 09fa518
commit 2da337d
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -195,9 +195,7 @@ RBQL for CSV files provides the following variables which you can use in your qu
 
 ### UPDATE statement
 
-_UPDATE_ query produces a new table where original values are replaced according to the UPDATE expression, so it can also be considered a special type of SELECT query. This prevents accidental data loss from poorly written queries.  
-_UPDATE SET_ is synonym to _UPDATE_, because in RBQL there is no need to specify the source table.  
-
+_UPDATE_ query produces a new table where original values are replaced according to the UPDATE expression, so it can also be considered a special type of SELECT query.
 
 ### Aggregate functions and queries
 
@@ -214,9 +212,9 @@ There is a workaround for the limitation above for _ARRAY_AGG_ function which su
 
 Join table B can be referenced either by its file path or by its name - an arbitrary string which the user should provide before executing the JOIN query.  
 RBQL supports _STRICT LEFT JOIN_ which is like _LEFT JOIN_, but generates an error if any key in the left table "A" doesn't have exactly one matching key in the right table "B".  
+Table B path can be either relative to the working dir, relative to the main table or absolute.  
 Limitation: _JOIN_ statements can't contain Python/JS expressions and must have the following form: _<JOIN\_KEYWORD> (/path/to/table.tsv | table_name ) ON a... == b... [AND a... == b... [AND ... ]]_
 
-
 ### SELECT EXCEPT statement
 
 SELECT EXCEPT can be used to select everything except specific columns. E.g. to select everything but columns 2 and 4, run: `SELECT * EXCEPT a2, a4`  

diff --git a/rbql-js/rbql.js b/rbql-js/rbql.js
@@ -70,7 +70,7 @@ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions
 
 
 const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs';
-const RBQL_VERSION = '0.20.0';
+const RBQL_VERSION = '0.21.0';
 
 
 function check_if_brackets_match(opening_bracket, closing_bracket) {
@@ -156,7 +156,7 @@ function column_info_from_text_span(text_span, string_literals) {
         if (replaced_string_literal_id < string_literals.length) {
             let quoted_column_name = string_literals[replaced_string_literal_id];
             let unquoted_column_name = unquote_string(quoted_column_name);
-            if (unquoted_column_name) {
+            if (unquoted_column_name !== null && unquoted_column_name !== undefined) {
                 return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false};
             }
         }
@@ -1783,12 +1783,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
     query_text = cleanup_query(query_text);
     var [format_expression, string_literals] = separate_string_literals(query_text);
     format_expression = remove_redundant_table_name(format_expression);
-    var input_variables_map = await input_iterator.get_variables_map(query_text);
 
     var rb_actions = separate_actions(format_expression);
     if (rb_actions.hasOwnProperty(WITH)) {
         input_iterator.handle_query_modifier(rb_actions[WITH]);
     }
+    var input_variables_map = await input_iterator.get_variables_map(query_text);
 
     if (rb_actions.hasOwnProperty(ORDER_BY) && rb_actions.hasOwnProperty(UPDATE))
         throw new RbqlParsingError('"ORDER BY" is not allowed in "UPDATE" queries');

diff --git a/rbql-js/rbql_csv.js b/rbql-js/rbql_csv.js
@@ -117,11 +117,18 @@ function get_index_record(index_path, key) {
 }
 
 
-function find_table_path(table_id) {
+function find_table_path(main_table_dir, table_id) {
+    // If table_id is a relative path it could be relative either to the current directory or to the main table dir.
     var candidate_path = expanduser(table_id);
     if (fs.existsSync(candidate_path)) {
         return candidate_path;
     }
+    if (main_table_dir && !path.isAbsolute(candidate_path)) {
+        candidate_path = path.join(main_table_dir, candidate_path);
+        if (fs.existsSync(candidate_path)) {
+            return candidate_path;
+        }
+    }
     let table_names_settings_path = path.join(os.homedir(), '.rbql_table_names');
     var name_record = get_index_record(table_names_settings_path, table_id);
     if (name_record && name_record.length > 1 && fs.existsSync(name_record[1])) {
@@ -261,7 +268,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
         rbql.parse_array_variables(query_text, this.variable_prefix, variable_map);
 
         await this.preread_first_record();
-        if (this.first_record) {
+        if (this.has_header && this.first_record) {
             rbql.parse_attribute_variables(query_text, this.variable_prefix, this.first_record, 'CSV header line', variable_map);
             rbql.parse_dictionary_variables(query_text, this.variable_prefix, this.first_record, variable_map);
         }
@@ -624,8 +631,9 @@ class CSVWriter extends rbql.RBQLOutputWriter {
 
 
 class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
-    constructor(delim, policy, encoding, has_header=false, comment_prefix=null, options=null) {
+    constructor(input_file_dir, delim, policy, encoding, has_header=false, comment_prefix=null, options=null) {
         super();
+        this.input_file_dir = input_file_dir;
         this.delim = delim;
         this.policy = policy;
         this.encoding = encoding;
@@ -640,7 +648,7 @@ class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
     }
 
     get_iterator_by_table_id(table_id) {
-        this.table_path = find_table_path(table_id);
+        this.table_path = find_table_path(this.input_file_dir, table_id);
         if (this.table_path === null) {
             throw new RbqlIOHandlingError(`Unable to find join table "${table_id}"`);
         }
@@ -683,8 +691,8 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
     if (user_init_code == '' && fs.existsSync(default_init_source_path)) {
         user_init_code = read_user_init_code(default_init_source_path);
     }
-
-    let join_tables_registry = new FileSystemCSVRegistry(input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options);
+    let input_file_dir = input_path ? path.dirname(input_path) : null;
+    let join_tables_registry = new FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options);
     let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, with_headers, comment_prefix);
     let output_writer = new CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy);
 

diff --git a/rbql/_version.py b/rbql/_version.py
@@ -1,3 +1,3 @@
 # Explanation of this file purpose: https://stackoverflow.com/a/16084844/2898283
-__version__ = '0.20.0'
+__version__ = '0.21.0'
 
diff --git a/rbql/rbql_csv.py b/rbql/rbql_csv.py
@@ -129,10 +129,15 @@ def get_index_record(index_path, key):
     return None
 
 
-def find_table_path(table_id):
+def find_table_path(main_table_dir, table_id):
+    # If table_id is a relative path it could be relative either to the current directory or to the main table dir.
     candidate_path = os.path.expanduser(table_id)
     if os.path.exists(candidate_path):
         return candidate_path
+    if main_table_dir and not os.path.isabs(candidate_path):
+        candidate_path = os.path.join(main_table_dir, candidate_path)
+        if os.path.exists(candidate_path):
+            return candidate_path
     name_record = get_index_record(table_names_settings_path, table_id)
     if name_record is not None and len(name_record) > 1 and os.path.exists(name_record[1]):
         return name_record[1]
@@ -378,7 +383,7 @@ def get_variables_map(self, query_text):
         variable_map = dict()
         rbql_engine.parse_basic_variables(query_text, self.variable_prefix, variable_map)
         rbql_engine.parse_array_variables(query_text, self.variable_prefix, variable_map)
-        if self.first_record is not None:
+        if self.has_header and self.first_record is not None:
             rbql_engine.parse_attribute_variables(query_text, self.variable_prefix, self.first_record, 'CSV header line', variable_map)
             rbql_engine.parse_dictionary_variables(query_text, self.variable_prefix, self.first_record, variable_map)
         return variable_map
@@ -514,7 +519,8 @@ def get_warnings(self):
 
 
 class FileSystemCSVRegistry(rbql_engine.RBQLTableRegistry):
-    def __init__(self, delim, policy, encoding, has_header, comment_prefix):
+    def __init__(self, input_file_dir, delim, policy, encoding, has_header, comment_prefix):
+        self.input_file_dir = input_file_dir
         self.delim = delim
         self.policy = policy
         self.encoding = encoding
@@ -525,7 +531,7 @@ def __init__(self, delim, policy, encoding, has_header, comment_prefix):
         self.table_path = None
 
     def get_iterator_by_table_id(self, table_id):
-        self.table_path = find_table_path(table_id)
+        self.table_path = find_table_path(self.input_file_dir, table_id)
         if self.table_path is None:
             raise rbql_engine.RbqlIOHandlingError('Unable to find join table "{}"'.format(table_id))
         self.input_stream = open(self.table_path, 'rb')
@@ -566,7 +572,8 @@ def query_csv(query_text, input_path, input_delim, input_policy, output_path, ou
         if user_init_code == '' and os.path.exists(default_init_source_path):
             user_init_code = read_user_init_code(default_init_source_path)
 
-        join_tables_registry = FileSystemCSVRegistry(input_delim, input_policy, csv_encoding, with_headers, comment_prefix)
+        input_file_dir = None if not input_path else os.path.dirname(input_path)
+        join_tables_registry = FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix)
         input_iterator = CSVRecordIterator(input_stream, csv_encoding, input_delim, input_policy, with_headers, comment_prefix=comment_prefix)
         output_writer = CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy, colorize_output=colorize_output)
         if debug_mode:

diff --git a/rbql/rbql_engine.py b/rbql/rbql_engine.py
@@ -52,7 +52,7 @@
 
 # TODO support 'AS' keyword
 
-# FIXME consider disallowing to use values in the first row when header is not enabled (only a1, a2, ... should be allowed) and vice versa - Don't allow a1, a2 etc when header is enabled. This is to make sure that the user knows what query mode they are in.
+# TODO Consider disabling a1, a2 etc variables when header is enabled. This is to make sure that the user knows what query mode they are in.
 
 
 GROUP_BY = 'GROUP BY'
@@ -205,6 +205,8 @@ def column_info_from_node(root):
             column_index = get_field(slice_val_root, 'n') - 1
         else:
             return None
+        if not PY3 and isinstance(column_name, str):
+            column_name = column_name.decode('utf-8')
         return QueryColumnInfo(table_name=table_name, column_index=column_index, column_name=column_name, is_star=False)
     return None
 
@@ -1459,10 +1461,10 @@ def shallow_parse_input_query(query_text, input_iterator, join_tables_registry,
     query_text = cleanup_query(query_text)
     format_expression, string_literals = separate_string_literals(query_text)
     format_expression = remove_redundant_input_table_name(format_expression)
-    input_variables_map = input_iterator.get_variables_map(query_text)
     rb_actions = separate_actions(format_expression)
     if WITH in rb_actions:
         input_iterator.handle_query_modifier(rb_actions[WITH])
+    input_variables_map = input_iterator.get_variables_map(query_text)
 
     if ORDER_BY in rb_actions and UPDATE in rb_actions:
         raise RbqlParsingError('"ORDER BY" is not allowed in "UPDATE" queries') # UT JSON