Begin databricks support

DataTreehouse · Aug 26, 2024 · c5858de · c5858de
1 parent 4a65832
commit c5858de
Show file tree

Hide file tree

Showing 17 changed files with 182 additions and 31 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -14,7 +14,7 @@ permissions:
 env:
   CARGO_TERM_COLOR: always
   RUST_LOG: debug
-  MATURIN_VERSION: '1.7.0'
+  MATURIN_VERSION: '1.7.1'
   RUST_TOOLCHAIN: nightly-2024-06-23
 
 jobs:

diff --git a/.github/workflows/python_query_tests.yml b/.github/workflows/python_query_tests.yml
@@ -11,7 +11,7 @@ on:
 env:
   CARGO_TERM_COLOR: always
   RUST_LOG: debug
-  MATURIN_VERSION: '1.7.0'
+  MATURIN_VERSION: '1.7.1'
   RUST_TOOLCHAIN: nightly-2024-06-23
 
 jobs:

diff --git a/.github/workflows/python_release.yml b/.github/workflows/python_release.yml
@@ -14,7 +14,7 @@ permissions:
 env:
   CARGO_TERM_COLOR: always
   RUST_TOOLCHAIN: nightly-2024-06-23
-  MATURIN_VERSION: '1.7.0'
+  MATURIN_VERSION: '1.7.1'
   MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
 
 jobs:

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,11 +14,11 @@ members = [
 #pydf_io = { path = "../maplib/lib/pydf_io"}
 #representation = { path = "../maplib/lib/representation", features = ["rdf-star"]}
 #templates = { path = "../maplib/lib/templates"}
-spargebra = { git = "https://github.com/DataTreehouse/maplib", rev="3aaabe8bd2326fbf204456f961f972d927afad78", features = ["rdf-star"]}
-query_processing = { git = "https://github.com/DataTreehouse/maplib", rev="3aaabe8bd2326fbf204456f961f972d927afad78" }
-pydf_io = { git = "https://github.com/DataTreehouse/maplib", rev="3aaabe8bd2326fbf204456f961f972d927afad78" }
-representation = { git = "https://github.com/DataTreehouse/maplib", rev="3aaabe8bd2326fbf204456f961f972d927afad78", features = ["rdf-star"] }
-templates = { git = "https://github.com/DataTreehouse/maplib", rev="3aaabe8bd2326fbf204456f961f972d927afad78" }
+spargebra = { git = "https://github.com/DataTreehouse/maplib", rev="3bf75ac20a71c9afeab07a7b4e0196fe51e43c61", features = ["rdf-star"]}
+query_processing = { git = "https://github.com/DataTreehouse/maplib", rev="3bf75ac20a71c9afeab07a7b4e0196fe51e43c61" }
+pydf_io = { git = "https://github.com/DataTreehouse/maplib", rev="3bf75ac20a71c9afeab07a7b4e0196fe51e43c61" }
+representation = { git = "https://github.com/DataTreehouse/maplib", rev="3bf75ac20a71c9afeab07a7b4e0196fe51e43c61", features = ["rdf-star"] }
+templates = { git = "https://github.com/DataTreehouse/maplib", rev="3bf75ac20a71c9afeab07a7b4e0196fe51e43c61" }
 
 
 sparesults = { version = "0.2.0-alpha.5", features = ["rdf-star"] }
@@ -44,6 +44,7 @@ gcp-bigquery-client = "0.20.0"
 rayon = "1.10.0"
 opcua = {version="0.12.0", features = ["vendored-openssl"]}
 url = "2.5.2"
+uuid = {version = "1.10.0", features = ["fast-rng", "v4"]}
 
 [patch.crates-io]
 oxrdf = { git = 'https://github.com/magbak/oxigraph.git', rev = "b13df973ed2785de2ac41066ca4b62d88d3f5d40"}

diff --git a/lib/chrontext/Cargo.toml b/lib/chrontext/Cargo.toml
@@ -32,3 +32,4 @@ async-recursion.workspace = true
 async-trait.workspace = true
 oxigraph.workspace = true
 filesize.workspace = true
+uuid.workspace = true
diff --git a/lib/chrontext/src/engine.rs b/lib/chrontext/src/engine.rs
@@ -16,6 +16,7 @@ use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 use virtualization::{Virtualization, VirtualizedDatabase};
 use virtualized_query::pushdown_setting::PushdownSetting;
+use crate::rename_vars::rename_vars;
 
 pub struct EngineConfig {
     pub sparql_endpoint: Option<String>,
@@ -83,6 +84,7 @@ impl Engine {
         let parsed_query = parse_sparql_select_query(query)?;
         debug!("Parsed query: {}", parsed_query.to_string());
         debug!("Parsed query algebra: {:?}", &parsed_query);
+        let (parsed_query, rename_map) = rename_vars(parsed_query);
         let virtualized_iris = self.virtualization.get_virtualized_iris();
         let first_level_virtualized_iris = self.virtualization.get_first_level_virtualized_iris();
 
@@ -112,10 +114,17 @@ impl Engine {
             rewritten_filters,
             self.virtualization.clone(),
         );
-        let solution_mappings = combiner
+        let mut solution_mappings = combiner
             .combine_static_and_time_series_results(static_queries_map, &preprocessed_query)
             .await
             .map_err(|x| ChrontextError::CombinerError(x))?;
+        for (original, renamed) in rename_map {
+            if let Some(dt) = solution_mappings.rdf_node_types.remove(&renamed) {
+                solution_mappings.mappings = solution_mappings.mappings.rename(&[renamed], &[original.clone()]);
+                solution_mappings.rdf_node_types.insert(original, dt);
+            }
+        }
+
         let SolutionMappings {
             mappings,
             rdf_node_types,

diff --git a/lib/chrontext/src/lib.rs b/lib/chrontext/src/lib.rs
@@ -10,3 +10,4 @@ pub mod rewriting;
 pub mod sparql_database;
 mod sparql_result_to_polars;
 pub mod splitter;
+mod rename_vars;
diff --git a/lib/chrontext/src/preparing/graph_patterns/expression_rewrites.rs b/lib/chrontext/src/preparing/graph_patterns/expression_rewrites.rs
@@ -87,7 +87,7 @@ pub(crate) fn rewrite_filter_expression(
         context,
         pushdown_settings,
     );
-    return (rewrite.expression.take(), rewrite.lost_value);
+    (rewrite.expression.take(), rewrite.lost_value)
 }
 
 pub(crate) fn try_recursive_rewrite_expression(

diff --git a/lib/virtualization/src/opcua.rs b/lib/virtualization/src/opcua.rs
@@ -13,7 +13,7 @@ use oxrdf::vocab::xsd;
 use oxrdf::{Literal, Variable};
 use polars::export::chrono::{DateTime as ChronoDateTime, Duration, TimeZone, Utc};
 use polars::prelude::{
-    concat, AnyValue, DataFrame, DataType, IntoLazy, IntoVec, NamedFrom, Series, UnionArgs,
+    concat, AnyValue, DataFrame, DataType, IntoLazy, NamedFrom, Series, UnionArgs,
 };
 use query_processing::constants::DATETIME_AS_SECONDS;
 use representation::query_context::Context;
@@ -93,8 +93,7 @@ impl VirtualizedOPCUADatabase {
         let mut grouping_col_lookup = HashMap::new();
         let grouping_columns = vq.get_groupby_columns();
         let grouping_col_name = if let Some(g) = grouping_columns.into_iter().next() {
-            #[allow(suspicious_double_ref_op)]
-            Some(g.deref().clone())
+            Some(g.deref())
         } else {
             None
         };

diff --git a/lib/virtualization/src/python/sql_translation.rs b/lib/virtualization/src/python/sql_translation.rs
@@ -6,6 +6,7 @@ from sqlalchemy.dialects import postgresql
 from sqlalchemy.sql.base import ColumnCollection
 from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy_bigquery.base import BigQueryDialect
+from databricks.sqlalchemy import DatabricksDialect
 
 from chrontext.vq import Expression, VirtualizedQuery, AggregateExpression
 from sqlalchemy import ColumnElement, Column, Table, MetaData, Select, select, literal, DateTime, values, cast, \
@@ -34,6 +35,8 @@ def translate_sql(vq: VirtualizedQuery, dialect: Literal["bigquery", "postgres"]
             use_dialect = BigQueryDialect()
         case "postgres":
             use_dialect = postgresql.dialect()
+        case "databricks":
+            use_dialect = DatabricksDialect()
     compiled = q.compile(dialect=use_dialect, compile_kwargs={"literal_binds": True})
     return str(compiled)
 
@@ -97,7 +100,7 @@ class SPARQLMapper:
                         ).select_from(
                             table
                         )
-                    if self.dialect == "postgres":
+                    if self.dialect == "postgres" or self.dialect == "databricks":
                         values_sub = values(
                                 Column("id"), Column(query.grouping_column_name),
                                 name=self.inner_name()
@@ -349,6 +352,12 @@ class SPARQLMapper:
                                     func.extract("EPOCH", sql_args[0]),
                                     sql_args[1])
                                 )
+                    elif self.dialect == "databricks":
+                        return func.TIMESTAMP_SECONDS(
+                            func.UNIX_TIMESTAMP(sql_args[0]) - func.mod(
+                                    func.UNIX_TIMESTAMP(sql_args[0]),
+                                    sql_args[1])
+                        )
                     elif self.dialect == "bigquery":
                         return func.TIMESTAMP_SECONDS(
                             func.UNIX_SECONDS(sql_args[0]) - func.mod(

diff --git a/py_chrontext/chrontext/chrontext.pyi b/py_chrontext/chrontext/chrontext.pyi
@@ -239,7 +239,7 @@ class VirtualizedPythonDatabase:
     def __init__(self,
                  database: Any,
                  resource_sql_map: Optional[Dict[str, Any]],
-                 sql_dialect: Optional[LiteralType["postgres", "bigquery"]]):
+                 sql_dialect: Optional[LiteralType["postgres", "bigquery", "databricks"]]):
         """
         See the tutorial in README.md for guidance on how to use this class.
         This API is subject to change, it will be possible to specify what parts of the SPARQL query may be pushed down into the database.

diff --git a/py_chrontext/pyproject.toml b/py_chrontext/pyproject.toml
@@ -1,10 +1,10 @@
 [project]
 name = "chrontext"
 description = "Hybrid SPARQL query engine for timeseries data"
-dependencies = ["polars>=0.20.2", "pyarrow>=7.0.0", "pandas", "sqlalchemy>=2.0.31", "sqlalchemy_bigquery==1.11.0"]
+dependencies = ["polars>=0.20.2", "pyarrow>=7.0.0", "pandas", "sqlalchemy>=2.0.31", "sqlalchemy_bigquery==1.11.0", "databricks-sql-connector>=3.3.0"]
 readme = "README.md"
-authors = [{name = "Magnus Bakken", email = "[email protected]" }]
-license = {file = "LICENSE"}
+authors = [{ name = "Magnus Bakken", email = "[email protected]" }]
+license = { file = "LICENSE" }
 requires-python = ">=3.9"
 keywords = ["rdf", "graph", "arrow", "sparql", "timeseries"]
 classifiers = [
@@ -26,5 +26,5 @@ Repository = "https://github.com/DataTreehouse/chrontext"
 Changelog = "https://github.com/DataTreehouse/chrontext/releases"
 
 [build-system]
-requires = ["maturin==1.5.1"]
+requires = ["maturin==1.7.1"]
 build-backend = "maturin"
diff --git a/py_chrontext/src/errors.rs b/py_chrontext/src/errors.rs
@@ -1,6 +1,4 @@
-use chrontext::combiner::CombinerError;
 use chrontext::errors::ChrontextError as RustChrontextError;
-use chrontext::splitter::QueryParseError;
 use oxrdf::IriParseError;
 use pyo3::{create_exception, exceptions::PyException, prelude::*};
 use spargebra::SparqlSyntaxError;

diff --git a/py_chrontext/tests/requirements.txt b/py_chrontext/tests/requirements.txt
@@ -6,3 +6,4 @@ requests>=2.32.0
 sparqlwrapper==2.0.0
 asyncua==1.0.4
 duckdb>=1.0.0
+pytest-mock==3.14.0