Skip to content

Commit

Permalink
Improve databend compatability (arrow-udf#22)
Browse files Browse the repository at this point in the history
* feat(arrow-udf-js): enable all intrinsics, enhance supported types and allow for alternate
extension key/values

- enables all rquickjs intrinsics
- allows for configurable arrow extension keys and values.
  It is not always practical to rebuild an already created record
  batch to add the field metadata values required by arrow-udf, so
  this makes them optionally configurable
- adds support for JSON in LargeBinary arrays for Databend compatibility
- adds support for Timestamp and Date32 arrow types as Date() type in JS
  for Databend compatibility
- adds support Decimal128 and Decimal256 arrow types as BigDecimal in JS
  for Databend compatibility
- simplifies  bigdecimal conversion by using `ctx.globals().get("BigDecimal")?` to access BigDecimal initializer function rather than holding a BigDecimal persistent function handle on the runtime

* feat(arrow-udf-python): add alternate extension name and json in LargeBinary support

* feat(arrow-udf-js-deno): add alternate extension name and json in LargeBinary support

* chore: update readme

* chore(arrow-udf-js): cargo fmt

* chore(arrow-udf-js): refactor decimal128/256 parsing

- Prefer split_once over split for splitting decimal string into integer
  and fractional parts.
- Prefer pattern matching style over more imperative approach.

* chore: cargo fmt
  • Loading branch information
maxjustus authored May 14, 2024
1 parent fa36365 commit d0a21f0
Show file tree
Hide file tree
Showing 10 changed files with 2,055 additions and 1,285 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ In addition to the standard types defined by Arrow, these crates also support th
| JSON | Utf8 | `ARROW:extension:name` = `arrowudf.json` |
| Decimal | Utf8 | `ARROW:extension:name` = `arrowudf.decimal` |

Alternatively, you can configure the extension metadata key and values to look for when converting between Arrow and extension types:
```rust
let mut js_runtime = arrow_udf_js::Runtime::new().unwrap();

js_runtime.converter.set_arrow_extension_key(&"Extension".to_string());
js_runtime.converter.set_json_extension_name(&"Variant".to_string());
js_runtime.converter.set_decimal_extension_name(&"Decimal".to_string());
```

### JSON Type

JSON type is stored in string array in text form.
Expand Down
1 change: 1 addition & 0 deletions arrow-udf-js-deno/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,5 @@ The following table shows the type mapping between Arrow and JavaScript:
| Extension Type | Physical Type | Metadata | JS Type |
| -------------- | ------------- | ------------------------------------------- | ------------- |
| JSON | String | `ARROW:extension:name` = `arrowudf.json` | any (parsed by `JSON.parse(string)`) |
| JSON | LargeBinary | `ARROW:extension:name` = `arrowudf.json` | any (parsed by `JSON.parse(string)`) |
| Decimal | String | `ARROW:extension:name` = `arrowudf.decimal` | BigDecimal |
1,918 changes: 993 additions & 925 deletions arrow-udf-js-deno/src/deno_arrow.rs

Large diffs are not rendered by default.

25 changes: 17 additions & 8 deletions arrow-udf-js-deno/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ use arrow_udf_js_deno_runtime::deno_runtime;
use futures::{Future, Stream, StreamExt, TryStreamExt};
use into_field::IntoField;

use crate::deno_arrow::get_jsvalue;

mod deno_arrow;
mod into_field;
pub mod tokio_spawn_pinned;
Expand Down Expand Up @@ -66,6 +64,7 @@ pub struct InternalRuntime {
functions: HashMap<String, Function>,
deno_runtime: Rc<RefCell<deno_runtime::DenoRuntime>>,
big_decimal: ::v8::Global<::v8::Function>,
converter: deno_arrow::Converter,
}

#[derive(Clone)]
Expand Down Expand Up @@ -145,6 +144,7 @@ pub(crate) struct RecordBatchIterInternal {
input: RecordBatch,
function: Function,
big_decimal: ::v8::Global<::v8::Function>,
converter: deno_arrow::Converter,
schema: SchemaRef,
chunk_size: usize,
promise: Rc<RefCell<Option<::v8::Global<::v8::Promise>>>>,
Expand Down Expand Up @@ -298,6 +298,7 @@ impl InternalRuntime {
functions: HashMap::new(),
deno_runtime,
big_decimal,
converter: deno_arrow::Converter::new(),
}
}

Expand Down Expand Up @@ -399,9 +400,10 @@ impl InternalRuntime {
for i in 0..input.num_rows() {
args.clear();
for (column, field) in input.columns().iter().zip(input.schema().fields()) {
let val =
deno_arrow::get_jsvalue(try_catch, field, column, &self.big_decimal, i)
.context("failed to get jsvalue from arrow array")?;
let val = self
.converter
.get_jsvalue(try_catch, field, column, &self.big_decimal, i)
.context("failed to get jsvalue from arrow array")?;
args.push(val);
}

Expand Down Expand Up @@ -467,7 +469,9 @@ impl InternalRuntime {
let scope = &mut js_runtime.handle_scope();
let try_catch = &mut ::v8::TryCatch::new(scope);

let array = deno_arrow::build_array(&function.return_field, try_catch, results)
let array = self
.converter
.build_array(&function.return_field, try_catch, results)
.context("failed to build arrow array from return values")?;
let schema = Schema::new(vec![function.return_field.clone()]);
Ok(RecordBatch::try_new(Arc::new(schema), vec![array])?)
Expand Down Expand Up @@ -497,6 +501,7 @@ impl InternalRuntime {
generator: Rc::new(RefCell::new(None)),
promise: Rc::new(RefCell::new(None)),
state: RecordBatchIterState::Processing,
converter: self.converter.clone(),
})
}
}
Expand Down Expand Up @@ -677,7 +682,9 @@ impl Stream for RecordBatchIterInternal {
(inner.input.columns().iter()).zip(inner.input.schema().fields())
{
let r = inner.row.borrow();
let val = get_jsvalue(scope, field, &column, &inner.big_decimal, *r)
let val = inner
.converter
.get_jsvalue(scope, field, &column, &inner.big_decimal, *r)
.context("failed to get jsvalue from arrow array")?;

row.push(val);
Expand Down Expand Up @@ -807,7 +814,9 @@ impl Stream for RecordBatchIterInternal {
}

let indexes = Arc::new(indexes.finish());
let array = deno_arrow::build_array(&inner.function.return_field, scope, results)
let array = inner
.converter
.build_array(&inner.function.return_field, scope, results)
.context("failed to build arrow array from return values")?;

match RecordBatch::try_new(inner.schema.clone(), vec![indexes, array]) {
Expand Down
5 changes: 5 additions & 0 deletions arrow-udf-js/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ The following table shows the type mapping between Arrow and JavaScript:
| Float64 | number |
| String | string |
| LargeString | string |
| Date32 | Date |
| Timestamp | Date |
| Decimal128 | BigDecimal |
| Decimal256 | BigDecimal |
| Binary | Uint8Array |
| LargeBinary | Uint8Array |
| List(Int8) | Int8Array |
Expand All @@ -146,4 +150,5 @@ The following table shows the type mapping between Arrow and JavaScript:
| Extension Type | Physical Type | Metadata | JS Type |
| -------------- | ------------- | ------------------------------------------- | ------------- |
| JSON | String | `ARROW:extension:name` = `arrowudf.json` | any (parsed by `JSON.parse(string)`) |
| JSON | LargeBinary | `ARROW:extension:name` = `arrowudf.json` | any (parsed by `JSON.parse(string)`) |
| Decimal | String | `ARROW:extension:name` = `arrowudf.decimal` | BigDecimal |
Loading

0 comments on commit d0a21f0

Please sign in to comment.