-
Notifications
You must be signed in to change notification settings - Fork 186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
columnar support for Arrow tables #2030
Changes from all commits
013bd45
019a793
778b890
932186c
1f5ded9
4e729d0
f045315
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ export const reindex = Symbol("reindex"); | |
export function valueof(data, value, type) { | ||
const valueType = typeof value; | ||
return valueType === "string" | ||
? maybeTypedMap(data, field(value), type) | ||
? columnar(data, value, type) | ||
: valueType === "function" | ||
? maybeTypedMap(data, value, type) | ||
: valueType === "number" || value instanceof Date || valueType === "boolean" | ||
|
@@ -133,6 +133,7 @@ export function keyword(input, name, allowed) { | |
// Promotes the specified data to an array as needed. | ||
export function arrayify(values) { | ||
if (values == null || values instanceof Array || values instanceof TypedArray) return values; | ||
if (isArrowTable(values)) return arrowTableProxy(values); | ||
switch (values.type) { | ||
case "FeatureCollection": | ||
return values.features; | ||
|
@@ -575,3 +576,35 @@ export function maybeClip(clip) { | |
else if (clip != null) clip = keyword(clip, "clip", ["frame", "sphere"]); | ||
return clip; | ||
} | ||
|
||
// Duck typing Apache Arrow tables | ||
function isArrowTable(data) { | ||
return typeof data?.getChild === "function" && typeof data.numRows === "number" && typeof data.slice === "function"; | ||
} | ||
|
||
// Extract columnar data | ||
function columnar(data, name, type) { | ||
if (isArrowTable(data)) { | ||
const column = maybeTypedArrayify(data.getChild(name), type); | ||
if (Array.isArray(column) && String(data.schema?.fields?.find((d) => d.name === name)).endsWith("<MILLISECOND>")) | ||
column.find((d, i) => d != null && (column[i] = new Date(d))); | ||
Comment on lines
+589
to
+590
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This assignment will have no effect if |
||
return column; | ||
} | ||
return maybeTypedMap(data, field(name), type); | ||
} | ||
|
||
// Arrayify arrow tables. We try to avoid materializing the values, but the | ||
// Proxy might be used by the group reducer to construct groupData. | ||
function arrowTableProxy(data) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this negative most of the benefit of using arrow? I think proxies add a lot of overhead and arrow already has fast proxies for e.g. iteration. Can you defer the conversion to the group reducer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it works. We still read the named channels with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I see. So in most cases you use columnar access anyway. Then maybe ignore my comment. |
||
return new Proxy(data, { | ||
get(target, prop) { | ||
return prop === "length" | ||
? target.numRows | ||
: prop === "constructor" // for take/map | ||
? Array | ||
: typeof prop === "string" && !isNaN(prop) | ||
? target.get(prop) | ||
: target[prop]; // pass all other properties | ||
} | ||
}); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be calling
data.getChild(name).toArray()
here, or elsemaybeTypedArrayify
will use the slowest iterable path rather than using the more efficientVector.toArray
(which could be zero-copy if the Arrow table only has one chunk).