Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-134: Add Validity functions #135

Merged
merged 8 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ repos:
# https://github.com/golangci/golangci-lint/issues/828
- id: golangci-lint-full
name: golangci-lint-full-arrow
entry: bash -c 'cd arrow && golangci-lint run --timeout=5m'
entry: bash -c 'cd arrow && golangci-lint run --timeout=10m'
- id: golangci-lint-full
name: golangci-lint-full-parquet
entry: bash -c 'cd parquet && golangci-lint run'
Expand Down
34 changes: 34 additions & 0 deletions arrow/array/struct.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,40 @@ func NewStructArray(cols []arrow.Array, names []string) (*Struct, error) {
return NewStructArrayWithNulls(cols, names, nil, 0, 0)
}

// NewStructArrayWithFields builds a new Struct Array using the passed columns
// and provided fields. As opposed to NewStructArray, this allows you to provide
// the full fields to utilize for the struct column instead of just the names.
func NewStructArrayWithFields(cols []arrow.Array, fields []arrow.Field) (*Struct, error) {
if len(cols) != len(fields) {
return nil, fmt.Errorf("%w: mismatching number of fields and child arrays", arrow.ErrInvalid)
}
if len(cols) == 0 {
return nil, fmt.Errorf("%w: can't infer struct array length with 0 child arrays", arrow.ErrInvalid)
}

length := cols[0].Len()
children := make([]arrow.ArrayData, len(cols))
for i, c := range cols {
if length != c.Len() {
return nil, fmt.Errorf("%w: mismatching child array lengths", arrow.ErrInvalid)
}
if !arrow.TypeEqual(fields[i].Type, c.DataType()) {
return nil, fmt.Errorf("%w: mismatching data type for child #%d, field says '%s', got '%s'",
arrow.ErrInvalid, i, fields[i].Type, c.DataType())
}
if !fields[i].Nullable && c.NullN() > 0 {
return nil, fmt.Errorf("%w: field says not-nullable, child #%d has nulls",
arrow.ErrInvalid, i)
}

children[i] = c.Data()
}

data := NewData(arrow.StructOf(fields...), length, []*memory.Buffer{nil}, children, 0, 0)
defer data.Release()
return NewStructData(data), nil
bkietz marked this conversation as resolved.
Show resolved Hide resolved
}

// NewStructArrayWithNulls is like NewStructArray as a convenience function,
// but also takes in a null bitmap, the number of nulls, and an optional offset
// to use for creating the Struct Array.
Expand Down
30 changes: 26 additions & 4 deletions arrow/compute/exprs/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ const (
SubstraitArithmeticFuncsURI = SubstraitDefaultURIPrefix + "functions_arithmetic.yaml"
// URI for official Substrait Comparison funcs extensions
SubstraitComparisonFuncsURI = SubstraitDefaultURIPrefix + "functions_comparison.yaml"
SubstraitBooleanFuncsURI = SubstraitDefaultURIPrefix + "functions_boolean.yaml"

TimestampTzTimezone = "UTC"
)
Expand Down Expand Up @@ -93,7 +94,7 @@ func init() {
}
}

for _, fn := range []string{"equal", "not_equal", "lt", "lte", "gt", "gte"} {
for _, fn := range []string{"equal", "not_equal", "lt", "lte", "gt", "gte", "is_null", "is_not_null", "is_nan"} {
err := DefaultExtensionIDRegistry.AddSubstraitScalarToArrow(
extensions.ID{URI: SubstraitComparisonFuncsURI, Name: fn},
simpleMapSubstraitToArrowFunc)
Expand All @@ -102,13 +103,30 @@ func init() {
}
}

for _, fn := range []string{"equal", "not_equal", "less", "less_equal", "greater", "greater_equal"} {
for _, fn := range []string{"equal", "not_equal", "less", "less_equal", "greater", "greater_equal", "is_null", "is_not_null", "is_nan"} {
err := DefaultExtensionIDRegistry.AddArrowToSubstrait(fn,
simpleMapArrowToSubstraitFunc(SubstraitComparisonFuncsURI))
if err != nil {
panic(err)
}
}

for _, fn := range []string{"and", "or"} {
err := DefaultExtensionIDRegistry.AddSubstraitScalarToArrow(
extensions.ID{URI: SubstraitBooleanFuncsURI, Name: fn},
simpleMapSubstraitToArrowFunc)
if err != nil {
panic(err)
}
}

for _, fn := range []string{"and_kleene", "or_kleene"} {
err := DefaultExtensionIDRegistry.AddArrowToSubstrait(fn,
simpleMapArrowToSubstraitFunc(SubstraitBooleanFuncsURI))
if err != nil {
panic(err)
}
}
}

type overflowBehavior string
Expand Down Expand Up @@ -168,13 +186,17 @@ var substraitToArrowFuncMap = map[string]string{
"gt": "greater",
"lte": "less_equal",
"gte": "greater_equal",
"or": "or_kleene",
"and": "and_kleene",
}

var arrowToSubstraitFuncMap = map[string]string{
"less": "lt",
"greater": "gt",
"less_equal": "lte",
"greater_equal": "gte",
"and_kleene": "and",
"or_kleene": "or",
}

func simpleMapSubstraitToArrowFunc(sf *expr.ScalarFunction) (fname string, opts compute.FunctionOptions, err error) {
Expand Down Expand Up @@ -553,9 +575,9 @@ func ToSubstraitType(dt arrow.DataType, nullable bool, ext ExtensionIDSet) (type
return &types.Float32Type{Nullability: nullability}, nil
case arrow.FLOAT64:
return &types.Float64Type{Nullability: nullability}, nil
case arrow.STRING:
case arrow.STRING, arrow.LARGE_STRING:
return &types.StringType{Nullability: nullability}, nil
case arrow.BINARY:
case arrow.BINARY, arrow.LARGE_BINARY:
return &types.BinaryType{Nullability: nullability}, nil
case arrow.DATE32:
return &types.DateType{Nullability: nullability}, nil
Expand Down
5 changes: 3 additions & 2 deletions arrow/compute/internal/kernels/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@ C2GOASM=c2goasm
CC=clang-11
CXX=clang++-11
C_FLAGS=-target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=5000 \
-fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -I../../../../internal/utils/_lib
-fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -fno-jump-tables \
-fno-math-errno -funsafe-math-optimizations -fno-rounding-math -fno-trapping-math -I_lib -I../../../../internal/utils/_lib
ASM_FLAGS_AVX2=-mavx2 -mfma
ASM_FLAGS_SSE4=-msse4
ASM_FLAGS_BMI2=-mbmi2
ASM_FLAGS_POPCNT=-mpopcnt

C_FLAGS_NEON=-O3 -fvectorize -mllvm -force-vector-width=16 -fno-asynchronous-unwind-tables -mno-red-zone -mstackrealign -fno-exceptions \
-fno-rtti -fno-builtin -ffast-math -fno-jump-tables -I_lib -I../../../../internal/utils/_lib
-fno-rtti -fno-builtin -fno-math-errno -funsafe-math-optimizations -fno-rounding-math -fno-trapping-math -fno-jump-tables -I_lib -I../../../../internal/utils/_lib

GO_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -not -name '*_test.go')
ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -not -name '*_test.go')
Expand Down
Loading
Loading