diff --git a/bloom.go b/bloom.go index 2e50057..580ff5d 100644 --- a/bloom.go +++ b/bloom.go @@ -4,8 +4,10 @@ import ( "io" "github.com/segmentio/parquet-go/bloom" + "github.com/segmentio/parquet-go/bloom/xxhash" "github.com/segmentio/parquet-go/deprecated" "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/encoding/plain" "github.com/segmentio/parquet-go/format" "github.com/segmentio/parquet-go/internal/bits" ) @@ -80,9 +82,13 @@ type BloomFilterColumn interface { // filter. Hash() bloom.Hash - // NewFilter constructs a new bloom filter configured to hold the given - // number of values and bits of filter per value. - NewFilter(numValues int64, bitsPerValue uint) bloom.MutableFilter + // Returns an encoding which can be used to write columns of values to the + // filter. + Encoding() encoding.Encoding + + // Returns the size of the filter needed to encode values in the filter, + // assuming each value will be encoded with the given number of bits. + Size(numValues int64, bitsPerValue uint) int } // SplitBlockFilter constructs a split block bloom filter object for the column @@ -91,10 +97,11 @@ func SplitBlockFilter(path ...string) BloomFilterColumn { return splitBlockFilte type splitBlockFilter []string -func (f splitBlockFilter) Path() []string { return f } -func (f splitBlockFilter) Hash() bloom.Hash { return bloom.XXH64{} } -func (f splitBlockFilter) NewFilter(numValues int64, bitsPerValue uint) bloom.MutableFilter { - return make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(numValues, bitsPerValue)) +func (f splitBlockFilter) Path() []string { return f } +func (f splitBlockFilter) Hash() bloom.Hash { return bloom.XXH64{} } +func (f splitBlockFilter) Encoding() encoding.Encoding { return splitBlockEncoding{} } +func (f splitBlockFilter) Size(numValues int64, bitsPerValue uint) int { + return bloom.BlockSize * bloom.NumSplitBlocksOf(numValues, bitsPerValue) } // Creates a header from the given bloom filter. @@ -124,128 +131,128 @@ func searchBloomFilterColumn(filters []BloomFilterColumn, path columnPath) Bloom return nil } -// bloomFilterEncoder is an adapter type which implements the encoding.Encoder -// interface on top of a bloom filter. -type bloomFilterEncoder struct { - filter bloom.MutableFilter - hash bloom.Hash - keys [128]uint64 -} - -func newBloomFilterEncoder(filter bloom.MutableFilter, hash bloom.Hash) *bloomFilterEncoder { - return &bloomFilterEncoder{filter: filter, hash: hash} -} +const ( + // Size of the stack buffer used to perform bulk operations on bloom filters. + // + // This value was determined as being a good default empirically, + // 128 x uint64 makes a 1KiB buffer which amortizes the cost of calling + // methods of bloom filters while not causing too much stack growth either. + filterEncodeBufferSize = 128 +) -func (e *bloomFilterEncoder) Bytes() []byte { - return e.filter.Bytes() +type splitBlockEncoding struct { + encoding.NotSupported } -func (e *bloomFilterEncoder) Reset(io.Writer) { - e.filter.Reset() +func (splitBlockEncoding) EncodeBoolean(dst []byte, src []bool) ([]byte, error) { + splitBlockEncodeUint8(bloom.MakeSplitBlockFilter(dst), bits.BoolToBytes(src)) + return dst, nil } -func (e *bloomFilterEncoder) SetBitWidth(int) { +func (splitBlockEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { + splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), bits.Int32ToUint32(src)) + return dst, nil } -func (e *bloomFilterEncoder) EncodeBoolean(data []bool) error { - return e.insert8(bits.BoolToBytes(data)) +func (splitBlockEncoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) { + splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), bits.Int64ToUint64(src)) + return dst, nil } -func (e *bloomFilterEncoder) EncodeInt8(data []int8) error { - return e.insert8(bits.Int8ToBytes(data)) +func (e splitBlockEncoding) EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) { + splitBlockEncodeFixedLenByteArray(bloom.MakeSplitBlockFilter(dst), deprecated.Int96ToBytes(src), 12) + return dst, nil } -func (e *bloomFilterEncoder) EncodeInt16(data []int16) error { - return e.insert16(bits.Int16ToUint16(data)) +func (splitBlockEncoding) EncodeFloat(dst []byte, src []float32) ([]byte, error) { + splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), bits.Float32ToUint32(src)) + return dst, nil } -func (e *bloomFilterEncoder) EncodeInt32(data []int32) error { - return e.insert32(bits.Int32ToUint32(data)) +func (splitBlockEncoding) EncodeDouble(dst []byte, src []float64) ([]byte, error) { + splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), bits.Float64ToUint64(src)) + return dst, nil } -func (e *bloomFilterEncoder) EncodeInt64(data []int64) error { - return e.insert64(bits.Int64ToUint64(data)) -} +func (splitBlockEncoding) EncodeByteArray(dst, src []byte) ([]byte, error) { + filter := bloom.MakeSplitBlockFilter(dst) + buffer := make([]uint64, 0, filterEncodeBufferSize) -func (e *bloomFilterEncoder) EncodeInt96(data []deprecated.Int96) error { - return e.EncodeFixedLenByteArray(12, deprecated.Int96ToBytes(data)) -} + err := plain.RangeByteArrays(src, func(value []byte) error { + if len(buffer) == cap(buffer) { + filter.InsertBulk(buffer) + buffer = buffer[:0] + } + buffer = append(buffer, xxhash.Sum64(value)) + return nil + }) -func (e *bloomFilterEncoder) EncodeFloat(data []float32) error { - return e.insert32(bits.Float32ToUint32(data)) + filter.InsertBulk(buffer) + return dst, err } -func (e *bloomFilterEncoder) EncodeDouble(data []float64) error { - return e.insert64(bits.Float64ToUint64(data)) +func (splitBlockEncoding) EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { + filter := bloom.MakeSplitBlockFilter(dst) + if size == 16 { + splitBlockEncodeUint128(filter, bits.BytesToUint128(src)) + } else { + splitBlockEncodeFixedLenByteArray(filter, src, size) + } + return dst, nil } -func (e *bloomFilterEncoder) EncodeByteArray(data encoding.ByteArrayList) error { - data.Range(func(v []byte) bool { e.insert(v); return true }) - return nil -} +func splitBlockEncodeFixedLenByteArray(filter bloom.SplitBlockFilter, data []byte, size int) { + buffer := make([]uint64, 0, filterEncodeBufferSize) -func (e *bloomFilterEncoder) EncodeFixedLenByteArray(size int, data []byte) error { - if size == 16 { - return e.insert128(bits.BytesToUint128(data)) - } for i, j := 0, size; j <= len(data); { - e.insert(data[i:j]) + if len(buffer) == cap(buffer) { + filter.InsertBulk(buffer) + buffer = buffer[:0] + } + buffer = append(buffer, xxhash.Sum64(data[i:j])) i += size j += size } - return nil -} -func (e *bloomFilterEncoder) insert(value []byte) { - e.filter.Insert(e.hash.Sum64(value)) + filter.InsertBulk(buffer) } -func (e *bloomFilterEncoder) insert8(data []uint8) error { - k := e.keys[:] - for i := 0; i < len(data); { - n := e.hash.MultiSum64Uint8(k, data[i:]) - e.filter.InsertBulk(k[:n:n]) - i += n - } - return nil -} +func splitBlockEncodeUint8(filter bloom.SplitBlockFilter, values []uint8) { + buffer := make([]uint64, filterEncodeBufferSize) -func (e *bloomFilterEncoder) insert16(data []uint16) error { - k := e.keys[:] - for i := 0; i < len(data); { - n := e.hash.MultiSum64Uint16(k, data[i:]) - e.filter.InsertBulk(k[:n:n]) + for i := 0; i < len(values); { + n := xxhash.MultiSum64Uint8(buffer, values[i:]) + filter.InsertBulk(buffer[:n]) i += n } - return nil } -func (e *bloomFilterEncoder) insert32(data []uint32) error { - k := e.keys[:] - for i := 0; i < len(data); { - n := e.hash.MultiSum64Uint32(k, data[i:]) - e.filter.InsertBulk(k[:n:n]) +func splitBlockEncodeUint32(filter bloom.SplitBlockFilter, values []uint32) { + buffer := make([]uint64, filterEncodeBufferSize) + + for i := 0; i < len(values); { + n := xxhash.MultiSum64Uint32(buffer, values[i:]) + filter.InsertBulk(buffer[:n]) i += n } - return nil } -func (e *bloomFilterEncoder) insert64(data []uint64) error { - k := e.keys[:] - for i := 0; i < len(data); { - n := e.hash.MultiSum64Uint64(k, data[i:]) - e.filter.InsertBulk(k[:n:n]) +func splitBlockEncodeUint64(filter bloom.SplitBlockFilter, values []uint64) { + buffer := make([]uint64, filterEncodeBufferSize) + + for i := 0; i < len(values); { + n := xxhash.MultiSum64Uint64(buffer, values[i:]) + filter.InsertBulk(buffer[:n]) i += n } - return nil } -func (e *bloomFilterEncoder) insert128(data [][16]byte) error { - k := e.keys[:] - for i := 0; i < len(data); { - n := e.hash.MultiSum64Uint128(k, data[i:]) - e.filter.InsertBulk(k[:n:n]) +func splitBlockEncodeUint128(filter bloom.SplitBlockFilter, values [][16]byte) { + buffer := make([]uint64, filterEncodeBufferSize) + + for i := 0; i < len(values); { + n := xxhash.MultiSum64Uint128(buffer, values[i:]) + filter.InsertBulk(buffer[:n]) i += n } - return nil } diff --git a/bloom/filter.go b/bloom/filter.go index 222462c..623b4a5 100644 --- a/bloom/filter.go +++ b/bloom/filter.go @@ -30,6 +30,14 @@ type MutableFilter interface { // to a storage medium. type SplitBlockFilter []Block +// MakeSplitBlockFilter constructs a SplitBlockFilter value from the data byte +// slice. +func MakeSplitBlockFilter(data []byte) SplitBlockFilter { + p := *(*unsafe.Pointer)(unsafe.Pointer(&data)) + n := len(data) / BlockSize + return unsafe.Slice((*Block)(p), n) +} + // NumSplitBlocksOf returns the number of blocks in a filter intended to hold // the given number of values and bits of filter per value. // diff --git a/bloom_test.go b/bloom_test.go index 760a832..03434b4 100644 --- a/bloom_test.go +++ b/bloom_test.go @@ -7,19 +7,18 @@ import ( "github.com/segmentio/parquet-go/bloom" "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/encoding/plain" ) -func TestBloomFilterEncoder(t *testing.T) { - newFilter := func(numValues int) *bloomFilterEncoder { - return newBloomFilterEncoder( - make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(int64(numValues), 11)), - bloom.XXH64{}, - ) +func TestSplitBlockFilter(t *testing.T) { + newFilter := func(numValues int) bloom.SplitBlockFilter { + return make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(int64(numValues), 11)) } - check := func(e *bloomFilterEncoder, v Value) bool { - return e.filter.Check(v.hash(e.hash)) + encoding := SplitBlockFilter("$").Encoding() + + check := func(filter bloom.SplitBlockFilter, value Value) bool { + return filter.Check(value.hash(&bloom.XXH64{})) } tests := []struct { @@ -29,10 +28,10 @@ func TestBloomFilterEncoder(t *testing.T) { { scenario: "BOOLEAN", function: func(values []bool) bool { - f := newFilter(len(values)) - f.EncodeBoolean(values) + filter := newFilter(len(values)) + encoding.EncodeBoolean(filter.Bytes(), values) for _, v := range values { - if !check(f, ValueOf(v)) { + if !check(filter, ValueOf(v)) { return false } } @@ -43,10 +42,10 @@ func TestBloomFilterEncoder(t *testing.T) { { scenario: "INT32", function: func(values []int32) bool { - f := newFilter(len(values)) - f.EncodeInt32(values) + filter := newFilter(len(values)) + encoding.EncodeInt32(filter.Bytes(), values) for _, v := range values { - if !check(f, ValueOf(v)) { + if !check(filter, ValueOf(v)) { return false } } @@ -57,10 +56,10 @@ func TestBloomFilterEncoder(t *testing.T) { { scenario: "INT64", function: func(values []int64) bool { - f := newFilter(len(values)) - f.EncodeInt64(values) + filter := newFilter(len(values)) + encoding.EncodeInt64(filter.Bytes(), values) for _, v := range values { - if !check(f, ValueOf(v)) { + if !check(filter, ValueOf(v)) { return false } } @@ -71,10 +70,10 @@ func TestBloomFilterEncoder(t *testing.T) { { scenario: "INT96", function: func(values []deprecated.Int96) bool { - f := newFilter(len(values)) - f.EncodeInt96(values) + filter := newFilter(len(values)) + encoding.EncodeInt96(filter.Bytes(), values) for _, v := range values { - if !check(f, ValueOf(v)) { + if !check(filter, ValueOf(v)) { return false } } @@ -85,10 +84,10 @@ func TestBloomFilterEncoder(t *testing.T) { { scenario: "FLOAT", function: func(values []float32) bool { - f := newFilter(len(values)) - f.EncodeFloat(values) + filter := newFilter(len(values)) + encoding.EncodeFloat(filter.Bytes(), values) for _, v := range values { - if !check(f, ValueOf(v)) { + if !check(filter, ValueOf(v)) { return false } } @@ -99,10 +98,10 @@ func TestBloomFilterEncoder(t *testing.T) { { scenario: "DOUBLE", function: func(values []float64) bool { - f := newFilter(len(values)) - f.EncodeDouble(values) + filter := newFilter(len(values)) + encoding.EncodeDouble(filter.Bytes(), values) for _, v := range values { - if !check(f, ValueOf(v)) { + if !check(filter, ValueOf(v)) { return false } } @@ -113,14 +112,14 @@ func TestBloomFilterEncoder(t *testing.T) { { scenario: "BYTE_ARRAY", function: func(values [][]byte) bool { - a := encoding.ByteArrayList{} - for _, v := range values { - a.Push(v) + byteArrays := make([]byte, 0) + for _, value := range values { + byteArrays = plain.AppendByteArray(byteArrays, value) } - f := newFilter(len(values)) - f.EncodeByteArray(a) + filter := newFilter(len(values)) + encoding.EncodeByteArray(filter.Bytes(), byteArrays) for _, v := range values { - if !check(f, ValueOf(v)) { + if !check(filter, ValueOf(v)) { return false } } @@ -131,10 +130,10 @@ func TestBloomFilterEncoder(t *testing.T) { { scenario: "FIXED_LEN_BYTE_ARRAY", function: func(values []byte) bool { - f := newFilter(len(values)) - f.EncodeFixedLenByteArray(1, values) + filter := newFilter(len(values)) + encoding.EncodeFixedLenByteArray(filter.Bytes(), values, 1) for _, v := range values { - if !check(f, ValueOf([1]byte{v})) { + if !check(filter, ValueOf([1]byte{v})) { return false } } @@ -146,41 +145,16 @@ func TestBloomFilterEncoder(t *testing.T) { for _, test := range tests { t.Run(test.scenario, func(t *testing.T) { if err := quick.Check(test.function, nil); err != nil { - t.Fatal(err) + t.Error(err) } }) } - - t.Run("Reset", func(t *testing.T) { - f := newFilter(1) - f.EncodeBoolean([]bool{false, true}) - - allZeros := true - for _, b := range f.Bytes() { - if b != 0 { - allZeros = false - break - } - } - if allZeros { - t.Fatal("bloom filter bytes were all zero after encoding values") - } - - f.Reset(nil) - for i, b := range f.Bytes() { - if b != 0 { - t.Fatalf("bloom filter byte at index %d was not zero after resetting the encoder: %02X", i, b) - } - } - }) } -func BenchmarkBloomFilterEncoder(b *testing.B) { +func BenchmarkSplitBlockFilter(b *testing.B) { const N = 1000 - f := newBloomFilterEncoder( - make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(N, 10)), - bloom.XXH64{}, - ) + f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(N, 10)).Bytes() + e := SplitBlockFilter("$").Encoding() v := make([]int64, N) r := rand.NewSource(10) @@ -189,7 +163,7 @@ func BenchmarkBloomFilterEncoder(b *testing.B) { } for i := 0; i < b.N; i++ { - f.EncodeInt64(v) + e.EncodeInt64(f, v) } b.SetBytes(8 * N) diff --git a/buffer.go b/buffer.go index e47205b..0789cd2 100644 --- a/buffer.go +++ b/buffer.go @@ -68,7 +68,7 @@ func (buf *Buffer) configure(schema *Schema) { if isDictionaryEncoding(encoding) { bufferSize /= 2 - dictionary = columnType.NewDictionary(columnIndex, bufferSize) + dictionary = columnType.NewDictionary(columnIndex, 0, make([]byte, 0, bufferSize)) columnType = dictionary.Type() } diff --git a/buffer_test.go b/buffer_test.go index a38d581..2e8c0d8 100644 --- a/buffer_test.go +++ b/buffer_test.go @@ -161,7 +161,7 @@ func TestBuffer(t *testing.T) { typ parquet.Type }{ {scenario: "plain", typ: test.typ}, - {scenario: "indexed", typ: test.typ.NewDictionary(0, 0).Type()}, + {scenario: "indexed", typ: test.typ.NewDictionary(0, 0, nil).Type()}, } { t.Run(config.scenario, func(t *testing.T) { for _, mod := range [...]struct { @@ -196,23 +196,14 @@ func TestBuffer(t *testing.T) { } content := new(bytes.Buffer) - decoder := parquet.Plain.NewDecoder(content) - encoder := parquet.Plain.NewEncoder(content) - reader := config.typ.NewColumnReader(0, 32) buffer := parquet.NewBuffer(options...) - reset := func() { - content.Reset() - decoder.Reset(content) - encoder.Reset(content) - buffer.Reset() - } - for _, values := range test.values { t.Run("", func(t *testing.T) { - reset() + defer content.Reset() + defer buffer.Reset() fields := schema.Fields() - testBuffer(t, fields[0], reader, buffer, encoder, decoder, values, ordering.sortFunc) + testBuffer(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc) }) } }) @@ -237,7 +228,7 @@ func descending(typ parquet.Type, values []parquet.Value) { sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) > 0 }) } -func testBuffer(t *testing.T, node parquet.Node, reader parquet.ColumnReader, buffer *parquet.Buffer, encoder encoding.Encoder, decoder encoding.Decoder, values []interface{}, sortFunc sortFunc) { +func testBuffer(t *testing.T, node parquet.Node, buffer *parquet.Buffer, encoding encoding.Encoding, values []interface{}, sortFunc sortFunc) { repetitionLevel := 0 definitionLevel := 0 if !node.Required() { @@ -297,11 +288,6 @@ func testBuffer(t *testing.T, node parquet.Node, reader parquet.ColumnReader, bu t.Fatalf("max value mismatch: want=%v got=%v", maxValue, max) } - if err := page.WriteTo(encoder); err != nil { - t.Fatalf("flushing page writer: %v", err) - } - - reader.Reset(int(numValues), decoder) // We write a single value per row, so num values = num rows for all pages // including repeated ones, which makes it OK to slice the pages using the // number of values as a proxy for the row indexes. @@ -312,7 +298,6 @@ func testBuffer(t *testing.T, node parquet.Node, reader parquet.ColumnReader, bu values []parquet.Value reader parquet.ValueReader }{ - {"test", batch, reader}, {"page", batch, page.Values()}, {"head", batch[:halfValues], page.Slice(0, halfValues).Values()}, {"tail", batch[halfValues:], page.Slice(halfValues, numValues).Values()}, diff --git a/class_go18.go b/class_go18.go index 1b48b4a..f443ae1 100644 --- a/class_go18.go +++ b/class_go18.go @@ -7,9 +7,8 @@ import ( "github.com/segmentio/parquet-go/deprecated" "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/encoding/plain" "github.com/segmentio/parquet-go/internal/bits" - "github.com/segmentio/parquet-go/internal/cast" + "github.com/segmentio/parquet-go/internal/unsafecast" ) type primitive interface { @@ -46,15 +45,13 @@ type class[T primitive] struct { kind Kind makeValue func(T) Value value func(Value) T - plain func(T) []byte compare func(T, T) int less func(T, T) bool order func([]T) int min func([]T) T max func([]T) T bounds func([]T) (T, T) - encode func(encoding.Encoder, []T) error - decode func(encoding.Decoder, []T) (int, error) + encode func(encoding.Encoding, []byte, []T) ([]byte, error) } var boolClass = class[bool]{ @@ -63,15 +60,13 @@ var boolClass = class[bool]{ kind: Boolean, makeValue: makeValueBoolean, value: Value.Boolean, - plain: plain.Boolean, compare: compareBool, less: func(a, b bool) bool { return a != b && !a }, order: bits.OrderOfBool, min: bits.MinBool, max: bits.MaxBool, bounds: bits.MinMaxBool, - encode: encoding.Encoder.EncodeBoolean, - decode: encoding.Decoder.DecodeBoolean, + encode: encoding.Encoding.EncodeBoolean, } var int32Class = class[int32]{ @@ -80,15 +75,13 @@ var int32Class = class[int32]{ kind: Int32, makeValue: makeValueInt32, value: Value.Int32, - plain: plain.Int32, compare: compare[int32], less: less[int32], order: bits.OrderOfInt32, min: bits.MinInt32, max: bits.MaxInt32, bounds: bits.MinMaxInt32, - encode: encoding.Encoder.EncodeInt32, - decode: encoding.Decoder.DecodeInt32, + encode: encoding.Encoding.EncodeInt32, } var int64Class = class[int64]{ @@ -97,15 +90,13 @@ var int64Class = class[int64]{ kind: Int64, makeValue: makeValueInt64, value: Value.Int64, - plain: plain.Int64, compare: compare[int64], less: less[int64], order: bits.OrderOfInt64, min: bits.MinInt64, max: bits.MaxInt64, bounds: bits.MinMaxInt64, - encode: encoding.Encoder.EncodeInt64, - decode: encoding.Decoder.DecodeInt64, + encode: encoding.Encoding.EncodeInt64, } var int96Class = class[deprecated.Int96]{ @@ -114,15 +105,13 @@ var int96Class = class[deprecated.Int96]{ kind: Int96, makeValue: makeValueInt96, value: Value.Int96, - plain: plain.Int96, compare: compareInt96, less: deprecated.Int96.Less, order: deprecated.OrderOfInt96, min: deprecated.MinInt96, max: deprecated.MaxInt96, bounds: deprecated.MinMaxInt96, - encode: encoding.Encoder.EncodeInt96, - decode: encoding.Decoder.DecodeInt96, + encode: encoding.Encoding.EncodeInt96, } var float32Class = class[float32]{ @@ -131,15 +120,13 @@ var float32Class = class[float32]{ kind: Float, makeValue: makeValueFloat, value: Value.Float, - plain: plain.Float, compare: compare[float32], less: less[float32], order: bits.OrderOfFloat32, min: bits.MinFloat32, max: bits.MaxFloat32, bounds: bits.MinMaxFloat32, - encode: encoding.Encoder.EncodeFloat, - decode: encoding.Decoder.DecodeFloat, + encode: encoding.Encoding.EncodeFloat, } var float64Class = class[float64]{ @@ -148,15 +135,13 @@ var float64Class = class[float64]{ kind: Double, makeValue: makeValueDouble, value: Value.Double, - plain: plain.Double, compare: compare[float64], less: less[float64], order: bits.OrderOfFloat64, min: bits.MinFloat64, max: bits.MaxFloat64, bounds: bits.MinMaxFloat64, - encode: encoding.Encoder.EncodeDouble, - decode: encoding.Decoder.DecodeDouble, + encode: encoding.Encoding.EncodeDouble, } var uint32Class = class[uint32]{ @@ -165,18 +150,14 @@ var uint32Class = class[uint32]{ kind: Int32, makeValue: func(v uint32) Value { return makeValueInt32(int32(v)) }, value: func(v Value) uint32 { return uint32(v.Int32()) }, - plain: func(v uint32) []byte { return plain.Int32(int32(v)) }, compare: compare[uint32], less: less[uint32], order: bits.OrderOfUint32, min: bits.MinUint32, max: bits.MaxUint32, bounds: bits.MinMaxUint32, - encode: func(e encoding.Encoder, v []uint32) error { - return e.EncodeInt32(cast.Slice[int32](v)) - }, - decode: func(d encoding.Decoder, v []uint32) (int, error) { - return d.DecodeInt32(cast.Slice[int32](v)) + encode: func(enc encoding.Encoding, dst []byte, src []uint32) ([]byte, error) { + return enc.EncodeInt32(dst, unsafecast.Slice[int32](src)) }, } @@ -186,17 +167,13 @@ var uint64Class = class[uint64]{ kind: Int64, makeValue: func(v uint64) Value { return makeValueInt64(int64(v)) }, value: func(v Value) uint64 { return uint64(v.Int64()) }, - plain: func(v uint64) []byte { return plain.Int64(int64(v)) }, compare: compare[uint64], less: less[uint64], order: bits.OrderOfUint64, min: bits.MinUint64, max: bits.MaxUint64, bounds: bits.MinMaxUint64, - encode: func(e encoding.Encoder, v []uint64) error { - return e.EncodeInt64(cast.Slice[int64](v)) - }, - decode: func(d encoding.Decoder, v []uint64) (int, error) { - return d.DecodeInt64(cast.Slice[int64](v)) + encode: func(enc encoding.Encoding, dst []byte, src []uint64) ([]byte, error) { + return enc.EncodeInt64(dst, unsafecast.Slice[int64](src)) }, } diff --git a/column.go b/column.go index 2bb95ea..1149db8 100644 --- a/column.go +++ b/column.go @@ -1,6 +1,7 @@ package parquet import ( + "encoding/binary" "fmt" "io" "reflect" @@ -9,6 +10,7 @@ import ( "github.com/segmentio/parquet-go/deprecated" "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" + "github.com/segmentio/parquet-go/internal/bits" ) // Column represents a column in a parquet file. @@ -99,7 +101,7 @@ func (c *Column) Pages() Pages { pages: make([]filePages, len(c.file.rowGroups)), } for i := range r.pages { - c.file.rowGroups[i].(*fileRowGroup).columns[c.index].(*fileColumnChunk).setPagesOn(&r.pages[i]) + r.pages[i].init(c.file.rowGroups[i].(*fileRowGroup).columns[c.index].(*fileColumnChunk)) } return r } @@ -125,8 +127,8 @@ func (r *columnPages) ReadPage() (Page, error) { func (r *columnPages) SeekToRow(rowIndex int64) error { r.index = 0 - for r.index < len(r.pages) && r.pages[r.index].column.rowGroup.NumRows >= rowIndex { - rowIndex -= r.pages[r.index].column.rowGroup.NumRows + for r.index < len(r.pages) && r.pages[r.index].chunk.rowGroup.NumRows >= rowIndex { + rowIndex -= r.pages[r.index].chunk.rowGroup.NumRows r.index++ } @@ -307,7 +309,7 @@ func (cl *columnLoader) open(file *File, path []string) (*Column, error) { // Pick the encoding and compression codec of the first chunk. // // Technically each column chunk may use a different compression - // codec, and each page of the columm chunk might have a different + // codec, and each page of the column chunk might have a different // encoding. Exposing these details does not provide a lot of value // to the end user. // @@ -471,6 +473,283 @@ func schemaRepetitionTypeOf(s *format.SchemaElement) format.FieldRepetitionType return format.Required } +type dictPage struct { + values []byte +} + +type dataPage struct { + repetitionLevels []int8 + definitionLevels []int8 + data []byte + values []byte + dictionary Dictionary +} + +func (p *dataPage) decompress(codec compress.Codec, data []byte) (err error) { + p.values, err = codec.Decode(p.values, data) + p.data, p.values = p.values, p.data[:0] + return err +} + +func (p *dataPage) decode(typ Type, enc encoding.Encoding, data []byte) error { + // Note: I am not sold on this design, it parts ways from the way type + // specific behavior is implemented in other places based on the Type + // specializations. + // + // It was difficult to design an exported API that would optimize well + // for safety, ease of use, and performance. I decided that I was lacking + // enough information about how the code would be used to make the right + // call, so I resorted to an internal mechanism which does not require + // exporting new APIs. The current approach will be less disruptive to + // revisit this decision in the future if needed. + switch typ.Kind() { + case Boolean: + return p.decodeBooleanPage(enc, data) + case Int32: + return p.decodeInt32Page(enc, data) + case Int64: + return p.decodeInt64Page(enc, data) + case Int96: + return p.decodeInt96Page(enc, data) + case Float: + return p.decodeFloatPage(enc, data) + case Double: + return p.decodeDoublePage(enc, data) + case ByteArray: + return p.decodeByteArrayPage(enc, data) + case FixedLenByteArray: + return p.decodeFixedLenByteArrayPage(enc, data, typ.Length()) + default: + return nil + } +} + +func (p *dataPage) decodeBooleanPage(enc encoding.Encoding, data []byte) error { + values, err := enc.DecodeBoolean(bits.BytesToBool(p.values), data) + p.values = bits.BoolToBytes(values) + return err +} + +func (p *dataPage) decodeInt32Page(enc encoding.Encoding, data []byte) error { + values, err := enc.DecodeInt32(bits.BytesToInt32(p.values), data) + p.values = bits.Int32ToBytes(values) + return err +} + +func (p *dataPage) decodeInt64Page(enc encoding.Encoding, data []byte) error { + values, err := enc.DecodeInt64(bits.BytesToInt64(p.values), data) + p.values = bits.Int64ToBytes(values) + return err +} + +func (p *dataPage) decodeInt96Page(enc encoding.Encoding, data []byte) error { + values, err := enc.DecodeInt96(deprecated.BytesToInt96(p.values), data) + p.values = deprecated.Int96ToBytes(values) + return err +} + +func (p *dataPage) decodeFloatPage(enc encoding.Encoding, data []byte) error { + values, err := enc.DecodeFloat(bits.BytesToFloat32(p.values), data) + p.values = bits.Float32ToBytes(values) + return err +} + +func (p *dataPage) decodeDoublePage(enc encoding.Encoding, data []byte) error { + values, err := enc.DecodeDouble(bits.BytesToFloat64(p.values), data) + p.values = bits.Float64ToBytes(values) + return err +} + +func (p *dataPage) decodeByteArrayPage(enc encoding.Encoding, data []byte) (err error) { + p.values, err = enc.DecodeByteArray(p.values, data) + return err +} + +func (p *dataPage) decodeFixedLenByteArrayPage(enc encoding.Encoding, data []byte, size int) (err error) { + p.values, err = enc.DecodeFixedLenByteArray(p.values, data, size) + return err +} + +// DecodeDataPageV1 decodes a data page from the header, compressed data, and +// optional dictionary passed as arguments. +func (c *Column) DecodeDataPageV1(header DataPageHeaderV1, data []byte, dict Dictionary) (Page, error) { + return c.decodeDataPageV1(header, &dataPage{data: data, dictionary: dict}) +} + +func (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *dataPage) (Page, error) { + var err error + + if isCompressed(c.compression) { + if err := page.decompress(c.compression, page.data); err != nil { + return nil, fmt.Errorf("decompressing data page v1: %w", err) + } + } + + numValues := header.NumValues() + data := page.data + page.repetitionLevels = page.repetitionLevels[:0] + page.definitionLevels = page.definitionLevels[:0] + + if c.maxRepetitionLevel > 0 { + encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel) + page.repetitionLevels, data, err = decodeLevelsV1(encoding, numValues, page.repetitionLevels, data) + if err != nil { + return nil, fmt.Errorf("decoding repetition levels of data page v1: %w", err) + } + } + + if c.maxDefinitionLevel > 0 { + encoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel) + page.definitionLevels, data, err = decodeLevelsV1(encoding, numValues, page.definitionLevels, data) + if err != nil { + return nil, fmt.Errorf("decoding definition levels of data page v1: %w", err) + } + + // Data pages v1 did not embed the number of null values, + // so we have to compute it from the definition levels. + numValues -= int64(countLevelsNotEqual(page.definitionLevels, c.maxDefinitionLevel)) + } + + return c.decodeDataPage(header, numValues, page, data) +} + +// DecodeDataPageV2 decodes a data page from the header, compressed data, and +// optional dictionary passed as arguments. +func (c *Column) DecodeDataPageV2(header DataPageHeaderV2, data []byte, dict Dictionary) (Page, error) { + return c.decodeDataPageV2(header, &dataPage{data: data, dictionary: dict}) +} + +func (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *dataPage) (Page, error) { + var numValues = header.NumValues() + var err error + var data = page.data + page.repetitionLevels = page.repetitionLevels[:0] + page.definitionLevels = page.definitionLevels[:0] + //fmt.Printf("PAGE: %q\n", data) + + if c.maxRepetitionLevel > 0 { + encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel) + length := header.RepetitionLevelsByteLength() + page.repetitionLevels, data, err = decodeLevelsV2(encoding, numValues, page.repetitionLevels, data, length) + if err != nil { + return nil, fmt.Errorf("decoding repetition levels of data page v2: %w", io.ErrUnexpectedEOF) + } + } + + if c.maxDefinitionLevel > 0 { + encoding := lookupLevelEncoding(header.DefinitionLevelEncoding(), c.maxDefinitionLevel) + length := header.DefinitionLevelsByteLength() + page.definitionLevels, data, err = decodeLevelsV2(encoding, numValues, page.definitionLevels, data, length) + if err != nil { + return nil, fmt.Errorf("decoding definition levels of data page v2: %w", io.ErrUnexpectedEOF) + } + } + + if isCompressed(c.compression) && header.IsCompressed() { + if err := page.decompress(c.compression, data); err != nil { + return nil, fmt.Errorf("decompressing data page v2: %w", err) + } + data = page.data + } + + numValues -= header.NumNulls() + return c.decodeDataPage(header, numValues, page, data) +} + +func (c *Column) decodeDataPage(header DataPageHeader, numValues int64, page *dataPage, data []byte) (Page, error) { + encoding := LookupEncoding(header.Encoding()) + pageType := c.Type() + + if isDictionaryEncoding(encoding) { + // In some legacy configurations, the PLAIN_DICTIONARY encoding is used + // on data page headers to indicate that the page contains indexes into + // the dictionary page, but the page is still encoded using the RLE + // encoding in this case, so we convert it to RLE_DICTIONARY. + pageType, encoding = Int32Type, &RLEDictionary + } + + if err := page.decode(pageType, encoding, data); err != nil { + return nil, err + } + + var newPage Page + if page.dictionary != nil { + newPage = newIndexedPage(page.dictionary, int16(c.index), int32(numValues), page.values) + } else { + newPage = pageType.NewPage(c.Index(), int(numValues), page.values) + } + switch { + case c.maxRepetitionLevel > 0: + newPage = newRepeatedPage(newPage.Buffer(), c.maxRepetitionLevel, c.maxDefinitionLevel, page.repetitionLevels, page.definitionLevels) + case c.maxDefinitionLevel > 0: + newPage = newOptionalPage(newPage.Buffer(), c.maxDefinitionLevel, page.definitionLevels) + } + return newPage, nil +} + +func decodeLevelsV1(enc encoding.Encoding, numValues int64, levels []int8, data []byte) ([]int8, []byte, error) { + if len(data) < 4 { + return nil, data, io.ErrUnexpectedEOF + } + i := 4 + j := 4 + int(binary.LittleEndian.Uint32(data)) + if j > len(data) { + return nil, data, io.ErrUnexpectedEOF + } + levels, err := decodeLevels(enc, numValues, levels, data[i:j]) + return levels, data[j:], err +} + +func decodeLevelsV2(enc encoding.Encoding, numValues int64, levels []int8, data []byte, length int64) ([]int8, []byte, error) { + if length > int64(len(data)) { + return nil, data, io.ErrUnexpectedEOF + } + levels, err := decodeLevels(enc, numValues, levels, data[:length]) + return levels, data[length:], err +} + +func decodeLevels(enc encoding.Encoding, numValues int64, levels []int8, data []byte) ([]int8, error) { + if cap(levels) < int(numValues) { + levels = make([]int8, numValues) + } + levels, err := enc.DecodeInt8(levels, data) + if err == nil { + switch { + case len(levels) < int(numValues): + err = fmt.Errorf("decoding level expected %d values but got only %d", numValues, len(levels)) + case len(levels) > int(numValues): + levels = levels[:numValues] + } + } + return levels, err +} + +// DecodeDictionary decodes a data page from the header and compressed data +// passed as arguments. +func (c *Column) DecodeDictionary(header DictionaryPageHeader, data []byte) (Dictionary, error) { + return c.decodeDictionary(header, &dataPage{data: data}, &dictPage{}) +} + +func (c *Column) decodeDictionary(header DictionaryPageHeader, page *dataPage, dict *dictPage) (Dictionary, error) { + if isCompressed(c.compression) { + if err := page.decompress(c.compression, page.data); err != nil { + return nil, fmt.Errorf("decompressing dictionary page: %w", err) + } + } + + pageType := c.Type() + encoding := header.Encoding() + if encoding == format.PlainDictionary { + encoding = format.Plain + } + if err := page.decode(pageType, LookupEncoding(encoding), page.data); err != nil { + return nil, err + } + + dict.values = append(dict.values[:0], page.values...) + return pageType.NewDictionary(int(c.index), int(header.NumValues()), dict.values), nil +} + var ( _ Node = (*Column)(nil) ) diff --git a/column_buffer.go b/column_buffer.go index 0a6fb07..70670fe 100644 --- a/column_buffer.go +++ b/column_buffer.go @@ -6,7 +6,6 @@ import ( "io" "sort" - "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/encoding/plain" ) @@ -673,7 +672,8 @@ type byteArrayColumnBuffer struct { func newByteArrayColumnBuffer(typ Type, columnIndex int16, bufferSize int) *byteArrayColumnBuffer { return &byteArrayColumnBuffer{ byteArrayPage: byteArrayPage{ - values: encoding.MakeByteArrayList(bufferSize / 16), + offsets: make([]uint32, 0, bufferSize/8), + values: make([]byte, 0, bufferSize/2), columnIndex: ^columnIndex, }, typ: typ, @@ -683,7 +683,8 @@ func newByteArrayColumnBuffer(typ Type, columnIndex int16, bufferSize int) *byte func (col *byteArrayColumnBuffer) Clone() ColumnBuffer { return &byteArrayColumnBuffer{ byteArrayPage: byteArrayPage{ - values: col.values.Clone(), + offsets: col.cloneOffsets(), + values: col.cloneValues(), columnIndex: col.columnIndex, }, typ: col.typ, @@ -708,15 +709,23 @@ func (col *byteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } func (col *byteArrayColumnBuffer) Page() BufferedPage { return &col.byteArrayPage } -func (col *byteArrayColumnBuffer) Reset() { col.values.Reset() } +func (col *byteArrayColumnBuffer) Reset() { + col.offsets, col.values = col.offsets[:0], col.values[:0] +} -func (col *byteArrayColumnBuffer) Cap() int { return col.values.Cap() } +func (col *byteArrayColumnBuffer) Cap() int { return cap(col.offsets) } -func (col *byteArrayColumnBuffer) Len() int { return col.values.Len() } +func (col *byteArrayColumnBuffer) Len() int { return len(col.offsets) } -func (col *byteArrayColumnBuffer) Less(i, j int) bool { return col.values.Less(i, j) } +func (col *byteArrayColumnBuffer) Less(i, j int) bool { + a := col.valueAt(col.offsets[i]) + b := col.valueAt(col.offsets[j]) + return bytes.Compare(a, b) < 0 +} -func (col *byteArrayColumnBuffer) Swap(i, j int) { col.values.Swap(i, j) } +func (col *byteArrayColumnBuffer) Swap(i, j int) { + col.offsets[i], col.offsets[j] = col.offsets[j], col.offsets[i] +} func (col *byteArrayColumnBuffer) Write(b []byte) (int, error) { _, n, err := col.writeByteArrays(b) @@ -732,19 +741,20 @@ func (col *byteArrayColumnBuffer) WriteByteArrays(values []byte) (int, error) { return n, err } -func (col *byteArrayColumnBuffer) writeByteArrays(values []byte) (c, n int, err error) { - err = plain.RangeByteArrays(values, func(v []byte) error { - col.values.Push(v) - n += plain.ByteArrayLengthSize + len(v) - c++ +func (col *byteArrayColumnBuffer) writeByteArrays(values []byte) (count, bytes int, err error) { + baseCount, baseBytes := len(col.offsets), len(col.values) + + err = plain.RangeByteArrays(values, func(value []byte) error { + col.append(value) return nil }) - return c, n, err + + return len(col.offsets) - baseCount, len(col.values) - baseBytes, err } func (col *byteArrayColumnBuffer) WriteValues(values []Value) (int, error) { - for _, v := range values { - col.values.Push(v.ByteArray()) + for _, value := range values { + col.append(value.ByteArray()) } return len(values), nil } @@ -753,12 +763,12 @@ func (col *byteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n i := int(offset) switch { case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(col.values.Len())) - case i >= col.values.Len(): + return 0, errRowIndexOutOfBounds(offset, int64(len(col.offsets))) + case i >= len(col.offsets): return 0, io.EOF default: - for n < len(values) && i < col.values.Len() { - values[n] = makeValueBytes(ByteArray, col.values.Index(i)) + for n < len(values) && i < len(col.offsets) { + values[n] = makeValueBytes(ByteArray, col.valueAt(col.offsets[i])) values[n].columnIndex = col.columnIndex n++ i++ diff --git a/column_buffer_go18.go b/column_buffer_go18.go index 8c21354..c5756e5 100644 --- a/column_buffer_go18.go +++ b/column_buffer_go18.go @@ -6,7 +6,7 @@ import ( "fmt" "io" - "github.com/segmentio/parquet-go/internal/cast" + "github.com/segmentio/parquet-go/internal/unsafecast" ) type columnBuffer[T primitive] struct { @@ -76,7 +76,7 @@ func (col *columnBuffer[T]) Write(b []byte) (int, error) { if (len(b) % sizeof[T]()) != 0 { return 0, fmt.Errorf("cannot write %s values from input of size %d", col.class.name, len(b)) } - n, err := col.WriteRequired(cast.BytesToSlice[T](b)) + n, err := col.WriteRequired(unsafecast.BytesToSlice[T](b)) return sizeof[T]() * n, err } diff --git a/column_index.go b/column_index.go index 3f36bcb..62e536d 100644 --- a/column_index.go +++ b/column_index.go @@ -1,7 +1,7 @@ package parquet import ( - "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/encoding/plain" "github.com/segmentio/parquet-go/format" "github.com/segmentio/parquet-go/internal/bits" ) @@ -209,8 +209,8 @@ func (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, type byteArrayColumnIndexer struct { baseColumnIndexer sizeLimit int - minValues encoding.ByteArrayList - maxValues encoding.ByteArrayList + minValues []byte + maxValues []byte } func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer { @@ -219,23 +219,25 @@ func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer { func (i *byteArrayColumnIndexer) Reset() { i.reset() - i.minValues.Reset() - i.maxValues.Reset() + i.minValues = i.minValues[:0] + i.maxValues = i.maxValues[:0] } func (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { i.observe(numValues, numNulls) - i.minValues.Push(min.ByteArray()) - i.maxValues.Push(max.ByteArray()) + minValue := min.ByteArray() + maxValue := max.ByteArray() + if i.sizeLimit > 0 { + minValue = truncateLargeMinByteArrayValue(minValue, i.sizeLimit) + maxValue = truncateLargeMaxByteArrayValue(maxValue, i.sizeLimit) + } + i.minValues = plain.AppendByteArray(i.minValues, minValue) + i.maxValues = plain.AppendByteArray(i.maxValues, maxValue) } func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { - minValues := i.minValues.Split() - maxValues := i.maxValues.Split() - if i.sizeLimit > 0 { - truncateLargeMinByteArrayValues(minValues, i.sizeLimit) - truncateLargeMaxByteArrayValues(maxValues, i.sizeLimit) - } + minValues := splitByteArrays(i.minValues) + maxValues := splitByteArrays(i.maxValues) return i.columnIndex( minValues, maxValues, @@ -244,70 +246,6 @@ func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { ) } -func truncateLargeMinByteArrayValues(values [][]byte, sizeLimit int) { - for i, v := range values { - if len(v) > sizeLimit { - values[i] = v[:sizeLimit] - } - } -} - -func truncateLargeMaxByteArrayValues(values [][]byte, sizeLimit int) { - if !hasLongerValuesThanSizeLimit(values, sizeLimit) { - return - } - - // Rather than allocating a new byte slice for each value that exceeds the - // limit, a single buffer is allocated to hold all the values. This makes - // the GC cost of this function a constant rather than being linear to the - // number of values in the input slice. - b := make([]byte, len(values)*sizeLimit) - - for i, v := range values { - if len(v) > sizeLimit { - // If v is the max value we cannot truncate it since there are no - // shorter byte sequence with a greater value. This condition should - // never occur unless the input was especially constructed to trigger - // it. - if !isMaxByteArrayValue(v) { - j := (i + 0) * sizeLimit - k := (i + 1) * sizeLimit - x := b[j:k:k] - copy(x, v) - values[i] = nextByteArrayValue(x) - } - } - } -} - -func hasLongerValuesThanSizeLimit(values [][]byte, sizeLimit int) bool { - for _, v := range values { - if len(v) > sizeLimit { - return true - } - } - return false -} - -func isMaxByteArrayValue(value []byte) bool { - for i := range value { - if value[i] != 0xFF { - return false - } - } - return true -} - -func nextByteArrayValue(value []byte) []byte { - for i := len(value) - 1; i > 0; i-- { - if value[i]++; value[i] != 0 { - break - } - // Overflow: increment the next byte - } - return value -} - type fixedLenByteArrayColumnIndexer struct { baseColumnIndexer size int @@ -336,11 +274,15 @@ func (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, mi } func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { - minValues := splitFixedLenByteArrayList(i.size, i.minValues) - maxValues := splitFixedLenByteArrayList(i.size, i.maxValues) - if i.sizeLimit > 0 && i.sizeLimit < i.size { - truncateLargeMinByteArrayValues(minValues, i.sizeLimit) - truncateLargeMaxByteArrayValues(maxValues, i.sizeLimit) + minValues := splitFixedLenByteArrays(i.minValues, i.size) + maxValues := splitFixedLenByteArrays(i.maxValues, i.size) + if sizeLimit := i.sizeLimit; sizeLimit > 0 { + for i, v := range minValues { + minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) + } + for i, v := range maxValues { + maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) + } } return i.columnIndex( minValues, @@ -350,7 +292,47 @@ func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { ) } -func splitFixedLenByteArrayList(size int, data []byte) [][]byte { +func truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte { + if len(value) > sizeLimit { + value = value[:sizeLimit] + } + return value +} + +func truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte { + if len(value) > sizeLimit && !isMaxByteArrayValue(value) { + value = value[:sizeLimit] + } + return value +} + +func isMaxByteArrayValue(value []byte) bool { + for i := range value { + if value[i] != 0xFF { + return false + } + } + return true +} + +func splitByteArrays(data []byte) [][]byte { + length := 0 + plain.RangeByteArrays(data, func([]byte) error { + length++ + return nil + }) + buffer := make([]byte, 0, len(data)-(4*length)) + values := make([][]byte, 0, length) + plain.RangeByteArrays(data, func(value []byte) error { + offset := len(buffer) + buffer = append(buffer, value...) + values = append(values, buffer[offset:]) + return nil + }) + return values +} + +func splitFixedLenByteArrays(data []byte, size int) [][]byte { data = copyBytes(data) values := make([][]byte, len(data)/size) for i := range values { diff --git a/column_index_default.go b/column_index_default.go index a1f7c83..33ae35e 100644 --- a/column_index_default.go +++ b/column_index_default.go @@ -112,8 +112,8 @@ func (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Val func (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex { return i.columnIndex( - splitFixedLenByteArrayList(1, bits.BoolToBytes(i.minValues)), - splitFixedLenByteArrayList(1, bits.BoolToBytes(i.maxValues)), + splitFixedLenByteArrays(bits.BoolToBytes(i.minValues), 1), + splitFixedLenByteArrays(bits.BoolToBytes(i.maxValues), 1), bits.OrderOfBool(i.minValues), bits.OrderOfBool(i.maxValues), ) @@ -143,8 +143,8 @@ func (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value func (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex { return i.columnIndex( - splitFixedLenByteArrayList(4, bits.Int32ToBytes(i.minValues)), - splitFixedLenByteArrayList(4, bits.Int32ToBytes(i.maxValues)), + splitFixedLenByteArrays(bits.Int32ToBytes(i.minValues), 4), + splitFixedLenByteArrays(bits.Int32ToBytes(i.maxValues), 4), bits.OrderOfInt32(i.minValues), bits.OrderOfInt32(i.maxValues), ) @@ -174,8 +174,8 @@ func (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value func (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex { return i.columnIndex( - splitFixedLenByteArrayList(8, bits.Int64ToBytes(i.minValues)), - splitFixedLenByteArrayList(8, bits.Int64ToBytes(i.maxValues)), + splitFixedLenByteArrays(bits.Int64ToBytes(i.minValues), 8), + splitFixedLenByteArrays(bits.Int64ToBytes(i.maxValues), 8), bits.OrderOfInt64(i.minValues), bits.OrderOfInt64(i.maxValues), ) @@ -205,8 +205,8 @@ func (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value func (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex { return i.columnIndex( - splitFixedLenByteArrayList(12, deprecated.Int96ToBytes(i.minValues)), - splitFixedLenByteArrayList(12, deprecated.Int96ToBytes(i.maxValues)), + splitFixedLenByteArrays(deprecated.Int96ToBytes(i.minValues), 12), + splitFixedLenByteArrays(deprecated.Int96ToBytes(i.maxValues), 12), deprecated.OrderOfInt96(i.minValues), deprecated.OrderOfInt96(i.maxValues), ) @@ -236,8 +236,8 @@ func (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value func (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex { return i.columnIndex( - splitFixedLenByteArrayList(4, bits.Float32ToBytes(i.minValues)), - splitFixedLenByteArrayList(4, bits.Float32ToBytes(i.maxValues)), + splitFixedLenByteArrays(bits.Float32ToBytes(i.minValues), 4), + splitFixedLenByteArrays(bits.Float32ToBytes(i.maxValues), 4), bits.OrderOfFloat32(i.minValues), bits.OrderOfFloat32(i.maxValues), ) @@ -267,8 +267,8 @@ func (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Valu func (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex { return i.columnIndex( - splitFixedLenByteArrayList(8, bits.Float64ToBytes(i.minValues)), - splitFixedLenByteArrayList(8, bits.Float64ToBytes(i.maxValues)), + splitFixedLenByteArrays(bits.Float64ToBytes(i.minValues), 8), + splitFixedLenByteArrays(bits.Float64ToBytes(i.maxValues), 8), bits.OrderOfFloat64(i.minValues), bits.OrderOfFloat64(i.maxValues), ) @@ -284,8 +284,8 @@ func (i uint32ColumnIndexer) ColumnIndex() format.ColumnIndex { minValues := bits.Int32ToUint32(i.minValues) maxValues := bits.Int32ToUint32(i.maxValues) return i.columnIndex( - splitFixedLenByteArrayList(4, bits.Uint32ToBytes(minValues)), - splitFixedLenByteArrayList(4, bits.Uint32ToBytes(maxValues)), + splitFixedLenByteArrays(bits.Uint32ToBytes(minValues), 4), + splitFixedLenByteArrays(bits.Uint32ToBytes(maxValues), 4), bits.OrderOfUint32(minValues), bits.OrderOfUint32(maxValues), ) @@ -301,8 +301,8 @@ func (i uint64ColumnIndexer) ColumnIndex() format.ColumnIndex { minValues := bits.Int64ToUint64(i.minValues) maxValues := bits.Int64ToUint64(i.maxValues) return i.columnIndex( - splitFixedLenByteArrayList(8, bits.Uint64ToBytes(minValues)), - splitFixedLenByteArrayList(8, bits.Uint64ToBytes(maxValues)), + splitFixedLenByteArrays(bits.Uint64ToBytes(minValues), 8), + splitFixedLenByteArrays(bits.Uint64ToBytes(maxValues), 8), bits.OrderOfUint64(minValues), bits.OrderOfUint64(maxValues), ) diff --git a/column_index_go18.go b/column_index_go18.go index 46885ce..d89993a 100644 --- a/column_index_go18.go +++ b/column_index_go18.go @@ -4,7 +4,7 @@ package parquet import ( "github.com/segmentio/parquet-go/format" - "github.com/segmentio/parquet-go/internal/cast" + "github.com/segmentio/parquet-go/internal/unsafecast" ) type columnIndex[T primitive] struct{ page *page[T] } @@ -44,8 +44,8 @@ func (i *columnIndexer[T]) IndexPage(numValues, numNulls int64, min, max Value) } func (i *columnIndexer[T]) ColumnIndex() format.ColumnIndex { - minValues := splitFixedLenByteArrayList(sizeof[T](), cast.SliceToBytes(i.minValues)) - maxValues := splitFixedLenByteArrayList(sizeof[T](), cast.SliceToBytes(i.maxValues)) + minValues := splitFixedLenByteArrays(unsafecast.SliceToBytes(i.minValues), sizeof[T]()) + maxValues := splitFixedLenByteArrays(unsafecast.SliceToBytes(i.maxValues), sizeof[T]()) minOrder := i.class.order(i.minValues) maxOrder := i.class.order(i.maxValues) return format.ColumnIndex{ diff --git a/column_reader.go b/column_reader.go deleted file mode 100644 index 3ce8b5a..0000000 --- a/column_reader.go +++ /dev/null @@ -1,458 +0,0 @@ -package parquet - -import ( - "fmt" - "io" - - "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/encoding/plain" - "github.com/segmentio/parquet-go/internal/bits" -) - -// ColumnReader is an interface implemented by types which support reading -// columns of values. The interface extends ValueReader to work on top of -// parquet encodings. -// -// Implementations of ColumnReader may also provide extensions that the -// application can detect using type assertions. For example, readers for -// columns of INT32 values may implement the parquet.Int32Reader interface -// as a mechanism to provide a type safe and more efficient access to the -// column values. -type ColumnReader interface { - ValueReader - - // Returns the type of values read. - Type() Type - - // Returns the column number of values read. - Column() int - - // Resets the reader state to read numValues values from the given decoder. - // - // Column readers created from parquet types are initialized to an empty - // state and will return io.EOF on every read until a decoder is installed - // via a call to Reset. - Reset(numValues int, decoder encoding.Decoder) -} - -type fileColumnReader struct { - remain int - numValues int - maxRepetitionLevel int8 - maxDefinitionLevel int8 - repetitions levelReader - definitions levelReader - values ColumnReader -} - -func newFileColumnReader(values ColumnReader, maxRepetitionLevel, maxDefinitionLevel int8, bufferSize int) *fileColumnReader { - repetitionBufferSize := 0 - definitionBufferSize := 0 - - switch { - case maxRepetitionLevel > 0 && maxDefinitionLevel > 0: - repetitionBufferSize = bufferSize / 2 - definitionBufferSize = bufferSize / 2 - - case maxRepetitionLevel > 0: - repetitionBufferSize = bufferSize - - case maxDefinitionLevel > 0: - definitionBufferSize = bufferSize - } - - return &fileColumnReader{ - maxRepetitionLevel: maxRepetitionLevel, - maxDefinitionLevel: maxDefinitionLevel, - repetitions: makeLevelReader(repetitionBufferSize), - definitions: makeLevelReader(definitionBufferSize), - values: values, - } -} - -func (r *fileColumnReader) Type() Type { return r.values.Type() } - -func (r *fileColumnReader) Column() int { return r.values.Column() } - -func (r *fileColumnReader) ReadValues(values []Value) (int, error) { - if r.values == nil { - return 0, io.EOF - } - read := 0 - columnIndex := ^int16(r.Column()) - - for r.remain > 0 && len(values) > 0 { - var err error - var repetitionLevels []int8 - var definitionLevels []int8 - var numValues = r.remain - - if len(values) < numValues { - numValues = len(values) - } - - if r.maxRepetitionLevel > 0 { - repetitionLevels, err = r.repetitions.peekLevels() - if err != nil { - return read, fmt.Errorf("decoding repetition level from data page of column %d: %w", r.Column(), err) - } - if len(repetitionLevels) < numValues { - numValues = len(repetitionLevels) - } - } - - if r.maxDefinitionLevel > 0 { - definitionLevels, err = r.definitions.peekLevels() - if err != nil { - return read, fmt.Errorf("decoding definition level from data page of column %d: %w", r.Column(), err) - } - if len(definitionLevels) < numValues { - numValues = len(definitionLevels) - } - } - - if len(repetitionLevels) > 0 { - repetitionLevels = repetitionLevels[:numValues] - } - if len(definitionLevels) > 0 { - definitionLevels = definitionLevels[:numValues] - } - numNulls := countLevelsNotEqual(definitionLevels, r.maxDefinitionLevel) - wantRead := numValues - numNulls - n, err := r.values.ReadValues(values[:wantRead]) - if n < wantRead && err != nil { - return read, fmt.Errorf("read error after decoding %d/%d values from data page of column %d: %w", r.numValues-r.remain, r.numValues, r.Column(), err) - } - - for i, j := n-1, len(definitionLevels)-1; j >= 0; j-- { - if definitionLevels[j] != r.maxDefinitionLevel { - values[j] = Value{columnIndex: columnIndex} - } else { - values[j] = values[i] - i-- - } - } - - for i, lvl := range repetitionLevels { - values[i].repetitionLevel = lvl - } - - for i, lvl := range definitionLevels { - values[i].definitionLevel = lvl - } - - values = values[numValues:] - r.repetitions.discardLevels(len(repetitionLevels)) - r.definitions.discardLevels(len(definitionLevels)) - r.remain -= numValues - read += numValues - } - - if r.remain == 0 { - return read, io.EOF - } - - return read, nil -} - -func (r *fileColumnReader) reset(numValues int, repetitions, definitions, values encoding.Decoder) { - if repetitions != nil { - repetitions.SetBitWidth(bits.Len8(r.maxRepetitionLevel)) - } - if definitions != nil { - definitions.SetBitWidth(bits.Len8(r.maxDefinitionLevel)) - } - r.remain = numValues - r.numValues = numValues - r.repetitions.reset(repetitions) - r.definitions.reset(definitions) - r.values.Reset(numValues, values) -} - -func (r *fileColumnReader) Reset(int, encoding.Decoder) { - panic("BUG: parquet.fileColumnReader.Reset must not be called") -} - -type levelReader struct { - decoder encoding.Decoder - levels []int8 - offset int - count int -} - -func makeLevelReader(bufferSize int) levelReader { - return levelReader{ - levels: make([]int8, 0, bufferSize), - } -} - -func (r *levelReader) readLevel() (int8, error) { - for { - if r.offset < len(r.levels) { - lvl := r.levels[r.offset] - r.offset++ - return lvl, nil - } - if err := r.decodeLevels(); err != nil { - return -1, err - } - } -} - -func (r *levelReader) peekLevels() ([]int8, error) { - if r.offset == len(r.levels) { - if err := r.decodeLevels(); err != nil { - return nil, err - } - } - return r.levels[r.offset:], nil -} - -func (r *levelReader) discardLevels(n int) { - remain := len(r.levels) - r.offset - switch { - case n > remain: - panic("BUG: cannot discard more levels than buffered") - case n == remain: - r.levels = r.levels[:0] - r.offset = 0 - default: - r.offset += n - } -} - -func (r *levelReader) decodeLevels() error { - n, err := r.decoder.DecodeInt8(r.levels[:cap(r.levels)]) - if n == 0 { - return err - } - r.levels = r.levels[:n] - r.offset = 0 - r.count += n - return nil -} - -func (r *levelReader) reset(decoder encoding.Decoder) { - r.decoder = decoder - r.levels = r.levels[:0] - r.offset = 0 - r.count = 0 -} - -type byteArrayColumnReader struct { - typ Type - decoder encoding.Decoder - buffer encoding.ByteArrayList - offset int - remain int - columnIndex int16 -} - -func newByteArrayColumnReader(typ Type, columnIndex int16, bufferSize int) *byteArrayColumnReader { - return &byteArrayColumnReader{ - typ: typ, - buffer: encoding.MakeByteArrayList(atLeastOne(bufferSize / 16)), - columnIndex: ^columnIndex, - } -} - -func (r *byteArrayColumnReader) Type() Type { return r.typ } - -func (r *byteArrayColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *byteArrayColumnReader) readByteArrays(do func([]byte) bool) (n int, err error) { - for { - for r.remain > 0 && r.offset < r.buffer.Len() { - if !do(r.buffer.Index(r.offset)) { - return n, nil - } - r.offset++ - r.remain-- - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - - r.buffer.Reset() - r.offset = 0 - - d, err := r.decoder.DecodeByteArray(&r.buffer) - if d == 0 { - return n, err - } - } -} - -func (r *byteArrayColumnReader) ReadRequired(values []byte) (int, error) { - return r.ReadByteArrays(values) -} - -func (r *byteArrayColumnReader) ReadByteArrays(values []byte) (int, error) { - i := 0 - n, err := r.readByteArrays(func(b []byte) bool { - k := plain.ByteArrayLengthSize + len(b) - if k > (len(values) - i) { - return false - } - plain.PutByteArrayLength(values[i:], len(b)) - copy(values[i+plain.ByteArrayLengthSize:], b) - i += k - return true - }) - if i == 0 && err == nil { - err = io.ErrShortBuffer - } - return n, err -} - -func (r *byteArrayColumnReader) ReadValues(values []Value) (int, error) { - i := 0 - return r.readByteArrays(func(b []byte) (ok bool) { - if ok = i < len(values); ok { - values[i] = makeValueBytes(ByteArray, copyBytes(b)) - values[i].columnIndex = r.columnIndex - i++ - } - return ok - }) -} - -func (r *byteArrayColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer.Reset() - r.offset = 0 - r.remain = numValues -} - -type fixedLenByteArrayColumnReader struct { - typ Type - decoder encoding.Decoder - buffer []byte - offset int - remain int - size int - bufferSize int - columnIndex int16 -} - -func newFixedLenByteArrayColumnReader(typ Type, columnIndex int16, bufferSize int) *fixedLenByteArrayColumnReader { - return &fixedLenByteArrayColumnReader{ - typ: typ, - size: typ.Length(), - bufferSize: bufferSize, - columnIndex: ^columnIndex, - } -} - -func (r *fixedLenByteArrayColumnReader) Type() Type { return r.typ } - -func (r *fixedLenByteArrayColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *fixedLenByteArrayColumnReader) ReadRequired(values []byte) (int, error) { - return r.ReadFixedLenByteArrays(values) -} - -func (r *fixedLenByteArrayColumnReader) ReadFixedLenByteArrays(values []byte) (n int, err error) { - if (len(values) % r.size) != 0 { - return 0, fmt.Errorf("cannot read FIXED_LEN_BYTE_ARRAY values of size %d into buffer of size %d", r.size, len(values)) - } - if r.offset < len(r.buffer) { - i := copy(values, r.buffer[r.offset:]) - n = i / r.size - r.offset += i - r.remain -= i - values = values[i:] - } - if r.decoder == nil { - err = io.EOF - } else { - var d int - values = values[:min(r.remain, len(values))] - d, err = r.decoder.DecodeFixedLenByteArray(r.size, values) - n += d - r.remain -= d - if r.remain == 0 && err == nil { - err = io.EOF - } - } - return n, err -} - -func (r *fixedLenByteArrayColumnReader) ReadValues(values []Value) (n int, err error) { - if cap(r.buffer) == 0 { - r.buffer = make([]byte, 0, atLeast((r.bufferSize/r.size)*r.size, r.size)) - } - - for { - for (r.offset+r.size) <= len(r.buffer) && n < len(values) { - values[n] = makeValueBytes(FixedLenByteArray, copyBytes(r.buffer[r.offset:r.offset+r.size])) - values[n].columnIndex = r.columnIndex - r.offset += r.size - r.remain -= r.size - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - if n == len(values) { - return n, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - d, err := r.decoder.DecodeFixedLenByteArray(r.size, buffer) - if d == 0 { - return n, err - } - - r.buffer = buffer[:d*r.size] - r.offset = 0 - } -} - -func (r *fixedLenByteArrayColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = r.size * numValues -} - -type nullColumnReader struct { - typ Type - remain int - columnIndex int16 -} - -func newNullColumnReader(typ Type, columnIndex int16) *nullColumnReader { - return &nullColumnReader{ - typ: typ, - columnIndex: ^columnIndex, - } -} - -func (r *nullColumnReader) Type() Type { - return r.typ -} - -func (r *nullColumnReader) Column() int { - return int(^r.columnIndex) -} - -func (r *nullColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.remain = numValues -} - -func (r *nullColumnReader) ReadValues(values []Value) (n int, err error) { - values = values[:min(r.remain, len(values))] - for i := range values { - values[i] = Value{columnIndex: r.columnIndex} - } - r.remain -= len(values) - if r.remain == 0 { - err = io.EOF - } - return len(values), err -} diff --git a/column_reader_default.go b/column_reader_default.go deleted file mode 100644 index 97887a4..0000000 --- a/column_reader_default.go +++ /dev/null @@ -1,533 +0,0 @@ -//go:build !go1.18 - -package parquet - -import ( - "io" - - "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" -) - -// The types below are implementations of the ColumnReader interface for each -// primitive type supported by parquet. -// -// The readers use an in-memory intermediary buffer to support decoding arrays -// of values from the underlying decoder, which are then boxed into the []Value -// buffer passed to ReadValues. When the program converts type checks the -// readers for more specific interfaces (e.g. parquet.Int32Reader), the values -// can be decoded directly from the underlying decoder. There is no need for -// the intermediary buffers so they are lazily allocated only if the ReadValues -// methods are called. - -type booleanColumnReader struct { - typ Type - decoder encoding.Decoder - buffer []bool - offset int - remain int - bufferSize int - columnIndex int16 -} - -func newBooleanColumnReader(typ Type, columnIndex int16, bufferSize int) *booleanColumnReader { - return &booleanColumnReader{ - typ: typ, - bufferSize: bufferSize, - columnIndex: ^columnIndex, - } -} - -func (r *booleanColumnReader) Type() Type { return r.typ } - -func (r *booleanColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *booleanColumnReader) ReadBooleans(values []bool) (n int, err error) { - if r.offset < len(r.buffer) { - n = copy(values, r.buffer[r.offset:]) - r.offset += n - r.remain -= n - values = values[n:] - } - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - values = values[:min(r.remain, len(values))] - d, err := r.decoder.DecodeBoolean(values) - r.remain -= d - if r.remain == 0 && err == nil { - err = io.EOF - } - return n + d, err -} - -func (r *booleanColumnReader) ReadValues(values []Value) (n int, err error) { - if cap(r.buffer) == 0 { - r.buffer = make([]bool, 0, atLeastOne(r.bufferSize)) - } - - for { - for r.offset < len(r.buffer) && n < len(values) { - values[n] = makeValueBoolean(r.buffer[r.offset]) - values[n].columnIndex = r.columnIndex - r.offset++ - r.remain-- - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - if n == len(values) { - return n, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - d, err := r.decoder.DecodeBoolean(buffer) - if d == 0 { - return n, err - } - - r.buffer = buffer[:d] - r.offset = 0 - } -} - -func (r *booleanColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = numValues -} - -type int32ColumnReader struct { - typ Type - decoder encoding.Decoder - buffer []int32 - offset int - remain int - bufferSize int - columnIndex int16 -} - -func newInt32ColumnReader(typ Type, columnIndex int16, bufferSize int) *int32ColumnReader { - return &int32ColumnReader{ - typ: typ, - bufferSize: bufferSize, - columnIndex: ^columnIndex, - } -} - -func (r *int32ColumnReader) Type() Type { return r.typ } - -func (r *int32ColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *int32ColumnReader) ReadInt32s(values []int32) (n int, err error) { - if r.offset < len(r.buffer) { - n = copy(values, r.buffer[r.offset:]) - r.offset += n - r.remain -= n - values = values[n:] - } - if r.remain == 0 || r.decoder == nil { - err = io.EOF - } else { - var d int - values = values[:min(r.remain, len(values))] - d, err = r.decoder.DecodeInt32(values) - n += d - r.remain -= d - if r.remain == 0 && err == nil { - err = io.EOF - } - } - return n, err -} - -func (r *int32ColumnReader) ReadValues(values []Value) (n int, err error) { - if cap(r.buffer) == 0 { - r.buffer = make([]int32, 0, atLeastOne(r.bufferSize)) - } - - for { - for r.offset < len(r.buffer) && n < len(values) { - values[n] = makeValueInt32(r.buffer[r.offset]) - values[n].columnIndex = r.columnIndex - r.offset++ - r.remain-- - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - if n == len(values) { - return n, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - d, err := r.decoder.DecodeInt32(buffer) - if d == 0 { - return n, err - } - - r.buffer = buffer[:d] - r.offset = 0 - } -} - -func (r *int32ColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = numValues -} - -type int64ColumnReader struct { - typ Type - decoder encoding.Decoder - buffer []int64 - offset int - remain int - bufferSize int - columnIndex int16 -} - -func newInt64ColumnReader(typ Type, columnIndex int16, bufferSize int) *int64ColumnReader { - return &int64ColumnReader{ - typ: typ, - bufferSize: bufferSize, - columnIndex: ^columnIndex, - } -} - -func (r *int64ColumnReader) Type() Type { return r.typ } - -func (r *int64ColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *int64ColumnReader) ReadInt64s(values []int64) (n int, err error) { - if r.offset < len(r.buffer) { - n = copy(values, r.buffer[r.offset:]) - r.offset += n - r.remain -= n - values = values[n:] - } - if r.remain == 0 || r.decoder == nil { - err = io.EOF - } else { - var d int - values = values[:min(r.remain, len(values))] - d, err = r.decoder.DecodeInt64(values) - n += d - r.remain -= d - if r.remain == 0 && err == nil { - err = io.EOF - } - } - return n, err -} - -func (r *int64ColumnReader) ReadValues(values []Value) (n int, err error) { - if cap(r.buffer) == 0 { - r.buffer = make([]int64, 0, atLeastOne(r.bufferSize)) - } - - for { - for r.offset < len(r.buffer) && n < len(values) { - values[n] = makeValueInt64(r.buffer[r.offset]) - values[n].columnIndex = r.columnIndex - r.offset++ - r.remain-- - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - if n == len(values) { - return n, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - d, err := r.decoder.DecodeInt64(buffer) - if d == 0 { - return n, err - } - - r.buffer = buffer[:d] - r.offset = 0 - } -} - -func (r *int64ColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = numValues -} - -type int96ColumnReader struct { - typ Type - decoder encoding.Decoder - buffer []deprecated.Int96 - offset int - remain int - bufferSize int - columnIndex int16 -} - -func newInt96ColumnReader(typ Type, columnIndex int16, bufferSize int) *int96ColumnReader { - return &int96ColumnReader{ - typ: typ, - bufferSize: bufferSize, - columnIndex: ^columnIndex, - } -} - -func (r *int96ColumnReader) Type() Type { return r.typ } - -func (r *int96ColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *int96ColumnReader) ReadInt96s(values []deprecated.Int96) (n int, err error) { - if r.offset < len(r.buffer) { - n = copy(values, r.buffer[r.offset:]) - r.offset += n - r.remain -= n - values = values[n:] - } - if r.remain == 0 || r.decoder == nil { - err = io.EOF - } else { - var d int - values = values[:min(r.remain, len(values))] - d, err = r.decoder.DecodeInt96(values) - n += d - r.remain -= d - if r.remain == 0 && err == nil { - err = io.EOF - } - } - return n, err -} - -func (r *int96ColumnReader) ReadValues(values []Value) (n int, err error) { - if cap(r.buffer) == 0 { - r.buffer = make([]deprecated.Int96, 0, atLeastOne(r.bufferSize)) - } - - for { - for r.offset < len(r.buffer) && n < len(values) { - values[n] = makeValueInt96(r.buffer[r.offset]) - values[n].columnIndex = r.columnIndex - r.offset++ - r.remain-- - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - if n == len(values) { - return n, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - d, err := r.decoder.DecodeInt96(buffer) - if d == 0 { - return n, err - } - - r.buffer = buffer[:d] - r.offset = 0 - } -} - -func (r *int96ColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = numValues -} - -type floatColumnReader struct { - typ Type - decoder encoding.Decoder - buffer []float32 - offset int - remain int - bufferSize int - columnIndex int16 -} - -func newFloatColumnReader(typ Type, columnIndex int16, bufferSize int) *floatColumnReader { - return &floatColumnReader{ - typ: typ, - bufferSize: bufferSize, - columnIndex: ^columnIndex, - } -} - -func (r *floatColumnReader) Type() Type { return r.typ } - -func (r *floatColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *floatColumnReader) ReadFloats(values []float32) (n int, err error) { - if r.offset < len(r.buffer) { - n = copy(values, r.buffer[r.offset:]) - r.offset += n - r.remain -= n - values = values[n:] - } - if r.remain == 0 || r.decoder == nil { - err = io.EOF - } else { - var d int - values = values[:min(r.remain, len(values))] - d, err = r.decoder.DecodeFloat(values) - n += d - r.remain -= d - if r.remain == 0 && err == nil { - err = io.EOF - } - } - return n, err -} - -func (r *floatColumnReader) ReadValues(values []Value) (n int, err error) { - if cap(r.buffer) == 0 { - r.buffer = make([]float32, 0, atLeastOne(r.bufferSize)) - } - - for { - for r.offset < len(r.buffer) && n < len(values) { - values[n] = makeValueFloat(r.buffer[r.offset]) - values[n].columnIndex = r.columnIndex - r.offset++ - r.remain-- - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - if n == len(values) { - return n, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - d, err := r.decoder.DecodeFloat(buffer) - if d == 0 { - return n, err - } - - r.buffer = buffer[:d] - r.offset = 0 - } -} - -func (r *floatColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = numValues -} - -type doubleColumnReader struct { - typ Type - decoder encoding.Decoder - buffer []float64 - offset int - remain int - bufferSize int - columnIndex int16 -} - -func newDoubleColumnReader(typ Type, columnIndex int16, bufferSize int) *doubleColumnReader { - return &doubleColumnReader{ - typ: typ, - bufferSize: bufferSize, - columnIndex: ^columnIndex, - } -} - -func (r *doubleColumnReader) Type() Type { return r.typ } - -func (r *doubleColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *doubleColumnReader) ReadDoubles(values []float64) (n int, err error) { - if r.offset < len(r.buffer) { - n = copy(values, r.buffer[r.offset:]) - r.offset += n - r.remain -= n - values = values[n:] - } - if r.remain == 0 || r.decoder == nil { - err = io.EOF - } else { - var d int - values = values[:min(r.remain, len(values))] - d, err = r.decoder.DecodeDouble(values) - n += d - r.remain -= d - if r.remain == 0 && err == nil { - err = io.EOF - } - } - return n, err -} - -func (r *doubleColumnReader) ReadValues(values []Value) (n int, err error) { - if cap(r.buffer) == 0 { - r.buffer = make([]float64, 0, atLeastOne(r.bufferSize)) - } - - for { - for r.offset < len(r.buffer) && n < len(values) { - values[n] = makeValueDouble(r.buffer[r.offset]) - values[n].columnIndex = r.columnIndex - r.offset++ - r.remain-- - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - if n == len(values) { - return n, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - d, err := r.decoder.DecodeDouble(buffer) - if d == 0 { - return n, err - } - - r.buffer = buffer[:d] - r.offset = 0 - } -} - -func (r *doubleColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = numValues -} - -var ( - _ BooleanReader = (*booleanColumnReader)(nil) - _ Int32Reader = (*int32ColumnReader)(nil) - _ Int64Reader = (*int64ColumnReader)(nil) - _ Int96Reader = (*int96ColumnReader)(nil) - _ FloatReader = (*floatColumnReader)(nil) - _ DoubleReader = (*doubleColumnReader)(nil) - _ ByteArrayReader = (*byteArrayColumnReader)(nil) - _ FixedLenByteArrayReader = (*fixedLenByteArrayColumnReader)(nil) -) diff --git a/column_reader_go18.go b/column_reader_go18.go deleted file mode 100644 index aa6f470..0000000 --- a/column_reader_go18.go +++ /dev/null @@ -1,93 +0,0 @@ -//go:build go1.18 - -package parquet - -import ( - "io" - - "github.com/segmentio/parquet-go/encoding" -) - -type columnReader[T primitive] struct { - class *class[T] - typ Type - decoder encoding.Decoder - buffer []T - offset int - remain int - bufferSize int - columnIndex int16 -} - -func newColumnReader[T primitive](typ Type, columnIndex int16, bufferSize int, class *class[T]) *columnReader[T] { - return &columnReader[T]{ - class: class, - typ: typ, - bufferSize: bufferSize, - columnIndex: ^columnIndex, - } -} - -func (r *columnReader[T]) Type() Type { return r.typ } - -func (r *columnReader[T]) Column() int { return int(^r.columnIndex) } - -func (r *columnReader[T]) ReadRequired(values []T) (n int, err error) { - if r.offset < len(r.buffer) { - n = copy(values, r.buffer[r.offset:]) - r.offset += n - r.remain -= n - values = values[n:] - } - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - d, err := r.class.decode(r.decoder, values) - r.remain -= d - if r.remain == 0 && err == nil { - err = io.EOF - } - return n + d, err -} - -func (r *columnReader[T]) ReadValues(values []Value) (n int, err error) { - if cap(r.buffer) == 0 { - r.buffer = make([]T, 0, atLeastOne(r.bufferSize)) - } - - makeValue := r.class.makeValue - columnIndex := r.columnIndex - for { - for r.offset < len(r.buffer) && n < len(values) { - values[n] = makeValue(r.buffer[r.offset]) - values[n].columnIndex = columnIndex - r.offset++ - r.remain-- - n++ - } - - if r.remain == 0 || r.decoder == nil { - return n, io.EOF - } - if n == len(values) { - return n, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - d, err := r.class.decode(r.decoder, buffer) - if d == 0 { - return n, err - } - - r.buffer = buffer[:d] - r.offset = 0 - } -} - -func (r *columnReader[T]) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = numValues -} diff --git a/column_test.go b/column_test.go index c522023..f5085a1 100644 --- a/column_test.go +++ b/column_test.go @@ -5,7 +5,6 @@ import ( "math/rand" "testing" "testing/quick" - "time" "github.com/google/uuid" "github.com/segmentio/parquet-go" @@ -103,7 +102,9 @@ func TestColumnPageIndex(t *testing.T) { }, } { t.Run(test.scenario, func(t *testing.T) { - if err := quick.Check(test.function(t), nil); err != nil { + if err := quick.Check(test.function(t), &quick.Config{ + Rand: rand.New(rand.NewSource(0)), + }); err != nil { t.Error(err) } }) @@ -154,10 +155,10 @@ func checkColumnChunkColumnIndex(columnChunk parquet.ColumnChunk) error { indexMax := columnIndex.MaxValue(pagesRead) if !parquet.Equal(pageMin, indexMin) { - return fmt.Errorf("max page value mismatch: index=%x page=%x", indexMin, pageMin) + return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMin, pageMin) } if !parquet.Equal(pageMax, indexMax) { - return fmt.Errorf("max page value mismatch: index=%x page=%x", indexMax, pageMax) + return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMax, pageMax) } numNulls := int64(0) @@ -250,9 +251,11 @@ func checkColumnChunkOffsetIndex(columnChunk parquet.ColumnChunk) error { func testColumnPageIndexWithFile(t *testing.T, rows rows) bool { if len(rows) > 0 { - r := rand.New(rand.NewSource(time.Now().UnixNano())) - size := parquet.PageBufferSize(r.Intn(49) + 1) - f, err := createParquetFile(rows, size) + r := rand.New(rand.NewSource(5)) + f, err := createParquetFile(rows, + parquet.PageBufferSize(r.Intn(49)+1), + parquet.ColumnIndexSizeLimit(4096), + ) if err != nil { t.Error(err) return false @@ -427,15 +430,15 @@ func newColumnStats(columnType parquet.Type) *columnStats { func (c *columnStats) observe(value parquet.Value) { if c.page >= len(c.minValues) { - c.minValues = append(c.minValues, value) + c.minValues = append(c.minValues, value.Clone()) } else if c.columnType.Compare(c.minValues[c.page], value) > 0 { - c.minValues[c.page] = value + c.minValues[c.page] = value.Clone() } if c.page >= len(c.maxValues) { - c.maxValues = append(c.maxValues, value) + c.maxValues = append(c.maxValues, value.Clone()) } else if c.columnType.Compare(c.maxValues[c.page], value) < 0 { - c.maxValues[c.page] = value + c.maxValues[c.page] = value.Clone() } } diff --git a/compress.go b/compress.go index 2757878..f942b72 100644 --- a/compress.go +++ b/compress.go @@ -90,3 +90,7 @@ func (u *unsupported) Decode(dst, src []byte) ([]byte, error) { func (u *unsupported) error() error { return fmt.Errorf("unsupported compression codec: %s", u.codec) } + +func isCompressed(c compress.Codec) bool { + return c != nil && c.CompressionCodec() != format.Uncompressed +} diff --git a/dictionary.go b/dictionary.go index a292849..fd026cf 100644 --- a/dictionary.go +++ b/dictionary.go @@ -2,7 +2,6 @@ package parquet import ( "bytes" - "fmt" "io" "github.com/segmentio/parquet-go/encoding" @@ -68,58 +67,33 @@ type byteArrayDictionary struct { index map[string]int32 } -func newByteArrayDictionary(typ Type, columnIndex int16, bufferSize int) *byteArrayDictionary { +func newByteArrayDictionary(typ Type, columnIndex int16, numValues int32, values []byte) *byteArrayDictionary { return &byteArrayDictionary{ typ: typ, byteArrayPage: byteArrayPage{ - values: encoding.MakeByteArrayList(dictCap(bufferSize, 16)), + offsets: makeByteArrayOffsets(numValues, values), + values: values, columnIndex: ^columnIndex, }, } } -func readByteArrayDictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (Dictionary, error) { - d := &byteArrayDictionary{ - typ: typ, - byteArrayPage: byteArrayPage{ - values: encoding.MakeByteArrayList(atLeastOne(numValues)), - columnIndex: ^columnIndex, - }, - } - - for { - if d.values.Len() == d.values.Cap() { - d.values.Grow(d.values.Len()) - } - _, err := decoder.DecodeByteArray(&d.values) - if err != nil { - if err == io.EOF { - err = nil - } - return d, err - } - } -} - func (d *byteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) } -func (d *byteArrayDictionary) Len() int { return d.values.Len() } +func (d *byteArrayDictionary) Len() int { return len(d.offsets) } func (d *byteArrayDictionary) Index(i int32) Value { - return makeValueBytes(ByteArray, d.values.Index(int(i))) + return makeValueBytes(ByteArray, d.valueAt(d.offsets[i])) } func (d *byteArrayDictionary) Insert(indexes []int32, values []Value) { _ = indexes[:len(values)] if d.index == nil { - index := int32(0) - d.index = make(map[string]int32, d.values.Cap()) - d.values.Range(func(v []byte) bool { - d.index[bits.BytesToString(v)] = index - index++ - return true - }) + d.index = make(map[string]int32, cap(d.offsets)) + for index, offset := range d.offsets { + d.index[bits.BytesToString(d.valueAt(offset))] = int32(index) + } } for i, v := range values { @@ -127,9 +101,9 @@ func (d *byteArrayDictionary) Insert(indexes []int32, values []Value) { index, exists := d.index[string(value)] if !exists { - d.values.Push(value) - index = int32(d.values.Len() - 1) - stringValue := bits.BytesToString(d.values.Index(int(index))) + index = int32(len(d.offsets)) + d.append(value) + stringValue := bits.BytesToString(d.valueAt(d.offsets[index])) d.index[stringValue] = index } @@ -145,11 +119,11 @@ func (d *byteArrayDictionary) Lookup(indexes []int32, values []Value) { func (d *byteArrayDictionary) Bounds(indexes []int32) (min, max Value) { if len(indexes) > 0 { - minValue := d.values.Index(int(indexes[0])) + minValue := d.valueAt(d.offsets[indexes[0]]) maxValue := minValue for _, i := range indexes[1:] { - value := d.values.Index(int(i)) + value := d.valueAt(d.offsets[i]) switch { case bytes.Compare(value, minValue) < 0: minValue = value @@ -165,7 +139,8 @@ func (d *byteArrayDictionary) Bounds(indexes []int32) (min, max Value) { } func (d *byteArrayDictionary) Reset() { - d.values.Reset() + d.offsets = d.offsets[:0] + d.values = d.values[:0] d.index = nil } @@ -179,52 +154,18 @@ type fixedLenByteArrayDictionary struct { index map[string]int32 } -func newFixedLenByteArrayDictionary(typ Type, columnIndex int16, bufferSize int) *fixedLenByteArrayDictionary { +func newFixedLenByteArrayDictionary(typ Type, columnIndex int16, numValues int32, data []byte) *fixedLenByteArrayDictionary { size := typ.Length() return &fixedLenByteArrayDictionary{ typ: typ, fixedLenByteArrayPage: fixedLenByteArrayPage{ size: size, - data: make([]byte, 0, dictCap(bufferSize, size)*size), + data: data, columnIndex: ^columnIndex, }, } } -func readFixedLenByteArrayDictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (Dictionary, error) { - size := typ.Length() - - d := &fixedLenByteArrayDictionary{ - typ: typ, - fixedLenByteArrayPage: fixedLenByteArrayPage{ - size: size, - data: make([]byte, 0, atLeastOne(numValues)*size), - columnIndex: ^columnIndex, - }, - } - - for { - if len(d.data) == cap(d.data) { - newValues := make([]byte, len(d.data), 2*cap(d.data)) - copy(newValues, d.data) - d.data = newValues - } - - n, err := decoder.DecodeFixedLenByteArray(d.size, d.data[len(d.data):cap(d.data)]) - if n > 0 { - d.data = d.data[:len(d.data)+(n*d.size)] - } - - if err == io.EOF { - return d, nil - } - if err != nil { - return nil, fmt.Errorf("reading parquet dictionary of fixed-length binary values of size %d: %w", d.size, err) - } - } - -} - func (d *fixedLenByteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *fixedLenByteArrayDictionary) Len() int { return len(d.data) / d.size } @@ -312,8 +253,8 @@ func (t *indexedType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer return newIndexedColumnBuffer(t.dict, t, makeColumnIndex(columnIndex), bufferSize) } -func (t *indexedType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newIndexedColumnReader(t.dict, t, makeColumnIndex(columnIndex), bufferSize) +func (t *indexedType) NewPage(columnIndex, numValues int, data []byte) Page { + return newIndexedPage(t.dict, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type indexedPage struct { @@ -322,6 +263,21 @@ type indexedPage struct { columnIndex int16 } +func newIndexedPage(dict Dictionary, columnIndex int16, numValues int32, data []byte) *indexedPage { + values := bits.BytesToInt32(data) + for len(values) < int(numValues) { + values = append(values, 0) + } + if len(values) > int(numValues) { + values = values[:numValues] + } + return &indexedPage{ + dict: dict, + values: values, + columnIndex: ^columnIndex, + } +} + func (page *indexedPage) Column() int { return int(^page.columnIndex) } func (page *indexedPage) Dictionary() Dictionary { return page.dict } @@ -363,14 +319,14 @@ func (page *indexedPage) RepetitionLevels() []int8 { return nil } func (page *indexedPage) DefinitionLevels() []int8 { return nil } -func (page *indexedPage) WriteTo(e encoding.Encoder) error { - return e.EncodeInt32(page.values) -} - func (page *indexedPage) Values() ValueReader { return &indexedPageReader{page: page} } func (page *indexedPage) Buffer() BufferedPage { return page } +func (page *indexedPage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return enc.EncodeInt32(dst, page.values) +} + type indexedPageReader struct { page *indexedPage offset int @@ -499,85 +455,6 @@ func (col *indexedColumnBuffer) ReadRowAt(row Row, index int64) (Row, error) { } } -type indexedColumnReader struct { - dict Dictionary - typ Type - decoder encoding.Decoder - buffer []int32 - offset int - remain int - columnIndex int16 -} - -func newIndexedColumnReader(dict Dictionary, typ Type, columnIndex int16, bufferSize int) *indexedColumnReader { - return &indexedColumnReader{ - dict: dict, - typ: typ, - buffer: make([]int32, 0, atLeastOne(bufferSize)), - columnIndex: ^columnIndex, - } -} - -func (r *indexedColumnReader) Type() Type { return r.typ } - -func (r *indexedColumnReader) Column() int { return int(^r.columnIndex) } - -func (r *indexedColumnReader) ReadValues(values []Value) (int, error) { - i := 0 - for { - for r.offset < len(r.buffer) && i < len(values) { - count := len(r.buffer) - r.offset - limit := len(values) - i - - if count > limit { - count = limit - } - - indexes := r.buffer[r.offset : r.offset+count] - dictLen := r.dict.Len() - for _, index := range indexes { - if index < 0 || int(index) >= dictLen { - return i, fmt.Errorf("reading value from indexed page: index out of bounds: %d/%d", index, dictLen) - } - } - - r.dict.Lookup(indexes, values[i:]) - r.offset += count - r.remain -= count - - j := i + int(count) - for i < j { - values[i].columnIndex = r.columnIndex - i++ - } - } - - if r.remain == 0 { - return i, io.EOF - } - if i == len(values) { - return i, nil - } - - length := min(r.remain, cap(r.buffer)) - buffer := r.buffer[:length] - n, err := r.decoder.DecodeInt32(buffer) - if n == 0 { - return i, err - } - - r.buffer = buffer[:n] - r.offset = 0 - } -} - -func (r *indexedColumnReader) Reset(numValues int, decoder encoding.Decoder) { - r.decoder = decoder - r.buffer = r.buffer[:0] - r.offset = 0 - r.remain = numValues -} - type indexedColumnIndex struct{ col *indexedColumnBuffer } func (index indexedColumnIndex) NumPages() int { return 1 } diff --git a/dictionary_default.go b/dictionary_default.go index 0e09dd3..c295875 100644 --- a/dictionary_default.go +++ b/dictionary_default.go @@ -3,11 +3,8 @@ package parquet import ( - "fmt" - "io" - "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/internal/bits" ) // The boolean dictionary always contains two values for true and false. @@ -17,46 +14,14 @@ type booleanDictionary struct { index map[bool]int32 } -func newBooleanDictionary(typ Type, columnIndex int16, bufferSize int) *booleanDictionary { +func newBooleanDictionary(typ Type, columnIndex int16, numValues int32, data []byte) *booleanDictionary { return &booleanDictionary{ typ: typ, booleanPage: booleanPage{ - values: make([]bool, 0, atLeastOne(bufferSize)), - columnIndex: ^columnIndex, - }, - } -} - -func readBooleanDictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (*booleanDictionary, error) { - d := &booleanDictionary{ - typ: typ, - booleanPage: booleanPage{ - values: make([]bool, 0, atLeastOne(numValues)), + values: bits.BytesToBool(data), columnIndex: ^columnIndex, }, } - - for { - if len(d.values) == cap(d.values) { - newValues := make([]bool, len(d.values), 2*cap(d.values)) - copy(newValues, d.values) - d.values = newValues - } - - n, err := decoder.DecodeBoolean(d.values[len(d.values):cap(d.values)]) - if n > 0 { - d.values = d.values[:len(d.values)+n] - } - - if err != nil { - if err == io.EOF { - err = nil - } else { - err = fmt.Errorf("reading parquet dictionary of int32 values: %w", err) - } - return d, err - } - } } func (d *booleanDictionary) Type() Type { return newIndexedType(d.typ, d) } @@ -131,46 +96,14 @@ type int32Dictionary struct { index map[int32]int32 } -func newInt32Dictionary(typ Type, columnIndex int16, bufferSize int) *int32Dictionary { +func newInt32Dictionary(typ Type, columnIndex int16, numValues int32, data []byte) *int32Dictionary { return &int32Dictionary{ typ: typ, int32Page: int32Page{ - values: make([]int32, 0, dictCap(bufferSize, 4)), - columnIndex: ^columnIndex, - }, - } -} - -func readInt32Dictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (*int32Dictionary, error) { - d := &int32Dictionary{ - typ: typ, - int32Page: int32Page{ - values: make([]int32, 0, atLeastOne(numValues)), + values: bits.BytesToInt32(data), columnIndex: ^columnIndex, }, } - - for { - if len(d.values) == cap(d.values) { - newValues := make([]int32, len(d.values), 2*cap(d.values)) - copy(newValues, d.values) - d.values = newValues - } - - n, err := decoder.DecodeInt32(d.values[len(d.values):cap(d.values)]) - if n > 0 { - d.values = d.values[:len(d.values)+n] - } - - if err != nil { - if err == io.EOF { - err = nil - } else { - err = fmt.Errorf("reading parquet dictionary of int32 values: %w", err) - } - return d, err - } - } } func (d *int32Dictionary) Type() Type { return newIndexedType(d.typ, d) } @@ -245,46 +178,14 @@ type int64Dictionary struct { index map[int64]int32 } -func newInt64Dictionary(typ Type, columnIndex int16, bufferSize int) *int64Dictionary { +func newInt64Dictionary(typ Type, columnIndex int16, numValues int32, data []byte) *int64Dictionary { return &int64Dictionary{ typ: typ, int64Page: int64Page{ - values: make([]int64, 0, dictCap(bufferSize, 8)), - columnIndex: ^columnIndex, - }, - } -} - -func readInt64Dictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (*int64Dictionary, error) { - d := &int64Dictionary{ - typ: typ, - int64Page: int64Page{ - values: make([]int64, 0, atLeastOne(numValues)), + values: bits.BytesToInt64(data), columnIndex: ^columnIndex, }, } - - for { - if len(d.values) == cap(d.values) { - newValues := make([]int64, len(d.values), 2*cap(d.values)) - copy(newValues, d.values) - d.values = newValues - } - - n, err := decoder.DecodeInt64(d.values[len(d.values):cap(d.values)]) - if n > 0 { - d.values = d.values[:len(d.values)+n] - } - - if err != nil { - if err == io.EOF { - err = nil - } else { - err = fmt.Errorf("reading parquet dictionary of int64 values: %w", err) - } - return d, err - } - } } func (d *int64Dictionary) Type() Type { return newIndexedType(d.typ, d) } @@ -359,48 +260,16 @@ type int96Dictionary struct { index map[deprecated.Int96]int32 } -func newInt96Dictionary(typ Type, columnIndex int16, bufferSize int) *int96Dictionary { +func newInt96Dictionary(typ Type, columnIndex int16, numValues int32, data []byte) *int96Dictionary { return &int96Dictionary{ typ: typ, int96Page: int96Page{ - values: make([]deprecated.Int96, 0, dictCap(bufferSize, 12)), + values: deprecated.BytesToInt96(data), columnIndex: ^columnIndex, }, } } -func readInt96Dictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (*int96Dictionary, error) { - d := &int96Dictionary{ - typ: typ, - int96Page: int96Page{ - values: make([]deprecated.Int96, 0, atLeastOne(numValues)), - columnIndex: ^columnIndex, - }, - } - - for { - if len(d.values) == cap(d.values) { - newValues := make([]deprecated.Int96, len(d.values), 2*cap(d.values)) - copy(newValues, d.values) - d.values = newValues - } - - n, err := decoder.DecodeInt96(d.values[len(d.values):cap(d.values)]) - if n > 0 { - d.values = d.values[:len(d.values)+n] - } - - if err != nil { - if err == io.EOF { - err = nil - } else { - err = fmt.Errorf("reading parquet dictionary of int96 values: %w", err) - } - return d, err - } - } -} - func (d *int96Dictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *int96Dictionary) Len() int { return len(d.values) } @@ -473,48 +342,16 @@ type floatDictionary struct { index map[float32]int32 } -func newFloatDictionary(typ Type, columnIndex int16, bufferSize int) *floatDictionary { +func newFloatDictionary(typ Type, columnIndex int16, numValues int32, data []byte) *floatDictionary { return &floatDictionary{ typ: typ, floatPage: floatPage{ - values: make([]float32, 0, dictCap(bufferSize, 4)), + values: bits.BytesToFloat32(data), columnIndex: ^columnIndex, }, } } -func readFloatDictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (*floatDictionary, error) { - d := &floatDictionary{ - typ: typ, - floatPage: floatPage{ - values: make([]float32, 0, atLeastOne(numValues)), - columnIndex: ^columnIndex, - }, - } - - for { - if len(d.values) == cap(d.values) { - newValues := make([]float32, len(d.values), 2*cap(d.values)) - copy(newValues, d.values) - d.values = newValues - } - - n, err := decoder.DecodeFloat(d.values[len(d.values):cap(d.values)]) - if n > 0 { - d.values = d.values[:len(d.values)+n] - } - - if err != nil { - if err == io.EOF { - err = nil - } else { - err = fmt.Errorf("reading parquet dictionary of float values: %w", err) - } - return d, err - } - } -} - func (d *floatDictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *floatDictionary) Len() int { return len(d.values) } @@ -587,48 +424,16 @@ type doubleDictionary struct { index map[float64]int32 } -func newDoubleDictionary(typ Type, columnIndex int16, bufferSize int) *doubleDictionary { +func newDoubleDictionary(typ Type, columnIndex int16, numValues int32, data []byte) *doubleDictionary { return &doubleDictionary{ typ: typ, doublePage: doublePage{ - values: make([]float64, 0, dictCap(bufferSize, 8)), + values: bits.BytesToFloat64(data), columnIndex: ^columnIndex, }, } } -func readDoubleDictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (*doubleDictionary, error) { - d := &doubleDictionary{ - typ: typ, - doublePage: doublePage{ - values: make([]float64, 0, atLeastOne(numValues)), - columnIndex: ^columnIndex, - }, - } - - for { - if len(d.values) == cap(d.values) { - newValues := make([]float64, len(d.values), 2*cap(d.values)) - copy(newValues, d.values) - d.values = newValues - } - - n, err := decoder.DecodeDouble(d.values[len(d.values):cap(d.values)]) - if n > 0 { - d.values = d.values[:len(d.values)+n] - } - - if err != nil { - if err == io.EOF { - err = nil - } else { - err = fmt.Errorf("reading parquet dictionary of double values: %w", err) - } - return d, err - } - } -} - func (d *doubleDictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *doubleDictionary) Len() int { return len(d.values) } @@ -697,13 +502,8 @@ func (d *doubleDictionary) Page() BufferedPage { type uint32Dictionary struct{ *int32Dictionary } -func newUint32Dictionary(typ Type, columnIndex int16, bufferSize int) uint32Dictionary { - return uint32Dictionary{newInt32Dictionary(typ, columnIndex, bufferSize)} -} - -func readUint32Dictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (uint32Dictionary, error) { - d, err := readInt32Dictionary(typ, columnIndex, numValues, decoder) - return uint32Dictionary{d}, err +func newUint32Dictionary(typ Type, columnIndex int16, numValues int32, data []byte) uint32Dictionary { + return uint32Dictionary{newInt32Dictionary(typ, columnIndex, numValues, data)} } func (d uint32Dictionary) Type() Type { return newIndexedType(d.typ, d) } @@ -735,13 +535,8 @@ func (d uint32Dictionary) Page() BufferedPage { type uint64Dictionary struct{ *int64Dictionary } -func newUint64Dictionary(typ Type, columnIndex int16, bufferSize int) uint64Dictionary { - return uint64Dictionary{newInt64Dictionary(typ, columnIndex, bufferSize)} -} - -func readUint64Dictionary(typ Type, columnIndex int16, numValues int, decoder encoding.Decoder) (uint64Dictionary, error) { - d, err := readInt64Dictionary(typ, columnIndex, numValues, decoder) - return uint64Dictionary{d}, err +func newUint64Dictionary(typ Type, columnIndex int16, numValues int32, data []byte) uint64Dictionary { + return uint64Dictionary{newInt64Dictionary(typ, columnIndex, numValues, data)} } func (d uint64Dictionary) Type() Type { return newIndexedType(d.typ, d) } diff --git a/dictionary_go18.go b/dictionary_go18.go index 2bd84b8..ce54327 100644 --- a/dictionary_go18.go +++ b/dictionary_go18.go @@ -4,9 +4,8 @@ package parquet import ( "fmt" - "io" - "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/internal/unsafecast" ) type dictionary[T primitive] struct { @@ -15,48 +14,19 @@ type dictionary[T primitive] struct { index map[T]int32 } -func newDictionary[T primitive](typ Type, columnIndex int16, bufferSize int, class *class[T]) *dictionary[T] { - return &dictionary[T]{ - typ: typ, - page: page[T]{ - class: class, - values: make([]T, 0, dictCap(bufferSize, sizeof[T]())), - columnIndex: ^columnIndex, - }, +func newDictionary[T primitive](typ Type, columnIndex int16, numValues int32, data []byte, class *class[T]) *dictionary[T] { + values := unsafecast.Slice[T](data) + if len(values) != int(numValues) { + panic(fmt.Errorf("number of values mismatch in numValues and data arguments: %d != %d", numValues, len(values))) } -} - -func readDictionary[T primitive](typ Type, columnIndex int16, numValues int, decoder encoding.Decoder, class *class[T]) (*dictionary[T], error) { - d := &dictionary[T]{ + return &dictionary[T]{ typ: typ, page: page[T]{ class: class, - values: make([]T, 0, atLeastOne(numValues)), + values: values, columnIndex: ^columnIndex, }, } - - for { - if len(d.values) == cap(d.values) { - newValues := make([]T, len(d.values), 2*cap(d.values)) - copy(newValues, d.values) - d.values = newValues - } - - n, err := d.class.decode(decoder, d.values[len(d.values):cap(d.values)]) - if n > 0 { - d.values = d.values[:len(d.values)+n] - } - - if err != nil { - if err == io.EOF { - err = nil - } else { - err = fmt.Errorf("reading parquet dictionary of %s values: %w", d.class.name, err) - } - return d, err - } - } } func (d *dictionary[T]) Type() Type { return newIndexedType(d.typ, d) } diff --git a/dictionary_test.go b/dictionary_test.go index 67fd023..ac1e8b4 100644 --- a/dictionary_test.go +++ b/dictionary_test.go @@ -42,7 +42,7 @@ func benchmarkDictionary(b *testing.B, do func(*testing.B, parquet.Type)) { func randDictionaryPage(typ parquet.Type, values []parquet.Value) parquet.BufferedPage { const bufferSize = 64 * 1024 - dict := typ.NewDictionary(0, 4*bufferSize) + dict := typ.NewDictionary(0, 0, make([]byte, 0, 4*bufferSize)) buf := dict.Type().NewColumnBuffer(0, bufferSize) buf.WriteValues(values) return buf.Page() diff --git a/encoding.go b/encoding.go index f5402d6..da3110d 100644 --- a/encoding.go +++ b/encoding.go @@ -1,14 +1,13 @@ package parquet import ( - "sort" - "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/encoding/bytestreamsplit" "github.com/segmentio/parquet-go/encoding/delta" "github.com/segmentio/parquet-go/encoding/plain" "github.com/segmentio/parquet-go/encoding/rle" "github.com/segmentio/parquet-go/format" + "github.com/segmentio/parquet-go/internal/bits" ) var ( @@ -51,6 +50,19 @@ var ( format.DeltaByteArray: &DeltaByteArray, format.ByteStreamSplit: &ByteStreamSplit, } + + // Table indexing RLE encodings for repetition and definition levels of + // all supported bit widths. + levelEncodings = [...]rle.Encoding{ + 0: {BitWidth: 1}, + 1: {BitWidth: 2}, + 2: {BitWidth: 3}, + 3: {BitWidth: 4}, + 4: {BitWidth: 5}, + 5: {BitWidth: 6}, + 6: {BitWidth: 7}, + 7: {BitWidth: 8}, + } ) func isDictionaryEncoding(encoding encoding.Encoding) bool { @@ -75,31 +87,37 @@ func LookupEncoding(enc format.Encoding) encoding.Encoding { return encoding.NotSupported{} } -func sortEncodings(encodings []encoding.Encoding) { - if len(encodings) > 1 { - sort.Slice(encodings, func(i, j int) bool { - return encodings[i].Encoding() < encodings[j].Encoding() - }) +func lookupLevelEncoding(enc format.Encoding, max int8) encoding.Encoding { + switch enc { + case format.RLE: + return &levelEncodings[bits.Len8(max)-1] + default: + return encoding.NotSupported{} } } -func dedupeSortedEncodings(encodings []encoding.Encoding) []encoding.Encoding { - if len(encodings) > 1 { - i := 0 - - for _, c := range encodings[1:] { - if c.Encoding() != encodings[i].Encoding() { - i++ - encodings[i] = c - } - } - - clear := encodings[i+1:] - for i := range clear { - clear[i] = nil - } - - encodings = encodings[:i+1] +func canEncode(e encoding.Encoding, k Kind) bool { + if isDictionaryEncoding(e) { + return true + } + switch k { + case Boolean: + return encoding.CanEncodeBoolean(e) + case Int32: + return encoding.CanEncodeInt32(e) + case Int64: + return encoding.CanEncodeInt64(e) + case Int96: + return encoding.CanEncodeInt96(e) + case Float: + return encoding.CanEncodeFloat(e) + case Double: + return encoding.CanEncodeDouble(e) + case ByteArray: + return encoding.CanEncodeByteArray(e) + case FixedLenByteArray: + return encoding.CanEncodeFixedLenByteArray(e) + default: + return false } - return encodings } diff --git a/encoding/bytearray.go b/encoding/bytearray.go deleted file mode 100644 index 776a1b0..0000000 --- a/encoding/bytearray.go +++ /dev/null @@ -1,157 +0,0 @@ -package encoding - -import ( - "sort" -) - -// ByteArrayList is a container similar to [][]byte with a smaller memory -// overhead. Where using a byte slices introduces ~24 bytes of overhead per -// element, ByteArrayList requires only 8 bytes per element. Extra efficiency -// also comes from reducing GC pressure by using contiguous areas of memory -// instead of allocating individual slices for each element. For lists with -// many small-size elements, the memory footprint can be reduced by 40-80%. -type ByteArrayList struct { - slices []slice - values []byte -} - -type slice struct{ i, j uint32 } - -func (s slice) len() int { return int(s.j - s.i) } - -func MakeByteArrayList(capacity int) ByteArrayList { - return ByteArrayList{ - slices: make([]slice, 0, capacity), - values: make([]byte, 0, 8*capacity), - } -} - -func (list *ByteArrayList) Clone() ByteArrayList { - size := 0 - for _, s := range list.slices { - size += s.len() - } - clone := ByteArrayList{ - slices: make([]slice, 0, len(list.slices)), - values: make([]byte, 0, size), - } - for _, s := range list.slices { - clone.Push(list.slice(s)) - } - return clone -} - -func (list *ByteArrayList) Split() [][]byte { - clone := ByteArrayList{ - slices: list.slices, - values: make([]byte, len(list.values)), - } - copy(clone.values, list.values) - split := make([][]byte, clone.Len()) - for i := range split { - split[i] = clone.Index(i) - } - return split -} - -func (list *ByteArrayList) Slice(i, j int) ByteArrayList { - return ByteArrayList{ - slices: list.slices[i:j:j], - values: list.values, - } -} - -func (list *ByteArrayList) Grow(n int) { - if n > (cap(list.slices) - len(list.slices)) { - newCap := 2 * cap(list.slices) - newLen := len(list.slices) + n - for newCap < newLen { - newCap *= 2 - } - newSlices := make([]slice, len(list.slices), newCap) - copy(newSlices, list.slices) - list.slices = newSlices - } -} - -func (list *ByteArrayList) Reset() { - list.slices = list.slices[:0] - list.values = list.values[:0] -} - -func (list *ByteArrayList) Push(v []byte) { - list.slices = append(list.slices, slice{ - i: uint32(len(list.values)), - j: uint32(len(list.values) + len(v)), - }) - list.values = append(list.values, v...) -} - -func (list *ByteArrayList) PushSize(n int) []byte { - i := len(list.values) - j := len(list.values) + n - - list.slices = append(list.slices, slice{ - i: uint32(i), - j: uint32(j), - }) - - if j <= cap(list.values) { - list.values = list.values[:j] - } else { - newCap := 2 * cap(list.values) - newLen := j - for newCap < newLen { - newCap *= 2 - } - newValues := make([]byte, newLen, newCap) - copy(newValues, list.values) - list.values = newValues - } - - return list.values[i:j:j] -} - -func (list *ByteArrayList) Index(i int) []byte { - return list.slice(list.slices[i]) -} - -func (list *ByteArrayList) Range(f func([]byte) bool) { - for _, s := range list.slices { - if !f(list.slice(s)) { - break - } - } -} - -func (list *ByteArrayList) Size() int64 { - size := int64(0) - for _, s := range list.slices { - size += 8 + int64(s.len()) - } - return size -} - -func (list *ByteArrayList) Cap() int { - return cap(list.slices) -} - -func (list *ByteArrayList) Len() int { - return len(list.slices) -} - -func (list *ByteArrayList) Less(i, j int) bool { - return string(list.Index(i)) < string(list.Index(j)) -} - -func (list *ByteArrayList) Swap(i, j int) { - list.slices[i], list.slices[j] = list.slices[j], list.slices[i] -} - -func (list *ByteArrayList) slice(s slice) []byte { - return list.values[s.i:s.j:s.j] -} - -var ( - _ sort.Interface = (*ByteArrayList)(nil) -) diff --git a/encoding/bytestreamsplit/bytestreamsplit.go b/encoding/bytestreamsplit/bytestreamsplit.go index 8bd0a47..84b4e40 100644 --- a/encoding/bytestreamsplit/bytestreamsplit.go +++ b/encoding/bytestreamsplit/bytestreamsplit.go @@ -1,7 +1,7 @@ package bytestreamsplit import ( - "io" + "math" "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" @@ -9,24 +9,136 @@ import ( // This encoder implements a version of the Byte Stream Split encoding as described // in https://github.com/apache/parquet-format/blob/master/Encodings.md#byte-stream-split-byte_stream_split--9 -type Encoding struct{} +type Encoding struct { + encoding.NotSupported +} + +func (e *Encoding) String() string { + return "BYTE_STREAM_SPLIT" +} func (e *Encoding) Encoding() format.Encoding { return format.ByteStreamSplit } -func (e *Encoding) CanEncode(t format.Type) bool { - return t == format.Float || t == format.Double +func (e *Encoding) EncodeFloat(dst []byte, src []float32) ([]byte, error) { + n := 4 * len(src) + if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + b0 := dst[0*len(src) : 1*len(src)] + b1 := dst[1*len(src) : 2*len(src)] + b2 := dst[2*len(src) : 3*len(src)] + b3 := dst[3*len(src) : 4*len(src)] + + for i, f := range src { + bits := math.Float32bits(f) + b0[i] = byte(bits) + b1[i] = byte(bits >> 8) + b2[i] = byte(bits >> 16) + b3[i] = byte(bits >> 24) + } + + return dst, nil } -func (e *Encoding) NewEncoder(w io.Writer) encoding.Encoder { - return NewEncoder(w) +func (e *Encoding) EncodeDouble(dst []byte, src []float64) ([]byte, error) { + n := 8 * len(src) + if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + b0 := dst[0*len(src) : 1*len(src)] + b1 := dst[1*len(src) : 2*len(src)] + b2 := dst[2*len(src) : 3*len(src)] + b3 := dst[3*len(src) : 4*len(src)] + b4 := dst[4*len(src) : 5*len(src)] + b5 := dst[5*len(src) : 6*len(src)] + b6 := dst[6*len(src) : 7*len(src)] + b7 := dst[7*len(src) : 8*len(src)] + + for i, f := range src { + bits := math.Float64bits(f) + b0[i] = byte(bits) + b1[i] = byte(bits >> 8) + b2[i] = byte(bits >> 16) + b3[i] = byte(bits >> 24) + b4[i] = byte(bits >> 32) + b5[i] = byte(bits >> 40) + b6[i] = byte(bits >> 48) + b7[i] = byte(bits >> 56) + } + + return dst, nil } -func (e *Encoding) NewDecoder(r io.Reader) encoding.Decoder { - return NewDecoder(r) +func (e *Encoding) DecodeFloat(dst []float32, src []byte) ([]float32, error) { + if (len(src) % 4) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "FLOAT", len(src)) + } + + n := len(src) / 4 + if cap(dst) < n { + dst = make([]float32, n) + } else { + dst = dst[:n] + } + + b0 := src[0*n : 1*n] + b1 := src[1*n : 2*n] + b2 := src[2*n : 3*n] + b3 := src[3*n : 4*n] + + for i := range dst { + dst[i] = math.Float32frombits( + uint32(b0[i]) | + uint32(b1[i])<<8 | + uint32(b2[i])<<16 | + uint32(b3[i])<<24, + ) + } + + return dst, nil } -func (e *Encoding) String() string { - return "BYTE_STREAM_SPLIT" +func (e *Encoding) DecodeDouble(dst []float64, src []byte) ([]float64, error) { + if (len(src) % 8) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "DOUBLE", len(src)) + } + + n := len(src) / 8 + if cap(dst) < n { + dst = make([]float64, n) + } else { + dst = dst[:n] + } + + b0 := src[0*n : 1*n] + b1 := src[1*n : 2*n] + b2 := src[2*n : 3*n] + b3 := src[3*n : 4*n] + b4 := src[4*n : 5*n] + b5 := src[5*n : 6*n] + b6 := src[6*n : 7*n] + b7 := src[7*n : 8*n] + + for i := range dst { + dst[i] = math.Float64frombits( + uint64(b0[i]) | + uint64(b1[i])<<8 | + uint64(b2[i])<<16 | + uint64(b3[i])<<24 | + uint64(b4[i])<<32 | + uint64(b5[i])<<40 | + uint64(b6[i])<<48 | + uint64(b7[i])<<56, + ) + } + + return dst, nil } diff --git a/encoding/bytestreamsplit/bytestreamsplit_test.go b/encoding/bytestreamsplit/bytestreamsplit_test.go new file mode 100644 index 0000000..a40c0cb --- /dev/null +++ b/encoding/bytestreamsplit/bytestreamsplit_test.go @@ -0,0 +1,19 @@ +//go:build go1.18 +// +build go1.18 + +package bytestreamsplit_test + +import ( + "testing" + + "github.com/segmentio/parquet-go/encoding/bytestreamsplit" + "github.com/segmentio/parquet-go/encoding/fuzz" +) + +func FuzzEncodeFloat(f *testing.F) { + fuzz.EncodeDouble(f, new(bytestreamsplit.Encoding)) +} + +func FuzzEncodeDouble(f *testing.F) { + fuzz.EncodeDouble(f, new(bytestreamsplit.Encoding)) +} diff --git a/encoding/bytestreamsplit/decoder.go b/encoding/bytestreamsplit/decoder.go deleted file mode 100644 index b0ecbf7..0000000 --- a/encoding/bytestreamsplit/decoder.go +++ /dev/null @@ -1,108 +0,0 @@ -package bytestreamsplit - -import ( - "bytes" - "io" - "math" - - "github.com/segmentio/parquet-go/encoding" -) - -type Decoder struct { - encoding.NotSupportedDecoder - reader io.Reader - buffer bytes.Buffer - offset int -} - -func NewDecoder(r io.Reader) *Decoder { - return &Decoder{reader: r} -} - -func (d *Decoder) Reset(r io.Reader) { - d.reader = r - d.offset = 0 - d.buffer.Reset() -} - -func (d *Decoder) DecodeFloat(data []float32) (int, error) { - if err := d.read(); err != nil { - return 0, err - } - - return d.decode32(data) -} - -func (d *Decoder) DecodeDouble(data []float64) (int, error) { - if err := d.read(); err != nil { - return 0, err - } - - return d.decode64(data) -} - -func (d *Decoder) read() error { - var err error - - if d.buffer.Len() == 0 { - d.buffer.ReadFrom(d.reader) - } - - return err -} - -func (d *Decoder) decode32(data []float32) (int, error) { - if d.offset*4 >= d.buffer.Len() { - return 0, io.EOF - } - - length := len(data) - - padding := d.buffer.Len() / 4 // float32 size - - for i := 0; i < length; i++ { - data[i] = d.float32frombits(i+d.offset, padding) - } - - d.offset += length - - return length, nil -} - -func (d *Decoder) float32frombits(idx, padding int) float32 { - return math.Float32frombits( - uint32(d.buffer.Bytes()[idx]) | - uint32(d.buffer.Bytes()[idx+padding])<<8 | - uint32(d.buffer.Bytes()[idx+padding*2])<<16 | - uint32(d.buffer.Bytes()[idx+padding*3])<<24) -} - -func (d *Decoder) decode64(data []float64) (int, error) { - if d.offset*8 >= d.buffer.Len() { - return 0, io.EOF - } - - length := len(data) - - padding := d.buffer.Len() / 8 // float64 size - - for i := 0; i < length; i++ { - data[i] = d.float64frombits(i+d.offset, padding) - } - - d.offset += length - - return length, nil -} - -func (d *Decoder) float64frombits(idx, padding int) float64 { - return math.Float64frombits( - uint64(d.buffer.Bytes()[idx]) | - uint64(d.buffer.Bytes()[idx+padding])<<8 | - uint64(d.buffer.Bytes()[idx+padding*2])<<16 | - uint64(d.buffer.Bytes()[idx+padding*3])<<24 | - uint64(d.buffer.Bytes()[idx+padding*4])<<32 | - uint64(d.buffer.Bytes()[idx+padding*5])<<40 | - uint64(d.buffer.Bytes()[idx+padding*6])<<48 | - uint64(d.buffer.Bytes()[idx+padding*7])<<56) -} diff --git a/encoding/bytestreamsplit/encoder.go b/encoding/bytestreamsplit/encoder.go deleted file mode 100644 index 9bc9953..0000000 --- a/encoding/bytestreamsplit/encoder.go +++ /dev/null @@ -1,85 +0,0 @@ -package bytestreamsplit - -import ( - "io" - "math" - - "github.com/segmentio/parquet-go/encoding" -) - -type Encoder struct { - encoding.NotSupportedEncoder - writer io.Writer - buffer []byte -} - -func NewEncoder(w io.Writer) *Encoder { - return &Encoder{ - writer: w, - } -} - -func (e *Encoder) Write(b []byte) (int, error) { - return e.writer.Write(b) -} - -func (e *Encoder) Reset(w io.Writer) { - e.writer = w - e.buffer = e.buffer[:0] -} - -func (e *Encoder) EncodeFloat(data []float32) error { - _, err := e.writer.Write(e.encode32(data)) - return err -} - -func (e *Encoder) EncodeDouble(data []float64) error { - _, err := e.writer.Write(e.encode64(data)) - return err -} - -func (e *Encoder) encode32(data []float32) []byte { - length := len(data) - if length == 0 { - return []byte{} - } - - if len(e.buffer) < length*4 { - e.buffer = make([]byte, length*4) - } - - for i, f := range data { - bits := math.Float32bits(f) - e.buffer[i] = byte(bits) - e.buffer[i+length] = byte(bits >> 8) - e.buffer[i+length*2] = byte(bits >> 16) - e.buffer[i+length*3] = byte(bits >> 24) - } - - return e.buffer[:length*4] -} - -func (e *Encoder) encode64(data []float64) []byte { - length := len(data) - if length == 0 { - return []byte{} - } - - if len(e.buffer) < length*8 { - e.buffer = make([]byte, length*8) - } - - for i, f := range data { - bits := math.Float64bits(f) - e.buffer[i] = byte(bits) - e.buffer[i+length] = byte(bits >> 8) - e.buffer[i+length*2] = byte(bits >> 16) - e.buffer[i+length*3] = byte(bits >> 24) - e.buffer[i+length*4] = byte(bits >> 32) - e.buffer[i+length*5] = byte(bits >> 40) - e.buffer[i+length*6] = byte(bits >> 48) - e.buffer[i+length*7] = byte(bits >> 56) - } - - return e.buffer[:length*8] -} diff --git a/encoding/bytestreamsplit/encoder_test.go b/encoding/bytestreamsplit/encoder_test.go deleted file mode 100644 index 5d657c6..0000000 --- a/encoding/bytestreamsplit/encoder_test.go +++ /dev/null @@ -1,43 +0,0 @@ -package bytestreamsplit - -import ( - "bytes" - "reflect" - "testing" -) - -func TestEncoding(t *testing.T) { - e := &Encoder{} - - data := []float32{1.0, 2.0, 3.0} - - expected := []byte{0, 0, 0, 0, 0, 0, 128, 0, 64, 63, 64, 64} - - encoded := e.encode32(data) - - if !bytes.Equal(encoded, expected) { - t.Error("encoding result not expected") - t.Logf("got: %v", encoded) - t.Logf("expected: %v", expected) - } - - d := &Decoder{ - reader: bytes.NewReader(encoded), - } - - final := make([]float32, 3) - - if err := d.read(); err != nil { - t.Error(err) - } - - if _, err := d.decode32(final); err != nil { - t.Error(err) - } - - if !reflect.DeepEqual(data, final) { - t.Error("decoding result not expected") - t.Logf("got: %v", final) - t.Logf("expected: %v", data) - } -} diff --git a/encoding/delta/binary_packed.go b/encoding/delta/binary_packed.go index 01ce3fc..5006743 100644 --- a/encoding/delta/binary_packed.go +++ b/encoding/delta/binary_packed.go @@ -1,31 +1,354 @@ package delta import ( + "encoding/binary" + "fmt" "io" + "math" "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" + "github.com/segmentio/parquet-go/internal/bits" +) + +const ( + blockSize = 128 + numMiniBlocks = 4 + miniBlockSize = blockSize / numMiniBlocks + // The parquet spec does not enforce a limit to the block size, but we need + // one otherwise invalid inputs may result in unbounded memory allocations. + // + // 65K+ values should be enough for any valid use case. + maxSupportedBlockSize = 65536 ) type BinaryPackedEncoding struct { + encoding.NotSupported +} + +func (e *BinaryPackedEncoding) String() string { + return "DELTA_BINARY_PACKED" } func (e *BinaryPackedEncoding) Encoding() format.Encoding { return format.DeltaBinaryPacked } -func (e *BinaryPackedEncoding) CanEncode(t format.Type) bool { - return t == format.Int32 || t == format.Int64 +func (e *BinaryPackedEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { + return e.encodeInt32(dst[:0], src) } -func (e *BinaryPackedEncoding) NewDecoder(r io.Reader) encoding.Decoder { - return NewBinaryPackedDecoder(r) +func (e *BinaryPackedEncoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) { + return e.encodeInt64(dst[:0], src) } -func (e *BinaryPackedEncoding) NewEncoder(w io.Writer) encoding.Encoder { - return NewBinaryPackedEncoder(w) +func (e *BinaryPackedEncoding) encodeInt32(dst []byte, src []int32) ([]byte, error) { + return e.encode(dst, len(src), func(i int) int64 { return int64(src[i]) }) } -func (e *BinaryPackedEncoding) String() string { - return "DELTA_BINARY_PACKED" +func (e *BinaryPackedEncoding) encodeInt64(dst []byte, src []int64) ([]byte, error) { + return e.encode(dst, len(src), func(i int) int64 { return src[i] }) +} + +func (e *BinaryPackedEncoding) encode(dst []byte, totalValues int, valueAt func(int) int64) ([]byte, error) { + firstValue := int64(0) + if totalValues > 0 { + firstValue = valueAt(0) + } + dst = appendBinaryPackedHeader(dst, blockSize, numMiniBlocks, totalValues, firstValue) + if totalValues < 2 { + return dst, nil + } + + lastValue := firstValue + for i := 1; i < totalValues; { + block := make([]int64, blockSize) + n := blockSize + r := totalValues - i + if n > r { + n = r + } + block = block[:n] + for j := range block { + block[j] = valueAt(i) + i++ + } + + for j, v := range block { + block[j], lastValue = v-lastValue, v + } + + minDelta := bits.MinInt64(block) + bits.SubInt64(block, minDelta) + + // blockSize x 8: we store at most `blockSize` count of values, which + // might be up to 64 bits in length, which is why we multiple by 8. + // + // Technically we could size the buffer to a smaller size when the + // bit width requires less than 8 bytes per value, but it would cause + // the buffer to be put on the heap since the compiler wouldn't know + // how much stack space it needs in advance. + miniBlock := make([]byte, blockSize*8) + bitWidths := make([]byte, numMiniBlocks) + bitOffset := uint(0) + miniBlockLength := 0 + + for i := range bitWidths { + j := (i + 0) * miniBlockSize + k := (i + 1) * miniBlockSize + + if k > len(block) { + k = len(block) + } + + bitWidth := uint(bits.MaxLen64(block[j:k])) + if bitWidth != 0 { + bitWidths[i] = byte(bitWidth) + + for _, bits := range block[j:k] { + for b := uint(0); b < bitWidth; b++ { + x := bitOffset / 8 + y := bitOffset % 8 + miniBlock[x] |= byte(((bits >> b) & 1) << y) + bitOffset++ + } + } + + miniBlockLength += (miniBlockSize * int(bitWidth)) / 8 + } + + if k == len(block) { + break + } + } + + miniBlock = miniBlock[:miniBlockLength] + dst = appendBinaryPackedBlock(dst, int64(minDelta), bitWidths) + dst = append(dst, miniBlock...) + } + + return dst, nil +} + +func (e *BinaryPackedEncoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) { + dst, _, err := e.decodeInt32(dst[:0], src) + return dst, e.wrap(err) +} + +func (e *BinaryPackedEncoding) DecodeInt64(dst []int64, src []byte) ([]int64, error) { + dst, _, err := e.decodeInt64(dst[:0], src) + return dst, e.wrap(err) +} + +func (e *BinaryPackedEncoding) decodeInt32(dst []int32, src []byte) ([]int32, []byte, error) { + src, err := e.decode(src, func(value int64) { dst = append(dst, int32(value)) }) + return dst, src, err +} + +func (e *BinaryPackedEncoding) decodeInt64(dst []int64, src []byte) ([]int64, []byte, error) { + src, err := e.decode(src, func(value int64) { dst = append(dst, value) }) + return dst, src, err +} + +func (e *BinaryPackedEncoding) decode(src []byte, observe func(int64)) ([]byte, error) { + blockSize, numMiniBlocks, totalValues, firstValue, src, err := decodeBinaryPackedHeader(src) + if err != nil { + return src, err + } + if totalValues == 0 { + return src, nil + } + + observe(firstValue) + totalValues-- + lastValue := firstValue + numValuesInMiniBlock := blockSize / numMiniBlocks + + block := make([]int64, 128) + if cap(block) < blockSize { + block = make([]int64, blockSize) + } else { + block = block[:blockSize] + } + + miniBlockData := make([]byte, 256) + + for totalValues > 0 && len(src) > 0 { + var minDelta int64 + var bitWidths []byte + minDelta, bitWidths, src, err = decodeBinaryPackedBlock(src, numMiniBlocks) + if err != nil { + return src, err + } + + blockOffset := 0 + for i := range block { + block[i] = 0 + } + + for _, bitWidth := range bitWidths { + if bitWidth == 0 { + n := numValuesInMiniBlock + if n > totalValues { + n = totalValues + } + blockOffset += n + totalValues -= n + } else { + miniBlockSize := (numValuesInMiniBlock * int(bitWidth)) / 8 + if cap(miniBlockData) < miniBlockSize { + miniBlockData = make([]byte, miniBlockSize, miniBlockSize) + } else { + miniBlockData = miniBlockData[:miniBlockSize] + } + + n := copy(miniBlockData, src) + src = src[n:] + bitOffset := uint(0) + + for count := numValuesInMiniBlock; count > 0 && totalValues > 0; count-- { + delta := int64(0) + + for b := uint(0); b < uint(bitWidth); b++ { + x := (bitOffset + b) / 8 + y := (bitOffset + b) % 8 + delta |= int64((miniBlockData[x]>>y)&1) << b + } + + block[blockOffset] = delta + blockOffset++ + totalValues-- + bitOffset += uint(bitWidth) + } + } + + if totalValues == 0 { + break + } + } + + bits.AddInt64(block, minDelta) + block[0] += lastValue + for i := 1; i < len(block); i++ { + block[i] += block[i-1] + } + if values := block[:blockOffset]; len(values) > 0 { + for _, v := range values { + observe(v) + } + lastValue = values[len(values)-1] + } + } + + if totalValues > 0 { + return src, fmt.Errorf("%d missing values: %w", totalValues, io.ErrUnexpectedEOF) + } + + return src, nil +} + +func (e *BinaryPackedEncoding) wrap(err error) error { + if err != nil { + err = encoding.Error(e, err) + } + return err +} + +func appendBinaryPackedHeader(dst []byte, blockSize, numMiniBlocks, totalValues int, firstValue int64) []byte { + b := [4 * binary.MaxVarintLen64]byte{} + n := 0 + n += binary.PutUvarint(b[n:], uint64(blockSize)) + n += binary.PutUvarint(b[n:], uint64(numMiniBlocks)) + n += binary.PutUvarint(b[n:], uint64(totalValues)) + n += binary.PutVarint(b[n:], firstValue) + return append(dst, b[:n]...) +} + +func appendBinaryPackedBlock(dst []byte, minDelta int64, bitWidths []byte) []byte { + b := [binary.MaxVarintLen64]byte{} + n := binary.PutVarint(b[:], minDelta) + dst = append(dst, b[:n]...) + dst = append(dst, bitWidths...) + return dst +} + +func decodeBinaryPackedHeader(src []byte) (blockSize, numMiniBlocks, totalValues int, firstValue int64, next []byte, err error) { + u := uint64(0) + n := 0 + i := 0 + + if u, n, err = decodeUvarint(src[i:], "block size"); err != nil { + return + } + i += n + blockSize = int(u) + + if u, n, err = decodeUvarint(src[i:], "number of mini-blocks"); err != nil { + return + } + i += n + numMiniBlocks = int(u) + + if u, n, err = decodeUvarint(src[i:], "total values"); err != nil { + return + } + i += n + totalValues = int(u) + + if firstValue, n, err = decodeVarint(src[i:], "first value"); err != nil { + return + } + i += n + + if numMiniBlocks == 0 { + err = fmt.Errorf("invalid number of mini block (%d)", numMiniBlocks) + } else if (blockSize <= 0) || (blockSize%128) != 0 { + err = fmt.Errorf("invalid block size is not a multiple of 128 (%d)", blockSize) + } else if blockSize > maxSupportedBlockSize { + err = fmt.Errorf("invalid block size is too large (%d)", blockSize) + } else if miniBlockSize := blockSize / numMiniBlocks; (numMiniBlocks <= 0) || (miniBlockSize%32) != 0 { + err = fmt.Errorf("invalid mini block size is not a multiple of 32 (%d)", miniBlockSize) + } else if totalValues < 0 { + err = fmt.Errorf("invalid total number of values is negative (%d)", totalValues) + } else if totalValues > math.MaxInt32 { + err = fmt.Errorf("too many values: %d", totalValues) + } + + return blockSize, numMiniBlocks, totalValues, firstValue, src[i:], err +} + +func decodeBinaryPackedBlock(src []byte, numMiniBlocks int) (minDelta int64, bitWidths, next []byte, err error) { + minDelta, n, err := decodeVarint(src, "min delta") + if err != nil { + return 0, nil, src, err + } + src = src[n:] + if len(src) < numMiniBlocks { + bitWidths, next = src, nil + } else { + bitWidths, next = src[:numMiniBlocks], src[numMiniBlocks:] + } + return minDelta, bitWidths, next, nil +} + +func decodeUvarint(buf []byte, what string) (u uint64, n int, err error) { + u, n = binary.Uvarint(buf) + if n == 0 { + return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF) + } + if n < 0 { + return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf)) + } + return u, n, nil +} + +func decodeVarint(buf []byte, what string) (v int64, n int, err error) { + v, n = binary.Varint(buf) + if n == 0 { + return 0, 0, fmt.Errorf("decoding %s: %w", what, io.ErrUnexpectedEOF) + } + if n < 0 { + return 0, 0, fmt.Errorf("overflow decoding %s (read %d/%d bytes)", what, -n, len(buf)) + } + return v, n, nil } diff --git a/encoding/delta/binary_packed_decoder.go b/encoding/delta/binary_packed_decoder.go deleted file mode 100644 index c573756..0000000 --- a/encoding/delta/binary_packed_decoder.go +++ /dev/null @@ -1,281 +0,0 @@ -package delta - -import ( - "bufio" - "encoding/binary" - "fmt" - "io" - - "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/internal/bits" -) - -type BinaryPackedDecoder struct { - encoding.NotSupportedDecoder - reader *bufio.Reader - blockSize int - numMiniBlock int - miniBlockSize int - totalValues int - lastValue int64 - bitWidths []byte - blockValues []int64 - valueIndex int - blockIndex int - miniBlocks bits.Reader -} - -func NewBinaryPackedDecoder(r io.Reader) *BinaryPackedDecoder { - d := &BinaryPackedDecoder{} - d.Reset(r) - return d -} - -func (d *BinaryPackedDecoder) Reset(r io.Reader) { - *d = BinaryPackedDecoder{ - reader: d.reader, - bitWidths: d.bitWidths[:0], - blockValues: d.blockValues[:0], - valueIndex: -1, - } - - if cap(d.blockValues) == 0 { - d.blockValues = make([]int64, 0, blockSize32) - } - - if rbuf, _ := r.(*bufio.Reader); rbuf != nil { - d.reader = rbuf - } else if d.reader != nil { - d.reader.Reset(r) - } else if r != nil { - d.reader = bufio.NewReaderSize(r, defaultBufferSize) - } - - d.miniBlocks.Reset(d.reader) -} - -func (d *BinaryPackedDecoder) DecodeInt32(data []int32) (int, error) { - decoded := 0 - - for len(data) > 0 { - if err := d.decode(); err != nil { - if err == io.EOF && decoded > 0 { - break - } - return decoded, err - } - - i := d.blockIndex - j := len(d.blockValues) - remain := d.totalValues - d.valueIndex - - if (j - i) > remain { - j = i + remain - } - - n := j - i - if n > len(data) { - n = len(data) - j = i + n - } - - for i, v := range d.blockValues[i:j] { - data[i] = int32(v) - } - - data = data[n:] - decoded += n - d.valueIndex += n - d.blockIndex += n - } - - return decoded, nil -} - -func (d *BinaryPackedDecoder) DecodeInt64(data []int64) (int, error) { - decoded := 0 - - for len(data) > 0 { - if err := d.decode(); err != nil { - if err == io.EOF && decoded > 0 { - break - } - return decoded, err - } - - n := copy(data, d.blockValues[d.blockIndex:]) - data = data[n:] - decoded += n - d.valueIndex += n - d.blockIndex += n - } - - return decoded, nil -} - -func (d *BinaryPackedDecoder) decode() error { - if d.valueIndex < 0 { - blockSize, numMiniBlock, totalValues, firstValue, err := d.decodeHeader() - if err != nil { - return err - } - - d.blockSize = blockSize - d.numMiniBlock = numMiniBlock - d.miniBlockSize = blockSize / numMiniBlock - d.totalValues = totalValues - d.lastValue = firstValue - d.valueIndex = 0 - d.blockIndex = 0 - - if d.totalValues > 0 { - d.blockValues = append(d.blockValues[:0], firstValue) - } - - return nil - } - - if d.valueIndex == d.totalValues { - return io.EOF - } - - if d.blockIndex == 0 || d.blockIndex == len(d.blockValues) { - if err := d.decodeBlock(); err != nil { - return err - } - d.blockIndex = 0 - } - - return nil -} - -func (d *BinaryPackedDecoder) decodeHeader() (blockSize, numMiniBlock, totalValues int, firstValue int64, err error) { - var u uint64 - - if u, err = binary.ReadUvarint(d.reader); err != nil { - if err != io.EOF { - err = fmt.Errorf("DELTA_BINARY_PACKED: reading block size: %w", err) - } - return - } else { - blockSize = int(u) - } - if u, err = binary.ReadUvarint(d.reader); err != nil { - err = fmt.Errorf("DELTA_BINARY_PACKED: reading number of mini blocks: %w", dontExpectEOF(err)) - return - } else { - numMiniBlock = int(u) - } - if u, err = binary.ReadUvarint(d.reader); err != nil { - err = fmt.Errorf("DELTA_BINARY_PACKED: reading number of values: %w", dontExpectEOF(err)) - return - } else { - totalValues = int(u) - } - if firstValue, err = binary.ReadVarint(d.reader); err != nil { - err = fmt.Errorf("DELTA_BINARY_PACKED: reading first value: %w", dontExpectEOF(err)) - return - } - - if numMiniBlock == 0 { - err = fmt.Errorf("DELTA_BINARY_PACKED: invalid number of mini block (%d)", numMiniBlock) - } else if (blockSize <= 0) || (blockSize%128) != 0 { - err = fmt.Errorf("DELTA_BINARY_PACKED: invalid block size is not a multiple of 128 (%d)", blockSize) - } else if miniBlockSize := blockSize / numMiniBlock; (numMiniBlock <= 0) || (miniBlockSize%32) != 0 { - err = fmt.Errorf("DELTA_BINARY_PACKED: invalid mini block size is not a multiple of 32 (%d)", miniBlockSize) - } else if totalValues < 0 { - err = fmt.Errorf("DETLA_BINARY_PACKED: invalid total number of values is negative (%d)", totalValues) - } - return -} - -func (d *BinaryPackedDecoder) decodeBlock() error { - minDelta, err := binary.ReadVarint(d.reader) - if err != nil { - return fmt.Errorf("DELTA_BINARY_PACKED: reading min delta (%d): %w", minDelta, err) - } - - if cap(d.bitWidths) < d.numMiniBlock { - d.bitWidths = make([]byte, d.numMiniBlock) - } else { - d.bitWidths = d.bitWidths[:d.numMiniBlock] - } - - if _, err := io.ReadFull(d.reader, d.bitWidths); err != nil { - return fmt.Errorf("DELTA_BINARY_PACKED: reading bit widths: %w", err) - } - - if cap(d.blockValues) < d.blockSize { - d.blockValues = make([]int64, d.blockSize) - } else { - d.blockValues = d.blockValues[:d.blockSize] - } - - for i := range d.blockValues { - d.blockValues[i] = 0 - } - - i := 0 - j := d.miniBlockSize - remain := d.totalValues - d.valueIndex - - for _, bitWidth := range d.bitWidths { - if bitWidth != 0 { - for k := range d.blockValues[i:j] { - v, nbits, err := d.miniBlocks.ReadBits(uint(bitWidth)) - if err != nil { - // In some cases, the last mini block seems to be missing - // trailing bytes when all values have already been decoded. - // - // The spec is unclear on the topic, it says that no padding - // is added for the miniblocks that contain no values, tho - // it is not explicit on whether the last miniblock is - // allowed to be incomplete. - // - // When we remove padding on the miniblock containing the - // last value, parquet-tools sometimes fails to read the - // column. However, if we don't handle the case where EOF - // is reached before reading the full last miniblock, we - // are unable to read some of the reference files from the - // parquet-testing repository. - if err == io.EOF && (i+k) >= remain { - break - } - err = dontExpectEOF(err) - err = fmt.Errorf("DELTA_BINARY_PACKED: reading mini blocks: %w", err) - return err - } - if nbits != uint(bitWidth) { - panic("BUG: wrong number of bits read from DELTA_BINARY_PACKED miniblock") - } - d.blockValues[i+k] = int64(v) - } - } - - if j >= remain { - break - } - - i += d.miniBlockSize - j += d.miniBlockSize - } - - if remain < len(d.blockValues) { - d.blockValues = d.blockValues[:remain] - } - - bits.AddInt64(d.blockValues, minDelta) - d.blockValues[0] += d.lastValue - for i := 1; i < len(d.blockValues); i++ { - d.blockValues[i] += d.blockValues[i-1] - } - d.lastValue = d.blockValues[len(d.blockValues)-1] - return nil -} - -func dontExpectEOF(err error) error { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - return err -} diff --git a/encoding/delta/binary_packed_encoder.go b/encoding/delta/binary_packed_encoder.go deleted file mode 100644 index 4fd4933..0000000 --- a/encoding/delta/binary_packed_encoder.go +++ /dev/null @@ -1,197 +0,0 @@ -package delta - -import ( - "encoding/binary" - "io" - - "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/internal/bits" -) - -// TODO: figure out better heuristics to determine those values, -// right now they are optimized for keeping the memory footprint -// of the encoder/decoder at ~8KB. -const ( - blockSize64 = 128 - numMiniBlock64 = 4 // (blockSize64 / numMiniBlock64) % 32 == 0 - miniBlockSize64 = blockSize64 / numMiniBlock64 - - blockSize32 = 2 * blockSize64 - numMiniBlock32 = 2 * numMiniBlock64 - miniBlockSize32 = blockSize32 / numMiniBlock32 - - headerBufferSize = 32 - blockBufferSize = 8 * blockSize64 - bitWidthsBufferSize = 2 * numMiniBlock64 -) - -type BinaryPackedEncoder struct { - encoding.NotSupportedEncoder - writer io.Writer - header [headerBufferSize]byte - block [blockBufferSize]byte - bitWidths [bitWidthsBufferSize]byte - miniBlock bits.Writer -} - -func NewBinaryPackedEncoder(w io.Writer) *BinaryPackedEncoder { - e := &BinaryPackedEncoder{} - e.Reset(w) - return e -} - -func (e *BinaryPackedEncoder) Reset(w io.Writer) { - e.writer = w - e.miniBlock.Reset(w) -} - -func (e *BinaryPackedEncoder) EncodeInt32(data []int32) error { - firstValue := int32(0) - if len(data) > 0 { - firstValue = data[0] - } - - if err := e.encodeHeader(blockSize32, numMiniBlock32, len(data), int64(firstValue)); err != nil { - return err - } - - if len(data) <= 1 { - return nil - } - - data = data[1:] - lastValue := firstValue - - for len(data) > 0 { - block := bits.BytesToInt32(e.block[:]) - for i := range block { - block[i] = 0 - } - - n := copy(block, data) - data = data[n:] - - for i, v := range block[:n] { - block[i], lastValue = v-lastValue, v - } - - minDelta := bits.MinInt32(block[:n]) - bits.SubInt32(block[:n], minDelta) - - bitWidths := e.bitWidths[:numMiniBlock32] - for i := range bitWidths { - j := (i + 0) * miniBlockSize32 - k := (i + 1) * miniBlockSize32 - bitWidths[i] = byte(bits.MaxLen32(block[j:k])) - } - - if err := e.encodeBlock(int64(minDelta), bitWidths); err != nil { - return err - } - - for i, bitWidth := range bitWidths { - j := (i + 0) * miniBlockSize32 - k := (i + 1) * miniBlockSize32 - if bitWidth != 0 { - for _, bits := range block[j:k] { - e.miniBlock.WriteBits(uint64(bits), uint(bitWidth)) - } - } - if k >= n { - break - } - } - - if err := e.miniBlock.Flush(); err != nil { - return err - } - } - - return nil -} - -func (e *BinaryPackedEncoder) EncodeInt64(data []int64) error { - firstValue := int64(0) - if len(data) > 0 { - firstValue = data[0] - } - - if err := e.encodeHeader(blockSize64, numMiniBlock64, len(data), firstValue); err != nil { - return err - } - - if len(data) <= 1 { - return nil - } - - data = data[1:] - lastValue := firstValue - - for len(data) > 0 { - block := bits.BytesToInt64(e.block[:]) - for i := range block { - block[i] = 0 - } - - n := copy(block, data) - data = data[n:] - - for i, v := range block[:n] { - block[i], lastValue = v-lastValue, v - } - - minDelta := bits.MinInt64(block) - bits.SubInt64(block, minDelta) - - bitWidths := e.bitWidths[:numMiniBlock64] - for i := range bitWidths { - j := (i + 0) * miniBlockSize64 - k := (i + 1) * miniBlockSize64 - bitWidths[i] = byte(bits.MaxLen64(block[j:k])) - } - - if err := e.encodeBlock(minDelta, bitWidths); err != nil { - return err - } - - for i, bitWidth := range bitWidths { - j := (i + 0) * miniBlockSize64 - k := (i + 1) * miniBlockSize64 - if bitWidth != 0 { - for _, bits := range block[j:k] { - e.miniBlock.WriteBits(uint64(bits), uint(bitWidth)) - } - } - if k >= n { - break - } - } - - if err := e.miniBlock.Flush(); err != nil { - return err - } - } - - return nil -} - -func (e *BinaryPackedEncoder) encodeHeader(blockSize, numMiniBlock, totalValues int, firstValue int64) error { - b := e.header[:] - n := 0 - n += binary.PutUvarint(b[n:], uint64(blockSize)) - n += binary.PutUvarint(b[n:], uint64(numMiniBlock)) - n += binary.PutUvarint(b[n:], uint64(totalValues)) - n += binary.PutVarint(b[n:], firstValue) - _, err := e.writer.Write(b[:n]) - return err -} - -func (e *BinaryPackedEncoder) encodeBlock(minDelta int64, bitWidths []byte) error { - b := e.header[:] - n := binary.PutVarint(b, minDelta) - if _, err := e.writer.Write(b[:n]); err != nil { - return err - } - _, err := e.writer.Write(bitWidths) - return err -} diff --git a/encoding/delta/byte_array.go b/encoding/delta/byte_array.go index 49c2abc..941833c 100644 --- a/encoding/delta/byte_array.go +++ b/encoding/delta/byte_array.go @@ -1,35 +1,201 @@ package delta import ( - "io" + "bytes" + "fmt" + "math" "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/encoding/plain" "github.com/segmentio/parquet-go/format" ) type ByteArrayEncoding struct { + encoding.NotSupported +} + +func (e *ByteArrayEncoding) String() string { + return "DELTA_BYTE_ARRAY" } func (e *ByteArrayEncoding) Encoding() format.Encoding { return format.DeltaByteArray } -func (e *ByteArrayEncoding) CanEncode(t format.Type) bool { +func (e *ByteArrayEncoding) EncodeByteArray(dst, src []byte) ([]byte, error) { + lastOffset := int32(0) + + offset := getInt32Buffer() + defer putInt32Buffer(offset) + + err := plain.RangeByteArrays(src, func(value []byte) error { + offset.values = append(offset.values, lastOffset) + lastOffset += 4 + int32(len(value)) + return nil + }) + if err != nil { + return dst[:0], encoding.Error(e, err) + } + + return e.encode(dst[:0], len(offset.values), func(i int) []byte { + j := int(offset.values[i]) + k := j + plain.ByteArrayLength(src[j:]) + j += 4 + k += 4 + return src[j:k:k] + }) +} + +func (e *ByteArrayEncoding) EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { // The parquet specs say that this encoding is only supported for BYTE_ARRAY // values, but the reference Java implementation appears to support // FIXED_LEN_BYTE_ARRAY as well: // https://github.com/apache/parquet-mr/blob/5608695f5777de1eb0899d9075ec9411cfdf31d3/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java#L211 - return t == format.ByteArray || t == format.FixedLenByteArray + if size < 0 || size > encoding.MaxFixedLenByteArraySize { + return dst[:0], encoding.Error(e, encoding.ErrInvalidArgument) + } + if (len(src) % size) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "FIXED_LEN_BYTE_ARRAY", len(src)) + } + return e.encode(dst[:0], len(src)/size, func(i int) []byte { + j := (i + 0) * size + k := (i + 1) * size + return src[j:k:k] + }) } -func (e *ByteArrayEncoding) NewDecoder(r io.Reader) encoding.Decoder { - return NewByteArrayDecoder(r) +func (e *ByteArrayEncoding) encode(dst []byte, numValues int, valueAt func(int) []byte) ([]byte, error) { + prefix := getInt32Buffer() + defer putInt32Buffer(prefix) + + length := getInt32Buffer() + defer putInt32Buffer(length) + + var lastValue []byte + for i := 0; i < numValues; i++ { + value := valueAt(i) + if len(value) > math.MaxInt32 { + return dst, encoding.Errorf(e, "byte array of length %d is too large to be encoded", len(value)) + } + n := prefixLength(lastValue, value) + prefix.values = append(prefix.values, int32(n)) + length.values = append(length.values, int32(len(value)-n)) + lastValue = value + } + + var binpack BinaryPackedEncoding + var err error + dst, err = binpack.encodeInt32(dst, prefix.values) + if err != nil { + return dst, err + } + dst, err = binpack.encodeInt32(dst, length.values) + if err != nil { + return dst, err + } + for i, p := range prefix.values { + dst = append(dst, valueAt(i)[p:]...) + } + return dst, nil } -func (e *ByteArrayEncoding) NewEncoder(w io.Writer) encoding.Encoder { - return NewByteArrayEncoder(w) +func (e *ByteArrayEncoding) DecodeByteArray(dst, src []byte) ([]byte, error) { + dst = dst[:0] + err := e.decode(src, func(value []byte) error { + dst = plain.AppendByteArray(dst, value) + return nil + }) + if err != nil { + err = encoding.Error(e, err) + } + return dst, err } -func (e *ByteArrayEncoding) String() string { - return "DELTA_BYTE_ARRAY" +func (e *ByteArrayEncoding) DecodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { + if size < 0 || size > encoding.MaxFixedLenByteArraySize { + return dst[:0], encoding.Error(e, encoding.ErrInvalidArgument) + } + dst = dst[:0] + err := e.decode(src, func(value []byte) error { + if len(value) != size { + return fmt.Errorf("cannot decode value of size %d into fixed-length byte array of size %d", len(value), size) + } + dst = append(dst, value...) + return nil + }) + if err != nil { + err = encoding.Error(e, err) + } + return dst, err +} + +func (e *ByteArrayEncoding) decode(src []byte, observe func([]byte) error) error { + prefix := getInt32Buffer() + defer putInt32Buffer(prefix) + + length := getInt32Buffer() + defer putInt32Buffer(length) + + var binpack BinaryPackedEncoding + var err error + prefix.values, src, err = binpack.decodeInt32(prefix.values, src) + if err != nil { + return err + } + length.values, src, err = binpack.decodeInt32(length.values, src) + if err != nil { + return err + } + if len(prefix.values) != len(length.values) { + return fmt.Errorf("number of prefix and lengths mismatch: %d != %d", len(prefix.values), len(length.values)) + } + + value := getBytesBuffer() + defer putBytesBuffer(value) + + for i, n := range length.values { + if int(n) < 0 { + return fmt.Errorf("invalid negative value length: %d", n) + } + if int(n) > len(src) { + return fmt.Errorf("value length is larger than the input size: %d > %d", n, len(src)) + } + + p := prefix.values[i] + if int(p) < 0 { + return fmt.Errorf("invalid negative prefix length: %d", p) + } + if int(p) > value.Len() { + return fmt.Errorf("prefix length %d is larger than the last value of size %d", p, value.Len()) + } + + value.Truncate(int(p)) + value.Write(src[:n]) + src = src[n:] + + if err := observe(value.Bytes()); err != nil { + return err + } + } + + return nil +} + +func prefixLength(base, data []byte) int { + return binarySearchPrefixLength(len(base)/2, base, data) +} + +func binarySearchPrefixLength(max int, base, data []byte) int { + for len(base) > 0 { + if bytes.HasPrefix(data, base[:max]) { + if max == len(base) { + return max + } + max += (len(base)-max)/2 + 1 + } else { + base = base[:max-1] + max /= 2 + } + } + return 0 } diff --git a/encoding/delta/byte_array_decoder.go b/encoding/delta/byte_array_decoder.go deleted file mode 100644 index 6205148..0000000 --- a/encoding/delta/byte_array_decoder.go +++ /dev/null @@ -1,104 +0,0 @@ -package delta - -import ( - "bufio" - "fmt" - "io" - - "github.com/segmentio/parquet-go/encoding" -) - -type ByteArrayDecoder struct { - encoding.NotSupportedDecoder - deltas BinaryPackedDecoder - arrays LengthByteArrayDecoder - previous []byte - prefixes []int32 -} - -func NewByteArrayDecoder(r io.Reader) *ByteArrayDecoder { - d := &ByteArrayDecoder{prefixes: make([]int32, defaultBufferSize/4)} - d.Reset(r) - return d -} - -func (d *ByteArrayDecoder) Reset(r io.Reader) { - if _, ok := r.(*bufio.Reader); !ok { - r = bufio.NewReaderSize(r, defaultBufferSize) - } - d.deltas.Reset(r) - d.arrays.Reset(r) - d.previous = d.previous[:0] - d.prefixes = d.prefixes[:0] -} - -func (d *ByteArrayDecoder) DecodeByteArray(data *encoding.ByteArrayList) (int, error) { - return d.decode(data.Cap()-data.Len(), func(n int) ([]byte, error) { return data.PushSize(n), nil }) -} - -func (d *ByteArrayDecoder) DecodeFixedLenByteArray(size int, data []byte) (int, error) { - if size <= 0 { - return 0, fmt.Errorf("DELTA_BYTE_ARRAY: %w: size of decoded FIXED_LEN_BYTE_ARRAY must be positive", encoding.ErrInvalidArgument) - } - - i := 0 - return d.decode(len(data)/size, func(n int) ([]byte, error) { - if n != size { - return nil, fmt.Errorf("decoding fixed length byte array of size %d but a value of length %d was found", size, n) - } - v := data[i : i+n] - i += n - return v, nil - }) -} - -func (d *ByteArrayDecoder) decode(limit int, push func(int) ([]byte, error)) (int, error) { - if d.arrays.index < 0 { - if err := d.decodePrefixes(); err != nil { - return 0, fmt.Errorf("DELTA_BYTE_ARRAY: decoding prefix lengths: %w", err) - } - if err := d.arrays.decodeLengths(); err != nil { - return 0, fmt.Errorf("DELTA_BYTE_ARRAY: decoding byte array lengths: %w", err) - } - } - - if d.arrays.index == len(d.arrays.lengths) { - return 0, io.EOF - } - - decoded := 0 - for d.arrays.index < len(d.arrays.lengths) && decoded < limit { - prefixLength := len(d.previous) - suffixLength := int(d.arrays.lengths[d.arrays.index]) - length := prefixLength + suffixLength - - value, err := push(length) - if err != nil { - return decoded, fmt.Errorf("DELTA_BYTE_ARRAY: %w", err) - } - - copy(value, d.previous[:prefixLength]) - if err := d.arrays.readFull(value[prefixLength:]); err != nil { - return decoded, fmt.Errorf("DELTA_BYTE_ARRAY: decoding byte array at index %d/%d: %w", d.arrays.index, len(d.arrays.lengths), err) - } - - if i := d.arrays.index + 1; i < len(d.prefixes) { - j := int(d.prefixes[i]) - k := len(value) - if j > k { - return decoded, fmt.Errorf("DELTA_BYTE_ARRAY: next prefix is longer than the last decoded byte array (%d>%d)", j, k) - } - d.previous = append(d.previous[:0], value[:j]...) - } - - decoded++ - d.arrays.index++ - } - - return decoded, nil -} - -func (d *ByteArrayDecoder) decodePrefixes() (err error) { - d.prefixes, err = appendDecodeInt32(&d.deltas, d.prefixes[:0]) - return err -} diff --git a/encoding/delta/byte_array_encoder.go b/encoding/delta/byte_array_encoder.go deleted file mode 100644 index 9df92bf..0000000 --- a/encoding/delta/byte_array_encoder.go +++ /dev/null @@ -1,83 +0,0 @@ -package delta - -import ( - "bytes" - "fmt" - "io" - "math" - - "github.com/segmentio/parquet-go/encoding" -) - -type ByteArrayEncoder struct { - encoding.NotSupportedEncoder - deltas BinaryPackedEncoder - arrays LengthByteArrayEncoder - prefixes []int32 - suffixes encoding.ByteArrayList -} - -func NewByteArrayEncoder(w io.Writer) *ByteArrayEncoder { - e := &ByteArrayEncoder{prefixes: make([]int32, defaultBufferSize/4)} - e.Reset(w) - return e -} - -func (e *ByteArrayEncoder) Reset(w io.Writer) { - e.deltas.Reset(w) - e.arrays.Reset(w) - e.prefixes = e.prefixes[:0] - e.suffixes.Reset() -} - -func (e *ByteArrayEncoder) EncodeByteArray(data encoding.ByteArrayList) error { - return e.encode(data.Len(), data.Index) -} - -func (e *ByteArrayEncoder) EncodeFixedLenByteArray(size int, data []byte) error { - if size <= 0 { - return fmt.Errorf("DELTA_BYTE_ARRAY: %w: size of encoded FIXED_LEN_BYTE_ARRAY must be positive", encoding.ErrInvalidArgument) - } - return e.encode(len(data)/size, func(i int) []byte { return data[i*size : (i+1)*size] }) -} - -func (e *ByteArrayEncoder) encode(count int, valueAt func(int) []byte) error { - lastValue := ([]byte)(nil) - e.prefixes = e.prefixes[:0] - e.suffixes.Reset() - - for i := 0; i < count; i++ { - value := valueAt(i) - if len(value) > math.MaxInt32 { - return fmt.Errorf("DELTA_BYTE_ARRAY: byte array of length %d is too large to be encoded", len(value)) - } - n := prefixLength(lastValue, value) - e.prefixes = append(e.prefixes, int32(n)) - e.suffixes.Push(value[n:]) - lastValue = value - } - - if err := e.deltas.EncodeInt32(e.prefixes); err != nil { - return err - } - return e.arrays.EncodeByteArray(e.suffixes) -} - -func prefixLength(base, data []byte) int { - return binarySearchPrefixLength(len(base)/2, base, data) -} - -func binarySearchPrefixLength(max int, base, data []byte) int { - for len(base) > 0 { - if bytes.HasPrefix(data, base[:max]) { - if max == len(base) { - return max - } - max += (len(base)-max)/2 + 1 - } else { - base = base[:max-1] - max /= 2 - } - } - return 0 -} diff --git a/encoding/delta/delta.go b/encoding/delta/delta.go index b2a6b66..cf21d72 100644 --- a/encoding/delta/delta.go +++ b/encoding/delta/delta.go @@ -1,34 +1,49 @@ package delta import ( - "io" - - "github.com/segmentio/parquet-go/encoding" + "bytes" + "sync" ) const ( defaultBufferSize = 4096 ) -func appendDecodeInt32(d encoding.Decoder, data []int32) ([]int32, error) { - for { - if len(data) == cap(data) { - if cap(data) == 0 { - data = make([]int32, 0, blockSize32) - } else { - newData := make([]int32, len(data), 2*cap(data)) - copy(newData, data) - data = newData - } - } +type int32Buffer struct { + values []int32 +} - n, err := d.DecodeInt32(data[len(data):cap(data)]) - data = data[:len(data)+n] - if err != nil { - if err == io.EOF { - err = nil - } - return data, err +var ( + int32BufferPool sync.Pool // *int32Buffer + bytesBufferPool sync.Pool // *bytes.Buffer +) + +func getInt32Buffer() *int32Buffer { + b, _ := int32BufferPool.Get().(*int32Buffer) + if b != nil { + b.values = b.values[:0] + } else { + b = &int32Buffer{ + values: make([]int32, 0, 1024), } } + return b +} + +func putInt32Buffer(b *int32Buffer) { + int32BufferPool.Put(b) +} + +func getBytesBuffer() *bytes.Buffer { + b, _ := bytesBufferPool.Get().(*bytes.Buffer) + if b != nil { + b.Reset() + } else { + b = new(bytes.Buffer) + } + return b +} + +func putBytesBuffer(b *bytes.Buffer) { + bytesBufferPool.Put(b) } diff --git a/encoding/delta/delta_test.go b/encoding/delta/delta_test.go new file mode 100644 index 0000000..3d3b2fa --- /dev/null +++ b/encoding/delta/delta_test.go @@ -0,0 +1,27 @@ +//go:build go1.18 +// +build go1.18 + +package delta_test + +import ( + "testing" + + "github.com/segmentio/parquet-go/encoding/delta" + "github.com/segmentio/parquet-go/encoding/fuzz" +) + +func FuzzDeltaBinaryPackedInt32(f *testing.F) { + fuzz.EncodeInt32(f, new(delta.BinaryPackedEncoding)) +} + +func FuzzDeltaBinaryPackedInt64(f *testing.F) { + fuzz.EncodeInt64(f, new(delta.BinaryPackedEncoding)) +} + +func FuzzDeltaLengthByteArray(f *testing.F) { + fuzz.EncodeByteArray(f, new(delta.LengthByteArrayEncoding)) +} + +func FuzzDeltaByteArray(f *testing.F) { + fuzz.EncodeByteArray(f, new(delta.ByteArrayEncoding)) +} diff --git a/encoding/delta/length_byte_array.go b/encoding/delta/length_byte_array.go index 8939ea3..ec7869d 100644 --- a/encoding/delta/length_byte_array.go +++ b/encoding/delta/length_byte_array.go @@ -1,31 +1,82 @@ package delta import ( - "io" + "fmt" + "math" "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/encoding/plain" "github.com/segmentio/parquet-go/format" ) type LengthByteArrayEncoding struct { + encoding.NotSupported +} + +func (e *LengthByteArrayEncoding) String() string { + return "DELTA_LENGTH_BYTE_ARRAY" } func (e *LengthByteArrayEncoding) Encoding() format.Encoding { return format.DeltaLengthByteArray } -func (e *LengthByteArrayEncoding) CanEncode(t format.Type) bool { - return t == format.ByteArray +func (e *LengthByteArrayEncoding) EncodeByteArray(dst, src []byte) ([]byte, error) { + return e.encodeByteArray(dst[:0], src) } -func (e *LengthByteArrayEncoding) NewDecoder(r io.Reader) encoding.Decoder { - return NewLengthByteArrayDecoder(r) +func (e *LengthByteArrayEncoding) encodeByteArray(dst, src []byte) ([]byte, error) { + b := getInt32Buffer() + defer putInt32Buffer(b) + + err := plain.RangeByteArrays(src, func(value []byte) error { + if len(value) > math.MaxInt32 { + return fmt.Errorf("byte array of length %d is too large to be encoded", len(value)) + } + b.values = append(b.values, int32(len(value))) + return nil + }) + if err != nil { + return dst, encoding.Error(e, err) + } + + binpack := BinaryPackedEncoding{} + dst, err = binpack.encodeInt32(dst, b.values) + if err != nil { + return dst, encoding.Error(e, err) + } + plain.RangeByteArrays(src, func(value []byte) error { + dst = append(dst, value...) + return nil + }) + return dst, nil } -func (e *LengthByteArrayEncoding) NewEncoder(w io.Writer) encoding.Encoder { - return NewLengthByteArrayEncoder(w) +func (e *LengthByteArrayEncoding) DecodeByteArray(dst, src []byte) ([]byte, error) { + return e.decodeByteArray(dst[:0], src) } -func (e *LengthByteArrayEncoding) String() string { - return "DELTA_LENGTH_BYTE_ARRAY" +func (e *LengthByteArrayEncoding) decodeByteArray(dst, src []byte) ([]byte, error) { + length := getInt32Buffer() + defer putInt32Buffer(length) + + var binpack BinaryPackedEncoding + var err error + length.values, src, err = binpack.decodeInt32(length.values, src) + if err != nil { + return dst, err + } + + for _, n := range length.values { + if int(n) < 0 { + return dst, encoding.Errorf(e, "invalid negative value length: %d", n) + } + if int(n) > len(src) { + return dst, encoding.Errorf(e, "value length is larger than the input size: %d > %d", n, len(src)) + } + dst = plain.AppendByteArray(dst, src[:n]) + src = src[n:] + } + + return dst, nil } diff --git a/encoding/delta/length_byte_array_decoder.go b/encoding/delta/length_byte_array_decoder.go deleted file mode 100644 index ec9b682..0000000 --- a/encoding/delta/length_byte_array_decoder.go +++ /dev/null @@ -1,66 +0,0 @@ -package delta - -import ( - "fmt" - "io" - - "github.com/segmentio/parquet-go/encoding" -) - -type LengthByteArrayDecoder struct { - encoding.NotSupportedDecoder - binpack BinaryPackedDecoder - lengths []int32 - index int -} - -func NewLengthByteArrayDecoder(r io.Reader) *LengthByteArrayDecoder { - d := &LengthByteArrayDecoder{lengths: make([]int32, defaultBufferSize/4)} - d.Reset(r) - return d -} - -func (d *LengthByteArrayDecoder) Reset(r io.Reader) { - d.binpack.Reset(r) - d.lengths = d.lengths[:0] - d.index = -1 -} - -func (d *LengthByteArrayDecoder) DecodeByteArray(data *encoding.ByteArrayList) (n int, err error) { - if d.index < 0 { - if err := d.decodeLengths(); err != nil { - return 0, err - } - } - - n = data.Len() - for data.Len() < data.Cap() && d.index < len(d.lengths) { - value := data.PushSize(int(d.lengths[d.index])) - _, err := io.ReadFull(d.binpack.reader, value) - if err != nil { - err = fmt.Errorf("DELTA_LENGTH_BYTE_ARRAY: decoding byte array at index %d/%d: %w", d.index, len(d.lengths), dontExpectEOF(err)) - break - } - d.index++ - } - - if d.index == len(d.lengths) { - err = io.EOF - } - - return data.Len() - n, err -} - -func (d *LengthByteArrayDecoder) decodeLengths() (err error) { - d.lengths, err = appendDecodeInt32(&d.binpack, d.lengths[:0]) - if err != nil { - return err - } - d.index = 0 - return nil -} - -func (d *LengthByteArrayDecoder) readFull(b []byte) error { - _, err := io.ReadFull(d.binpack.reader, b) - return dontExpectEOF(err) -} diff --git a/encoding/delta/length_byte_array_encoder.go b/encoding/delta/length_byte_array_encoder.go deleted file mode 100644 index efbed42..0000000 --- a/encoding/delta/length_byte_array_encoder.go +++ /dev/null @@ -1,50 +0,0 @@ -package delta - -import ( - "fmt" - "io" - "math" - - "github.com/segmentio/parquet-go/encoding" -) - -type LengthByteArrayEncoder struct { - encoding.NotSupportedEncoder - binpack BinaryPackedEncoder - lengths []int32 -} - -func NewLengthByteArrayEncoder(w io.Writer) *LengthByteArrayEncoder { - e := &LengthByteArrayEncoder{lengths: make([]int32, defaultBufferSize/4)} - e.Reset(w) - return e -} - -func (e *LengthByteArrayEncoder) Reset(w io.Writer) { - e.binpack.Reset(w) -} - -func (e *LengthByteArrayEncoder) EncodeByteArray(data encoding.ByteArrayList) (err error) { - e.lengths = e.lengths[:0] - - data.Range(func(value []byte) bool { - if len(value) > math.MaxInt32 { - err = fmt.Errorf("DELTA_LENGTH_BYTE_ARRAY: byte array of length %d is too large to be encoded", len(value)) - return false - } - e.lengths = append(e.lengths, int32(len(value))) - return true - }) - if err != nil { - return err - } - if err = e.binpack.EncodeInt32(e.lengths); err != nil { - return err - } - - data.Range(func(value []byte) bool { - _, err = e.binpack.writer.Write(value) - return err == nil - }) - return err -} diff --git a/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/2404234dd7e87c04303eb7e58208d5b2ccb04fb616c18f3254e2375c4bc327e3 b/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/2404234dd7e87c04303eb7e58208d5b2ccb04fb616c18f3254e2375c4bc327e3 new file mode 100644 index 0000000..72f66a7 --- /dev/null +++ b/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/2404234dd7e87c04303eb7e58208d5b2ccb04fb616c18f3254e2375c4bc327e3 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\x80\xf8\xa9\xaf\x14\xfc\r\rR1000") +int64(13) diff --git a/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/4cf9c92e5a2096e3d6c42eaf9b1e31d2567854d33e06c8d2d7a8c46437345850 b/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/4cf9c92e5a2096e3d6c42eaf9b1e31d2567854d33e06c8d2d7a8c46437345850 new file mode 100644 index 0000000..bc5d42b --- /dev/null +++ b/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/4cf9c92e5a2096e3d6c42eaf9b1e31d2567854d33e06c8d2d7a8c46437345850 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\xa1\xa1\xa1\xa1\xa1\xa1\xa1\xa1\xa100") +int64(-180) diff --git a/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/9b210529f5e34e2dea5824929bf0d8242dc9c3165c0dce10bb376c50e21b38cc b/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/9b210529f5e34e2dea5824929bf0d8242dc9c3165c0dce10bb376c50e21b38cc new file mode 100644 index 0000000..e571e4e --- /dev/null +++ b/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/9b210529f5e34e2dea5824929bf0d8242dc9c3165c0dce10bb376c50e21b38cc @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\x800000\xc9\xc9\xc9\xc9\xc9\xc9\xc9\xc9\xc900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000") +int64(-79) diff --git a/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/fbe137144bcda3a149c8ea109703f3242192c5480ea1e82dde0ea24e94f3afef b/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/fbe137144bcda3a149c8ea109703f3242192c5480ea1e82dde0ea24e94f3afef new file mode 100644 index 0000000..3742aad --- /dev/null +++ b/encoding/delta/testdata/fuzz/FuzzDeltaByteArray/fbe137144bcda3a149c8ea109703f3242192c5480ea1e82dde0ea24e94f3afef @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\x8000000") +int64(-97) diff --git a/encoding/encoding.go b/encoding/encoding.go index ee27659..1377181 100644 --- a/encoding/encoding.go +++ b/encoding/encoding.go @@ -3,12 +3,16 @@ package encoding import ( - "io" + "math" "github.com/segmentio/parquet-go/deprecated" "github.com/segmentio/parquet-go/format" ) +const ( + MaxFixedLenByteArraySize = math.MaxInt16 +) + // The Encoding interface is implemented by types representing parquet column // encodings. // @@ -20,170 +24,41 @@ type Encoding interface { // Returns the parquet code representing the encoding. Encoding() format.Encoding - // Checks whether the encoding is capable of serializing parquet values of - // the given type. - CanEncode(format.Type) bool - - // Creates a decoder reading encoded values to the io.Reader passed as - // argument. + // Encode methods serialize the source sequence of values into the + // destination buffer, potentially reallocating it if it was too short to + // contain the output. // - // The io.Reader may be nil, in which case the decoder's Reset method must - // be called with a non-nil io.Reader prior to decoding values. - NewDecoder(io.Reader) Decoder - - // Creates an encoder writing values to the io.Writer passed as argument. + // When encoding columns of byte array values, the input is expected to be + // formatted with the PLAIN encoding (a sequence of 4-bytes length prefix + // followed by the data). // - // The io.Writer may be nil, in which case the encoder's Reset method must - // be called with a non-nil io.Writer prior to encoding values. - NewEncoder(io.Writer) Encoder -} - -// The Encoder interface is implemented by encoders types. -// -// Some encodings only support partial -type Encoder interface { - // Calling Reset clears the encoder state and changes the io.Writer where - // encoded values are written to the one given as argument. - // - // The io.Writer may be nil, in which case the encoder must not be used - // until Reset is called again with a non-nil writer. - // - // Calling Reset does not override the bit-width configured on the encoder. - Reset(io.Writer) - - // Encodes an array of boolean values using this encoder. - EncodeBoolean(data []bool) error - - // Encodes an array of 8 bits integer values using this encoder. - // - // The parquet type system does not have a 8 bits integers, this method - // is intended to encode INT32 values but receives them as an array of - // int8 values to enable greater memory efficiency when the application - // knows that all values can fit in 8 bits. - EncodeInt8(data []int8) error - - // Encodes an array of boolean values using this encoder. - // - // The parquet type system does not have a 16 bits integers, this method - // is intended to encode INT32 values but receives them as an array of - // int8 values to enable greater memory efficiency when the application - // knows that all values can fit in 16 bits. - EncodeInt16(data []int16) error - - // Encodes an array of 32 bit integer values using this encoder. - EncodeInt32(data []int32) error - - // Encodes an array of 64 bit integer values using this encoder. - EncodeInt64(data []int64) error - - // Encodes an array of 96 bit integer values using this encoder. - EncodeInt96(data []deprecated.Int96) error - - // Encodes an array of 32 bit floating point values using this encoder. - EncodeFloat(data []float32) error - - // Encodes an array of 64 bit floating point values using this encoder. - EncodeDouble(data []float64) error - - // Encodes an array of variable length byte array values using this encoder. - EncodeByteArray(data ByteArrayList) error - - // Encodes an array of fixed length byte array values using this encoder. + // When encoding fixed-length byte array values, each value is expected to + // be found back-to-back in chunks of the given size. + EncodeBoolean(dst []byte, src []bool) ([]byte, error) + EncodeInt8(dst []byte, src []int8) ([]byte, error) + EncodeInt32(dst []byte, src []int32) ([]byte, error) + EncodeInt64(dst []byte, src []int64) ([]byte, error) + EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) + EncodeFloat(dst []byte, src []float32) ([]byte, error) + EncodeDouble(dst []byte, src []float64) ([]byte, error) + EncodeByteArray(dst, src []byte) ([]byte, error) + EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) + + // Decode methods deserialize from the source buffer into the destination + // slice, potentially growing it if it was too short to contain the result. // - // The list is encoded contiguously in the `data` byte slice, in chunks of - // `size` elements - EncodeFixedLenByteArray(size int, data []byte) error - - // Configures the bit-width on the encoder. - // - // Not all encodings require declaring the bit-width, but applications that - // use the Encoder abstraction should not make assumptions about the - // underlying type of the encoder, and therefore should call SetBitWidth - // prior to encoding repetition and definition levels. - SetBitWidth(bitWidth int) -} - -// The Decoder interface is implemented by decoder types. -type Decoder interface { - // Calling Reset clears the decoder state and changes the io.Reader where - // decoded values are written to the one given as argument. - // - // The io.Reader may be nil, in which case the decoder must not be used - // until Reset is called again with a non-nil reader. - // - // Calling Reset does not override the bit-width configured on the decoder. - Reset(io.Reader) - - // Decodes an array of boolean values using this decoder, returning - // the number of decoded values, and io.EOF if the end of the underlying - // io.Reader was reached. - DecodeBoolean(data []bool) (int, error) - - // Decodes an array of 8 bits integer values using this decoder, returning - // the number of decoded values, and io.EOF if the end of the underlying - // io.Reader was reached. - // - // The parquet type system does not have a 8 bits integers, this method - // is intended to decode INT32 values but receives them as an array of - // int8 values to enable greater memory efficiency when the application - // knows that all values can fit in 8 bits. - DecodeInt8(data []int8) (int, error) - - // Decodes an array of 16 bits integer values using this decoder, returning - // the number of decoded values, and io.EOF if the end of the underlying - // io.Reader was reached. - // - // The parquet type system does not have a 16 bits integers, this method - // is intended to decode INT32 values but receives them as an array of - // int8 values to enable greater memory efficiency when the application - // knows that all values can fit in 16 bits. - DecodeInt16(data []int16) (int, error) - - // Decodes an array of 32 bits integer values using this decoder, returning - // the number of decoded values, and io.EOF if the end of the underlying - // io.Reader was reached. - DecodeInt32(data []int32) (int, error) - - // Decodes an array of 64 bits integer values using this decoder, returning - // the number of decoded values, and io.EOF if the end of the underlying - // io.Reader was reached. - DecodeInt64(data []int64) (int, error) - - // Decodes an array of 96 bits integer values using this decoder, returning - // the number of decoded values, and io.EOF if the end of the underlying - // io.Reader was reached. - DecodeInt96(data []deprecated.Int96) (int, error) - - // Decodes an array of 32 bits floating point values using this decoder, - // returning the number of decoded values, and io.EOF if the end of the - // underlying io.Reader was reached. - DecodeFloat(data []float32) (int, error) - - // Decodes an array of 64 bits floating point values using this decoder, - // returning the number of decoded values, and io.EOF if the end of the - // underlying io.Reader was reached. - DecodeDouble(data []float64) (int, error) - - // Decodes an array of variable length byte array values using this decoder, - // returning the number of decoded values, and io.EOF if the end of the - // underlying io.Reader was reached. - // - // The values are written to the `data` buffer by calling the Push method, - // the method returns the number of values written. DecodeByteArray will - // stop pushing value to the output ByteArrayList if its total capacity is - // reached. - DecodeByteArray(data *ByteArrayList) (int, error) - - // Decodes an array of fixed length byte array values using this decoder, - // returning the number of decoded values, and io.EOF if the end of the - // underlying io.Reader was reached. - DecodeFixedLenByteArray(size int, data []byte) (int, error) - - // Configures the bit-width on the decoder. + // When decoding columns of byte array values, the values are written to the + // output buffer using the PLAIN encoding. // - // Not all encodings require declaring the bit-width, but applications that - // use the Decoder abstraction should not make assumptions about the - // underlying type of the decoder, and therefore should call SetBitWidth - // prior to decoding repetition and definition levels. - SetBitWidth(bitWidth int) + // When encoding fixed-length byte array values, each value is written + // back-to-back in chunks of the given size to the output buffer. + DecodeBoolean(dst []bool, src []byte) ([]bool, error) + DecodeInt8(dst []int8, src []byte) ([]int8, error) + DecodeInt32(dst []int32, src []byte) ([]int32, error) + DecodeInt64(dst []int64, src []byte) ([]int64, error) + DecodeInt96(dst []deprecated.Int96, src []byte) ([]deprecated.Int96, error) + DecodeFloat(dst []float32, src []byte) ([]float32, error) + DecodeDouble(dst []float64, src []byte) ([]float64, error) + DecodeByteArray(dst, src []byte) ([]byte, error) + DecodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) } diff --git a/encoding/encoding_test.go b/encoding/encoding_test.go index 10e1055..0fe83fd 100644 --- a/encoding/encoding_test.go +++ b/encoding/encoding_test.go @@ -2,9 +2,10 @@ package encoding_test import ( "bytes" - "errors" "io" "math" + "math/rand" + "reflect" "testing" "github.com/segmentio/parquet-go/deprecated" @@ -13,10 +14,17 @@ import ( "github.com/segmentio/parquet-go/encoding/delta" "github.com/segmentio/parquet-go/encoding/plain" "github.com/segmentio/parquet-go/encoding/rle" - "github.com/segmentio/parquet-go/format" "github.com/segmentio/parquet-go/internal/bits" ) +func repeatInt64(seq []int64, n int) []int64 { + rep := make([]int64, len(seq)*n) + for i := 0; i < n; i++ { + copy(rep[i*len(seq):], seq) + } + return rep +} + var booleanTests = [...][]bool{ {}, {true}, @@ -49,32 +57,7 @@ var int8Tests = [...][]int8{ {}, {0}, {1}, - {-1, 0, 1, 0, 2, 3, 4, 5, 6, math.MaxInt8, math.MaxInt8, 0}, - { // repeating 24x - 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, - }, - { // never repeating - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, - 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, - }, - { // streaks of repeating values - 0, 0, 0, 0, 1, 1, 1, 1, - 2, 2, 2, 2, 3, 3, 3, 3, - 4, 4, 4, 4, 5, 5, 5, 5, - 6, 6, 6, 7, 7, 7, 8, 8, - 8, 9, 9, 9, - }, -} - -var int16Tests = [...][]int16{ - {}, - {0}, - {1}, - {-1, 0, 1, 0, 2, 3, 4, 5, 6, math.MaxInt16, math.MaxInt16, 0}, + {0, 1, 0, 2, 3, 4, 5, 6, math.MaxInt8, math.MaxInt8, 0}, { // repeating 24x 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, @@ -153,6 +136,16 @@ var int64Tests = [...][]int64{ 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, }, + repeatInt64( // a sequence resulting in 64 bits words in the delta binary packed encoding + []int64{ + math.MinInt64, math.MaxInt64, math.MinInt64, math.MaxInt64, + math.MinInt64, math.MaxInt64, math.MinInt64, math.MaxInt64, + + 0, math.MaxInt64, math.MinInt64, math.MaxInt64, + math.MinInt64, math.MaxInt64, math.MinInt64, math.MaxInt64, + }, + 5, + ), } var int96Tests = [...][]deprecated.Int96{ @@ -194,51 +187,53 @@ var fixedLenByteArrayTests = [...]struct { {size: 8, data: []byte("ABCDEFGH")}, } -func TestEncoding(t *testing.T) { - for _, test := range [...]struct { - scenario string - encoding encoding.Encoding - }{ - { - scenario: "PLAIN", - encoding: new(plain.Encoding), - }, +var encodings = [...]struct { + scenario string + encoding encoding.Encoding +}{ + { + scenario: "PLAIN", + encoding: new(plain.Encoding), + }, - { - scenario: "RLE", - encoding: new(rle.Encoding), - }, + { + scenario: "RLE", + encoding: new(rle.Encoding), + }, - { - scenario: "PLAIN_DICTIONARY", - encoding: new(plain.DictionaryEncoding), - }, + { + scenario: "PLAIN_DICTIONARY", + encoding: new(plain.DictionaryEncoding), + }, - { - scenario: "RLE_DICTIONARY", - encoding: new(rle.DictionaryEncoding), - }, + { + scenario: "RLE_DICTIONARY", + encoding: new(rle.DictionaryEncoding), + }, - { - scenario: "DELTA_BINARY_PACKED", - encoding: new(delta.BinaryPackedEncoding), - }, + { + scenario: "DELTA_BINARY_PACKED", + encoding: new(delta.BinaryPackedEncoding), + }, - { - scenario: "DELTA_LENGTH_BYTE_ARRAY", - encoding: new(delta.LengthByteArrayEncoding), - }, + { + scenario: "DELTA_LENGTH_BYTE_ARRAY", + encoding: new(delta.LengthByteArrayEncoding), + }, - { - scenario: "DELTA_BYTE_ARRAY", - encoding: new(delta.ByteArrayEncoding), - }, + { + scenario: "DELTA_BYTE_ARRAY", + encoding: new(delta.ByteArrayEncoding), + }, - { - scenario: "BYTE_STREAM_SPLIT", - encoding: new(bytestreamsplit.Encoding), - }, - } { + { + scenario: "BYTE_STREAM_SPLIT", + encoding: new(bytestreamsplit.Encoding), + }, +} + +func TestEncoding(t *testing.T) { + for _, test := range encodings { t.Run(test.scenario, func(t *testing.T) { testEncoding(t, test.encoding) }) } } @@ -258,11 +253,6 @@ func testEncoding(t *testing.T, e encoding.Encoding) { function: testInt8Encoding, }, - { - scenario: "int16", - function: testInt16Encoding, - }, - { scenario: "int32", function: testInt32Encoding, @@ -302,530 +292,630 @@ func testEncoding(t *testing.T, e encoding.Encoding) { } } -func testBooleanEncoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.Boolean) { - t.Skipf("%s cannot encode boolean values", e) +func setBitWidth(e encoding.Encoding, bitWidth int) { + if r, ok := e.(*rle.Encoding); ok { + r.BitWidth = bitWidth } +} - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := [1]bool{} +func testBooleanEncoding(t *testing.T, e encoding.Encoding) { + testCanEncodeBoolean(t, e) + buffer := []byte{} + values := []bool{} + setBitWidth(e, 1) for _, test := range booleanTests { t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() - - if err := enc.EncodeBoolean(test); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) + var err error + buffer, err = e.EncodeBoolean(buffer, test) + if err != nil { + t.Fatal(err) } - - for i, want := range test { - n, err := dec.DecodeBoolean(tmp[:]) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - if got := tmp[0]; got != want { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, want, got) - } + values, err = e.DecodeBoolean(values, buffer) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(test, values[:len(test)]) { + t.Fatalf("values mismatch:\nwant = %+v\ngot = %+v", test, values) } - // Boolean encodings may pad their output with up to 7 bits, so we // count the distance from the last decoded value to the EOF error, // and ensure that it's always smaller than 8. - extra := 0 - for { - if extra == 8 { - t.Fatal("nil error returned for more than 7 tailing bits") - break - } - if n, err := dec.DecodeBoolean(tmp[:]); err == io.EOF { - break - } else if n != 1 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - extra++ + if extra := len(values) - len(test); extra > 7 { + t.Fatal("nil error returned for more than 7 tailing bits") } }) } } func testInt8Encoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.Int32) { - t.Skipf("%s cannot encode int32 values", e) - } - - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := [1]int8{} + testCanEncodeInt8(t, e) + buffer := []byte{} + values := []int8{} for _, test := range int8Tests { + setBitWidth(e, bits.MaxLen8(test)) + t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() + var err error + buffer, err = e.EncodeInt8(buffer, test) + assertNoError(t, err) + values, err = e.DecodeInt8(values, buffer) + assertNoError(t, err) + assertDeepEqual(t, test, values) + }) + } +} - bitWidth := bits.MaxLen8(test) - if bitWidth == 0 { - bitWidth = 1 - } - enc.SetBitWidth(bitWidth) - dec.SetBitWidth(bitWidth) - - if err := enc.EncodeInt8(test); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } +func testInt32Encoding(t *testing.T, e encoding.Encoding) { + testCanEncodeInt32(t, e) + buffer := []byte{} + values := []int32{} - for i := range test { - n, err := dec.DecodeInt8(tmp[:]) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - if tmp[0] != test[i] { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, test[i], tmp[0]) - } - } + for _, test := range int32Tests { + setBitWidth(e, bits.MaxLen32(test)) - if n, err := dec.DecodeInt8(tmp[:]); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } + t.Run("", func(t *testing.T) { + var err error + buffer, err = e.EncodeInt32(buffer, test) + assertNoError(t, err) + values, err = e.DecodeInt32(values, buffer) + assertNoError(t, err) + assertDeepEqual(t, test, values) }) } } -func testInt16Encoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.Int32) { - t.Skipf("%s cannot encode int32 values", e) - } +func testInt64Encoding(t *testing.T, e encoding.Encoding) { + testCanEncodeInt64(t, e) + buffer := []byte{} + values := []int64{} - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := [1]int16{} + for _, test := range int64Tests { + setBitWidth(e, bits.MaxLen64(test)) - for _, test := range int16Tests { t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() - - bitWidth := bits.MaxLen16(test) - if bitWidth == 0 { - bitWidth = 1 - } - enc.SetBitWidth(bitWidth) - dec.SetBitWidth(bitWidth) - - if err := enc.EncodeInt16(test); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } + var err error + buffer, err = e.EncodeInt64(buffer, test) + assertNoError(t, err) + values, err = e.DecodeInt64(values, buffer) + assertNoError(t, err) + assertDeepEqual(t, test, values) + }) + } +} - for i := range test { - n, err := dec.DecodeInt16(tmp[:]) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - if tmp[0] != test[i] { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, test[i], tmp[0]) - } - } +func testInt96Encoding(t *testing.T, e encoding.Encoding) { + testCanEncodeInt96(t, e) + buffer := []byte{} + values := []deprecated.Int96{} - if n, err := dec.DecodeInt16(tmp[:]); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } + for _, test := range int96Tests { + t.Run("", func(t *testing.T) { + var err error + buffer, err = e.EncodeInt96(buffer, test) + assertNoError(t, err) + values, err = e.DecodeInt96(values, buffer) + assertNoError(t, err) + assertDeepEqual(t, test, values) }) } } -func testInt32Encoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.Int32) { - t.Skipf("%s cannot encode int32 values", e) +func testFloatEncoding(t *testing.T, e encoding.Encoding) { + testCanEncodeFloat(t, e) + buffer := []byte{} + values := []float32{} + + for _, test := range floatTests { + t.Run("", func(t *testing.T) { + var err error + buffer, err = e.EncodeFloat(buffer, test) + assertNoError(t, err) + values, err = e.DecodeFloat(values, buffer) + assertNoError(t, err) + assertDeepEqual(t, test, values) + }) } +} - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := [1]int32{} +func testDoubleEncoding(t *testing.T, e encoding.Encoding) { + testCanEncodeDouble(t, e) + buffer := []byte{} + values := []float64{} - for _, test := range int32Tests { + for _, test := range doubleTests { t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() + var err error + buffer, err = e.EncodeDouble(buffer, test) + assertNoError(t, err) + values, err = e.DecodeDouble(values, buffer) + assertNoError(t, err) + assertDeepEqual(t, test, values) + }) + } +} - bitWidth := bits.MaxLen32(test) - if bitWidth == 0 { - bitWidth = 1 - } - enc.SetBitWidth(bitWidth) - dec.SetBitWidth(bitWidth) - - if err := enc.EncodeInt32(test); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } +func testByteArrayEncoding(t *testing.T, e encoding.Encoding) { + testCanEncodeByteArray(t, e) + buffer := []byte{} + values := []byte{} + byteArrays := []byte{} - for i := range test { - n, err := dec.DecodeInt32(tmp[:]) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - if tmp[0] != test[i] { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, test[i], tmp[0]) - } - } + for _, test := range byteArrayTests { + byteArrays = byteArrays[:0] - if n, err := dec.DecodeInt32(tmp[:]); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } + for _, value := range test { + byteArrays = plain.AppendByteArray(byteArrays, value) + } + + t.Run("", func(t *testing.T) { + var err error + buffer, err = e.EncodeByteArray(buffer, byteArrays) + assertNoError(t, err) + values, err = e.DecodeByteArray(values, buffer) + assertNoError(t, err) + assertDeepEqual(t, byteArrays, values) }) } } -func testInt64Encoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.Int64) { - t.Skipf("%s cannot encode int64 values", e) - } - - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := [1]int64{} +func testFixedLenByteArrayEncoding(t *testing.T, e encoding.Encoding) { + testCanEncodeFixedLenByteArray(t, e) + buffer := []byte{} + values := []byte{} - for _, test := range int64Tests { + for _, test := range fixedLenByteArrayTests { t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() + var err error + buffer, err = e.EncodeFixedLenByteArray(buffer, test.data, test.size) + assertNoError(t, err) + values, err = e.DecodeFixedLenByteArray(values, buffer, test.size) + assertNoError(t, err) + assertDeepEqual(t, test.data, values) + }) + } +} - bitWidth := bits.MaxLen64(test) - if bitWidth == 0 { - bitWidth = 1 - } - enc.SetBitWidth(bitWidth) - dec.SetBitWidth(bitWidth) - - if err := enc.EncodeInt64(test); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } +func testCanEncodeBoolean(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeBoolean) +} - for i := range test { - n, err := dec.DecodeInt64(tmp[:]) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - if tmp[0] != test[i] { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, test[i], tmp[0]) - } - } +func testCanEncodeInt8(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeInt8) +} - if n, err := dec.DecodeInt64(tmp[:]); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - }) - } +func testCanEncodeInt32(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeInt32) } -func testInt96Encoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.Int96) { - t.Skipf("%s cannot encode int96 values", e) - } +func testCanEncodeInt64(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeInt64) +} - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := [1]deprecated.Int96{} +func testCanEncodeInt96(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeInt96) +} - for _, test := range int96Tests { - t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() +func testCanEncodeFloat(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeFloat) +} - bitWidth := deprecated.MaxLenInt96(test) - if bitWidth == 0 { - bitWidth = 1 - } - enc.SetBitWidth(bitWidth) - dec.SetBitWidth(bitWidth) - - if err := enc.EncodeInt96(test); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } +func testCanEncodeDouble(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeDouble) +} - for i := range test { - n, err := dec.DecodeInt96(tmp[:]) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - if tmp[0] != test[i] { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, test[i], tmp[i]) - } - } +func testCanEncodeByteArray(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeByteArray) +} - if n, err := dec.DecodeInt96(tmp[:]); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - }) +func testCanEncodeFixedLenByteArray(t testing.TB, e encoding.Encoding) { + testCanEncode(t, e, encoding.CanEncodeFixedLenByteArray) +} + +func testCanEncode(t testing.TB, e encoding.Encoding, test func(encoding.Encoding) bool) { + if !test(e) { + t.Skip("encoding not supported") } } -func testFloatEncoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.Float) { - t.Skipf("%s cannot encode float values", e) +func assertNoError(t *testing.T, err error) { + if err != nil { + t.Fatal(err) } +} - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := [1]float32{} +func assertDeepEqual(t *testing.T, want, got interface{}) { + if !reflect.DeepEqual(want, got) { + t.Fatalf("values mismatch:\nwant = %+v\ngot = %+v", want, got) + } +} - for _, test := range floatTests { - t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() - - if err := enc.EncodeFloat(test); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } +const ( + benchmarkNumValues = 10e3 +) - for i := range test { - n, err := dec.DecodeFloat(tmp[:]) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - if tmp[0] != test[i] { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, test[i], tmp[i]) - } - } +func newRand() *rand.Rand { + return rand.New(rand.NewSource(1)) +} - if n, err := dec.DecodeFloat(tmp[:]); err != nil && err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - }) +func BenchmarkEncode(b *testing.B) { + for _, test := range encodings { + b.Run(test.scenario, func(b *testing.B) { benchmarkEncode(b, test.encoding) }) } } -func testDoubleEncoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.Double) { - t.Skipf("%s cannot encode double values", e) +func benchmarkEncode(b *testing.B, e encoding.Encoding) { + for _, test := range [...]struct { + scenario string + function func(*testing.B, encoding.Encoding) + }{ + { + scenario: "boolean", + function: benchmarkEncodeBoolean, + }, + { + scenario: "int8", + function: benchmarkEncodeInt8, + }, + { + scenario: "int32", + function: benchmarkEncodeInt32, + }, + { + scenario: "int64", + function: benchmarkEncodeInt64, + }, + { + scenario: "float", + function: benchmarkEncodeFloat, + }, + { + scenario: "double", + function: benchmarkEncodeDouble, + }, + { + scenario: "byte array", + function: benchmarkEncodeByteArray, + }, + { + scenario: "fixed length byte array", + function: benchmarkEncodeFixedLenByteArray, + }, + } { + b.Run(test.scenario, func(b *testing.B) { test.function(b, e) }) } +} - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := [1]float64{} +func benchmarkEncodeBoolean(b *testing.B, e encoding.Encoding) { + testCanEncodeBoolean(b, e) + buffer := make([]byte, 0) + values := generateBooleanValues(benchmarkNumValues, newRand()) + setBitWidth(e, 1) - for _, test := range doubleTests { - t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() - - if err := enc.EncodeDouble(test); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } + benchmarkZeroAllocsPerRun(b, func() { + buffer, _ = e.EncodeBoolean(buffer, values) + }) - for i := range test { - n, err := dec.DecodeDouble(tmp[:]) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - if tmp[0] != test[i] { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, test[i], tmp[i]) - } - } + b.SetBytes(1 * int64(len(values))) +} - if n, err := dec.DecodeDouble(tmp[:]); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - }) +func benchmarkEncodeInt8(b *testing.B, e encoding.Encoding) { + testCanEncodeInt8(b, e) + buffer := make([]byte, 0) + values := generateInt8Values(benchmarkNumValues, newRand()) + setBitWidth(e, bits.MaxLen8(values)) + + benchmarkZeroAllocsPerRun(b, func() { + buffer, _ = e.EncodeInt8(buffer, values) + }) + + b.SetBytes(1 * int64(len(values))) +} + +func benchmarkEncodeInt32(b *testing.B, e encoding.Encoding) { + testCanEncodeInt32(b, e) + buffer := make([]byte, 0) + values := generateInt32Values(benchmarkNumValues, newRand()) + setBitWidth(e, bits.MaxLen32(values)) + + benchmarkZeroAllocsPerRun(b, func() { + buffer, _ = e.EncodeInt32(buffer, values) + }) + + b.SetBytes(4 * int64(len(values))) +} + +func benchmarkEncodeInt64(b *testing.B, e encoding.Encoding) { + testCanEncodeInt64(b, e) + buffer := make([]byte, 0) + values := generateInt64Values(benchmarkNumValues, newRand()) + setBitWidth(e, bits.MaxLen64(values)) + + benchmarkZeroAllocsPerRun(b, func() { + buffer, _ = e.EncodeInt64(buffer, values) + }) + + b.SetBytes(8 * int64(len(values))) +} + +func benchmarkEncodeFloat(b *testing.B, e encoding.Encoding) { + testCanEncodeFloat(b, e) + buffer := make([]byte, 0) + values := generateFloatValues(benchmarkNumValues, newRand()) + + benchmarkZeroAllocsPerRun(b, func() { + buffer, _ = e.EncodeFloat(buffer, values) + }) + + b.SetBytes(4 * int64(len(values))) +} + +func benchmarkEncodeDouble(b *testing.B, e encoding.Encoding) { + testCanEncodeDouble(b, e) + buffer := make([]byte, 0) + values := generateDoubleValues(benchmarkNumValues, newRand()) + + benchmarkZeroAllocsPerRun(b, func() { + buffer, _ = e.EncodeDouble(buffer, values) + }) + + b.SetBytes(8 * int64(len(values))) +} + +func benchmarkEncodeByteArray(b *testing.B, e encoding.Encoding) { + testCanEncodeByteArray(b, e) + buffer := make([]byte, 0) + values := generateByteArrayValues(benchmarkNumValues, newRand()) + + benchmarkZeroAllocsPerRun(b, func() { + buffer, _ = e.EncodeByteArray(buffer, values) + }) + + b.SetBytes(int64(len(values))) +} + +func benchmarkEncodeFixedLenByteArray(b *testing.B, e encoding.Encoding) { + testCanEncodeFixedLenByteArray(b, e) + const size = 16 + buffer := make([]byte, 0) + values := generateFixedLenByteArrayValues(benchmarkNumValues, newRand(), size) + + benchmarkZeroAllocsPerRun(b, func() { + buffer, _ = e.EncodeFixedLenByteArray(buffer, values, size) + }) + + b.SetBytes(int64(len(values))) +} + +func BenchmarkDecode(b *testing.B) { + for _, test := range encodings { + b.Run(test.scenario, func(b *testing.B) { benchmarkDecode(b, test.encoding) }) } } -func testByteArrayEncoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.ByteArray) { - t.Skipf("%s cannot encode byte array values", e) +func benchmarkDecode(b *testing.B, e encoding.Encoding) { + for _, test := range [...]struct { + scenario string + function func(*testing.B, encoding.Encoding) + }{ + { + scenario: "boolean", + function: benchmarkDecodeBoolean, + }, + { + scenario: "int8", + function: benchmarkDecodeInt8, + }, + { + scenario: "int32", + function: benchmarkDecodeInt32, + }, + { + scenario: "int64", + function: benchmarkDecodeInt64, + }, + { + scenario: "float", + function: benchmarkDecodeFloat, + }, + { + scenario: "double", + function: benchmarkDecodeDouble, + }, + { + scenario: "byte array", + function: benchmarkDecodeByteArray, + }, + { + scenario: "fixed length byte array", + function: benchmarkDecodeFixedLenByteArray, + }, + } { + b.Run(test.scenario, func(b *testing.B) { test.function(b, e) }) } +} - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) - tmp := encoding.MakeByteArrayList(1) +func benchmarkDecodeBoolean(b *testing.B, e encoding.Encoding) { + testCanEncodeBoolean(b, e) + values := generateBooleanValues(benchmarkNumValues, newRand()) + output := make([]bool, 0) + setBitWidth(e, 1) + buffer, _ := e.EncodeBoolean(nil, values) - for _, test := range byteArrayTests { - t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() - defer tmp.Reset() + benchmarkZeroAllocsPerRun(b, func() { + output, _ = e.DecodeBoolean(output, buffer) + }) - for _, v := range test { - tmp.Push(v) - } + b.SetBytes(1 * int64(len(values))) +} - if err := enc.EncodeByteArray(tmp); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } +func benchmarkDecodeInt8(b *testing.B, e encoding.Encoding) { + testCanEncodeInt8(b, e) + values := generateInt8Values(benchmarkNumValues, newRand()) + output := make([]int8, 0) + setBitWidth(e, bits.MaxLen8(values)) + buffer, _ := e.EncodeInt8(nil, values) - for i := 0; i < len(test); { - tmp.Reset() - n, err := dec.DecodeByteArray(&tmp) - if err != nil && (err != io.EOF || n != len(test)) { - t.Fatal("decode:", n, err) - } - if n == 0 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - tmp.Range(func(value []byte) bool { - if !bytes.Equal(value, test[i]) { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, test[i], value) - } - i++ - return true - }) - } + benchmarkZeroAllocsPerRun(b, func() { + output, _ = e.DecodeInt8(output, buffer) + }) - tmp.Reset() - if n, err := dec.DecodeByteArray(&tmp); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - }) - } + b.SetBytes(1 * int64(len(values))) } -func testFixedLenByteArrayEncoding(t *testing.T, e encoding.Encoding) { - if !e.CanEncode(format.FixedLenByteArray) { - t.Skipf("%s cannot encode fixed-length byte array values", e) +func benchmarkDecodeInt32(b *testing.B, e encoding.Encoding) { + testCanEncodeInt32(b, e) + values := generateInt32Values(benchmarkNumValues, newRand()) + output := make([]int32, 0) + setBitWidth(e, bits.MaxLen32(values)) + buffer, _ := e.EncodeInt32(nil, values) + + benchmarkZeroAllocsPerRun(b, func() { + output, _ = e.DecodeInt32(output, buffer) + }) + + b.SetBytes(4 * int64(len(values))) +} + +func benchmarkDecodeInt64(b *testing.B, e encoding.Encoding) { + testCanEncodeInt64(b, e) + values := generateInt64Values(benchmarkNumValues, newRand()) + output := make([]int64, 0) + setBitWidth(e, bits.MaxLen64(values)) + buffer, _ := e.EncodeInt64(nil, values) + + benchmarkZeroAllocsPerRun(b, func() { + output, _ = e.DecodeInt64(output, buffer) + }) + + b.SetBytes(8 * int64(len(values))) +} + +func benchmarkDecodeFloat(b *testing.B, e encoding.Encoding) { + testCanEncodeFloat(b, e) + values := generateFloatValues(benchmarkNumValues, newRand()) + output := make([]float32, 0) + buffer, _ := e.EncodeFloat(nil, values) + + benchmarkZeroAllocsPerRun(b, func() { + output, _ = e.DecodeFloat(output, buffer) + }) + + b.SetBytes(4 * int64(len(values))) +} + +func benchmarkDecodeDouble(b *testing.B, e encoding.Encoding) { + testCanEncodeDouble(b, e) + values := generateDoubleValues(benchmarkNumValues, newRand()) + output := make([]float64, 0) + buffer, _ := e.EncodeDouble(nil, values) + + benchmarkZeroAllocsPerRun(b, func() { + output, _ = e.DecodeDouble(output, buffer) + }) + + b.SetBytes(8 * int64(len(values))) +} + +func benchmarkDecodeByteArray(b *testing.B, e encoding.Encoding) { + testCanEncodeByteArray(b, e) + values := generateByteArrayValues(benchmarkNumValues, newRand()) + output := make([]byte, 0) + buffer, _ := e.EncodeByteArray(nil, values) + + benchmarkZeroAllocsPerRun(b, func() { + output, _ = e.DecodeByteArray(output, buffer) + }) + + b.SetBytes(int64(len(values))) +} + +func benchmarkDecodeFixedLenByteArray(b *testing.B, e encoding.Encoding) { + testCanEncodeFixedLenByteArray(b, e) + const size = 16 + values := generateFixedLenByteArrayValues(benchmarkNumValues, newRand(), size) + output := make([]byte, 0) + buffer, _ := e.EncodeFixedLenByteArray(nil, values, size) + + benchmarkZeroAllocsPerRun(b, func() { + output, _ = e.DecodeFixedLenByteArray(output, buffer, size) + }) + + b.SetBytes(int64(len(values))) +} + +func benchmarkZeroAllocsPerRun(b *testing.B, f func()) { + if allocs := testing.AllocsPerRun(b.N, f); allocs != 0 { + b.Errorf("too many memory allocations: %g", allocs) } +} - buf := new(bytes.Buffer) - enc := e.NewEncoder(buf) - dec := e.NewDecoder(buf) +func generateBooleanValues(n int, r *rand.Rand) []bool { + values := make([]bool, n) + for i := range values { + values[i] = r.Float64() > 0.5 + } + return values +} - for _, test := range fixedLenByteArrayTests { - t.Run("", func(t *testing.T) { - defer dec.Reset(buf) - defer enc.Reset(buf) - defer buf.Reset() - tmp := make([]byte, test.size) - - if err := enc.EncodeFixedLenByteArray(test.size, test.data); err != nil { - if errors.Is(err, encoding.ErrNotSupported) { - t.Skip(err) - } - t.Fatal("encode:", err) - } +func generateInt8Values(n int, r *rand.Rand) []int8 { + values := make([]int8, n) + for i := range values { + values[i] = int8(r.Intn(6)) + } + return values +} - for i := 0; i < (len(test.data) / test.size); i++ { - n, err := dec.DecodeFixedLenByteArray(test.size, tmp) - if err != nil { - t.Fatal("decode:", err) - } - if n != 1 { - t.Fatalf("decoder decoded the wrong number of items: %d", n) - } - want := test.data[i*test.size : (i+1)*test.size] - if !bytes.Equal(want, tmp) { - t.Fatalf("decoder decoded the wrong value at index %d:\nwant = %#v\ngot = %#v", i, want, tmp) - } - } +func generateInt32Values(n int, r *rand.Rand) []int32 { + values := make([]int32, n) + for i := range values { + values[i] = r.Int31n(100) + } + return values +} - if n, err := dec.DecodeFixedLenByteArray(test.size, tmp); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - }) +func generateInt64Values(n int, r *rand.Rand) []int64 { + values := make([]int64, n) + for i := range values { + values[i] = r.Int63n(100) } + return values } -func BenchmarkFloatEncoding(b *testing.B) { - buf := new(bytes.Buffer) - enc := bytestreamsplit.NewEncoder(buf) - dec := bytestreamsplit.NewDecoder(buf) +func generateFloatValues(n int, r *rand.Rand) []float32 { + values := make([]float32, n) + for i := range values { + values[i] = r.Float32() + } + return values +} - for n := 0; n < b.N; n++ { - for _, test := range floatTests { - tmp := make([]float32, len(test)) +func generateDoubleValues(n int, r *rand.Rand) []float64 { + values := make([]float64, n) + for i := range values { + values[i] = r.Float64() + } + return values +} - if err := enc.EncodeFloat(test); err != nil { - b.Fatal(err) - } +func generateByteArrayValues(n int, r *rand.Rand) []byte { + values := make([]byte, n*21) + length := 0 - if _, err := dec.DecodeFloat(tmp); err != nil && err != io.EOF { - b.Fatal(err) - } - } + for i := 0; i < n; i++ { + k := r.Intn(20) + 1 + plain.PutByteArrayLength(values[length:], k) + length += 4 + io.ReadFull(r, values[length:length+k]) + length += k } + + return values[:length] +} + +func generateFixedLenByteArrayValues(n int, r *rand.Rand, size int) []byte { + values := make([]byte, n*size) + io.ReadFull(r, values) + return values } diff --git a/encoding/fuzz/fuzz.go b/encoding/fuzz/fuzz.go new file mode 100644 index 0000000..dd02bc0 --- /dev/null +++ b/encoding/fuzz/fuzz.go @@ -0,0 +1,156 @@ +//go:build go1.18 +// +build go1.18 + +// Package fuzz contains functions to help fuzz test parquet encodings. +package fuzz + +import ( + "bytes" + "math/rand" + "testing" + + "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/encoding/plain" + "github.com/segmentio/parquet-go/internal/unsafecast" +) + +func EncodeBoolean(f *testing.F, e encoding.Encoding) { + var err error + var buf = make([]bool, 64*1024) + var src = make([]bool, 64*1024) + var dst = make([]byte, 64*1024) + + f.Fuzz(func(t *testing.T, input []byte) { + src = src[:0] + for _, c := range input { + src = append(src, (c&1) == 1) + } + dst, err = e.EncodeBoolean(dst, src) + if err != nil { + t.Error(err) + return + } + buf, err = e.DecodeBoolean(buf, dst) + if err != nil { + t.Error(err) + return + } + if !bytes.Equal(unsafecast.Slice[byte](buf), unsafecast.Slice[byte](src)) { + t.Error("decoded output does not match the original input") + return + } + // Likely invalid inputs, look for panics. + buf, _ = e.DecodeBoolean(buf, input) + }) +} + +func EncodeInt8(f *testing.F, e encoding.Encoding) { + encode(f, e, + encoding.Encoding.EncodeInt8, + encoding.Encoding.DecodeInt8, + ) +} + +func EncodeInt32(f *testing.F, e encoding.Encoding) { + encode(f, e, + encoding.Encoding.EncodeInt32, + encoding.Encoding.DecodeInt32, + ) +} + +func EncodeInt64(f *testing.F, e encoding.Encoding) { + encode(f, e, + encoding.Encoding.EncodeInt64, + encoding.Encoding.DecodeInt64, + ) +} + +func EncodeFloat(f *testing.F, e encoding.Encoding) { + encode(f, e, + encoding.Encoding.EncodeFloat, + encoding.Encoding.DecodeFloat, + ) +} + +func EncodeDouble(f *testing.F, e encoding.Encoding) { + encode(f, e, + encoding.Encoding.EncodeDouble, + encoding.Encoding.DecodeDouble, + ) +} + +func EncodeByteArray(f *testing.F, e encoding.Encoding) { + var err error + var buf = make([]byte, 64*1024) + var dst = make([]byte, 64*1024) + var src = make([]byte, 64*1024) + var prng = rand.New(rand.NewSource(0)) + + f.Fuzz(func(t *testing.T, input []byte, seed int64) { + prng.Seed(seed) + src = generatePlainByteArrayList(src[:0], input, prng) + + dst, err = e.EncodeByteArray(dst, src) + if err != nil { + t.Error(err) + return + } + buf, err = e.DecodeByteArray(buf, dst) + if err != nil { + t.Error(err) + return + } + if !bytes.Equal(buf, src) { + t.Error("decoded output does not match the original input") + return + } + // Likely invalid inputs, look for panics. + buf, _ = e.DecodeByteArray(buf, input) + }) +} + +type encodeFunc[T any] func(encoding.Encoding, []byte, []T) ([]byte, error) + +type decodeFunc[T any] func(encoding.Encoding, []T, []byte) ([]T, error) + +func encode[T any](f *testing.F, e encoding.Encoding, encode encodeFunc[T], decode decodeFunc[T]) { + var err error + var buf = make([]T, 16*1024) + var dst = make([]byte, 64*1024) + + f.Fuzz(func(t *testing.T, input []byte) { + var src = unsafecast.Slice[T](input) + dst, err = encode(e, dst, src) + if err != nil { + t.Error(err) + return + } + buf, err = decode(e, buf, dst) + if err != nil { + t.Error(err) + return + } + if !bytes.Equal(unsafecast.Slice[byte](buf), unsafecast.Slice[byte](src)) { + t.Error("decoded output does not match the original input") + return + } + // Likely invalid inputs, look for panics. + buf, _ = decode(e, buf, input) + }) +} + +func generatePlainByteArrayList(dst, src []byte, prng *rand.Rand) []byte { + limit := len(src)/10 + 1 + + for i := 0; i < len(src); { + n := prng.Intn(limit) + 1 + r := len(src) - i + if n > r { + n = r + } + dst = plain.AppendByteArray(dst, src[i:i+n]) + i += n + } + + return dst +} diff --git a/encoding/fuzz_test.go b/encoding/fuzz_test.go deleted file mode 100644 index 063d084..0000000 --- a/encoding/fuzz_test.go +++ /dev/null @@ -1,412 +0,0 @@ -//go:build go1.18 -// +build go1.18 - -package encoding_test - -import ( - "bytes" - "encoding/binary" - "errors" - "io" - "math/rand" - "testing" - - "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/encoding/bytestreamsplit" - "github.com/segmentio/parquet-go/encoding/delta" - "github.com/segmentio/parquet-go/encoding/plain" - "github.com/segmentio/parquet-go/encoding/rle" - "github.com/segmentio/parquet-go/format" - "github.com/segmentio/parquet-go/internal/bits" -) - -func fuzzEncoding(fuzz func(e encoding.Encoding)) { - for _, test := range [...]struct { - scenario string - encoding encoding.Encoding - }{ - { - scenario: "PLAIN", - encoding: new(plain.Encoding), - }, - - { - scenario: "RLE", - encoding: new(rle.Encoding), - }, - - { - scenario: "PLAIN_DICTIONARY", - encoding: new(plain.DictionaryEncoding), - }, - - { - scenario: "RLE_DICTIONARY", - encoding: new(rle.DictionaryEncoding), - }, - - { - scenario: "DELTA_BINARY_PACKED", - encoding: new(delta.BinaryPackedEncoding), - }, - - { - scenario: "DELTA_LENGTH_BYTE_ARRAY", - encoding: new(delta.LengthByteArrayEncoding), - }, - - { - scenario: "DELTA_BYTE_ARRAY", - encoding: new(delta.ByteArrayEncoding), - }, - - { - scenario: "BYTE_STREAM_SPLIT", - encoding: new(bytestreamsplit.Encoding), - }, - } { - fuzz(test.encoding) - } -} - -func FuzzAllEncoding(f *testing.F) { - f.Fuzz(func(t *testing.T, input []byte, size int) { - fuzzEncoding(func(e encoding.Encoding) { - fuzzBooleanDecoding(t, makeRandBoolean(input, size), e) - fuzzByteArrayDecoding(t, input, e) - fuzzFixedLenByteArrayDecoding(t, size, input, e) - fuzzFloatDecoding(t, makeRandFloat(input, size), e) - fuzzDoubleDecoding(t, makeRandDouble(input, size), e) - fuzzInt32Decoding(t, makeRandInt32(input, size), e) - fuzzInt64Decoding(t, makeRandInt64(input, size), e) - fuzzInt96Decoding(t, makeRandInt96(input, size), e) - }) - }) -} - -func FuzzBooleanEncoding(f *testing.F) { - f.Fuzz(func(t *testing.T, input []byte, count int) { - fuzzEncoding(func(e encoding.Encoding) { - fuzzBooleanDecoding(t, makeRandBoolean(input, count), e) - }) - }) -} - -func FuzzByteArrayEncoding(f *testing.F) { - f.Fuzz(func(t *testing.T, input []byte) { - fuzzEncoding(func(e encoding.Encoding) { - fuzzByteArrayDecoding(t, input, e) - }) - }) -} - -func FuzzFixedLenByteArrayEncoding(f *testing.F) { - f.Fuzz(func(t *testing.T, size int, input []byte) { - fuzzEncoding(func(e encoding.Encoding) { - fuzzFixedLenByteArrayDecoding(t, size, input, e) - }) - }) -} - -func FuzzFloatEncoding(f *testing.F) { - f.Fuzz(func(t *testing.T, size int, input []byte) { - fuzzEncoding(func(e encoding.Encoding) { - fuzzFloatDecoding(t, makeRandFloat(input, size), e) - fuzzDoubleDecoding(t, makeRandDouble(input, size), e) - }) - }) -} - -func FuzzIntEncoding(f *testing.F) { - f.Fuzz(func(t *testing.T, size int, input []byte) { - fuzzEncoding(func(e encoding.Encoding) { - fuzzInt32Decoding(t, makeRandInt32(input, size), e) - fuzzInt64Decoding(t, makeRandInt64(input, size), e) - fuzzInt96Decoding(t, makeRandInt96(input, size), e) - }) - }) -} - -func fuzzBooleanDecoding(t *testing.T, input []bool, e encoding.Encoding) { - if !e.CanEncode(format.Boolean) { - return - } - - buf := new(bytes.Buffer) - dec := e.NewDecoder(buf) - tmp := make([]bool, 1) - - for { - _, err := dec.DecodeBoolean(tmp) - if err != nil { - if errors.Is(err, io.EOF) { - break - } - t.Logf("encoding:%s, decoding boolean: %s", e, err) - break - } - } -} - -func fuzzFixedLenByteArrayDecoding(t *testing.T, size int, input []byte, e encoding.Encoding) { - if !e.CanEncode(format.FixedLenByteArray) { - return - } - - dec := e.NewDecoder(bytes.NewReader(input)) - tmp := make([]byte, 1) - for { - _, err := dec.DecodeFixedLenByteArray(size, tmp) - if err != nil { - if errors.Is(err, io.EOF) { - break - } - t.Logf("encoding:%s, decoding fixed len byte array: %s", e, err) - break - } - } -} - -func fuzzByteArrayDecoding(t *testing.T, input []byte, e encoding.Encoding) { - if !e.CanEncode(format.ByteArray) { - return - } - - dec := e.NewDecoder(bytes.NewReader(input)) - tmp := encoding.MakeByteArrayList(1) - for { - _, err := dec.DecodeByteArray(&tmp) - if err != nil { - if errors.Is(err, io.EOF) { - break - } - t.Logf("encoding:%s, decoding byte array: %s", e, err) - break - } - } -} - -func fuzzFloatDecoding(t *testing.T, input []float32, e encoding.Encoding) { - if !e.CanEncode(format.Float) { - return - } - - buf := new(bytes.Buffer) - dec := e.NewDecoder(buf) - tmp := make([]float32, 1) - for { - _, err := dec.DecodeFloat(tmp) - if err != nil { - if errors.Is(err, io.EOF) { - break - } - t.Logf("encoding:%s, decoding float: %s", e, err) - break - } - } -} - -func fuzzDoubleDecoding(t *testing.T, input []float64, e encoding.Encoding) { - if !e.CanEncode(format.Double) { - return - } - - buf := new(bytes.Buffer) - dec := e.NewDecoder(buf) - tmp := make([]float64, 1) - for { - _, err := dec.DecodeDouble(tmp) - if err != nil { - if errors.Is(err, io.EOF) { - break - } - t.Logf("encoding:%s, decoding double: %s", e, err) - break - } - } -} - -func fuzzInt32Decoding(t *testing.T, input []int32, e encoding.Encoding) { - if !e.CanEncode(format.Int32) { - return - } - - buf := new(bytes.Buffer) - - dec := e.NewDecoder(buf) - if e.String() == "RLE" { - bitWidth := bits.MaxLen32(input) - if bitWidth == 0 { - bitWidth = 1 - } - dec.SetBitWidth(bitWidth) - } - - tmp := make([]int32, 1) - for { - _, err := dec.DecodeInt32(tmp) - if err != nil { - if errors.Is(err, io.EOF) { - break - } - t.Logf("encoding:%s, decoding int32: %s", e, err) - break - } - } -} - -func fuzzInt64Decoding(t *testing.T, input []int64, e encoding.Encoding) { - if !e.CanEncode(format.Int64) { - return - } - - buf := new(bytes.Buffer) - - dec := e.NewDecoder(buf) - if e.String() == "RLE" { - bitWidth := bits.MaxLen64(input) - if bitWidth == 0 { - bitWidth = 1 - } - dec.SetBitWidth(bitWidth) - } - - tmp := make([]int64, 1) - for { - _, err := dec.DecodeInt64(tmp) - if err != nil { - if errors.Is(err, io.EOF) { - break - } - t.Logf("encoding:%s, decoding int64: %s", e, err) - break - } - } -} - -func fuzzInt96Decoding(t *testing.T, input []deprecated.Int96, e encoding.Encoding) { - if !e.CanEncode(format.Int96) { - return - } - - buf := new(bytes.Buffer) - dec := e.NewDecoder(buf) - tmp := make([]deprecated.Int96, 1) - for { - _, err := dec.DecodeInt96(tmp) - if err != nil { - if errors.Is(err, io.EOF) { - break - } - t.Logf("encoding:%s, decode: %s", e, err) - break - } - } -} - -func makeRandBoolean(data []byte, count int) []bool { - if count < 1 { - return nil - } - src := rand.New(newByteSource(data)) - b := make([]bool, count) - for i := 0; i < count; i++ { - b[i] = src.Int63()&0x01 == 1 - } - return b -} - -func makeRandFloat(data []byte, count int) []float32 { - if count < 1 { - return nil - } - src := rand.New(newByteSource(data)) - f := make([]float32, count) - for i := 0; i < count; i++ { - f[i] = src.Float32() - } - return f -} - -func makeRandDouble(data []byte, count int) []float64 { - if count < 1 { - return nil - } - src := rand.New(newByteSource(data)) - f := make([]float64, count) - for i := 0; i < count; i++ { - f[i] = src.Float64() - } - - return f -} - -func makeRandInt32(data []byte, count int) []int32 { - if count < 1 { - return nil - } - - src := rand.New(newByteSource(data)) - a := make([]int32, count) - for i := 0; i < count; i++ { - a[i] = int32(src.Int63()) - } - return a -} - -func makeRandInt64(data []byte, count int) []int64 { - if count < 1 { - return nil - } - - src := rand.New(newByteSource(data)) - a := make([]int64, count) - for i := 0; i < count; i++ { - a[i] = src.Int63() - } - return a -} - -func makeRandInt96(data []byte, count int) []deprecated.Int96 { - if count < 1 { - return nil - } - - src := rand.New(newByteSource(data)) - a := make([]deprecated.Int96, count) - for i := 0; i < count; i++ { - a[i] = deprecated.Int96{ - uint32(src.Int63()), - uint32(src.Int63()), - uint32(src.Int63()), - } - } - return a -} - -// byteSource is used to compose fuzz tests from a byte array. -// This is to workaround the current stblib limitations. -type byteSource struct { - *bytes.Reader -} - -func newByteSource(data []byte) *byteSource { - return &byteSource{ - Reader: bytes.NewReader(data), - } -} - -func (s *byteSource) Uint64() uint64 { - var bytes [8]byte - if _, err := s.Read(bytes[:]); err != nil && !errors.Is(err, io.EOF) { - panic("byteSource: failed to read bytes") - } - return binary.BigEndian.Uint64(bytes[:]) -} - -func (s *byteSource) Int63() int64 { - return int64(s.Uint64() >> 1) -} - -func (s *byteSource) Seed(seed int64) {} diff --git a/encoding/notsupported.go b/encoding/notsupported.go index 64d14df..7a71096 100644 --- a/encoding/notsupported.go +++ b/encoding/notsupported.go @@ -3,7 +3,6 @@ package encoding import ( "errors" "fmt" - "io" "github.com/segmentio/parquet-go/deprecated" "github.com/segmentio/parquet-go/format" @@ -27,151 +26,168 @@ var ( ErrInvalidArgument = errors.New("invalid argument") ) -// NotSupported is a type satisfying the Encoding interface which does not -// support encoding nor decoding any value types. -type NotSupported struct { +// Error constructs an error which wraps err and indicates that it originated +// from the given encoding. +func Error(e Encoding, err error) error { + return fmt.Errorf("%s: %w", e, err) } -func (NotSupported) Encoding() format.Encoding { - return -1 +// Errorf is like Error but constructs the error message from the given format +// and arguments. +func Errorf(e Encoding, msg string, args ...interface{}) error { + return Error(e, fmt.Errorf(msg, args...)) } -func (NotSupported) CanEncode(format.Type) bool { - return false +// ErrInvalidInputSize constructs an error indicating that decoding failed due +// to the size of the input. +func ErrInvalidInputSize(e Encoding, typ string, size int) error { + return Errorf(e, "cannot decode %s from input of size %d: %w", typ, size, ErrInvalidArgument) } -func (NotSupported) NewDecoder(io.Reader) Decoder { - return NotSupportedDecoder{} +// CanEncodeBoolean reports whether e can encode BOOLEAN values. +func CanEncodeBoolean(e Encoding) bool { + _, err := e.EncodeBoolean(nil, nil) + return !errors.Is(err, ErrNotSupported) } -func (NotSupported) NewEncoder(io.Writer) Encoder { - return NotSupportedEncoder{} +// CanEncodeInt8 reports whether e can encode INT8 values. +func CanEncodeInt8(e Encoding) bool { + _, err := e.EncodeInt8(nil, nil) + return !errors.Is(err, ErrNotSupported) } -func (NotSupported) String() string { - return "NOT_SUPPORTED" +// CanEncodeInt32 reports whether e can encode INT32 values. +func CanEncodeInt32(e Encoding) bool { + _, err := e.EncodeInt32(nil, nil) + return !errors.Is(err, ErrNotSupported) } -// NotSupportedDecoder is an implementation of the Decoder interface which does -// not support decoding any value types. -// -// Many parquet encodings only support decoding a subset of the parquet types, -// they can embed this type to default to not supporting any decoding, then -// override specific Decode* methods to provide implementations for the types -// they do support. -type NotSupportedDecoder struct { +// CanEncodeInt64 reports whether e can encode INT64 values. +func CanEncodeInt64(e Encoding) bool { + _, err := e.EncodeInt64(nil, nil) + return !errors.Is(err, ErrNotSupported) } -func (NotSupportedDecoder) Encoding() format.Encoding { - return -1 +// CanEncodeInt96 reports whether e can encode INT96 values. +func CanEncodeInt96(e Encoding) bool { + _, err := e.EncodeInt96(nil, nil) + return !errors.Is(err, ErrNotSupported) } -func (NotSupportedDecoder) Reset(io.Reader) { +// CanEncodeFloat reports whether e can encode FLOAT values. +func CanEncodeFloat(e Encoding) bool { + _, err := e.EncodeFloat(nil, nil) + return !errors.Is(err, ErrNotSupported) } -func (NotSupportedDecoder) DecodeBoolean([]bool) (int, error) { - return 0, errNotSupported("BOOLEAN") +// CanEncodeDouble reports whether e can encode DOUBLE values. +func CanEncodeDouble(e Encoding) bool { + _, err := e.EncodeDouble(nil, nil) + return !errors.Is(err, ErrNotSupported) } -func (NotSupportedDecoder) DecodeInt8([]int8) (int, error) { - return 0, errNotSupported("INT8") +// CanEncodeByteArray reports whether e can encode BYTE_ARRAY values. +func CanEncodeByteArray(e Encoding) bool { + _, err := e.EncodeByteArray(nil, nil) + return !errors.Is(err, ErrNotSupported) } -func (NotSupportedDecoder) DecodeInt16([]int16) (int, error) { - return 0, errNotSupported("INT16") +// CanEncodeFixedLenByteArray reports whether e can encode +// FIXED_LEN_BYTE_ARRAY values. +func CanEncodeFixedLenByteArray(e Encoding) bool { + _, err := e.EncodeFixedLenByteArray(nil, nil, 1) + return !errors.Is(err, ErrNotSupported) } -func (NotSupportedDecoder) DecodeInt32([]int32) (int, error) { - return 0, errNotSupported("INT32") -} - -func (NotSupportedDecoder) DecodeInt64([]int64) (int, error) { - return 0, errNotSupported("INT64") +// NotSupported is a type satisfying the Encoding interface which does not +// support encoding nor decoding any value types. +type NotSupported struct { } -func (NotSupportedDecoder) DecodeInt96([]deprecated.Int96) (int, error) { - return 0, errNotSupported("INT96") +func (NotSupported) String() string { + return "NOT_SUPPORTED" } -func (NotSupportedDecoder) DecodeFloat([]float32) (int, error) { - return 0, errNotSupported("FLOAT") +func (NotSupported) Encoding() format.Encoding { + return -1 } -func (NotSupportedDecoder) DecodeDouble([]float64) (int, error) { - return 0, errNotSupported("DOUBLE") +func (NotSupported) EncodeBoolean(dst []byte, src []bool) ([]byte, error) { + return dst[:0], errNotSupported("BOOLEAN") } -func (NotSupportedDecoder) DecodeByteArray(*ByteArrayList) (int, error) { - return 0, errNotSupported("BYTE_ARRAY") +func (NotSupported) EncodeInt8(dst []byte, src []int8) ([]byte, error) { + return dst[:0], errNotSupported("INT8") } -func (NotSupportedDecoder) DecodeFixedLenByteArray(size int, data []byte) (int, error) { - return 0, errNotSupported("FIXED_LEN_BYTE_ARRAY") +func (NotSupported) EncodeInt32(dst []byte, src []int32) ([]byte, error) { + return dst[:0], errNotSupported("INT32") } -func (NotSupportedDecoder) SetBitWidth(int) { +func (NotSupported) EncodeInt64(dst []byte, src []int64) ([]byte, error) { + return dst[:0], errNotSupported("INT64") } -// NotSupportedEncoder is an implementation of the Encoder interface which does -// not support encoding any value types. -// -// Many parquet encodings only support encoding a subset of the parquet types, -// they can embed this type to default to not supporting any encoding, then -// override specific Encode* methods to provide implementations for the types -// they do support. -type NotSupportedEncoder struct { +func (NotSupported) EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) { + return dst[:0], errNotSupported("INT96") } -func (NotSupportedEncoder) Encoding() format.Encoding { - return -1 +func (NotSupported) EncodeFloat(dst []byte, src []float32) ([]byte, error) { + return dst[:0], errNotSupported("FLOAT") } -func (NotSupportedEncoder) Reset(io.Writer) { +func (NotSupported) EncodeDouble(dst []byte, src []float64) ([]byte, error) { + return dst[:0], errNotSupported("DOUBLE") } -func (NotSupportedEncoder) EncodeBoolean([]bool) error { - return errNotSupported("BOOLEAN") +func (NotSupported) EncodeByteArray(dst, src []byte) ([]byte, error) { + return dst[:0], errNotSupported("BYTE_ARRAY") } -func (NotSupportedEncoder) EncodeInt8([]int8) error { - return errNotSupported("INT8") +func (NotSupported) EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { + return dst[:0], errNotSupported("FIXED_LEN_BYTE_ARRAY") } -func (NotSupportedEncoder) EncodeInt16([]int16) error { - return errNotSupported("INT16") +func (NotSupported) DecodeBoolean(dst []bool, src []byte) ([]bool, error) { + return dst[:0], errNotSupported("BOOLEAN") } -func (NotSupportedEncoder) EncodeInt32([]int32) error { - return errNotSupported("INT32") +func (NotSupported) DecodeInt8(dst []int8, src []byte) ([]int8, error) { + return dst[:0], errNotSupported("INT8") } -func (NotSupportedEncoder) EncodeInt64([]int64) error { - return errNotSupported("INT64") +func (NotSupported) DecodeInt32(dst []int32, src []byte) ([]int32, error) { + return dst[:0], errNotSupported("INT32") } -func (NotSupportedEncoder) EncodeInt96([]deprecated.Int96) error { - return errNotSupported("INT96") +func (NotSupported) DecodeInt64(dst []int64, src []byte) ([]int64, error) { + return dst[:0], errNotSupported("INT64") } -func (NotSupportedEncoder) EncodeFloat([]float32) error { - return errNotSupported("FLOAT") +func (NotSupported) DecodeInt96(dst []deprecated.Int96, src []byte) ([]deprecated.Int96, error) { + return dst[:0], errNotSupported("INT96") } -func (NotSupportedEncoder) EncodeDouble([]float64) error { - return errNotSupported("DOUBLE") +func (NotSupported) DecodeFloat(dst []float32, src []byte) ([]float32, error) { + return dst[:0], errNotSupported("FLOAT") } -func (NotSupportedEncoder) EncodeByteArray(ByteArrayList) error { - return errNotSupported("BYTE_ARRAY") +func (NotSupported) DecodeDouble(dst []float64, src []byte) ([]float64, error) { + return dst[:0], errNotSupported("DOUBLE") } -func (NotSupportedEncoder) EncodeFixedLenByteArray(int, []byte) error { - return errNotSupported("FIXED_LEN_BYTE_ARRAY") +func (NotSupported) DecodeByteArray(dst, src []byte) ([]byte, error) { + return dst[:0], errNotSupported("BYTE_ARRAY") } -func (NotSupportedEncoder) SetBitWidth(int) { +func (NotSupported) DecodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { + return dst[:0], errNotSupported("FIXED_LEN_BYTE_ARRAY") } func errNotSupported(typ string) error { return fmt.Errorf("%w for type %s", ErrNotSupported, typ) } + +var ( + _ Encoding = NotSupported{} +) diff --git a/encoding/plain/decoder.go b/encoding/plain/decoder.go deleted file mode 100644 index f924bb9..0000000 --- a/encoding/plain/decoder.go +++ /dev/null @@ -1,111 +0,0 @@ -package plain - -import ( - "encoding/binary" - "fmt" - "io" - - "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/encoding/rle" - "github.com/segmentio/parquet-go/internal/bits" -) - -type Decoder struct { - encoding.NotSupportedDecoder - reader io.Reader - buffer [4]byte - rle *rle.Decoder -} - -func NewDecoder(r io.Reader) *Decoder { - return &Decoder{reader: r} -} - -func (d *Decoder) Reset(r io.Reader) { - d.reader = r - - if d.rle != nil { - d.rle.Reset(r) - } -} - -func (d *Decoder) DecodeBoolean(data []bool) (int, error) { - if d.rle == nil { - d.rle = rle.NewDecoder(d.reader) - } - return d.rle.DecodeBoolean(data) -} - -func (d *Decoder) DecodeInt32(data []int32) (int, error) { - return readFull(d.reader, 4, bits.Int32ToBytes(data)) -} - -func (d *Decoder) DecodeInt64(data []int64) (int, error) { - return readFull(d.reader, 8, bits.Int64ToBytes(data)) -} - -func (d *Decoder) DecodeInt96(data []deprecated.Int96) (int, error) { - return readFull(d.reader, 12, deprecated.Int96ToBytes(data)) -} - -func (d *Decoder) DecodeFloat(data []float32) (int, error) { - return readFull(d.reader, 4, bits.Float32ToBytes(data)) -} - -func (d *Decoder) DecodeDouble(data []float64) (int, error) { - return readFull(d.reader, 8, bits.Float64ToBytes(data)) -} - -func (d *Decoder) DecodeByteArray(data *encoding.ByteArrayList) (n int, err error) { - n = data.Len() - - for data.Len() < data.Cap() { - if _, err = io.ReadFull(d.reader, d.buffer[:4]); err != nil { - break - } - if value := data.PushSize(int(binary.LittleEndian.Uint32(d.buffer[:4]))); len(value) > 0 { - if _, err = io.ReadFull(d.reader, value); err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - break - } - } - } - - return data.Len() - n, err -} - -func (d *Decoder) DecodeFixedLenByteArray(size int, data []byte) (int, error) { - if size <= 0 { - return 0, fmt.Errorf("PLAIN: %w: size of decoded FIXED_LEN_BYTE_ARRAY must be positive", encoding.ErrInvalidArgument) - } - - if (len(data) % size) != 0 { - return 0, fmt.Errorf("PLAIN: %w: length of decoded FIXED_LEN_BYTE_ARRAY must be a multiple of its size: size=%d length=%d", encoding.ErrInvalidArgument, size, len(data)) - } - - return readFull(d.reader, size, data) -} - -func (d *Decoder) SetBitWidth(bitWidth int) {} - -func readFull(r io.Reader, scale int, data []byte) (int, error) { - n, err := io.ReadFull(r, data) - if err == io.ErrUnexpectedEOF && (n%scale) == 0 { - err = io.EOF - } - return n / scale, err -} - -func prepend(dst, src []byte) (ret []byte) { - if (cap(dst) - len(dst)) < len(src) { - ret = make([]byte, len(src)+len(dst)) - } else { - ret = dst[:len(src)+len(dst)] - } - copy(ret[len(src):], dst) - copy(ret, src) - return ret -} diff --git a/encoding/plain/dictionary.go b/encoding/plain/dictionary.go index 80e11c9..87598f9 100644 --- a/encoding/plain/dictionary.go +++ b/encoding/plain/dictionary.go @@ -1,31 +1,30 @@ package plain import ( - "io" - "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" + "github.com/segmentio/parquet-go/internal/bits" ) type DictionaryEncoding struct { + encoding.NotSupported } -func (e DictionaryEncoding) Encoding() format.Encoding { - return format.PlainDictionary -} - -func (e DictionaryEncoding) CanEncode(t format.Type) bool { - return true +func (e *DictionaryEncoding) String() string { + return "PLAIN_DICTIONARY" } -func (e DictionaryEncoding) NewDecoder(r io.Reader) encoding.Decoder { - return NewDecoder(r) +func (e *DictionaryEncoding) Encoding() format.Encoding { + return format.PlainDictionary } -func (e DictionaryEncoding) NewEncoder(w io.Writer) encoding.Encoder { - return NewEncoder(w) +func (e *DictionaryEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { + return append(dst[:0], bits.Int32ToBytes(src)...), nil } -func (e DictionaryEncoding) String() string { - return "PLAIN_DICTIONARY" +func (e *DictionaryEncoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) { + if (len(src) % 4) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "INT32", len(src)) + } + return append(dst[:0], bits.BytesToInt32(src)...), nil } diff --git a/encoding/plain/encoder.go b/encoding/plain/encoder.go deleted file mode 100644 index d44afd0..0000000 --- a/encoding/plain/encoder.go +++ /dev/null @@ -1,92 +0,0 @@ -package plain - -import ( - "encoding/binary" - "fmt" - "io" - - "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/encoding/rle" - "github.com/segmentio/parquet-go/internal/bits" -) - -type Encoder struct { - encoding.NotSupportedEncoder - writer io.Writer - buffer [4]byte - rle *rle.Encoder -} - -func NewEncoder(w io.Writer) *Encoder { - return &Encoder{writer: w} -} - -func (e *Encoder) Reset(w io.Writer) { - e.writer = w - - if e.rle != nil { - e.rle.Reset(w) - } -} - -func (e *Encoder) EncodeBoolean(data []bool) error { - if e.rle == nil { - e.rle = rle.NewEncoder(e.writer) - } - return e.rle.EncodeBoolean(data) -} - -func (e *Encoder) EncodeInt32(data []int32) error { - _, err := e.writer.Write(bits.Int32ToBytes(data)) - return err -} - -func (e *Encoder) EncodeInt64(data []int64) error { - _, err := e.writer.Write(bits.Int64ToBytes(data)) - return err -} - -func (e *Encoder) EncodeInt96(data []deprecated.Int96) error { - _, err := e.writer.Write(deprecated.Int96ToBytes(data)) - return err -} - -func (e *Encoder) EncodeFloat(data []float32) error { - _, err := e.writer.Write(bits.Float32ToBytes(data)) - return err -} - -func (e *Encoder) EncodeDouble(data []float64) error { - _, err := e.writer.Write(bits.Float64ToBytes(data)) - return err -} - -func (e *Encoder) EncodeByteArray(data encoding.ByteArrayList) (err error) { - data.Range(func(value []byte) bool { - binary.LittleEndian.PutUint32(e.buffer[:4], uint32(len(value))) - if _, err = e.writer.Write(e.buffer[:4]); err != nil { - return false - } - if _, err = e.writer.Write(value); err != nil { - return false - } - return true - }) - return err -} - -func (e *Encoder) EncodeFixedLenByteArray(size int, data []byte) error { - if size <= 0 { - return fmt.Errorf("PLAIN: %w: size of encoded FIXED_LEN_BYTE_ARRAY must be positive", encoding.ErrInvalidArgument) - } - - if (len(data) % size) != 0 { - return fmt.Errorf("PLAIN: %w: length of encoded FIXED_LEN_BYTE_ARRAY must be a multiple of its size: size=%d length=%d", encoding.ErrInvalidArgument, size, len(data)) - } - - _, err := e.writer.Write(data) - return err -} - -func (e *Encoder) SetBitWidth(bitWidth int) {} diff --git a/encoding/plain/plain.go b/encoding/plain/plain.go index 65738b9..c91594a 100644 --- a/encoding/plain/plain.go +++ b/encoding/plain/plain.go @@ -12,6 +12,7 @@ import ( "github.com/segmentio/parquet-go/deprecated" "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" + "github.com/segmentio/parquet-go/internal/bits" ) const ( @@ -21,24 +22,173 @@ const ( type Encoding struct { } +func (e *Encoding) String() string { + return "PLAIN" +} + func (e *Encoding) Encoding() format.Encoding { return format.Plain } -func (e *Encoding) CanEncode(format.Type) bool { - return true +func (e *Encoding) EncodeBoolean(dst []byte, src []bool) ([]byte, error) { + dst = dst[:0] + b := byte(0) + i := 0 + n := (len(src) / 8) * 8 + + for i < n { + b = 0 + if src[i+7] { + b |= 1 << 7 + } + if src[i+6] { + b |= 1 << 6 + } + if src[i+5] { + b |= 1 << 5 + } + if src[i+4] { + b |= 1 << 4 + } + if src[i+3] { + b |= 1 << 3 + } + if src[i+2] { + b |= 1 << 2 + } + if src[i+1] { + b |= 1 << 1 + } + if src[i+0] { + b |= 1 << 0 + } + dst = append(dst, b) + i += 8 + } + + if i < len(src) { + b = 0 + for j := uint(0); i < len(src); j++ { + if src[i] { + b |= 1 << j + } + i++ + } + dst = append(dst, b) + } + + return dst, nil } -func (e *Encoding) NewDecoder(r io.Reader) encoding.Decoder { - return NewDecoder(r) +func (e *Encoding) EncodeInt8(dst []byte, src []int8) ([]byte, error) { + return append(dst[:0], bits.Int8ToBytes(src)...), nil } -func (e *Encoding) NewEncoder(w io.Writer) encoding.Encoder { - return NewEncoder(w) +func (e *Encoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { + return append(dst[:0], bits.Int32ToBytes(src)...), nil } -func (e *Encoding) String() string { - return "PLAIN" +func (e *Encoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) { + return append(dst[:0], bits.Int64ToBytes(src)...), nil +} + +func (e *Encoding) EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) { + return append(dst[:0], deprecated.Int96ToBytes(src)...), nil +} + +func (e *Encoding) EncodeFloat(dst []byte, src []float32) ([]byte, error) { + return append(dst[:0], bits.Float32ToBytes(src)...), nil +} + +func (e *Encoding) EncodeDouble(dst []byte, src []float64) ([]byte, error) { + return append(dst[:0], bits.Float64ToBytes(src)...), nil +} + +func (e *Encoding) EncodeByteArray(dst []byte, src []byte) ([]byte, error) { + if err := RangeByteArrays(src, func([]byte) error { return nil }); err != nil { + return dst[:0], encoding.Error(e, err) + } + return append(dst[:0], src...), nil +} + +func (e *Encoding) EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { + if size < 0 || size > encoding.MaxFixedLenByteArraySize { + return dst[:0], encoding.Error(e, encoding.ErrInvalidArgument) + } + return append(dst[:0], src...), nil +} + +func (e *Encoding) DecodeBoolean(dst []bool, src []byte) ([]bool, error) { + dst = dst[:0] + for _, b := range src { + dst = append(dst, + ((b>>0)&1) != 0, + ((b>>1)&1) != 0, + ((b>>2)&1) != 0, + ((b>>3)&1) != 0, + ((b>>4)&1) != 0, + ((b>>5)&1) != 0, + ((b>>6)&1) != 0, + ((b>>7)&1) != 0, + ) + } + return dst, nil +} + +func (e *Encoding) DecodeInt8(dst []int8, src []byte) ([]int8, error) { + return append(dst[:0], bits.BytesToInt8(src)...), nil +} + +func (e *Encoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) { + if (len(src) % 4) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "INT32", len(src)) + } + return append(dst[:0], bits.BytesToInt32(src)...), nil +} + +func (e *Encoding) DecodeInt64(dst []int64, src []byte) ([]int64, error) { + if (len(src) % 8) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "INT64", len(src)) + } + return append(dst[:0], bits.BytesToInt64(src)...), nil +} + +func (e *Encoding) DecodeInt96(dst []deprecated.Int96, src []byte) ([]deprecated.Int96, error) { + if (len(src) % 12) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "INT96", len(src)) + } + return append(dst[:0], deprecated.BytesToInt96(src)...), nil +} + +func (e *Encoding) DecodeFloat(dst []float32, src []byte) ([]float32, error) { + if (len(src) % 4) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "FLOAT", len(src)) + } + return append(dst[:0], bits.BytesToFloat32(src)...), nil +} + +func (e *Encoding) DecodeDouble(dst []float64, src []byte) ([]float64, error) { + if (len(src) % 8) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "DOUBLE", len(src)) + } + return append(dst[:0], bits.BytesToFloat64(src)...), nil +} + +func (e *Encoding) DecodeByteArray(dst, src []byte) ([]byte, error) { + if err := RangeByteArrays(src, func([]byte) error { return nil }); err != nil { + return dst[:0], encoding.Error(e, err) + } + return append(dst[:0], src...), nil +} + +func (e *Encoding) DecodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { + if size < 0 || size > encoding.MaxFixedLenByteArraySize { + return dst[:0], encoding.Error(e, encoding.ErrInvalidArgument) + } + if (len(src) % size) != 0 { + return dst[:0], encoding.ErrInvalidInputSize(e, "FIXED_LEN_BYTE_ARRAY", len(src)) + } + return append(dst[:0], src...), nil } func Boolean(v bool) []byte { return AppendBoolean(nil, v) } @@ -105,6 +255,10 @@ func AppendByteArray(b, v []byte) []byte { return b } +func ByteArrayLength(b []byte) int { + return int(binary.LittleEndian.Uint32(b)) +} + func PutByteArrayLength(b []byte, n int) { binary.LittleEndian.PutUint32(b, uint32(n)) } @@ -130,5 +284,5 @@ func NextByteArray(b []byte) (v, r []byte, err error) { if n > len(b) { return nil, b, fmt.Errorf("input of length %d is too short to contain a PLAIN encoded byte array of length %d: %w", len(b)-4, n-4, io.ErrUnexpectedEOF) } - return b[4:n], b[n:], nil + return b[4:n:n], b[n:len(b):len(b)], nil } diff --git a/encoding/rle/bitpack.go b/encoding/rle/bitpack.go deleted file mode 100644 index 6bb6422..0000000 --- a/encoding/rle/bitpack.go +++ /dev/null @@ -1,211 +0,0 @@ -package rle - -import ( - "fmt" - "io" - . "math/bits" - - "github.com/segmentio/parquet-go/internal/bits" -) - -const ( - unlimited = ^uint(0) -) - -type bitPackRunDecoder struct { - source io.LimitedReader - reader bits.Reader - remain uint - bitWidth uint -} - -func (d *bitPackRunDecoder) String() string { return "BIT_PACK" } - -func (d *bitPackRunDecoder) reset(r io.Reader, bitWidth, numValues uint) { - if numValues == unlimited { - d.reader.Reset(r) - } else { - d.source.R = r - d.source.N = int64(bits.ByteCount(numValues * bitWidth)) - d.reader.Reset(&d.source) - } - d.remain = numValues - d.bitWidth = bitWidth -} - -func (d *bitPackRunDecoder) decode(dst []byte, dstWidth uint) (n int, err error) { - dstBitCount := bits.BitCount(len(dst)) - - if dstWidth < 8 || dstWidth > 64 || OnesCount(dstWidth) != 1 { - return 0, fmt.Errorf("BIT_PACK decoder expects the output size to be a power of 8 bits but got %d bits", dstWidth) - } - - if (dstBitCount & (dstWidth - 1)) != 0 { // (dstBitCount % dstWidth) != 0 - return 0, fmt.Errorf("BIT_PACK decoder expects the input size to be a multiple of the destination width: bit-count=%d bit-width=%d", - dstBitCount, dstWidth) - } - - if dstWidth < d.bitWidth { - return 0, fmt.Errorf("BIT_PACK decoder cannot encode %d bits values to %d bits: the source width must be less or equal to the destination width", - d.bitWidth, dstWidth) - } - - switch dstWidth { - case 8: - n, err = d.decodeInt8(bits.BytesToInt8(dst), d.bitWidth) - case 16: - n, err = d.decodeInt16(bits.BytesToInt16(dst), d.bitWidth) - case 32: - n, err = d.decodeInt32(bits.BytesToInt32(dst), d.bitWidth) - case 64: - n, err = d.decodeInt64(bits.BytesToInt64(dst), d.bitWidth) - default: - panic("BUG: unsupported destination bit-width") - } - - if d.remain != unlimited { - if d.remain -= uint(n); d.remain == 0 { - err = io.EOF - } else if err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - } - } - - return n, err -} - -func (d *bitPackRunDecoder) decodeInt8(dst []int8, bitWidth uint) (n int, err error) { - if uint(len(dst)) > d.remain { - dst = dst[:d.remain] - } - for n < len(dst) { - b, _, err := d.reader.ReadBits(bitWidth) - if err != nil { - return n, err - } - dst[n] = int8(b) - n++ - } - return n, nil -} - -func (d *bitPackRunDecoder) decodeInt16(dst []int16, bitWidth uint) (n int, err error) { - if uint(len(dst)) > d.remain { - dst = dst[:d.remain] - } - for n < len(dst) { - b, _, err := d.reader.ReadBits(bitWidth) - if err != nil { - return n, err - } - dst[n] = int16(b) - n++ - } - return n, nil -} - -func (d *bitPackRunDecoder) decodeInt32(dst []int32, bitWidth uint) (n int, err error) { - if uint(len(dst)) > d.remain { - dst = dst[:d.remain] - } - for n < len(dst) { - b, _, err := d.reader.ReadBits(bitWidth) - if err != nil { - return n, err - } - dst[n] = int32(b) - n++ - } - return n, nil -} - -func (d *bitPackRunDecoder) decodeInt64(dst []int64, bitWidth uint) (n int, err error) { - if uint(len(dst)) > d.remain { - dst = dst[:d.remain] - } - for n < len(dst) { - b, _, err := d.reader.ReadBits(bitWidth) - if err != nil { - return n, err - } - dst[n] = int64(b) - n++ - } - return n, nil -} - -type bitPackRunEncoder struct { - writer bits.Writer - bitWidth uint -} - -func (e *bitPackRunEncoder) reset(w io.Writer, bitWidth uint) { - e.writer.Reset(w) - e.bitWidth = bitWidth -} - -func (e *bitPackRunEncoder) flush() error { - return e.writer.Flush() -} - -func (e *bitPackRunEncoder) encode(src []byte, srcWidth uint) error { - srcBitCount := bits.BitCount(len(src)) - - if srcWidth < 8 || srcWidth > 64 || OnesCount(srcWidth) != 1 { - return fmt.Errorf("BIT_PACK encoder expects the input size to be a power of 8 bits but got %d bits", srcWidth) - } - - if (srcBitCount & (srcWidth - 1)) != 0 { // (srcBitCount % srcWidth) != 0 - return fmt.Errorf("BIT_PACK encoder expects the input size to be a multiple of the source width: bit-count=%d bit-width=%d", srcBitCount, srcWidth) - } - - if ((srcBitCount / srcWidth) % 8) != 0 { - return fmt.Errorf("BIT_PACK encoder expects sequences of 8 values but %d were written", srcBitCount/srcWidth) - } - - if srcWidth < e.bitWidth { - return fmt.Errorf("BIT_PACK encoder cannot encode %d bits values to %d bits: the source width must be less or equal to the destination width", - srcWidth, e.bitWidth) - } - - switch srcWidth { - case 8: - e.encodeInt8(bits.BytesToInt8(src), e.bitWidth) - case 16: - e.encodeInt16(bits.BytesToInt16(src), e.bitWidth) - case 32: - e.encodeInt32(bits.BytesToInt32(src), e.bitWidth) - case 64: - e.encodeInt64(bits.BytesToInt64(src), e.bitWidth) - default: - panic("BUG: unsupported source bit-width") - } - - return e.flush() -} - -func (e *bitPackRunEncoder) encodeInt8(src []int8, bitWidth uint) { - for _, v := range src { - e.writer.WriteBits(uint64(v), bitWidth) - } -} - -func (e *bitPackRunEncoder) encodeInt16(src []int16, bitWidth uint) { - for _, v := range src { - e.writer.WriteBits(uint64(v), bitWidth) - } -} - -func (e *bitPackRunEncoder) encodeInt32(src []int32, bitWidth uint) { - for _, v := range src { - e.writer.WriteBits(uint64(v), bitWidth) - } -} - -func (e *bitPackRunEncoder) encodeInt64(src []int64, bitWidth uint) { - for _, v := range src { - e.writer.WriteBits(uint64(v), bitWidth) - } -} diff --git a/encoding/rle/bitpack_test.go b/encoding/rle/bitpack_test.go deleted file mode 100644 index d55129c..0000000 --- a/encoding/rle/bitpack_test.go +++ /dev/null @@ -1,58 +0,0 @@ -package rle - -import ( - "bytes" - "fmt" - "io" - "math/rand" - "testing" - - "github.com/segmentio/parquet-go/internal/bits" -) - -func TestBitPack(t *testing.T) { - data := make([]uint64, 4096) - prng := rand.New(rand.NewSource(0)) - prng.Read(bits.Uint64ToBytes(data)) - - buf := new(bytes.Buffer) - enc := bitPackRunEncoder{} - dec := bitPackRunDecoder{} - tmp := [1]uint64{} - - for bitWidth := uint(1); bitWidth <= 64; bitWidth++ { - t.Run(fmt.Sprintf("bitWidth=%d", bitWidth), func(t *testing.T) { - enc.reset(buf, bitWidth) - dec.reset(buf, bitWidth, uint(len(data))) - - if err := enc.encode(bits.Uint64ToBytes(data), 64); err != nil { - t.Fatal("encoding:", err) - } - - mask := uint64((1 << bitWidth) - 1) - - for i, value := range data { - n, err := dec.decode(bits.Uint64ToBytes(tmp[:]), 64) - if err != nil { - if err != io.EOF || n == 0 { - t.Fatal("decoding:", err) - } - } - if n != 1 { - t.Fatalf("wrong number of values decoded at index %d/%d: want=1 got=%d", i, len(data), n) - } - v1 := mask & value - v2 := mask & tmp[0] - if v1 != v2 { - t.Fatalf("wrong value at index %d/%d: want=%08b got=%08b (mask=%08b)", i, len(data), v1, v2, mask) - } - } - - if n, err := dec.decode(bits.Uint64ToBytes(tmp[:]), 64); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - }) - } -} diff --git a/encoding/rle/decoder.go b/encoding/rle/decoder.go deleted file mode 100644 index c573c34..0000000 --- a/encoding/rle/decoder.go +++ /dev/null @@ -1,128 +0,0 @@ -package rle - -import ( - "encoding/binary" - "fmt" - "io" - - "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/internal/bits" -) - -type Decoder struct { - encoding.NotSupportedDecoder - reader io.Reader - buffer [1]byte - bitWidth uint - decoder hybridDecoder - runLength runLengthRunDecoder - bitPack bitPackRunDecoder -} - -func NewDecoder(r io.Reader) *Decoder { - return &Decoder{reader: r} -} - -func (d *Decoder) BitWidth() int { - return int(d.bitWidth) -} - -func (d *Decoder) SetBitWidth(bitWidth int) { - d.bitWidth = uint(bitWidth) -} - -func (d *Decoder) Reset(r io.Reader) { - d.reader, d.decoder = r, nil -} - -func (d *Decoder) Read(b []byte) (int, error) { - return d.reader.Read(b) -} - -func (d *Decoder) ReadByte() (byte, error) { - _, err := d.Read(d.buffer[:1]) - return d.buffer[0], err -} - -func (d *Decoder) DecodeBoolean(data []bool) (int, error) { - // When decoding booleans with the RLE encoding, only the BIT_PACKED version - // is used, which skips encoding of the varint header, and consumes bits - // until EOF is reached. - if d.decoder == nil { - d.bitPack.reset(d.reader, 1, unlimited) - d.decoder = &d.bitPack - } - return d.decode(bits.BoolToBytes(data), 8, 1) -} - -func (d *Decoder) DecodeInt8(data []int8) (int, error) { - return d.decode(bits.Int8ToBytes(data), 8, d.bitWidth) -} - -func (d *Decoder) DecodeInt16(data []int16) (int, error) { - return d.decode(bits.Int16ToBytes(data), 16, d.bitWidth) -} - -func (d *Decoder) DecodeInt32(data []int32) (int, error) { - return d.decode(bits.Int32ToBytes(data), 32, d.bitWidth) -} - -func (d *Decoder) DecodeInt64(data []int64) (int, error) { - return d.decode(bits.Int64ToBytes(data), 64, d.bitWidth) -} - -func (d *Decoder) decode(data []byte, dstWidth, srcWidth uint) (int, error) { - if srcWidth == 0 { - return 0, fmt.Errorf("the source bit-width must be configured on a RLE decoder before reading %d bits integer values", dstWidth) - } - decoded := 0 - wordSize := bits.ByteCount(dstWidth) - - for len(data) >= wordSize { - if d.decoder == nil { - u, err := binary.ReadUvarint(d) - switch err { - case nil: - count, bitpack := uint(u>>1), (u&1) != 0 - if bitpack { - d.bitPack.reset(d.reader, srcWidth, count*8) - d.decoder = &d.bitPack - } else { - d.runLength.reset(d.reader, srcWidth, count) - d.decoder = &d.runLength - } - case io.EOF: - if decoded > 0 { - err = nil - } - return decoded, err - default: - return decoded, fmt.Errorf("decoding RLE run length: %w", err) - } - } - - n, err := d.decoder.decode(data, dstWidth) - decoded += n - - if err != nil { - if err == io.EOF { - d.decoder = nil - } else { - return decoded, fmt.Errorf("decoding RLE values from %s encoded run: %w", d.decoder, err) - } - } - - data = data[n*wordSize:] - } - - return decoded, nil -} - -type hybridDecoder interface { - decode(dst []byte, dstWidth uint) (int, error) -} - -var ( - _ io.ByteReader = (*Decoder)(nil) - _ io.Reader = (*Decoder)(nil) -) diff --git a/encoding/rle/dictionary.go b/encoding/rle/dictionary.go index b5302fe..ca3b316 100644 --- a/encoding/rle/dictionary.go +++ b/encoding/rle/dictionary.go @@ -1,110 +1,43 @@ package rle import ( - "fmt" - "io" - "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" "github.com/segmentio/parquet-go/internal/bits" ) type DictionaryEncoding struct { -} - -func (e *DictionaryEncoding) Encoding() format.Encoding { - return format.RLEDictionary -} - -func (e *DictionaryEncoding) CanEncode(t format.Type) bool { - return true -} - -func (e *DictionaryEncoding) NewDecoder(r io.Reader) encoding.Decoder { - return dictionaryDecoder{rle: NewDecoder(r)} -} - -func (e *DictionaryEncoding) NewEncoder(w io.Writer) encoding.Encoder { - return dictionaryEncoder{rle: NewEncoder(w)} + encoding.NotSupported } func (e *DictionaryEncoding) String() string { return "RLE_DICTIONARY" } -type dictionaryDecoder struct { - encoding.NotSupportedDecoder - rle *Decoder - zero bool -} - -func (d dictionaryDecoder) Reset(r io.Reader) { - d.rle.Reset(r) - d.rle.SetBitWidth(0) - d.zero = false +func (e *DictionaryEncoding) Encoding() format.Encoding { + return format.RLEDictionary } -func (d dictionaryDecoder) DecodeInt32(data []int32) (int, error) { - if d.zero { - clearInt32(data) - return len(data), nil - } - if d.rle.BitWidth() == 0 { - bitWidth, err := d.decodeBitWidth() - if err != nil { - return 0, err - } - // Sometimes, when the dictionary contains only a single value, the page - // can be encoded as a zero bit width to indicate that all indexes will - // be zero. - if bitWidth == 0 { - d.zero = true - clearInt32(data) - return len(data), nil - } - d.rle.SetBitWidth(bitWidth) - } - return d.rle.DecodeInt32(data) +func (e *DictionaryEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { + bitWidth := bits.MaxLen32(src) + dst = append(dst[:0], byte(bitWidth)) + dst, err := encodeInt32(dst, src, uint(bitWidth)) + return dst, e.wrap(err) } -func (d dictionaryDecoder) decodeBitWidth() (int, error) { - b, err := d.rle.ReadByte() - switch err { - case nil: - if b > 32 { - return 0, fmt.Errorf("decoding RLE bit width: %d>32", b) - } - return int(b), nil - case io.EOF: - return 0, err - default: - return 0, fmt.Errorf("decoding RLE bit width: %w", err) +func (e *DictionaryEncoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) { + if len(src) == 0 { + return dst[:0], nil } + dst, err := decodeInt32(dst[:0], src[1:], uint(src[0])) + return dst, e.wrap(err) } -type dictionaryEncoder struct { - encoding.NotSupportedEncoder - rle *Encoder -} - -func (e dictionaryEncoder) Reset(w io.Writer) { - e.rle.Reset(w) -} - -func (e dictionaryEncoder) EncodeInt32(data []int32) error { - bitWidth := bits.MaxLen32(data) - if bitWidth == 0 { - bitWidth = 1 +func (e *DictionaryEncoding) wrap(err error) error { + if err != nil { + err = encoding.Error(e, err) } - if err := e.encodeBitWidth(bitWidth); err != nil { - return err - } - e.rle.SetBitWidth(bitWidth) - return e.rle.EncodeInt32(data) -} - -func (e dictionaryEncoder) encodeBitWidth(bitWidth int) error { - return e.rle.WriteByte(byte(bitWidth)) + return err } func clearInt32(data []int32) { diff --git a/encoding/rle/encoder.go b/encoding/rle/encoder.go deleted file mode 100644 index 11631ff..0000000 --- a/encoding/rle/encoder.go +++ /dev/null @@ -1,164 +0,0 @@ -package rle - -import ( - "bytes" - "encoding/binary" - "fmt" - "io" - - "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/internal/bits" -) - -type Encoder struct { - encoding.NotSupportedEncoder - writer io.Writer - bitWidth uint - buffer [64]byte - runLength runLengthRunEncoder - bitPack bitPackRunEncoder -} - -func NewEncoder(w io.Writer) *Encoder { - return &Encoder{writer: w} -} - -func (e *Encoder) Write(b []byte) (int, error) { - return e.writer.Write(b) -} - -func (e *Encoder) WriteByte(b byte) error { - e.buffer[0] = b - _, err := e.Write(e.buffer[:1]) - return err -} - -func (e *Encoder) WriteUvarint(u uint64) (int, error) { - n := binary.PutUvarint(e.buffer[:], u) - return e.Write(e.buffer[:n]) -} - -func (e *Encoder) BitWidth() int { - return int(e.bitWidth) -} - -func (e *Encoder) SetBitWidth(bitWidth int) { - e.bitWidth = uint(bitWidth) -} - -func (e *Encoder) Reset(w io.Writer) { - e.writer = w -} - -func (e *Encoder) EncodeBoolean(data []bool) error { - // When encoding booleans, the BIT_PACKED encoding is used without the - // varint header. - e.bitPack.reset(e.writer, 1) - bytes := bits.BoolToBytes(data) - int8s := bits.BytesToInt8(bytes) - e.bitPack.encodeInt8(int8s, 1) - return e.bitPack.flush() -} - -func (e *Encoder) EncodeInt8(data []int8) error { - return e.encode(bits.Int8ToBytes(data), e.bitWidth, 8) -} - -func (e *Encoder) EncodeInt16(data []int16) error { - return e.encode(bits.Int16ToBytes(data), e.bitWidth, 16) -} - -func (e *Encoder) EncodeInt32(data []int32) error { - return e.encode(bits.Int32ToBytes(data), e.bitWidth, 32) -} - -func (e *Encoder) EncodeInt64(data []int64) error { - return e.encode(bits.Int64ToBytes(data), e.bitWidth, 64) -} - -func (e *Encoder) encode(data []byte, dstWidth, srcWidth uint) error { - if dstWidth == 0 { - return fmt.Errorf("the destination bit-width must be configured on a RLE encoder before writing %d bits integer values", srcWidth) - } - - wordSize := uint(bits.ByteCount(srcWidth)) - eightWordSize := 8 * wordSize - i := uint(0) - n := uint(len(data)) - pattern := e.buffer[:eightWordSize] - - for i < n { - j := i - k := i + eightWordSize - fill(pattern, data[i:i+wordSize]) - - for k <= n && !bytes.Equal(data[j:k], pattern) { - j += eightWordSize - k += eightWordSize - } - - if i < j { - if err := e.encodeBitPack(data[i:j], dstWidth, srcWidth); err != nil { - return err - } - } else { - if k <= n { - j += eightWordSize - k += eightWordSize - } - - for k <= n && bytes.Equal(data[j:k], pattern) { - j += eightWordSize - k += eightWordSize - } - - k = j + wordSize - for k <= n && bytes.Equal(data[j:k], pattern[:wordSize]) { - j += wordSize - k += wordSize - } - - if i < j { - if err := e.encodeRunLength(data[i:j], dstWidth, srcWidth); err != nil { - return err - } - } - } - - i = j - } - - return nil -} - -func (e *Encoder) encodeBitPack(run []byte, dstWidth, srcWidth uint) error { - if _, err := e.WriteUvarint((uint64(len(run)/(8*bits.ByteCount(srcWidth))) << 1) | 1); err != nil { - return err - } - e.bitPack.reset(e.writer, dstWidth) - return e.bitPack.encode(run, srcWidth) -} - -func (e *Encoder) encodeRunLength(run []byte, dstWidth, srcWidth uint) error { - if _, err := e.WriteUvarint(uint64(len(run)/bits.ByteCount(srcWidth)) << 1); err != nil { - return err - } - e.runLength.reset(e.writer, dstWidth) - return e.runLength.encode(run, srcWidth) -} - -func fill(b, v []byte) int { - n := copy(b, v) - - for i := n; i < len(b); { - n += copy(b[i:], b[:i]) - i *= 2 - } - - return n -} - -var ( - _ io.ByteWriter = (*Encoder)(nil) - _ io.Writer = (*Encoder)(nil) -) diff --git a/encoding/rle/rle.go b/encoding/rle/rle.go index 8fc175f..0915fd0 100644 --- a/encoding/rle/rle.go +++ b/encoding/rle/rle.go @@ -6,31 +6,411 @@ package rle import ( + "bytes" + "encoding/binary" + "fmt" "io" + "unsafe" "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" + "github.com/segmentio/parquet-go/internal/bits" +) + +const ( + // This limit is intended to prevent unbounded memory allocations when + // decoding runs. + // + // We use a generous limit which allows for over a million values per page + // if there is only one run to encode the repetition or definition levels + // (this should be uncommon). + maxSupportedValueCount = 1024 * 1024 ) type Encoding struct { + encoding.NotSupported + BitWidth int +} + +func (e *Encoding) String() string { + return "RLE" } func (e *Encoding) Encoding() format.Encoding { return format.RLE } -func (e *Encoding) CanEncode(t format.Type) bool { - return t == format.Boolean || t == format.Int32 || t == format.Int64 +func (e *Encoding) EncodeBoolean(dst []byte, src []bool) ([]byte, error) { + // In the case of encoding a boolean values, the 4 bytes length of the + // output is expected by the parquet format. We add the bytes as placeholder + // before appending the encoded data. + dst = append(dst[:0], 0, 0, 0, 0) + dst, err := encodeInt8(dst, bits.BytesToInt8(bits.BoolToBytes(src)), 1) + binary.LittleEndian.PutUint32(dst, uint32(len(dst))-4) + return dst, e.wrap(err) } -func (e *Encoding) NewDecoder(r io.Reader) encoding.Decoder { - return NewDecoder(r) +func (e *Encoding) EncodeInt8(dst []byte, src []int8) ([]byte, error) { + dst, err := encodeInt8(dst[:0], src, uint(e.BitWidth)) + return dst, e.wrap(err) } -func (e *Encoding) NewEncoder(w io.Writer) encoding.Encoder { - return NewEncoder(w) +func (e *Encoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { + dst, err := encodeInt32(dst[:0], src, uint(e.BitWidth)) + return dst, e.wrap(err) } -func (e *Encoding) String() string { - return "RLE" +func (e *Encoding) DecodeBoolean(dst []bool, src []byte) ([]bool, error) { + if len(src) == 4 { + return dst[:0], nil + } + if len(src) < 4 { + return dst[:0], fmt.Errorf("input shorter than 4 bytes: %w", io.ErrUnexpectedEOF) + } + n := int(binary.LittleEndian.Uint32(src)) + src = src[4:] + if n > len(src) { + return dst[:0], fmt.Errorf("input shorter than length prefix: %d < %d: %w", len(src), n, io.ErrUnexpectedEOF) + } + buf := bits.BytesToInt8(bits.BoolToBytes(dst)) + buf, err := decodeInt8(buf[:0], src[:n], 1) + return bits.BytesToBool(bits.Int8ToBytes(buf)), e.wrap(err) +} + +func (e *Encoding) DecodeInt8(dst []int8, src []byte) ([]int8, error) { + dst, err := decodeInt8(dst[:0], src, uint(e.BitWidth)) + return dst, e.wrap(err) +} + +func (e *Encoding) DecodeInt32(dst []int32, src []byte) ([]int32, error) { + dst, err := decodeInt32(dst[:0], src, uint(e.BitWidth)) + return dst, e.wrap(err) +} + +func (e *Encoding) wrap(err error) error { + if err != nil { + err = encoding.Error(e, err) + } + return err +} + +func encodeInt8(dst []byte, src []int8, bitWidth uint) ([]byte, error) { + if bitWidth > 8 { + return dst, errEncodeInvalidBitWidth("INT8", bitWidth) + } + if bitWidth == 0 { + if !isZeroInt8(src) { + return dst, errEncodeInvalidBitWidth("INT8", bitWidth) + } + return appendUvarint(dst, uint64(len(src))<<1), nil + } + + bitMask := uint64(1<= 8 { + words := unsafe.Slice((*uint64)(unsafe.Pointer(&src[0])), len(src)/8) + + for i := 0; i < len(words); { + j := i + pattern := broadcast8x8(words[i] & 0xFF) + + for j < len(words) && words[j] == pattern { + j++ + } + + if i < j { + dst = appendUvarint(dst, uint64(8*(j-i))<<1) + dst = append(dst, byte(pattern)) + } else { + j++ + + for j < len(words) && words[j] != broadcast8x8(words[j-1]) { + j++ + } + + dst = appendUvarint(dst, uint64(j-i)<<1|1) + + for _, word := range words[i:j] { + word = (word & bitMask) | + (((word >> 8) & bitMask) << (1 * bitWidth)) | + (((word >> 16) & bitMask) << (2 * bitWidth)) | + (((word >> 24) & bitMask) << (3 * bitWidth)) | + (((word >> 32) & bitMask) << (4 * bitWidth)) | + (((word >> 40) & bitMask) << (5 * bitWidth)) | + (((word >> 48) & bitMask) << (6 * bitWidth)) | + (((word >> 56) & bitMask) << (7 * bitWidth)) + bits := [8]byte{} + binary.LittleEndian.PutUint64(bits[:], word) + dst = append(dst, bits[:byteCount]...) + } + } + + i = j + } + } + + for i := (len(src) / 8) * 8; i < len(src); { + j := i + 1 + + for j < len(src) && src[i] == src[j] { + j++ + } + + dst = appendUvarint(dst, uint64(j-i)<<1) + dst = append(dst, byte(src[i])) + i = j + } + + return dst, nil +} + +func encodeInt32(dst []byte, src []int32, bitWidth uint) ([]byte, error) { + if bitWidth > 32 { + return dst, errEncodeInvalidBitWidth("INT32", bitWidth) + } + if bitWidth == 0 { + if !isZeroInt32(src) { + return dst, errEncodeInvalidBitWidth("INT32", bitWidth) + } + return appendUvarint(dst, uint64(len(src))<<1), nil + } + + bitMask := uint32(1<= 8 { + words := unsafe.Slice((*[8]int32)(unsafe.Pointer(&src[0])), len(src)/8) + + for i := 0; i < len(words); { + j := i + pattern := broadcast32x8(words[i][0]) + + for j < len(words) && words[j] == pattern { + j++ + } + + if i < j { + dst = appendUvarint(dst, uint64(8*(j-i))<<1) + dst = appendInt32(dst, pattern[0], bitWidth) + } else { + j++ + + for j < len(words) && words[j] != broadcast32x8(words[j-1][0]) { + j++ + } + + dst = appendUvarint(dst, uint64(j-i)<<1|1) + + for _, word := range words[i:j] { + bits := [9]uint32{} + bitOffset := uint(0) + + for _, value := range word { + i := bitOffset / 32 + j := bitOffset % 32 + bits[i+0] |= (uint32(value) & bitMask) << j + bits[i+1] |= (uint32(value) >> (32 - j)) + bitOffset += bitWidth + } + + b := unsafe.Slice((*byte)(unsafe.Pointer(&bits[0])), byteCount) + dst = append(dst, b...) + } + } + + i = j + } + } + + for i := (len(src) / 8) * 8; i < len(src); { + j := i + 1 + + for j < len(src) && src[i] == src[j] { + j++ + } + + dst = appendUvarint(dst, uint64(j-i)<<1) + dst = appendInt32(dst, src[i], bitWidth) + i = j + } + + return dst, nil +} + +func decodeInt8(dst []int8, src []byte, bitWidth uint) ([]int8, error) { + if bitWidth > 8 { + return dst, errDecodeInvalidBitWidth("INT8", bitWidth) + } + + bitMask := uint64(1<>1), (u&1) != 0 + if count > maxSupportedValueCount { + return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount) + } + if !bitpack { + if bitWidth != 0 && (i+1) > len(src) { + return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF) + } + + word := int8(0) + if bitWidth != 0 { + word = int8(src[i]) + i++ + } + + for count > 0 { + dst = append(dst, word) + count-- + } + } else { + for n := uint(0); n < count; n++ { + j := i + byteCount + + if j > len(src) { + return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", 8*count, io.ErrUnexpectedEOF) + } + + bits := [8]byte{} + copy(bits[:], src[i:j]) + word := binary.LittleEndian.Uint64(bits[:]) + + dst = append(dst, + int8((word>>(0*bitWidth))&bitMask), + int8((word>>(1*bitWidth))&bitMask), + int8((word>>(2*bitWidth))&bitMask), + int8((word>>(3*bitWidth))&bitMask), + int8((word>>(4*bitWidth))&bitMask), + int8((word>>(5*bitWidth))&bitMask), + int8((word>>(6*bitWidth))&bitMask), + int8((word>>(7*bitWidth))&bitMask), + ) + + i = j + } + } + } + + return dst, nil +} + +func decodeInt32(dst []int32, src []byte, bitWidth uint) ([]int32, error) { + if bitWidth > 32 { + return dst, errDecodeInvalidBitWidth("INT32", bitWidth) + } + + bitMask := uint64(1<>1), (u&1) != 0 + if count > maxSupportedValueCount { + return dst, fmt.Errorf("decoded run-length block cannot have more than %d values", maxSupportedValueCount) + } + if !bitpack { + j := i + byteCount1 + + if j > len(src) { + return dst, fmt.Errorf("decoding run-length block of %d values: %w", count, io.ErrUnexpectedEOF) + } + + bits := [4]byte{} + copy(bits[:], src[i:j]) + + word := binary.LittleEndian.Uint32(bits[:]) + i = j + + for count > 0 { + dst = append(dst, int32(word)) + count-- + } + } else { + for n := uint(0); n < count; n++ { + j := i + byteCount8 + + if j > len(src) { + return dst, fmt.Errorf("decoding bit-packed block of %d values: %w", 8*count, io.ErrUnexpectedEOF) + } + + value := uint64(0) + bitOffset := uint(0) + + for _, b := range src[i:j] { + value |= uint64(b) << bitOffset + + for bitOffset += 8; bitOffset >= bitWidth; { + dst = append(dst, int32(value&bitMask)) + value >>= bitWidth + bitOffset -= bitWidth + } + } + + i = j + } + } + } + + return dst, nil +} + +func errEncodeInvalidBitWidth(typ string, bitWidth uint) error { + return errInvalidBitWidth("encode", typ, bitWidth) +} + +func errDecodeInvalidBitWidth(typ string, bitWidth uint) error { + return errInvalidBitWidth("decode", typ, bitWidth) +} + +func errInvalidBitWidth(op, typ string, bitWidth uint) error { + return fmt.Errorf("cannot %s %s with invalid bit-width=%d", op, typ, bitWidth) +} + +func appendUvarint(dst []byte, u uint64) []byte { + var b [binary.MaxVarintLen64]byte + var n = binary.PutUvarint(b[:], u) + return append(dst, b[:n]...) +} + +func appendInt32(dst []byte, v int32, bitWidth uint) []byte { + var b [4]byte + binary.LittleEndian.PutUint32(b[:], uint32(v)) + return append(dst, b[:bits.ByteCount(bitWidth)]...) +} + +func broadcast8x8(v uint64) uint64 { + return v | v<<8 | v<<16 | v<<24 | v<<32 | v<<40 | v<<48 | v<<56 +} + +func broadcast32x8(v int32) [8]int32 { + return [8]int32{v, v, v, v, v, v, v, v} +} + +func isZeroInt8(data []int8) bool { + return bytes.Count(bits.Int8ToBytes(data), []byte{0}) == len(data) +} + +func isZeroInt32(data []int32) bool { + return bytes.Count(bits.Int32ToBytes(data), []byte{0}) == (4 * len(data)) } diff --git a/encoding/rle/rle_test.go b/encoding/rle/rle_test.go new file mode 100644 index 0000000..ed45083 --- /dev/null +++ b/encoding/rle/rle_test.go @@ -0,0 +1,23 @@ +//go:build go1.18 +// +build go1.18 + +package rle_test + +import ( + "testing" + + "github.com/segmentio/parquet-go/encoding/fuzz" + "github.com/segmentio/parquet-go/encoding/rle" +) + +func FuzzEncodeBoolean(f *testing.F) { + fuzz.EncodeBoolean(f, &rle.Encoding{BitWidth: 1}) +} + +func FuzzEncodeInt8(f *testing.F) { + fuzz.EncodeInt8(f, &rle.Encoding{BitWidth: 8}) +} + +func FuzzEncodeInt32(f *testing.F) { + fuzz.EncodeInt32(f, &rle.Encoding{BitWidth: 32}) +} diff --git a/encoding/rle/runlength.go b/encoding/rle/runlength.go deleted file mode 100644 index b74bc9e..0000000 --- a/encoding/rle/runlength.go +++ /dev/null @@ -1,83 +0,0 @@ -package rle - -import ( - "encoding/binary" - "io" - - "github.com/segmentio/parquet-go/internal/bits" -) - -type runLengthRunDecoder struct { - reader io.Reader - remain uint - length uint - bitWidth uint - buffer [8]byte -} - -func (d *runLengthRunDecoder) String() string { return "RLE" } - -func (d *runLengthRunDecoder) reset(r io.Reader, bitWidth, numValues uint) { - d.reader = r - d.remain = numValues - d.length = uint(bits.ByteCount(bitWidth)) - d.bitWidth = bitWidth - d.buffer = [8]byte{} -} - -func (d *runLengthRunDecoder) decode(dst []byte, dstWidth uint) (int, error) { - if d.remain == 0 { - return 0, io.EOF - } - - if d.length != 0 { - _, err := io.ReadFull(d.reader, d.buffer[:d.length]) - if err != nil { - return 0, err - } - d.length = 0 - } - - n := bits.BitCount(len(dst)) / dstWidth - if n > d.remain { - n = d.remain - } - dst = dst[:bits.ByteCount(n*dstWidth)] - bits.Fill(dst, dstWidth, binary.LittleEndian.Uint64(d.buffer[:]), d.bitWidth) - d.remain -= n - return int(n), nil -} - -type runLengthRunEncoder struct { - writer io.Writer - bitWidth uint - buffer [8]byte -} - -func (e *runLengthRunEncoder) reset(w io.Writer, bitWidth uint) { - e.writer, e.bitWidth = w, bitWidth -} - -func (e *runLengthRunEncoder) encode(src []byte, srcWidth uint) error { - // At this stage we make the assumption that the source buffer contains a - // sequence of repeated values of the given bit width; we pack the first - // value only into the encoder's buffer to adjust the bit width then write - // it to the underlying io.Writer. - v := uint64(0) - switch srcWidth { - case 8: - v = uint64(src[0]) - case 16: - v = uint64(binary.LittleEndian.Uint16(src)) - case 32: - v = uint64(binary.LittleEndian.Uint32(src)) - case 64: - v = binary.LittleEndian.Uint64(src) - default: - panic("BUG: unsupported source bit-width") - } - v &= (1 << uint64(e.bitWidth)) - 1 - binary.LittleEndian.PutUint64(e.buffer[:], v) - _, err := e.writer.Write(e.buffer[:bits.ByteCount(e.bitWidth)]) - return err -} diff --git a/encoding/rle/runlength_test.go b/encoding/rle/runlength_test.go deleted file mode 100644 index 3cd1b47..0000000 --- a/encoding/rle/runlength_test.go +++ /dev/null @@ -1,56 +0,0 @@ -package rle - -import ( - "bytes" - "io" - "math" - . "math/bits" - "testing" - - "github.com/segmentio/parquet-go/internal/bits" -) - -func TestRunLength(t *testing.T) { - buf := new(bytes.Buffer) - enc := runLengthRunEncoder{} - dec := runLengthRunDecoder{} - data := [1]int16{} - - for value := 0; value < math.MaxInt16; value += 31 { - t.Run("", func(t *testing.T) { - numValues := uint(10) - bitWidth := uint(Len16(uint16(value))) - if bitWidth == 0 { - bitWidth = 1 - } - enc.reset(buf, bitWidth) - dec.reset(buf, bitWidth, numValues) - - data[0] = int16(value) - - if err := enc.encode(bits.Int16ToBytes(data[:]), 16); err != nil { - t.Fatal("encoding:", err) - } - - for i := uint(0); i < numValues; i++ { - data[0] = 0 - n, err := dec.decode(bits.Int16ToBytes(data[:]), 16) - if err != nil { - t.Fatal("decoding:", err) - } - if n != 1 { - t.Fatal("wrong number of values decoded:", n) - } - if data[0] != int16(value) { - t.Fatal("wrong value decoded:", data[0]) - } - } - - if n, err := dec.decode(bits.Int16ToBytes(data[:]), 16); err != io.EOF { - t.Fatal("non-EOF error returned after decoding all the values:", err) - } else if n != 0 { - t.Fatal("non-zero number of values decoded at EOF:", n) - } - }) - } -} diff --git a/encoding/rle/testdata/fuzz/FuzzEncodeBoolean/9772b3f21a6f61810fe38d120bcc9da6d78540f22dc819a4201283608671fdf4 b/encoding/rle/testdata/fuzz/FuzzEncodeBoolean/9772b3f21a6f61810fe38d120bcc9da6d78540f22dc819a4201283608671fdf4 new file mode 100644 index 0000000..38f3266 --- /dev/null +++ b/encoding/rle/testdata/fuzz/FuzzEncodeBoolean/9772b3f21a6f61810fe38d120bcc9da6d78540f22dc819a4201283608671fdf4 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("00000001") diff --git a/encoding/rle/testdata/fuzz/FuzzEncodeInt32/06ba4bdb19de593e669c642987e270fe2488d4d58ecd712db136a3e011071253 b/encoding/rle/testdata/fuzz/FuzzEncodeInt32/06ba4bdb19de593e669c642987e270fe2488d4d58ecd712db136a3e011071253 new file mode 100644 index 0000000..7fd7a20 --- /dev/null +++ b/encoding/rle/testdata/fuzz/FuzzEncodeInt32/06ba4bdb19de593e669c642987e270fe2488d4d58ecd712db136a3e011071253 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0000") diff --git a/encoding/rle/testdata/fuzz/FuzzEncodeInt32/92533a0626fba1bd5e7adf99644b53cbd11540553335f38848327e4639dea792 b/encoding/rle/testdata/fuzz/FuzzEncodeInt32/92533a0626fba1bd5e7adf99644b53cbd11540553335f38848327e4639dea792 new file mode 100644 index 0000000..d636e17 --- /dev/null +++ b/encoding/rle/testdata/fuzz/FuzzEncodeInt32/92533a0626fba1bd5e7adf99644b53cbd11540553335f38848327e4639dea792 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\xaaJ\x9a") diff --git a/encoding/rle/testdata/fuzz/FuzzEncodeInt8/0468684de48f926219bfc47be13ddf085b5a0ed9fbd9c40a005641b253e88d33 b/encoding/rle/testdata/fuzz/FuzzEncodeInt8/0468684de48f926219bfc47be13ddf085b5a0ed9fbd9c40a005641b253e88d33 new file mode 100644 index 0000000..369abe5 --- /dev/null +++ b/encoding/rle/testdata/fuzz/FuzzEncodeInt8/0468684de48f926219bfc47be13ddf085b5a0ed9fbd9c40a005641b253e88d33 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\xba\xba\xba\xba0\xba\xba\xba\xba\xba\xba") diff --git a/errors.go b/errors.go index f428d26..9cf8847 100644 --- a/errors.go +++ b/errors.go @@ -29,4 +29,22 @@ var ( // ErrSeekOutOfRange is an error returned when seeking to a row index which // is less than the first row of a page. ErrSeekOutOfRange = errors.New("seek to row index out of page range") + + // ErrUnexpectedDictionaryPage is an error returned when a page reader + // encounters a dictionary page after the first page, or in a column + // which does not use a dictionary encoding. + ErrUnexpectedDictionaryPage = errors.New("unexpected dictionary page") + + // ErrMissingPageHeader is an error returned when a page reader encounters + // a malformed page header which is missing page-type-specific information. + ErrMissingPageHeader = errors.New("missing page header") + + // ErrUnexpectedRepetitionLevels is an error returned when attempting to + // decode repetition levels into a page which is not part of a repeated + // column. + ErrUnexpectedRepetitionLevels = errors.New("unexpected repetition levels") + + // ErrUnexpectedDefinitionLevels is an error returned when attempting to + // decode definition levels into a page which is part of a required column. + ErrUnexpectedDefinitionLevels = errors.New("unexpected definition levels") ) diff --git a/file.go b/file.go index 702bf1c..3f507d1 100644 --- a/file.go +++ b/file.go @@ -2,9 +2,7 @@ package parquet import ( "bufio" - "bytes" "encoding/binary" - "errors" "fmt" "hash/crc32" "io" @@ -396,29 +394,10 @@ func (c *fileColumnChunk) Column() int { func (c *fileColumnChunk) Pages() Pages { r := new(filePages) - c.setPagesOn(r) + r.init(c) return r } -func (c *fileColumnChunk) setPagesOn(r *filePages) { - r.column = c - r.page = filePage{ - column: c.column, - columnType: c.column.Type(), - codec: c.chunk.MetaData.Codec, - } - r.baseOffset = c.chunk.MetaData.DataPageOffset - r.dataOffset = r.baseOffset - if c.chunk.MetaData.DictionaryPageOffset != 0 { - r.baseOffset = c.chunk.MetaData.DictionaryPageOffset - r.dictOffset = r.baseOffset - } - r.section = io.NewSectionReader(c.file, r.baseOffset, c.chunk.MetaData.TotalCompressedSize) - r.rbuf = bufio.NewReaderSize(r.section, defaultReadBufferSize) - r.section.Seek(r.dataOffset-r.baseOffset, io.SeekStart) - r.decoder.Reset(r.protocol.NewReader(r.rbuf)) -} - func (c *fileColumnChunk) ColumnIndex() ColumnIndex { if c.columnIndex == nil { return nil @@ -445,183 +424,161 @@ func (c *fileColumnChunk) NumValues() int64 { } type filePages struct { - column *fileColumnChunk - protocol thrift.CompactProtocol - decoder thrift.Decoder - baseOffset int64 - dictOffset int64 - dataOffset int64 + chunk *fileColumnChunk + dictPage *dictPage + dataPage *dataPage + section *io.SectionReader + rbuf *bufio.Reader - section *io.SectionReader - rbuf *bufio.Reader + protocol thrift.CompactProtocol + decoder thrift.Decoder - page filePage - skip int64 + baseOffset int64 + dataOffset int64 + dictOffset int64 + index int + skip int64 } -func (r *filePages) readPage() (*filePage, error) { - r.page.header = format.PageHeader{} - - /* - h := &r.page.header - h.Type = 0 - h.UncompressedPageSize = 0 - h.CompressedPageSize = 0 - h.CRC = 0 - - if h.DataPageHeader != nil { - *h.DataPageHeader = format.DataPageHeader{} - } - if h.IndexPageHeader != nil { - h.IndexPageHeader = nil - } - if h.DictionaryPageHeader != nil { - h.DictionaryPageHeader = nil - } - if h.DataPageHeaderV2 != nil { - *h.DataPageHeaderV2 = format.DataPageHeaderV2{} - } - */ - - if err := r.decoder.Decode(&r.page.header); err != nil { - if err != io.EOF { - err = fmt.Errorf("decoding page header: %w", err) - } - return nil, err - } - - compressedPageSize := int(r.page.header.CompressedPageSize) - if cap(r.page.data) < compressedPageSize { - r.page.data = make([]byte, compressedPageSize) - } else { - r.page.data = r.page.data[:compressedPageSize] +func (r *filePages) init(c *fileColumnChunk) { + r.dataPage = new(dataPage) + r.chunk = c + r.baseOffset = c.chunk.MetaData.DataPageOffset + r.dataOffset = r.baseOffset + if c.chunk.MetaData.DictionaryPageOffset != 0 { + r.baseOffset = c.chunk.MetaData.DictionaryPageOffset + r.dictOffset = r.baseOffset } + r.section = io.NewSectionReader(c.file, r.baseOffset, c.chunk.MetaData.TotalCompressedSize) + r.rbuf = bufio.NewReaderSize(r.section, defaultReadBufferSize) + r.decoder.Reset(r.protocol.NewReader(r.rbuf)) +} - _, err := io.ReadFull(r.rbuf, r.page.data) - if err != nil { - return nil, fmt.Errorf("reading page %d of column %q", r.page.index, r.page.columnPath()) - } +func (r *filePages) ReadPage() (Page, error) { + for { + header := new(format.PageHeader) + if err := r.decoder.Decode(header); err != nil { + return nil, err + } - if r.page.header.CRC != 0 { - headerChecksum := uint32(r.page.header.CRC) - bufferChecksum := crc32.ChecksumIEEE(r.page.data) - - if headerChecksum != bufferChecksum { - // The parquet specs indicate that corruption errors could be - // handled gracefully by skipping pages, tho this may not always - // be practical. Depending on how the pages are consumed, - // missing rows may cause unpredictable behaviors in algorithms. - // - // For now, we assume these errors to be fatal, but we may - // revisit later and improve error handling to be more resilient - // to data corruption. - return nil, fmt.Errorf("crc32 checksum mismatch in page %d of column %q: 0x%08X != 0x%08X: %w", - r.page.index, - r.page.columnPath(), - headerChecksum, - bufferChecksum, - ErrCorrupted, - ) + if cap(r.dataPage.data) < int(header.CompressedPageSize) { + r.dataPage.data = make([]byte, header.CompressedPageSize) + } else { + r.dataPage.data = r.dataPage.data[:header.CompressedPageSize] } - } - if r.column.columnIndex != nil { - err = r.page.parseColumnIndex(r.column.columnIndex) - } else { - err = r.page.parseStatistics() - } - return &r.page, err -} + if cap(r.dataPage.values) < int(header.UncompressedPageSize) { + r.dataPage.values = make([]byte, 0, header.UncompressedPageSize) + } -func (r *filePages) readDictionary() error { - if _, err := r.section.Seek(r.dictOffset-r.baseOffset, io.SeekStart); err != nil { - return fmt.Errorf("seeking to dictionary page offset: %w", err) - } - r.rbuf.Reset(r.section) - p, err := r.readPage() - if err != nil { - return err - } - return r.readDictionaryPage(p) -} + if _, err := io.ReadFull(r.rbuf, r.dataPage.data); err != nil { + return nil, err + } -func (r *filePages) readDictionaryPage(p *filePage) error { - pageData, err := p.decompress(p.data) - if err != nil { - return fmt.Errorf("decompressing dictionary page of column %q: %w", p.columnPath(), err) - } + if header.CRC != 0 { + headerChecksum := uint32(header.CRC) + bufferChecksum := crc32.ChecksumIEEE(r.dataPage.data) + + if headerChecksum != bufferChecksum { + // The parquet specs indicate that corruption errors could be + // handled gracefully by skipping pages, tho this may not always + // be practical. Depending on how the pages are consumed, + // missing rows may cause unpredictable behaviors in algorithms. + // + // For now, we assume these errors to be fatal, but we may + // revisit later and improve error handling to be more resilient + // to data corruption. + return nil, fmt.Errorf("crc32 checksum mismatch in page %d of column %q: 0x%08X != 0x%08X: %w", + r.index, + r.columnPath(), + headerChecksum, + bufferChecksum, + ErrCorrupted, + ) + } + } - enc := p.header.DictionaryPageHeader.Encoding - dec := LookupEncoding(enc).NewDecoder(bytes.NewReader(pageData)) + var column = r.chunk.column + var page Page + var err error - columnIndex := r.column.Column() - numValues := int(p.NumValues()) - dict, err := p.columnType.ReadDictionary(columnIndex, numValues, dec) + switch header.Type { + case format.DataPageV2: + if header.DataPageHeaderV2 == nil { + err = ErrMissingPageHeader + } else { + page, err = column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, r.dataPage) + } - if err != nil { - return fmt.Errorf("reading dictionary of column %q: %w", p.columnPath(), err) - } + case format.DataPage: + if header.DataPageHeader == nil { + err = ErrMissingPageHeader + } else { + page, err = column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, r.dataPage) + } - r.page.dictionary = dict - r.page.columnType = dict.Type() - return nil -} + case format.DictionaryPage: + // Sometimes parquet files do not have the dictionary page offset + // recorded in the column metadata. We account for this by lazily + // checking whether the first page is a dictionary page. + if header.DictionaryPageHeader == nil { + err = ErrMissingPageHeader + } else if r.index > 0 { + err = ErrUnexpectedDictionaryPage + } else { + r.dictPage = new(dictPage) + r.dataPage.dictionary, err = column.decodeDictionary( + DictionaryPageHeader{header.DictionaryPageHeader}, + r.dataPage, + r.dictPage, + ) + } -func (r *filePages) ReadPage() (Page, error) { - if r.page.dictionary == nil && r.dictOffset > 0 { - if err := r.readDictionary(); err != nil { - return nil, err + default: + err = fmt.Errorf("cannot read values of type %s from page", header.Type) } - } - for { - p, err := r.readPage() if err != nil { - return nil, err + return nil, fmt.Errorf("decoding page %d of column %q: %w", r.index, r.columnPath(), err) } - // Sometimes parquet files do not have the dictionary page offset - // recorded in the column metadata. We account for this by lazily - // checking whether the first page is a dictionary page. - if p.index == 0 && p.header.Type == format.DictionaryPage && r.page.dictionary == nil { - offset, err := r.section.Seek(0, io.SeekCurrent) - if err != nil { - return nil, err + if page != nil { + r.index++ + if r.skip == 0 { + return page, nil } - r.dictOffset = r.baseOffset - r.dataOffset = r.baseOffset + offset - if err := r.readDictionaryPage(p); err != nil { - return nil, err - } - continue - } - - p.index++ - if r.skip == 0 { - return p, nil - } - numRows := p.NumRows() - if numRows > r.skip { - seek := r.skip - r.skip = 0 - if seek > 0 { - return p.Buffer().Slice(seek, numRows), nil + // TODO: what about pages that don't embed the number of rows? + // (data page v1 with no offset index in the column chunk). + numRows := page.NumRows() + if numRows > r.skip { + seek := r.skip + r.skip = 0 + if seek > 0 { + page = page.Buffer().Slice(seek, numRows) + } + return page, nil } - return p, nil - } - r.skip -= numRows + r.skip -= numRows + } } } +func (r *filePages) columnPath() columnPath { + return columnPath(r.chunk.column.Path()) +} + func (r *filePages) SeekToRow(rowIndex int64) (err error) { - if r.column.offsetIndex == nil { + if r.chunk.offsetIndex == nil { _, err = r.section.Seek(r.dataOffset-r.baseOffset, io.SeekStart) r.skip = rowIndex - r.page.index = 0 + r.index = 0 + if r.dictOffset > 0 { + r.index = 1 + } } else { - pages := r.column.offsetIndex.PageLocations + pages := r.chunk.offsetIndex.PageLocations index := sort.Search(len(pages), func(i int) bool { return pages[i].FirstRowIndex > rowIndex }) - 1 @@ -630,365 +587,8 @@ func (r *filePages) SeekToRow(rowIndex int64) (err error) { } _, err = r.section.Seek(pages[index].Offset-r.baseOffset, io.SeekStart) r.skip = rowIndex - pages[index].FirstRowIndex - r.page.index = index + r.index = index } r.rbuf.Reset(r.section) return err } - -type filePage struct { - column *Column - columnType Type - dictionary Dictionary - - codec format.CompressionCodec - header format.PageHeader - data []byte - buffer []byte - - index int - minValue Value - maxValue Value - hasBounds bool -} - -var ( - errPageIndexExceedsColumnIndexNullPages = errors.New("page index exceeds column index null pages") - errPageIndexExceedsColumnIndexMinValues = errors.New("page index exceeds column index min values") - errPageIndexExceedsColumnIndexMaxValues = errors.New("page index exceeds column index max values") - errPageIndexExceedsColumnIndexNullCounts = errors.New("page index exceeds column index null counts") -) - -func (p *filePage) decompress(pageData []byte) ([]byte, error) { - if p.codec != format.Uncompressed { - var err error - p.buffer, err = LookupCompressionCodec(p.codec).Decode(p.buffer[:0], pageData) - if err != nil { - return nil, err - } - pageData = p.buffer - } - return pageData, nil -} - -func (p *filePage) statistics() *format.Statistics { - switch p.header.Type { - case format.DataPageV2: - return &p.header.DataPageHeaderV2.Statistics - case format.DataPage: - return &p.header.DataPageHeader.Statistics - default: - return nil - } -} - -func (p *filePage) parseColumnIndex(columnIndex *format.ColumnIndex) (err error) { - if p.index >= len(columnIndex.NullPages) { - return p.errColumnIndex(errPageIndexExceedsColumnIndexNullPages) - } - if p.index >= len(columnIndex.MinValues) { - return p.errColumnIndex(errPageIndexExceedsColumnIndexMinValues) - } - if p.index >= len(columnIndex.MaxValues) { - return p.errColumnIndex(errPageIndexExceedsColumnIndexMaxValues) - } - if p.index >= len(columnIndex.NullCounts) { - return p.errColumnIndex(errPageIndexExceedsColumnIndexNullCounts) - } - - minValue := columnIndex.MinValues[p.index] - maxValue := columnIndex.MaxValues[p.index] - - if stats := p.statistics(); stats != nil { - if stats.MinValue == nil { - stats.MinValue = minValue - } - if stats.MaxValue == nil { - stats.MaxValue = maxValue - } - if stats.NullCount == 0 { - stats.NullCount = columnIndex.NullCounts[p.index] - } - } - - if columnIndex.NullPages[p.index] { - p.minValue = Value{} - p.maxValue = Value{} - p.hasBounds = false - } else { - kind := p.columnType.Kind() - p.minValue, err = parseValue(kind, minValue) - if err != nil { - return p.errColumnIndex(err) - } - p.maxValue, err = parseValue(kind, maxValue) - if err != nil { - return p.errColumnIndex(err) - } - p.hasBounds = true - } - - return nil -} - -func (p *filePage) parseStatistics() (err error) { - kind := p.columnType.Kind() - stats := p.statistics() - - if stats == nil { - // The column has no index and page has no statistics, - // default to reporting that the min and max are both null. - p.minValue = Value{} - p.maxValue = Value{} - p.hasBounds = false - return nil - } - - if stats.MinValue == nil { - p.minValue = Value{} - } else { - p.minValue, err = parseValue(kind, stats.MinValue) - if err != nil { - return p.errStatistics(err) - } - } - - if stats.MaxValue == nil { - p.maxValue = Value{} - } else { - p.maxValue, err = parseValue(kind, stats.MaxValue) - if err != nil { - return p.errStatistics(err) - } - } - - p.hasBounds = true - return nil -} - -func (p *filePage) errColumnIndex(err error) error { - return fmt.Errorf("reading bounds of page %d from index of column %q: %w", p.index, p.columnPath(), err) -} - -func (p *filePage) errStatistics(err error) error { - return fmt.Errorf("reading bounds of page %d from statistics in column %q: %w", p.index, p.columnPath(), err) -} - -func (p *filePage) columnPath() columnPath { - return columnPath(p.column.Path()) -} - -func (p *filePage) Column() int { - return int(p.column.Index()) -} - -func (p *filePage) Dictionary() Dictionary { - return p.dictionary -} - -func (p *filePage) NumRows() int64 { - switch p.header.Type { - case format.DataPageV2: - return int64(p.header.DataPageHeaderV2.NumRows) - default: - return 0 - } -} - -func (p *filePage) NumValues() int64 { - switch p.header.Type { - case format.DataPageV2: - return int64(p.header.DataPageHeaderV2.NumValues) - case format.DataPage: - return int64(p.header.DataPageHeader.NumValues) - case format.DictionaryPage: - return int64(p.header.DictionaryPageHeader.NumValues) - default: - return 0 - } -} - -func (p *filePage) NumNulls() int64 { - switch p.header.Type { - case format.DataPageV2: - return int64(p.header.DataPageHeaderV2.NumNulls) - case format.DataPage: - return p.header.DataPageHeader.Statistics.NullCount - default: - return 0 - } -} - -func (p *filePage) Bounds() (min, max Value, ok bool) { - return p.minValue, p.maxValue, p.hasBounds -} - -func (p *filePage) Size() int64 { - return int64(p.header.UncompressedPageSize) -} - -func (p *filePage) Values() ValueReader { - v, err := p.values() - if err != nil { - v = &errorValueReader{err} - } - return v -} - -func (p *filePage) values() (ValueReader, error) { - var repetitionLevels []byte - var definitionLevels []byte - var pageEncoding format.Encoding - var pageData = p.data - var numValues int - var err error - - maxRepetitionLevel := p.column.maxRepetitionLevel - maxDefinitionLevel := p.column.maxDefinitionLevel - - switch p.header.Type { - case format.DataPageV2: - header := p.header.DataPageHeaderV2 - repetitionLevels, definitionLevels, pageData, err = readDataPageV2(header, pageData) - if err != nil { - return nil, fmt.Errorf("initializing v2 reader for page of column %q: %w", p.columnPath(), err) - } - if p.codec != format.Uncompressed && (header.IsCompressed == nil || *header.IsCompressed) { - if pageData, err = p.decompress(pageData); err != nil { - return nil, fmt.Errorf("decompressing data page v2 of column %q: %w", p.columnPath(), err) - } - } - pageEncoding = header.Encoding - numValues = int(header.NumValues) - - case format.DataPage: - if pageData, err = p.decompress(pageData); err != nil { - return nil, fmt.Errorf("decompressing data page v1 of column %q: %w", p.columnPath(), err) - } - repetitionLevels, definitionLevels, pageData, err = readDataPageV1(maxRepetitionLevel, maxDefinitionLevel, pageData) - if err != nil { - return nil, fmt.Errorf("initializing v1 reader for page of column %q: %w", p.columnPath(), err) - } - header := p.header.DataPageHeader - pageEncoding = header.Encoding - numValues = int(header.NumValues) - - default: - return nil, fmt.Errorf("cannot read values of type %s from page of column %q", p.header.Type, p.columnPath()) - } - - // In some legacy configurations, the PLAIN_DICTIONARY encoding is used on - // data page headers to indicate that the page contains indexes into the - // dictionary page, tho it is still encoded using the RLE encoding in this - // case, so we convert the encoding to RLE_DICTIONARY to simplify. - switch pageEncoding { - case format.PlainDictionary: - pageEncoding = format.RLEDictionary - } - - pageDecoder := LookupEncoding(pageEncoding).NewDecoder(bytes.NewReader(pageData)) - reader := p.columnType.NewColumnReader(int(p.column.index), defaultReadBufferSize) - reader.Reset(numValues, pageDecoder) - - hasLevels := maxRepetitionLevel > 0 || maxDefinitionLevel > 0 - if hasLevels { - repetitions := RLE.NewDecoder(bytes.NewReader(repetitionLevels)) - definitions := RLE.NewDecoder(bytes.NewReader(definitionLevels)) - fileReader := newFileColumnReader(reader, maxRepetitionLevel, maxDefinitionLevel, defaultReadBufferSize) - fileReader.reset(numValues, repetitions, definitions, pageDecoder) - reader = fileReader - } - - return reader, nil -} - -func (p *filePage) Buffer() BufferedPage { - bufferedPage := p.column.Type().NewColumnBuffer(p.Column(), int(p.Size())) - _, err := CopyValues(bufferedPage, p.Values()) - if err != nil { - return &errorPage{err: err, columnIndex: p.Column()} - } - return bufferedPage.Page() -} - -func (p *filePage) PageHeader() PageHeader { - switch p.header.Type { - case format.DataPageV2: - return DataPageHeaderV2{p.header.DataPageHeaderV2} - case format.DataPage: - return DataPageHeaderV1{p.header.DataPageHeader} - case format.DictionaryPage: - return DictionaryPageHeader{p.header.DictionaryPageHeader} - default: - return unknownPageHeader{&p.header} - } -} - -func (p *filePage) PageData() io.Reader { return bytes.NewReader(p.data) } - -func (p *filePage) PageSize() int64 { return int64(p.header.CompressedPageSize) } - -func (p *filePage) CRC() uint32 { return uint32(p.header.CRC) } - -func readDataPageV1(maxRepetitionLevel, maxDefinitionLevel int8, page []byte) (repetitionLevels, definitionLevels, data []byte, err error) { - data = page - if maxRepetitionLevel > 0 { - repetitionLevels, data, err = readDataPageV1Level(data, "repetition") - if err != nil { - return nil, nil, page, err - } - } - if maxDefinitionLevel > 0 { - definitionLevels, data, err = readDataPageV1Level(data, "definition") - if err != nil { - return nil, nil, page, err - } - } - return repetitionLevels, definitionLevels, data, nil -} - -func readDataPageV1Level(page []byte, typ string) (level, data []byte, err error) { - size, page, err := read(page, 4) - if err != nil { - return nil, page, fmt.Errorf("reading %s level: %w", typ, err) - } - return read(page, int(binary.LittleEndian.Uint32(size))) -} - -func readDataPageV2(header *format.DataPageHeaderV2, page []byte) (repetitionLevels, definitionLevels, data []byte, err error) { - repetitionLevelsByteLength := header.RepetitionLevelsByteLength - definitionLevelsByteLength := header.DefinitionLevelsByteLength - data = page - if repetitionLevelsByteLength > 0 { - repetitionLevels, data, err = readDataPageV2Level(data, repetitionLevelsByteLength, "repetition") - if err != nil { - return nil, nil, page, err - } - } - if definitionLevelsByteLength > 0 { - definitionLevels, data, err = readDataPageV2Level(data, definitionLevelsByteLength, "definition") - if err != nil { - return nil, nil, page, err - } - } - return repetitionLevels, definitionLevels, data, nil -} - -func readDataPageV2Level(page []byte, size int32, typ string) (level, data []byte, err error) { - level, data, err = read(page, int(size)) - if err != nil { - err = fmt.Errorf("reading %s level: %w", typ, err) - } - return level, data, err -} - -func read(data []byte, size int) (head, tail []byte, err error) { - if len(data) < size { - return nil, data, io.ErrUnexpectedEOF - } - return data[:size], data[size:], nil -} - -var ( - _ CompressedPage = (*filePage)(nil) -) diff --git a/file_test.go b/file_test.go index adcb684..a4c20fc 100644 --- a/file_test.go +++ b/file_test.go @@ -76,7 +76,6 @@ func printColumns(t *testing.T, col *parquet.Column, indent string) { break } - header := p.(parquet.CompressedPage).PageHeader().(parquet.DataPageHeader) values := p.Values() numValues := int64(0) nullCount := int64(0) @@ -102,17 +101,14 @@ func printColumns(t *testing.T, col *parquet.Column, indent string) { } } - if numValues != header.NumValues() { - t.Errorf("page of column %d declared %d values but %d were read", col.Index(), header.NumValues(), numValues) + if numValues != p.NumValues() { + t.Errorf("page of column %d declared %d values but %d were read", col.Index(), p.NumValues(), numValues) return } - // Only the v2 data pages advertise the number of nulls they contain. - if _, isV2 := header.(parquet.DataPageHeaderV2); isV2 { - if nullCount != header.NullCount() { - t.Errorf("page of column %d declared %d nulls but %d were read", col.Index(), header.NullCount(), nullCount) - return - } + if nullCount != p.NumNulls() { + t.Errorf("page of column %d declared %d nulls but %d were read", col.Index(), p.NumNulls(), nullCount) + return } } diff --git a/internal/bits/io.go b/internal/bits/io.go deleted file mode 100644 index e370645..0000000 --- a/internal/bits/io.go +++ /dev/null @@ -1,112 +0,0 @@ -package bits - -import ( - "encoding/binary" - "io" -) - -type Reader struct { - reader io.Reader - length uint - cache uint64 - buffer [8]byte -} - -func (r *Reader) Reset(rr io.Reader) { - r.reader = rr - r.length = 0 - r.cache = 0 -} - -func (r *Reader) ReadBit() (int, error) { - bits, _, err := r.ReadBits(1) - return int(bits), err -} - -func (r *Reader) ReadBits(count uint) (uint64, uint, error) { - bits, nbits := uint64(0), uint(0) - - for count > 0 { - if r.length == 0 { - byteCount := ByteCount(count) - if byteCount > 8 { - byteCount = 8 - } - n, err := r.reader.Read(r.buffer[:byteCount]) - if err != nil && n == 0 { - if err == io.EOF && nbits != 0 { - err = io.ErrUnexpectedEOF - } - return bits, nbits, err - } - b := [8]byte{} - copy(b[:], r.buffer[:n]) - r.length = 8 * uint(n) - r.cache = binary.LittleEndian.Uint64(b[:]) - } - - n := count - if n > r.length { - n = r.length - } - - bits |= (r.cache & ((1 << n) - 1)) << nbits - nbits += n - count -= n - r.length -= n - r.cache >>= n - } - - return bits, nbits, nil -} - -type Writer struct { - writer io.Writer - length uint - cache uint64 - buffer []byte -} - -func (w *Writer) Buffered() int { - return len(w.buffer) -} - -func (w *Writer) Reset(ww io.Writer) { - w.writer = ww - w.length = 0 - w.buffer = w.buffer[:0] -} - -func (w *Writer) Flush() error { - w.flush() - _, err := w.writer.Write(w.buffer) - w.buffer = w.buffer[:0] - return err -} - -func (w *Writer) flush() { - b := [8]byte{} - binary.LittleEndian.PutUint64(b[:], w.cache) - w.buffer = append(w.buffer, b[:ByteCount(w.length)]...) - w.length = 0 - w.cache = 0 -} - -func (w *Writer) WriteBit(bit int) { - w.WriteBits(uint64(bit), 1) -} - -func (w *Writer) WriteBits(bits uint64, count uint) { - for { - w.cache |= (bits & ((1 << count) - 1)) << w.length - n := 64 - w.length - if n >= count { - w.length += count - break - } - w.length += n - bits >>= n - count -= n - w.flush() - } -} diff --git a/internal/bits/io_test.go b/internal/bits/io_test.go deleted file mode 100644 index eeb5847..0000000 --- a/internal/bits/io_test.go +++ /dev/null @@ -1,73 +0,0 @@ -package bits_test - -import ( - "bytes" - "io" - "testing" - - "github.com/segmentio/parquet-go/internal/bits" -) - -func TestReader(t *testing.T) { - want := []byte{ - 0b10101010, 0b10101010, 0b10101010, 0b10101010, - 0b10101010, 0b10101010, 0b10101010, 0b10101010, - - 0b10101010, 0b10101010, 0b10101010, 0b10101010, - 0b10101010, 0b10101010, 0b10101010, 0b00000010, - } - data := make([]byte, len(want)) - - r := new(bits.Reader) - r.Reset(bytes.NewReader(want)) - - for i := 0; i < 8*len(data); i++ { - j := i / 8 - k := i % 8 - - b, err := r.ReadBit() - if err != nil { - t.Fatal(err) - } - - data[j] |= byte(b) << k - } - - _, err := r.ReadBit() - if err != io.EOF { - t.Errorf("unexpected error returned after reading all the bits: %v", err) - } - - if !bytes.Equal(data, want) { - t.Errorf("data = %08b", data) - t.Errorf("want = %08b", want) - } -} - -func TestWriter(t *testing.T) { - b := new(bytes.Buffer) - w := new(bits.Writer) - w.Reset(b) - - for i := 0; i < 123; i++ { - w.WriteBit(i & 1) - } - - if err := w.Flush(); err != nil { - t.Fatal(err) - } - - data := b.Bytes() - want := []byte{ - 0b10101010, 0b10101010, 0b10101010, 0b10101010, - 0b10101010, 0b10101010, 0b10101010, 0b10101010, - - 0b10101010, 0b10101010, 0b10101010, 0b10101010, - 0b10101010, 0b10101010, 0b10101010, 0b00000010, - } - - if !bytes.Equal(data, want) { - t.Errorf("data = %08b", data) - t.Errorf("want = %08b", want) - } -} diff --git a/internal/bits/max_go18.go b/internal/bits/max_go18.go index 75e73fe..149926e 100644 --- a/internal/bits/max_go18.go +++ b/internal/bits/max_go18.go @@ -5,7 +5,7 @@ package bits import ( "encoding/binary" - "github.com/segmentio/parquet-go/internal/cast" + "github.com/segmentio/parquet-go/internal/unsafecast" ) func maxBool(data []bool) bool { @@ -39,7 +39,7 @@ func max[T ordered](data []T) (max T) { func maxBE128(data []byte) (min []byte) { if len(data) > 0 { - be128 := cast.BytesToSlice[uint128](data) + be128 := unsafecast.BytesToSlice[uint128](data) m := binary.BigEndian.Uint64(be128[0][:8]) j := 0 for i := 1; i < len(be128); i++ { diff --git a/internal/bits/min_go18.go b/internal/bits/min_go18.go index bd5d33b..0bd9796 100644 --- a/internal/bits/min_go18.go +++ b/internal/bits/min_go18.go @@ -5,7 +5,7 @@ package bits import ( "encoding/binary" - "github.com/segmentio/parquet-go/internal/cast" + "github.com/segmentio/parquet-go/internal/unsafecast" ) func minBool(data []bool) bool { return boolEqualAll(data, true) } @@ -37,7 +37,7 @@ func min[T ordered](data []T) (min T) { func minBE128(data []byte) (min []byte) { if len(data) > 0 { - be128 := cast.BytesToSlice[uint128](data) + be128 := unsafecast.BytesToSlice[uint128](data) m := binary.BigEndian.Uint64(be128[0][:8]) j := 0 for i := 1; i < len(be128); i++ { diff --git a/internal/bits/minmax_go18.go b/internal/bits/minmax_go18.go index e98ba59..4109628 100644 --- a/internal/bits/minmax_go18.go +++ b/internal/bits/minmax_go18.go @@ -5,7 +5,7 @@ package bits import ( "encoding/binary" - "github.com/segmentio/parquet-go/internal/cast" + "github.com/segmentio/parquet-go/internal/unsafecast" ) func boolEqualAll(data []bool, value bool) bool { @@ -62,7 +62,7 @@ func minmax[T ordered](data []T) (min, max T) { func minMaxBE128(data []byte) (min, max []byte) { if len(data) > 0 { - be128 := cast.BytesToSlice[uint128](data) + be128 := unsafecast.BytesToSlice[uint128](data) minHi := binary.BigEndian.Uint64(be128[0][:8]) maxHi := minHi minIndex := 0 diff --git a/internal/bits/unsafe_go18.go b/internal/bits/unsafe_go18.go index 2361019..de13fda 100644 --- a/internal/bits/unsafe_go18.go +++ b/internal/bits/unsafe_go18.go @@ -2,63 +2,63 @@ package bits -import "github.com/segmentio/parquet-go/internal/cast" +import "github.com/segmentio/parquet-go/internal/unsafecast" -// TODO: remove these functions and use the internal/cast package instead when +// TODO: remove these functions and use the internal/unsafecast package instead when // we drop support for Go 1.17. -func BoolToBytes(data []bool) []byte { return cast.SliceToBytes(data) } +func BoolToBytes(data []bool) []byte { return unsafecast.SliceToBytes(data) } -func Int8ToBytes(data []int8) []byte { return cast.SliceToBytes(data) } +func Int8ToBytes(data []int8) []byte { return unsafecast.SliceToBytes(data) } -func Int16ToBytes(data []int16) []byte { return cast.SliceToBytes(data) } +func Int16ToBytes(data []int16) []byte { return unsafecast.SliceToBytes(data) } -func Int32ToBytes(data []int32) []byte { return cast.SliceToBytes(data) } +func Int32ToBytes(data []int32) []byte { return unsafecast.SliceToBytes(data) } -func Int64ToBytes(data []int64) []byte { return cast.SliceToBytes(data) } +func Int64ToBytes(data []int64) []byte { return unsafecast.SliceToBytes(data) } -func Float32ToBytes(data []float32) []byte { return cast.SliceToBytes(data) } +func Float32ToBytes(data []float32) []byte { return unsafecast.SliceToBytes(data) } -func Float64ToBytes(data []float64) []byte { return cast.SliceToBytes(data) } +func Float64ToBytes(data []float64) []byte { return unsafecast.SliceToBytes(data) } -func Int16ToUint16(data []int16) []uint16 { return cast.Slice[uint16](data) } +func Int16ToUint16(data []int16) []uint16 { return unsafecast.Slice[uint16](data) } -func Int32ToUint32(data []int32) []uint32 { return cast.Slice[uint32](data) } +func Int32ToUint32(data []int32) []uint32 { return unsafecast.Slice[uint32](data) } -func Int64ToUint64(data []int64) []uint64 { return cast.Slice[uint64](data) } +func Int64ToUint64(data []int64) []uint64 { return unsafecast.Slice[uint64](data) } -func Float32ToUint32(data []float32) []uint32 { return cast.Slice[uint32](data) } +func Float32ToUint32(data []float32) []uint32 { return unsafecast.Slice[uint32](data) } -func Float64ToUint64(data []float64) []uint64 { return cast.Slice[uint64](data) } +func Float64ToUint64(data []float64) []uint64 { return unsafecast.Slice[uint64](data) } -func Uint32ToBytes(data []uint32) []byte { return cast.SliceToBytes(data) } +func Uint32ToBytes(data []uint32) []byte { return unsafecast.SliceToBytes(data) } -func Uint64ToBytes(data []uint64) []byte { return cast.SliceToBytes(data) } +func Uint64ToBytes(data []uint64) []byte { return unsafecast.SliceToBytes(data) } -func Uint128ToBytes(data [][16]byte) []byte { return cast.SliceToBytes(data) } +func Uint128ToBytes(data [][16]byte) []byte { return unsafecast.SliceToBytes(data) } -func Uint32ToInt32(data []uint32) []int32 { return cast.Slice[int32](data) } +func Uint32ToInt32(data []uint32) []int32 { return unsafecast.Slice[int32](data) } -func Uint64ToInt64(data []uint64) []int64 { return cast.Slice[int64](data) } +func Uint64ToInt64(data []uint64) []int64 { return unsafecast.Slice[int64](data) } -func BytesToBool(data []byte) []bool { return cast.BytesToSlice[bool](data) } +func BytesToBool(data []byte) []bool { return unsafecast.BytesToSlice[bool](data) } -func BytesToInt8(data []byte) []int8 { return cast.BytesToSlice[int8](data) } +func BytesToInt8(data []byte) []int8 { return unsafecast.BytesToSlice[int8](data) } -func BytesToInt16(data []byte) []int16 { return cast.BytesToSlice[int16](data) } +func BytesToInt16(data []byte) []int16 { return unsafecast.BytesToSlice[int16](data) } -func BytesToInt32(data []byte) []int32 { return cast.BytesToSlice[int32](data) } +func BytesToInt32(data []byte) []int32 { return unsafecast.BytesToSlice[int32](data) } -func BytesToInt64(data []byte) []int64 { return cast.BytesToSlice[int64](data) } +func BytesToInt64(data []byte) []int64 { return unsafecast.BytesToSlice[int64](data) } -func BytesToUint32(data []byte) []uint32 { return cast.BytesToSlice[uint32](data) } +func BytesToUint32(data []byte) []uint32 { return unsafecast.BytesToSlice[uint32](data) } -func BytesToUint64(data []byte) []uint64 { return cast.BytesToSlice[uint64](data) } +func BytesToUint64(data []byte) []uint64 { return unsafecast.BytesToSlice[uint64](data) } -func BytesToUint128(data []byte) [][16]byte { return cast.BytesToSlice[uint128](data) } +func BytesToUint128(data []byte) [][16]byte { return unsafecast.BytesToSlice[uint128](data) } -func BytesToFloat32(data []byte) []float32 { return cast.BytesToSlice[float32](data) } +func BytesToFloat32(data []byte) []float32 { return unsafecast.BytesToSlice[float32](data) } -func BytesToFloat64(data []byte) []float64 { return cast.BytesToSlice[float64](data) } +func BytesToFloat64(data []byte) []float64 { return unsafecast.BytesToSlice[float64](data) } -func BytesToString(data []byte) string { return cast.BytesToString(data) } +func BytesToString(data []byte) string { return unsafecast.BytesToString(data) } diff --git a/internal/cast/cast.go b/internal/cast/cast.go deleted file mode 100644 index ad8c2e0..0000000 --- a/internal/cast/cast.go +++ /dev/null @@ -1,27 +0,0 @@ -//go:build go1.18 - -package cast - -import "unsafe" - -type Int96 = [3]uint32 - -type Uint128 = [16]byte - -func Slice[To, From any](data []From) []To { - var zf From - var zt To - return unsafe.Slice(*(**To)(unsafe.Pointer(&data)), (uintptr(len(data))*unsafe.Sizeof(zf))/unsafe.Sizeof(zt)) -} - -func SliceToBytes[T any](data []T) []byte { - return Slice[byte](data) -} - -func BytesToSlice[T any](data []byte) []T { - return Slice[T](data) -} - -func BytesToString(data []byte) string { - return *(*string)(unsafe.Pointer(&data)) -} diff --git a/internal/unsafecast/unsafecast.go b/internal/unsafecast/unsafecast.go new file mode 100644 index 0000000..dc30b0c --- /dev/null +++ b/internal/unsafecast/unsafecast.go @@ -0,0 +1,67 @@ +//go:build go1.18 + +// Package unsafecast exposes functions to bypass the Go type system and perform +// conversions between types that would otherwise not be possible. +// +// The functions of this package are mostly useful as optimizations to avoid +// memory copies when converting between compatible memory layouts; for example, +// casting a [][16]byte to a []byte in order to use functions of the standard +// bytes package on the slices. +// +// With great power comes great responsibility. +// +package unsafecast + +import "unsafe" + +// The slice type represents the memory layout of slices in Go. It is similar to +// reflect.SliceHeader but uses a unsafe.Pointer instead of uintptr to for the +// backing array to allow the garbage collector to track track the reference. +type slice struct { + ptr unsafe.Pointer + len int + cap int +} + +// Slice converts the data slice of type []From to a slice of type []To sharing +// the same backing array. The length and capacity of the returned slice are +// scaled according to the size difference between the source and destination +// types. +// +// Note that the function does not perform any checks to ensure that the memory +// layouts of the types are compatible, it is possible to cause memory +// corruption if the layouts mismatch (e.g. the pointers in the From are different +// than the pointers in To). +func Slice[To, From any](data []From) []To { + // This function could use unsafe.Slice but it would drop the capacity + // information, so instead we implement the type conversion. + var zf From + var zt To + var s = (*slice)(unsafe.Pointer(&data)) + s.len = int((uintptr(s.len) * unsafe.Sizeof(zf)) / unsafe.Sizeof(zt)) + s.cap = int((uintptr(s.cap) * unsafe.Sizeof(zf)) / unsafe.Sizeof(zt)) + return *(*[]To)(unsafe.Pointer(s)) +} + +// SliceToBytes is a specialization of the Slice function converting any slice +// to a byte slice. +func SliceToBytes[T any](data []T) []byte { + return Slice[byte](data) +} + +// BytesToSlice is a specialization fo the Slice function for the case where +// converting a byte slice to a different type. +func BytesToSlice[T any](data []byte) []T { + return Slice[T](data) +} + +// BytesToString converts a byte slice to a string value. The returned string +// shares the backing array of the byte slice. +// +// Programs using this function are responsible for ensuring that the data slice +// is not modified while the returned string is in use, otherwise the guarantee +// of immutability of Go string values will be violated, resulting in undefined +// behavior. +func BytesToString(data []byte) string { + return *(*string)(unsafe.Pointer(&data)) +} diff --git a/internal/unsafecast/unsafecast_test.go b/internal/unsafecast/unsafecast_test.go new file mode 100644 index 0000000..ed0cacf --- /dev/null +++ b/internal/unsafecast/unsafecast_test.go @@ -0,0 +1,44 @@ +//go:build go1.18 + +package unsafecast_test + +import ( + "testing" + + "github.com/segmentio/parquet-go/internal/unsafecast" +) + +func TestUnsafeCastSlice(t *testing.T) { + a := make([]uint32, 4, 13) + a[0] = 1 + a[1] = 0 + a[2] = 2 + a[3] = 0 + + b := unsafecast.Slice[int64](a) + if len(b) != 2 { // (4 * sizeof(uint32)) / sizeof(int64) + t.Fatalf("length mismatch: want=2 got=%d", len(b)) + } + if cap(b) != 6 { // (13 * sizeof(uint32)) / sizeof(int64) + t.Fatalf("capacity mismatch: want=7 got=%d", cap(b)) + } + if b[0] != 1 { + t.Errorf("wrong value at index 0: want=1 got=%d", b[0]) + } + if b[1] != 2 { + t.Errorf("wrong value at index 1: want=2 got=%d", b[1]) + } + + c := unsafecast.Slice[uint32](b) + if len(c) != 4 { + t.Fatalf("length mismatch: want=2 got=%d", len(b)) + } + if cap(c) != 12 { + t.Fatalf("capacity mismatch: want=7 got=%d", cap(b)) + } + for i := range c { + if c[i] != a[i] { + t.Errorf("wrong value at index %d: want=%d got=%d", i, a[i], c[i]) + } + } +} diff --git a/limits.go b/limits.go index 9ca4142..9b88118 100644 --- a/limits.go +++ b/limits.go @@ -34,6 +34,11 @@ func makeColumnIndex(i int) int16 { return int16(i) } +func makeNumValues(i int) int32 { + checkIndexRange("number of values", i, 0, math.MaxInt32) + return int32(i) +} + func checkIndexRange(typ string, i, min, max int) { if i < min || i > max { panic(errIndexOutOfRange(typ, i, min, max)) diff --git a/node.go b/node.go index 4913f63..875988d 100644 --- a/node.go +++ b/node.go @@ -102,7 +102,7 @@ func Encoded(node Node, encoding encoding.Encoding) Node { } if encoding != nil { kind := node.Type().Kind() - if !encoding.CanEncode(format.Type(kind)) { + if !canEncode(encoding, kind) { panic("cannot apply " + encoding.Encoding().String() + " to node of type " + kind.String()) } } diff --git a/page.go b/page.go index f500d6a..35a7f28 100644 --- a/page.go +++ b/page.go @@ -1,6 +1,7 @@ package parquet import ( + "encoding/binary" "fmt" "io" @@ -79,8 +80,8 @@ type BufferedPage interface { RepetitionLevels() []int8 DefinitionLevels() []int8 - // Writes the page to the given encoder. - WriteTo(encoding.Encoder) error + // Writes the page data to dst with the given encoding. + Encode(dst []byte, enc encoding.Encoding) ([]byte, error) } // CompressedPage is an extension of the Page interface implemented by pages @@ -219,20 +220,21 @@ func newErrorPage(columnIndex int, msg string, args ...interface{}) *errorPage { } } -func (page *errorPage) Column() int { return page.columnIndex } -func (page *errorPage) Dictionary() Dictionary { return nil } -func (page *errorPage) NumRows() int64 { return 1 } -func (page *errorPage) NumValues() int64 { return 1 } -func (page *errorPage) NumNulls() int64 { return 0 } -func (page *errorPage) Bounds() (min, max Value, ok bool) { return } -func (page *errorPage) Clone() BufferedPage { return page } -func (page *errorPage) Slice(i, j int64) BufferedPage { return page } -func (page *errorPage) Size() int64 { return 1 } -func (page *errorPage) RepetitionLevels() []int8 { return nil } -func (page *errorPage) DefinitionLevels() []int8 { return nil } -func (page *errorPage) WriteTo(encoding.Encoder) error { return page.err } -func (page *errorPage) Values() ValueReader { return &errorValueReader{err: page.err} } -func (page *errorPage) Buffer() BufferedPage { return page } +func (page *errorPage) Column() int { return page.columnIndex } +func (page *errorPage) Dictionary() Dictionary { return nil } +func (page *errorPage) NumRows() int64 { return 1 } +func (page *errorPage) NumValues() int64 { return 1 } +func (page *errorPage) NumNulls() int64 { return 0 } +func (page *errorPage) Bounds() (min, max Value, ok bool) { return } +func (page *errorPage) Clone() BufferedPage { return page } +func (page *errorPage) Slice(i, j int64) BufferedPage { return page } +func (page *errorPage) Size() int64 { return 1 } +func (page *errorPage) RepetitionLevels() []int8 { return nil } +func (page *errorPage) DefinitionLevels() []int8 { return nil } +func (page *errorPage) Values() ValueReader { return &errorValueReader{err: page.err} } +func (page *errorPage) Buffer() BufferedPage { return page } +func (page *errorPage) Encode(dst []byte, _ encoding.Encoding) ([]byte, error) { return dst, page.err } +func (page *errorPage) Decode(_, _ []int8, _ []byte, _ encoding.Encoding) error { return page.err } func errPageBoundsOutOfRange(i, j, n int64) error { return fmt.Errorf("page bounds out of range [%d:%d]: with length %d", i, j, n) @@ -334,10 +336,6 @@ func (page *optionalPage) DefinitionLevels() []int8 { return page.definitionLevels } -func (page *optionalPage) WriteTo(e encoding.Encoder) error { - return page.base.WriteTo(e) -} - func (page *optionalPage) Values() ValueReader { return &optionalPageReader{page: page} } @@ -346,6 +344,10 @@ func (page *optionalPage) Buffer() BufferedPage { return page } +func (page *optionalPage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return page.base.Encode(dst, enc) +} + type optionalPageReader struct { page *optionalPage values ValueReader @@ -386,6 +388,7 @@ func (r *optionalPageReader) ReadValues(values []Value) (n int, err error) { if err != nil && err != io.EOF { return n, err } + err = nil } } @@ -502,8 +505,8 @@ func (page *repeatedPage) DefinitionLevels() []int8 { return page.definitionLevels } -func (page *repeatedPage) WriteTo(e encoding.Encoder) error { - return page.base.WriteTo(e) +func (page *repeatedPage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return page.base.Encode(dst, enc) } func (page *repeatedPage) Values() ValueReader { @@ -555,6 +558,7 @@ func (r *repeatedPageReader) ReadValues(values []Value) (n int, err error) { if err != nil && err != io.EOF { return n, err } + err = nil } } @@ -565,27 +569,59 @@ func (r *repeatedPageReader) ReadValues(values []Value) (n int, err error) { } type byteArrayPage struct { - values encoding.ByteArrayList + offsets []uint32 + values []byte columnIndex int16 } +func makeByteArrayOffsets(numValues int32, values []byte) []uint32 { + offsets := make([]uint32, 0, numValues) + lastOffset := uint32(0) + plain.RangeByteArrays(values, func(value []byte) error { + offsets = append(offsets, lastOffset) + lastOffset += 4 + uint32(len(value)) + return nil + }) + return offsets +} + +func newByteArrayPage(columnIndex int16, numValues int32, values []byte) *byteArrayPage { + return &byteArrayPage{ + offsets: makeByteArrayOffsets(numValues, values), + values: values, + columnIndex: ^columnIndex, + } +} + func (page *byteArrayPage) Column() int { return int(^page.columnIndex) } func (page *byteArrayPage) Dictionary() Dictionary { return nil } -func (page *byteArrayPage) NumRows() int64 { return int64(page.values.Len()) } +func (page *byteArrayPage) NumRows() int64 { return int64(len(page.offsets)) } -func (page *byteArrayPage) NumValues() int64 { return int64(page.values.Len()) } +func (page *byteArrayPage) NumValues() int64 { return int64(len(page.offsets)) } func (page *byteArrayPage) NumNulls() int64 { return 0 } +func (page *byteArrayPage) append(value []byte) { + page.offsets = append(page.offsets, uint32(len(page.values))) + page.values = plain.AppendByteArray(page.values, value) +} + +func (page *byteArrayPage) valueAt(offset uint32) []byte { + length := binary.LittleEndian.Uint32(page.values[offset:]) + j := 4 + offset + k := 4 + offset + length + return page.values[j:k:k] +} + func (page *byteArrayPage) min() (min []byte) { - if page.values.Len() > 0 { - min = page.values.Index(0) - for i := 1; i < page.values.Len(); i++ { - v := page.values.Index(i) - if string(v) < string(min) { - min = v + if len(page.offsets) > 0 { + min = page.valueAt(page.offsets[0]) + + for _, offset := range page.offsets[1:] { + if value := page.valueAt(offset); string(value) < string(min) { + min = value } } } @@ -593,12 +629,12 @@ func (page *byteArrayPage) min() (min []byte) { } func (page *byteArrayPage) max() (max []byte) { - if page.values.Len() > 0 { - max = page.values.Index(0) - for i := 1; i < page.values.Len(); i++ { - v := page.values.Index(i) - if string(v) > string(max) { - max = v + if len(page.offsets) > 0 { + max = page.valueAt(page.offsets[0]) + + for _, offset := range page.offsets[1:] { + if value := page.valueAt(offset); string(value) > string(max) { + max = value } } } @@ -606,17 +642,17 @@ func (page *byteArrayPage) max() (max []byte) { } func (page *byteArrayPage) bounds() (min, max []byte) { - if page.values.Len() > 0 { - min = page.values.Index(0) + if len(page.offsets) > 0 { + min = page.valueAt(page.offsets[0]) max = min - for i := 1; i < page.values.Len(); i++ { - v := page.values.Index(i) + for _, offset := range page.offsets[1:] { + value := page.valueAt(offset) switch { - case string(v) < string(min): - min = v - case string(v) > string(max): - max = v + case string(value) < string(min): + min = value + case string(value) > string(max): + max = value } } } @@ -624,7 +660,7 @@ func (page *byteArrayPage) bounds() (min, max []byte) { } func (page *byteArrayPage) Bounds() (min, max Value, ok bool) { - if ok = page.values.Len() > 0; ok { + if ok = len(page.offsets) > 0; ok { minBytes, maxBytes := page.bounds() min = makeValueBytes(ByteArray, minBytes) max = makeValueBytes(ByteArray, maxBytes) @@ -632,32 +668,68 @@ func (page *byteArrayPage) Bounds() (min, max Value, ok bool) { return min, max, ok } +func (page *byteArrayPage) cloneOffsets() []uint32 { + offsets := make([]uint32, len(page.offsets)) + copy(offsets, page.offsets) + return offsets +} + +func (page *byteArrayPage) cloneValues() []byte { + values := make([]byte, len(page.values)) + copy(values, page.values) + return values +} + func (page *byteArrayPage) Clone() BufferedPage { return &byteArrayPage{ - values: page.values.Clone(), + offsets: page.cloneOffsets(), + values: page.cloneValues(), columnIndex: page.columnIndex, } } func (page *byteArrayPage) Slice(i, j int64) BufferedPage { return &byteArrayPage{ - values: page.values.Slice(int(i), int(j)), + offsets: page.offsets[i:j], + values: page.values, columnIndex: page.columnIndex, } } -func (page *byteArrayPage) Size() int64 { return page.values.Size() } +func (page *byteArrayPage) Size() int64 { return int64(len(page.values)) } func (page *byteArrayPage) RepetitionLevels() []int8 { return nil } func (page *byteArrayPage) DefinitionLevels() []int8 { return nil } -func (page *byteArrayPage) WriteTo(e encoding.Encoder) error { return e.EncodeByteArray(page.values) } - func (page *byteArrayPage) Values() ValueReader { return &byteArrayPageReader{page: page} } func (page *byteArrayPage) Buffer() BufferedPage { return page } +func (page *byteArrayPage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + values := page.values + + switch { + case len(page.offsets) == 0: + values = nil + + case bits.OrderOfUint32(page.offsets) < 1: // unordered? + values = make([]byte, 0, len(values)) // TODO: pool this buffer? + + for _, offset := range page.offsets { + values = plain.AppendByteArray(values, page.valueAt(offset)) + } + + default: + i := page.offsets[0] + j := page.offsets[len(page.offsets)-1] + j += 4 + binary.LittleEndian.Uint32(values[j:]) + values = values[i:j:j] + } + + return enc.EncodeByteArray(dst, values) +} + type byteArrayPageReader struct { page *byteArrayPage offset int @@ -678,8 +750,8 @@ func (r *byteArrayPageReader) ReadByteArrays(values []byte) (int, error) { } func (r *byteArrayPageReader) readByteArrays(values []byte) (c, n int, err error) { - for r.offset < r.page.values.Len() { - b := r.page.values.Index(r.offset) + for r.offset < len(r.page.offsets) { + b := r.page.valueAt(r.page.offsets[r.offset]) k := plain.ByteArrayLengthSize + len(b) if k > (len(values) - n) { break @@ -690,7 +762,7 @@ func (r *byteArrayPageReader) readByteArrays(values []byte) (c, n int, err error r.offset++ c++ } - if r.offset == r.page.values.Len() { + if r.offset == len(r.page.offsets) { err = io.EOF } else if n == 0 && len(values) > 0 { err = io.ErrShortBuffer @@ -699,24 +771,38 @@ func (r *byteArrayPageReader) readByteArrays(values []byte) (c, n int, err error } func (r *byteArrayPageReader) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < r.page.values.Len() { - values[n] = makeValueBytes(ByteArray, r.page.values.Index(r.offset)) + for n < len(values) && r.offset < len(r.page.offsets) { + values[n] = makeValueBytes(ByteArray, r.page.valueAt(r.page.offsets[r.offset])) values[n].columnIndex = r.page.columnIndex r.offset++ n++ } - if r.offset == r.page.values.Len() { + if r.offset == len(r.page.offsets) { err = io.EOF } return n, err } type fixedLenByteArrayPage struct { - size int data []byte + size int columnIndex int16 } +func newFixedLenByteArrayPage(columnIndex int16, numValues int32, data []byte, size int) *fixedLenByteArrayPage { + if (len(data) % size) != 0 { + panic("cannot create fixed-length byte array page from input which is not a multiple of the type size") + } + if int(numValues) != len(data)/size { + panic(fmt.Errorf("number of values mismatch in numValues and data arguments: %d != %d", numValues, len(data)/size)) + } + return &fixedLenByteArrayPage{ + data: data, + size: size, + columnIndex: ^columnIndex, + } +} + func (page *fixedLenByteArrayPage) Column() int { return int(^page.columnIndex) } func (page *fixedLenByteArrayPage) Dictionary() Dictionary { return nil } @@ -750,16 +836,16 @@ func (page *fixedLenByteArrayPage) Bounds() (min, max Value, ok bool) { func (page *fixedLenByteArrayPage) Clone() BufferedPage { return &fixedLenByteArrayPage{ - size: page.size, data: append([]byte{}, page.data...), + size: page.size, columnIndex: page.columnIndex, } } func (page *fixedLenByteArrayPage) Slice(i, j int64) BufferedPage { return &fixedLenByteArrayPage{ - size: page.size, data: page.data[i*int64(page.size) : j*int64(page.size)], + size: page.size, columnIndex: page.columnIndex, } } @@ -770,16 +856,16 @@ func (page *fixedLenByteArrayPage) RepetitionLevels() []int8 { return nil } func (page *fixedLenByteArrayPage) DefinitionLevels() []int8 { return nil } -func (page *fixedLenByteArrayPage) WriteTo(e encoding.Encoder) error { - return e.EncodeFixedLenByteArray(page.size, page.data) -} - func (page *fixedLenByteArrayPage) Values() ValueReader { return &fixedLenByteArrayPageReader{page: page} } func (page *fixedLenByteArrayPage) Buffer() BufferedPage { return page } +func (page *fixedLenByteArrayPage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return enc.EncodeFixedLenByteArray(dst, page.data, page.size) +} + type fixedLenByteArrayPageReader struct { page *fixedLenByteArrayPage offset int @@ -817,3 +903,54 @@ func (r *fixedLenByteArrayPageReader) ReadValues(values []Value) (n int, err err } return n, err } + +type nullPage struct { + column int + count int +} + +func newNullPage(columnIndex int16, numValues int32) *nullPage { + return &nullPage{ + column: int(columnIndex), + count: int(numValues), + } +} + +func (p *nullPage) Column() int { return p.column } +func (p *nullPage) Dictionary() Dictionary { return nil } +func (p *nullPage) NumRows() int64 { return int64(p.count) } +func (p *nullPage) NumValues() int64 { return int64(p.count) } +func (p *nullPage) NumNulls() int64 { return int64(p.count) } +func (p *nullPage) Bounds() (min, max Value, ok bool) { return } +func (p *nullPage) Size() int64 { return 1 } +func (p *nullPage) Values() ValueReader { + return &nullPageReader{column: p.column, remain: p.count} +} +func (p *nullPage) Buffer() BufferedPage { return p } +func (p *nullPage) Clone() BufferedPage { return p } +func (p *nullPage) Slice(i, j int64) BufferedPage { + return &nullPage{column: p.column, count: p.count - int(j-i)} +} +func (p *nullPage) RepetitionLevels() []int8 { return nil } +func (p *nullPage) DefinitionLevels() []int8 { return nil } +func (p *nullPage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return dst[:0], nil +} + +type nullPageReader struct { + column int + remain int +} + +func (r *nullPageReader) ReadValues(values []Value) (n int, err error) { + columnIndex := ^int16(r.column) + values = values[:min(r.remain, len(values))] + for i := range values { + values[i] = Value{columnIndex: columnIndex} + } + r.remain -= len(values) + if r.remain == 0 { + err = io.EOF + } + return len(values), err +} diff --git a/page_default.go b/page_default.go index bfb8ae2..fec2eba 100644 --- a/page_default.go +++ b/page_default.go @@ -15,6 +15,20 @@ type booleanPage struct { columnIndex int16 } +func newBooleanPage(columnIndex int16, numValues int32, data []byte) *booleanPage { + values := bits.BytesToBool(data) + for len(values) < int(numValues) { + values = append(values, false) + } + if len(values) > int(numValues) { + values = values[:numValues] + } + return &booleanPage{ + values: values, + columnIndex: ^columnIndex, + } +} + func (page *booleanPage) Column() int { return int(^page.columnIndex) } func (page *booleanPage) Dictionary() Dictionary { return nil } @@ -95,12 +109,14 @@ func (page *booleanPage) RepetitionLevels() []int8 { return nil } func (page *booleanPage) DefinitionLevels() []int8 { return nil } -func (page *booleanPage) WriteTo(e encoding.Encoder) error { return e.EncodeBoolean(page.values) } - func (page *booleanPage) Values() ValueReader { return &booleanPageReader{page: page} } func (page *booleanPage) Buffer() BufferedPage { return page } +func (page *booleanPage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return enc.EncodeBoolean(dst, page.values) +} + type booleanPageReader struct { page *booleanPage offset int @@ -137,6 +153,20 @@ type int32Page struct { columnIndex int16 } +func newInt32Page(columnIndex int16, numValues int32, data []byte) *int32Page { + values := bits.BytesToInt32(data) + for len(values) < int(numValues) { + values = append(values, 0) + } + if len(values) > int(numValues) { + values = values[:numValues] + } + return &int32Page{ + values: values, + columnIndex: ^columnIndex, + } +} + func (page *int32Page) Column() int { return int(^page.columnIndex) } func (page *int32Page) Dictionary() Dictionary { return nil } @@ -182,12 +212,14 @@ func (page *int32Page) RepetitionLevels() []int8 { return nil } func (page *int32Page) DefinitionLevels() []int8 { return nil } -func (page *int32Page) WriteTo(e encoding.Encoder) error { return e.EncodeInt32(page.values) } - func (page *int32Page) Values() ValueReader { return &int32PageReader{page: page} } func (page *int32Page) Buffer() BufferedPage { return page } +func (page *int32Page) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return enc.EncodeInt32(dst, page.values) +} + type int32PageReader struct { page *int32Page offset int @@ -225,6 +257,20 @@ type int64Page struct { columnIndex int16 } +func newInt64Page(columnIndex int16, numValues int32, data []byte) *int64Page { + values := bits.BytesToInt64(data) + for len(values) < int(numValues) { + values = append(values, 0) + } + if len(values) > int(numValues) { + values = values[:numValues] + } + return &int64Page{ + values: values, + columnIndex: ^columnIndex, + } +} + func (page *int64Page) Column() int { return int(^page.columnIndex) } func (page *int64Page) Dictionary() Dictionary { return nil } @@ -270,12 +316,14 @@ func (page *int64Page) RepetitionLevels() []int8 { return nil } func (page *int64Page) DefinitionLevels() []int8 { return nil } -func (page *int64Page) WriteTo(e encoding.Encoder) error { return e.EncodeInt64(page.values) } - func (page *int64Page) Values() ValueReader { return &int64PageReader{page: page} } func (page *int64Page) Buffer() BufferedPage { return page } +func (page *int64Page) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return enc.EncodeInt64(dst, page.values) +} + type int64PageReader struct { page *int64Page offset int @@ -313,6 +361,20 @@ type int96Page struct { columnIndex int16 } +func newInt96Page(columnIndex int16, numValues int32, data []byte) *int96Page { + values := deprecated.BytesToInt96(data) + for len(values) < int(numValues) { + values = append(values, deprecated.Int96{}) + } + if len(values) > int(numValues) { + values = values[:numValues] + } + return &int96Page{ + values: values, + columnIndex: ^columnIndex, + } +} + func (page *int96Page) Column() int { return int(^page.columnIndex) } func (page *int96Page) Dictionary() Dictionary { return nil } @@ -360,12 +422,14 @@ func (page *int96Page) RepetitionLevels() []int8 { return nil } func (page *int96Page) DefinitionLevels() []int8 { return nil } -func (page *int96Page) WriteTo(e encoding.Encoder) error { return e.EncodeInt96(page.values) } - func (page *int96Page) Values() ValueReader { return &int96PageReader{page: page} } func (page *int96Page) Buffer() BufferedPage { return page } +func (page *int96Page) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return enc.EncodeInt96(dst, page.values) +} + type int96PageReader struct { page *int96Page offset int @@ -403,6 +467,20 @@ type floatPage struct { columnIndex int16 } +func newFloatPage(columnIndex int16, numValues int32, data []byte) *floatPage { + values := bits.BytesToFloat32(data) + for len(values) < int(numValues) { + values = append(values, 0) + } + if len(values) > int(numValues) { + values = values[:numValues] + } + return &floatPage{ + values: values, + columnIndex: ^columnIndex, + } +} + func (page *floatPage) Column() int { return int(^page.columnIndex) } func (page *floatPage) Dictionary() Dictionary { return nil } @@ -448,12 +526,14 @@ func (page *floatPage) RepetitionLevels() []int8 { return nil } func (page *floatPage) DefinitionLevels() []int8 { return nil } -func (page *floatPage) WriteTo(e encoding.Encoder) error { return e.EncodeFloat(page.values) } - func (page *floatPage) Values() ValueReader { return &floatPageReader{page: page} } func (page *floatPage) Buffer() BufferedPage { return page } +func (page *floatPage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return enc.EncodeFloat(dst, page.values) +} + type floatPageReader struct { page *floatPage offset int @@ -491,6 +571,20 @@ type doublePage struct { columnIndex int16 } +func newDoublePage(columnIndex int16, numValues int32, data []byte) *doublePage { + values := bits.BytesToFloat64(data) + for len(values) < int(numValues) { + values = append(values, 0) + } + if len(values) > int(numValues) { + values = values[:numValues] + } + return &doublePage{ + values: values, + columnIndex: ^columnIndex, + } +} + func (page *doublePage) Column() int { return int(^page.columnIndex) } func (page *doublePage) Dictionary() Dictionary { return nil } @@ -536,12 +630,14 @@ func (page *doublePage) RepetitionLevels() []int8 { return nil } func (page *doublePage) DefinitionLevels() []int8 { return nil } -func (page *doublePage) WriteTo(e encoding.Encoder) error { return e.EncodeDouble(page.values) } - func (page *doublePage) Values() ValueReader { return &doublePageReader{page: page} } func (page *doublePage) Buffer() BufferedPage { return page } +func (page *doublePage) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return enc.EncodeDouble(dst, page.values) +} + type doublePageReader struct { page *doublePage offset int @@ -579,6 +675,10 @@ func (r *doublePageReader) ReadValues(values []Value) (n int, err error) { type uint32Page struct{ *int32Page } +func newUint32Page(columnIndex int16, numValues int32, data []byte) uint32Page { + return uint32Page{newInt32Page(columnIndex, numValues, data)} +} + func (page uint32Page) min() uint32 { return bits.MinUint32(bits.Int32ToUint32(page.values)) } func (page uint32Page) max() uint32 { return bits.MaxUint32(bits.Int32ToUint32(page.values)) } @@ -608,6 +708,10 @@ func (page uint32Page) Buffer() BufferedPage { return page } type uint64Page struct{ *int64Page } +func newUint64Page(columnIndex int16, numValues int32, data []byte) uint64Page { + return uint64Page{newInt64Page(columnIndex, numValues, data)} +} + func (page uint64Page) min() uint64 { return bits.MinUint64(bits.Int64ToUint64(page.values)) } func (page uint64Page) max() uint64 { return bits.MaxUint64(bits.Int64ToUint64(page.values)) } diff --git a/page_go18.go b/page_go18.go index 4b1cf08..c473942 100644 --- a/page_go18.go +++ b/page_go18.go @@ -6,7 +6,7 @@ import ( "io" "github.com/segmentio/parquet-go/encoding" - "github.com/segmentio/parquet-go/internal/cast" + "github.com/segmentio/parquet-go/internal/unsafecast" ) type page[T primitive] struct { @@ -15,6 +15,22 @@ type page[T primitive] struct { columnIndex int16 } +func newPage[T primitive](columnIndex int16, numValues int32, data []byte, class *class[T]) *page[T] { + var values = unsafecast.Slice[T](data) + var zero T + for len(values) < int(numValues) { + values = append(values, zero) + } + if len(values) > int(numValues) { + values = values[:numValues] + } + return &page[T]{ + class: class, + values: values, + columnIndex: ^columnIndex, + } +} + func (p *page[T]) Column() int { return int(^p.columnIndex) } func (p *page[T]) Dictionary() Dictionary { return nil } @@ -62,23 +78,25 @@ func (p *page[T]) RepetitionLevels() []int8 { return nil } func (p *page[T]) DefinitionLevels() []int8 { return nil } -func (p *page[T]) WriteTo(e encoding.Encoder) error { return p.class.encode(e, p.values) } - -func (p *page[T]) Values() ValueReader { return &pageReader[T]{page: p} } +func (p *page[T]) Values() ValueReader { return &pageValueReader[T]{page: p} } func (p *page[T]) Buffer() BufferedPage { return p } -type pageReader[T primitive] struct { +func (p *page[T]) Encode(dst []byte, enc encoding.Encoding) ([]byte, error) { + return p.class.encode(enc, dst, p.values) +} + +type pageValueReader[T primitive] struct { page *page[T] offset int } -func (r *pageReader[T]) Read(b []byte) (n int, err error) { - n, err = r.ReadRequired(cast.BytesToSlice[T](b)) +func (r *pageValueReader[T]) Read(b []byte) (n int, err error) { + n, err = r.ReadRequired(unsafecast.BytesToSlice[T](b)) return sizeof[T]() * n, err } -func (r *pageReader[T]) ReadRequired(values []T) (n int, err error) { +func (r *pageValueReader[T]) ReadRequired(values []T) (n int, err error) { n = copy(values, r.page.values[r.offset:]) r.offset += n if r.offset == len(r.page.values) { @@ -87,7 +105,7 @@ func (r *pageReader[T]) ReadRequired(values []T) (n int, err error) { return n, err } -func (r *pageReader[T]) ReadValues(values []Value) (n int, err error) { +func (r *pageValueReader[T]) ReadValues(values []Value) (n int, err error) { makeValue := r.page.class.makeValue pageValues := r.page.values columnIndex := r.page.columnIndex @@ -104,5 +122,5 @@ func (r *pageReader[T]) ReadValues(values []Value) (n int, err error) { } var ( - _ RequiredReader[bool] = (*pageReader[bool])(nil) + _ RequiredReader[bool] = (*pageValueReader[bool])(nil) ) diff --git a/page_go18_test.go b/page_go18_test.go index 5088b39..0be0ffb 100644 --- a/page_go18_test.go +++ b/page_go18_test.go @@ -13,7 +13,7 @@ import ( "github.com/segmentio/parquet-go" "github.com/segmentio/parquet-go/deprecated" "github.com/segmentio/parquet-go/encoding/plain" - "github.com/segmentio/parquet-go/internal/cast" + "github.com/segmentio/parquet-go/internal/unsafecast" ) func TestPage(t *testing.T) { @@ -63,12 +63,12 @@ func testPageOf[T plain.Type](t *testing.T) { 0: randValue[T](r), 1: randValue[T](r), } - n, err := w.(io.Writer).Write(cast.SliceToBytes(values)) + n, err := w.(io.Writer).Write(unsafecast.SliceToBytes(values)) return values[:n/sizeof], err }, read: func(r parquet.ValueReader) ([]T, error) { values := make([]T, 2) - n, err := r.(io.Reader).Read(cast.SliceToBytes(values)) + n, err := r.(io.Reader).Read(unsafecast.SliceToBytes(values)) return values[:n/sizeof], err }, }) diff --git a/page_header.go b/page_header.go index 73646ca..11afafe 100644 --- a/page_header.go +++ b/page_header.go @@ -180,12 +180,17 @@ func (v2 DataPageHeaderV2) MaxValue() []byte { return v2.header.Statistics.MaxValue } +func (v2 DataPageHeaderV2) IsCompressed() bool { + return v2.header.IsCompressed == nil || *v2.header.IsCompressed +} + func (v2 DataPageHeaderV2) String() string { - return fmt.Sprintf("DATA_PAGE_HEADER_V2{NumValues=%d,NumNulls=%d,NumRows=%d,Encoding=%s}", + return fmt.Sprintf("DATA_PAGE_HEADER_V2{NumValues=%d,NumNulls=%d,NumRows=%d,Encoding=%s,IsCompressed=%t}", v2.header.NumValues, v2.header.NumNulls, v2.header.NumRows, - v2.header.Encoding) + v2.header.Encoding, + v2.IsCompressed()) } type unknownPageHeader struct { diff --git a/testdata/alltypes_dictionary.parquet b/testdata/alltypes_dictionary.parquet old mode 100755 new mode 100644 diff --git a/testdata/alltypes_plain.parquet b/testdata/alltypes_plain.parquet old mode 100755 new mode 100644 diff --git a/testdata/alltypes_plain.snappy.parquet b/testdata/alltypes_plain.snappy.parquet old mode 100755 new mode 100644 diff --git a/type.go b/type.go index 74c66ec..611a5b6 100644 --- a/type.go +++ b/type.go @@ -9,7 +9,6 @@ import ( "github.com/google/uuid" "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" "github.com/segmentio/parquet-go/internal/bits" ) @@ -118,11 +117,6 @@ type Type interface { // The method panics if it is called on a group type. NewColumnIndexer(sizeLimit int) ColumnIndexer - // Creates a dictionary holding values of this type. - // - // The method panics if it is called on a group type. - NewDictionary(columnIndex, bufferSize int) Dictionary - // Creates a row group buffer column for values of this type. // // Column buffers are created using the index of the column they are @@ -142,29 +136,33 @@ type Type interface { // The method panics if it is called on a group type. NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer - // Creates a reader for columns of this type. + // Creates a dictionary holding values of this type. // - // Column readers are created using the index of the column they are reading - // values from (relative to the parent schema). The column index will be set - // on values read from the reader. + // If the length of data is not zero, it must contain PLAIN encoded values + // of the dictionary. // - // The buffer size is given in bytes, because we want to control memory - // consumption of the application, which is simpler to achieve with buffer - // size expressed in bytes rather than number of elements. + // The dictionary retains the data buffer, it does not make a copy of it. + // If the application needs to share ownership of the memory buffer, it must + // ensure that it will not be modified while the page is in use, or it must + // make a copy of it prior to creating the dictionary. // - // The returned reader may implement extensions that can be tested via type - // assertions. For example, on a INT32 type, the reader could implement the - // parquet.Int32Reader interface to allow programs to more efficiently read - // columns of INT32 values. - NewColumnReader(columnIndex, bufferSize int) ColumnReader - - // Reads a dictionary with values of this type from the decoder passed as - // argument. + // The method panics if it is called on a group type. + NewDictionary(columnIndex, numValues int, data []byte) Dictionary + + // Creates a page belonging to a column at the given index, backed by the + // data buffer. + // + // If the length of data is not zero, it must contain PLAIN encoded values + // of the page. + // + // The page retains the data buffer, it does not make a copy of it. If the + // application needs to share ownership of the memory buffer, it must ensure + // that it will not be modified while the page is in use, or it must make a + // copy of it prior to creating the page. // - // The number of values is a hint to optimize the allocation of memory - // buffers for the dictionary. Callers that don't know how many values will - // be decoded should pass zero for numValues. - ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) + // The method panics if the data is not a valid PLAIN encoded representation + // of the page values. + NewPage(columnIndex, numValues int, data []byte) Page } // In the current parquet version supported by this library, only type-defined @@ -391,20 +389,16 @@ func (t *stringType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newByteArrayColumnIndexer(sizeLimit) } -func (t *stringType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) +func (t *stringType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } func (t *stringType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *stringType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) -} - -func (t *stringType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t *stringType) NewPage(columnIndex, numValues int, data []byte) Page { + return newByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } func (t *stringType) GoType() reflect.Type { @@ -446,20 +440,16 @@ func (t *uuidType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newFixedLenByteArrayColumnIndexer(16, sizeLimit) } -func (t *uuidType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) +func (t *uuidType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } func (t *uuidType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newFixedLenByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *uuidType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newFixedLenByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) -} - -func (t *uuidType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t *uuidType) NewPage(columnIndex, numValues int, data []byte) Page { + return newFixedLenByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, 16) } func (t *uuidType) GoType() reflect.Type { @@ -503,20 +493,16 @@ func (t *enumType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newByteArrayColumnIndexer(sizeLimit) } -func (t *enumType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) +func (t *enumType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } func (t *enumType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *enumType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) -} - -func (t *enumType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t *enumType) NewPage(columnIndex, numValues int, data []byte) Page { + return newByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } func (t *enumType) GoType() reflect.Type { @@ -560,20 +546,16 @@ func (t *jsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newByteArrayColumnIndexer(sizeLimit) } -func (t *jsonType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) +func (t *jsonType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } func (t *jsonType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *jsonType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) -} - -func (t *jsonType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t *jsonType) NewPage(columnIndex, numValues int, data []byte) Page { + return newByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } // BSON constructs a leaf node of BSON logical type. @@ -613,20 +595,16 @@ func (t *bsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newByteArrayColumnIndexer(sizeLimit) } -func (t *bsonType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) +func (t *bsonType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } func (t *bsonType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *bsonType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) -} - -func (t *bsonType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t *bsonType) NewPage(columnIndex, numValues int, data []byte) Page { + return newByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } // Date constructs a leaf node of DATE logical type. @@ -839,7 +817,7 @@ func (t *listType) NewColumnIndexer(int) ColumnIndexer { panic("create create column indexer from parquet LIST type") } -func (t *listType) NewDictionary(int, int) Dictionary { +func (t *listType) NewDictionary(int, int, []byte) Dictionary { panic("cannot create dictionary from parquet LIST type") } @@ -847,12 +825,8 @@ func (t *listType) NewColumnBuffer(int, int) ColumnBuffer { panic("cannot create column buffer from parquet LIST type") } -func (t *listType) NewColumnReader(int, int) ColumnReader { - panic("cannot create column reader from parquet LIST type") -} - -func (t *listType) ReadDictionary(int, int, encoding.Decoder) (Dictionary, error) { - panic("cannot read dictionary from parquet LIST type") +func (t *listType) NewPage(int, int, []byte) Page { + panic("cannot create page from parquet LIST type") } // Map constructs a node of MAP logical type. @@ -897,7 +871,7 @@ func (t *mapType) NewColumnIndexer(int) ColumnIndexer { panic("create create column indexer from parquet MAP type") } -func (t *mapType) NewDictionary(int, int) Dictionary { +func (t *mapType) NewDictionary(int, int, []byte) Dictionary { panic("cannot create dictionary from parquet MAP type") } @@ -905,12 +879,8 @@ func (t *mapType) NewColumnBuffer(int, int) ColumnBuffer { panic("cannot create column buffer from parquet MAP type") } -func (t *mapType) NewColumnReader(int, int) ColumnReader { - panic("cannot create column reader from parquet MAP type") -} - -func (t *mapType) ReadDictionary(int, int, encoding.Decoder) (Dictionary, error) { - panic("cannot read dictionary from parquet MAP type") +func (t *mapType) NewPage(int, int, []byte) Page { + panic("cannot create page from parquet MAP type") } type nullType format.NullType @@ -937,7 +907,7 @@ func (t *nullType) NewColumnIndexer(int) ColumnIndexer { panic("create create column indexer from parquet NULL type") } -func (t *nullType) NewDictionary(int, int) Dictionary { +func (t *nullType) NewDictionary(int, int, []byte) Dictionary { panic("cannot create dictionary from parquet NULL type") } @@ -945,12 +915,8 @@ func (t *nullType) NewColumnBuffer(int, int) ColumnBuffer { panic("cannot create column buffer from parquet NULL type") } -func (t *nullType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newNullColumnReader(t, makeColumnIndex(columnIndex)) -} - -func (t *nullType) ReadDictionary(int, int, encoding.Decoder) (Dictionary, error) { - panic("cannot read dictionary from parquet NULL type") +func (t *nullType) NewPage(columnIndex, numValues int, _ []byte) Page { + return newNullPage(makeColumnIndex(columnIndex), makeNumValues(numValues)) } type groupType struct{} @@ -969,7 +935,7 @@ func (groupType) NewColumnIndexer(int) ColumnIndexer { panic("cannot create column indexer from parquet group") } -func (groupType) NewDictionary(int, int) Dictionary { +func (groupType) NewDictionary(int, int, []byte) Dictionary { panic("cannot create dictionary from parquet group") } @@ -977,12 +943,8 @@ func (t groupType) NewColumnBuffer(int, int) ColumnBuffer { panic("cannot create column buffer from parquet group") } -func (t groupType) NewColumnReader(int, int) ColumnReader { - panic("cannot create column reader from parquet group") -} - -func (t groupType) ReadDictionary(int, int, encoding.Decoder) (Dictionary, error) { - panic("cannot read dictionary from parquet group") +func (t groupType) NewPage(int, int, []byte) Page { + panic("cannot create page from parquet group") } func (groupType) Length() int { return 0 } diff --git a/type_default.go b/type_default.go index 0961cd1..00ad289 100644 --- a/type_default.go +++ b/type_default.go @@ -7,7 +7,6 @@ import ( "fmt" "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" ) @@ -49,20 +48,16 @@ func (t booleanType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newBooleanColumnIndexer() } -func (t booleanType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newBooleanDictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t booleanType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newBooleanColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t booleanType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newBooleanColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t booleanType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newBooleanDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t booleanType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readBooleanDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t booleanType) NewPage(columnIndex, numValues int, data []byte) Page { + return newBooleanPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type int32Type struct{ primitiveType } @@ -85,20 +80,16 @@ func (t int32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newInt32ColumnIndexer() } -func (t int32Type) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newInt32Dictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t int32Type) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t int32Type) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newInt32ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t int32Type) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t int32Type) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readInt32Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t int32Type) NewPage(columnIndex, numValues int, data []byte) Page { + return newInt32Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type int64Type struct{ primitiveType } @@ -121,20 +112,16 @@ func (t int64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newInt64ColumnIndexer() } -func (t int64Type) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newInt64Dictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t int64Type) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t int64Type) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newInt64ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t int64Type) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t int64Type) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readInt64Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t int64Type) NewPage(columnIndex, numValues int, data []byte) Page { + return newInt64Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type int96Type struct{ primitiveType } @@ -157,20 +144,16 @@ func (t int96Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newInt96ColumnIndexer() } -func (t int96Type) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newInt96Dictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t int96Type) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newInt96ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t int96Type) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newInt96ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t int96Type) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newInt96Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t int96Type) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readInt96Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t int96Type) NewPage(columnIndex, numValues int, data []byte) Page { + return newInt96Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type floatType struct{ primitiveType } @@ -193,20 +176,16 @@ func (t floatType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newFloatColumnIndexer() } -func (t floatType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newFloatDictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t floatType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newFloatColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t floatType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newFloatColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t floatType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newFloatDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t floatType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readFloatDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t floatType) NewPage(columnIndex, numValues int, data []byte) Page { + return newFloatPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type doubleType struct{ primitiveType } @@ -227,20 +206,16 @@ func (t doubleType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newDoubleColumnIndexer() } -func (t doubleType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newDoubleDictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t doubleType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newDoubleColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t doubleType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newDoubleColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t doubleType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newDoubleDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t doubleType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readDoubleDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t doubleType) NewPage(columnIndex, numValues int, data []byte) Page { + return newDoublePage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type byteArrayType struct{ primitiveType } @@ -263,20 +238,16 @@ func (t byteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newByteArrayColumnIndexer(sizeLimit) } -func (t byteArrayType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t byteArrayType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t byteArrayType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t byteArrayType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t byteArrayType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t byteArrayType) NewPage(columnIndex, numValues int, data []byte) Page { + return newByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type fixedLenByteArrayType struct { @@ -304,20 +275,16 @@ func (t *fixedLenByteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newFixedLenByteArrayColumnIndexer(t.length, sizeLimit) } -func (t *fixedLenByteArrayType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t *fixedLenByteArrayType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newFixedLenByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *fixedLenByteArrayType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newFixedLenByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t *fixedLenByteArrayType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t *fixedLenByteArrayType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t fixedLenByteArrayType) NewPage(columnIndex, numValues int, data []byte) Page { + return newFixedLenByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, t.Length()) } // FixedLenByteArrayType constructs a type for fixed-length values of the given @@ -342,58 +309,50 @@ func (t *intType) NewColumnIndexer(sizeLimit int) ColumnIndexer { } } -func (t *intType) NewDictionary(columnIndex, bufferSize int) Dictionary { +func (t *intType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { if t.IsSigned { if t.BitWidth == 64 { - return newInt64Dictionary(t, makeColumnIndex(columnIndex), bufferSize) + return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } else { - return newInt32Dictionary(t, makeColumnIndex(columnIndex), bufferSize) + return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } } else { if t.BitWidth == 64 { - return newUint64Dictionary(t, makeColumnIndex(columnIndex), bufferSize) + return newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } else { - return newUint32Dictionary(t, makeColumnIndex(columnIndex), bufferSize) + return newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } } } -func (t *intType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { +func (t *intType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { if t.IsSigned { if t.BitWidth == 64 { - return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) + return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } else { - return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) + return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } } else { if t.BitWidth == 64 { - return newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) + return newUint64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } else { - return newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) + return newUint32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } } } -func (t *intType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - if t.BitWidth == 64 { - return newInt64ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) - } else { - return newInt32ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) - } -} - -func (t *intType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { +func (t *intType) NewPage(columnIndex, numValues int, data []byte) Page { if t.IsSigned { if t.BitWidth == 64 { - return readInt64Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) + return newInt64Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } else { - return readInt32Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) + return newInt32Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } } else { if t.BitWidth == 64 { - return readUint64Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) + return newUint64Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } else { - return readUint32Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) + return newUint32Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } } } @@ -402,20 +361,16 @@ func (t *dateType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newInt32ColumnIndexer() } -func (t *dateType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newInt32Dictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t *dateType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *dateType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newInt32ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t *dateType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t *dateType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readInt32Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t *dateType) NewPage(columnIndex, numValues int, data []byte) Page { + return newInt32Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { @@ -426,14 +381,6 @@ func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { } } -func (t *timeType) NewDictionary(columnIndex, bufferSize int) Dictionary { - if t.Unit.Millis != nil { - return newInt32Dictionary(t, makeColumnIndex(columnIndex), bufferSize) - } else { - return newInt64Dictionary(t, makeColumnIndex(columnIndex), bufferSize) - } -} - func (t *timeType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { if t.Unit.Millis != nil { return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) @@ -442,19 +389,19 @@ func (t *timeType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { } } -func (t *timeType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { +func (t *timeType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { if t.Unit.Millis != nil { - return newInt32ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) + return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } else { - return newInt64ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) + return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } } -func (t *timeType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { +func (t *timeType) NewPage(columnIndex, numValues int, data []byte) Page { if t.Unit.Millis != nil { - return readInt32Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) + return newInt32Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } else { - return readInt64Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) + return newInt64Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } } @@ -462,18 +409,14 @@ func (t *timestampType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newInt64ColumnIndexer() } -func (t *timestampType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newInt64Dictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t *timestampType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *timestampType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newInt64ColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t *timestampType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t *timestampType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readInt64Dictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t *timestampType) NewPage(columnIndex, numValues int, data []byte) Page { + return newInt64Page(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } diff --git a/type_go18.go b/type_go18.go index b017517..00021c1 100644 --- a/type_go18.go +++ b/type_go18.go @@ -7,7 +7,6 @@ import ( "fmt" "github.com/segmentio/parquet-go/deprecated" - "github.com/segmentio/parquet-go/encoding" "github.com/segmentio/parquet-go/format" ) @@ -45,20 +44,16 @@ func (t primitiveType[T]) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newColumnIndexer(t.class) } -func (t primitiveType[T]) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, t.class) -} - func (t primitiveType[T]) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize, t.class) } -func (t primitiveType[T]) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, t.class) +func (t primitiveType[T]) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, t.class) } -func (t primitiveType[T]) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, t.class) +func (t primitiveType[T]) NewPage(columnIndex, numValues int, data []byte) Page { + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, t.class) } type byteArrayType struct{} @@ -87,20 +82,16 @@ func (t byteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newByteArrayColumnIndexer(sizeLimit) } -func (t byteArrayType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t byteArrayType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t byteArrayType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t byteArrayType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t byteArrayType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t byteArrayType) NewPage(columnIndex, numValues int, data []byte) Page { + return newByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data) } type fixedLenByteArrayType struct{ length int } @@ -129,20 +120,16 @@ func (t *fixedLenByteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newFixedLenByteArrayColumnIndexer(t.length, sizeLimit) } -func (t *fixedLenByteArrayType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), bufferSize) -} - func (t *fixedLenByteArrayType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newFixedLenByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize) } -func (t *fixedLenByteArrayType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newFixedLenByteArrayColumnReader(t, makeColumnIndex(columnIndex), bufferSize) +func (t *fixedLenByteArrayType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } -func (t *fixedLenByteArrayType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), numValues, decoder) +func (t fixedLenByteArrayType) NewPage(columnIndex, numValues int, data []byte) Page { + return newFixedLenByteArrayPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, t.Length()) } // FixedLenByteArrayType constructs a type for fixed-length values of the given @@ -165,22 +152,6 @@ func (t *intType) NewColumnIndexer(sizeLimit int) ColumnIndexer { } } -func (t *intType) NewDictionary(columnIndex, bufferSize int) Dictionary { - if t.IsSigned { - if t.BitWidth == 64 { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, &int64Class) - } else { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, &int32Class) - } - } else { - if t.BitWidth == 64 { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, &uint64Class) - } else { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, &uint32Class) - } - } -} - func (t *intType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { if t.IsSigned { if t.BitWidth == 64 { @@ -197,34 +168,34 @@ func (t *intType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { } } -func (t *intType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { +func (t *intType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { if t.IsSigned { if t.BitWidth == 64 { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, &int64Class) + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int64Class) } else { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, &int32Class) + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int32Class) } } else { if t.BitWidth == 64 { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, &uint64Class) + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, &uint64Class) } else { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, &uint32Class) + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, &uint32Class) } } } -func (t *intType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { +func (t *intType) NewPage(columnIndex, numValues int, data []byte) Page { if t.IsSigned { if t.BitWidth == 64 { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, &int64Class) + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int64Class) } else { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, &int32Class) + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int32Class) } } else { if t.BitWidth == 64 { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, &uint64Class) + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, &uint64Class) } else { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, &uint32Class) + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, &uint32Class) } } } @@ -233,20 +204,16 @@ func (t *dateType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newColumnIndexer(&int32Class) } -func (t *dateType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, &int32Class) -} - func (t *dateType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize, &int32Class) } -func (t *dateType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, &int32Class) +func (t *dateType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int32Class) } -func (t *dateType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, &int32Class) +func (t *dateType) NewPage(columnIndex, numValues int, data []byte) Page { + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int32Class) } func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { @@ -257,14 +224,6 @@ func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { } } -func (t *timeType) NewDictionary(columnIndex, bufferSize int) Dictionary { - if t.Unit.Millis != nil { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, &int32Class) - } else { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, &int64Class) - } -} - func (t *timeType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { if t.Unit.Millis != nil { return newColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize, &int32Class) @@ -273,19 +232,19 @@ func (t *timeType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { } } -func (t *timeType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { +func (t *timeType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { if t.Unit.Millis != nil { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, &int32Class) + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int32Class) } else { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, &int64Class) + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int64Class) } } -func (t *timeType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { +func (t *timeType) NewPage(columnIndex, numValues int, data []byte) Page { if t.Unit.Millis != nil { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, &int32Class) + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int32Class) } else { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, &int64Class) + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int64Class) } } @@ -293,18 +252,14 @@ func (t *timestampType) NewColumnIndexer(sizeLimit int) ColumnIndexer { return newColumnIndexer(&int64Class) } -func (t *timestampType) NewDictionary(columnIndex, bufferSize int) Dictionary { - return newDictionary(t, makeColumnIndex(columnIndex), bufferSize, &int64Class) -} - func (t *timestampType) NewColumnBuffer(columnIndex, bufferSize int) ColumnBuffer { return newColumnBuffer(t, makeColumnIndex(columnIndex), bufferSize, &int64Class) } -func (t *timestampType) NewColumnReader(columnIndex, bufferSize int) ColumnReader { - return newColumnReader(t, makeColumnIndex(columnIndex), bufferSize, &int64Class) +func (t *timestampType) NewDictionary(columnIndex, numValues int, data []byte) Dictionary { + return newDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int64Class) } -func (t *timestampType) ReadDictionary(columnIndex, numValues int, decoder encoding.Decoder) (Dictionary, error) { - return readDictionary(t, makeColumnIndex(columnIndex), numValues, decoder, &int64Class) +func (t *timestampType) NewPage(columnIndex, numValues int, data []byte) Page { + return newPage(makeColumnIndex(columnIndex), makeNumValues(numValues), data, &int64Class) } diff --git a/writer.go b/writer.go index 60673be..452991d 100644 --- a/writer.go +++ b/writer.go @@ -215,13 +215,6 @@ type writer struct { sortingColumns []format.SortingColumn } -type writerBuffers struct { - compressed []byte - header bytes.Buffer - page bytes.Buffer - reader bytes.Reader -} - func newWriter(output io.Writer, config *WriterConfig) *writer { w := new(writer) if config.WriteBufferSize <= 0 { @@ -302,7 +295,7 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { } if isDictionaryEncoding(encoding) { - dictionary = columnType.NewDictionary(columnIndex, defaultDictBufferSize) + dictionary = columnType.NewDictionary(columnIndex, 0, make([]byte, 0, defaultDictBufferSize)) columnType = dictionary.Type() } @@ -327,7 +320,7 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { // compressed, the data pages are encoded with the hybrid // RLE/Bit-Pack encoding which doesn't benefit from an extra // compression layer. - isCompressed: compression.CompressionCodec() != format.Uncompressed && (dataPageType != format.DataPageV2 || dictionary == nil), + isCompressed: isCompressed(compression) && (dataPageType != format.DataPageV2 || dictionary == nil), } c.header.encoder.Reset(c.header.protocol.NewWriter(&buffers.header)) @@ -342,7 +335,6 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { } if leaf.maxDefinitionLevel > 0 { - c.levels.encoder = RLE.NewEncoder(nil) c.encodings = addEncoding(c.encodings, format.RLE) } @@ -350,9 +342,8 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { c.encodings = addEncoding(c.encodings, format.Plain) } - c.page.encoder = encoding.NewEncoder(nil) - c.page.encoding = encoding.Encoding() - c.encodings = addEncoding(c.encodings, c.page.encoding) + c.page.encoding = encoding + c.encodings = addEncoding(c.encodings, c.page.encoding.Encoding()) sortPageEncodings(c.encodings) w.columns = append(w.columns, c) @@ -454,7 +445,7 @@ func (w *writer) writeFileHeader() error { func (w *writer) configureBloomFilters(columnChunks []ColumnChunk) { for i, c := range w.columns { if c.columnFilter != nil { - c.page.filter = c.newBloomFilterEncoder(columnChunks[i].NumValues()) + c.resizeBloomFilter(columnChunks[i].NumValues()) } } } @@ -556,7 +547,7 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] fileOffset := w.writer.offset for _, c := range w.columns { - if c.page.filter != nil { + if len(c.filter.bits) > 0 { c.columnChunk.MetaData.BloomFilterOffset = w.writer.offset if err := c.writeBloomFilter(&w.writer); err != nil { return 0, err @@ -664,11 +655,102 @@ func (w *writer) WritePage(page Page) (int64, error) { return w.columns[page.Column()].WritePage(page) } +// One writerBuffers is used by each writer instance, the memory buffers here +// are shared by all columns of the writer because serialization is not done +// concurrently, which helps keep memory utilization low, both in the total +// footprint and GC cost. +// +// The type also exposes helper methods to facilitate the generation of parquet +// pages. A scratch space is used when serialization requires combining multiple +// buffers or compressing the page data, with double-buffering technique being +// employed by swapping the scratch and page buffers to minimize memory copies. +type writerBuffers struct { + header bytes.Buffer // buffer where page headers are encoded + repetitions []byte // buffer used to encode repetition levels + definitions []byte // buffer used to encode definition levels + page []byte // page buffer holding the page data + scratch []byte // scratch space used for compression +} + +func (wb *writerBuffers) crc32() (checksum uint32) { + checksum = crc32.Update(checksum, crc32.IEEETable, wb.repetitions) + checksum = crc32.Update(checksum, crc32.IEEETable, wb.definitions) + checksum = crc32.Update(checksum, crc32.IEEETable, wb.page) + return checksum +} + +func (wb *writerBuffers) size() int { + return len(wb.repetitions) + len(wb.definitions) + len(wb.page) +} + +func (wb *writerBuffers) reset() { + wb.repetitions = wb.repetitions[:0] + wb.definitions = wb.definitions[:0] + wb.page = wb.page[:0] +} + +func (wb *writerBuffers) encodeRepetitionLevels(page BufferedPage, maxRepetitionLevel int8) (err error) { + bitWidth := bits.Len8(maxRepetitionLevel) + encoding := &levelEncodings[bitWidth-1] + wb.repetitions, err = encoding.EncodeInt8(wb.repetitions[:0], page.RepetitionLevels()) + return err +} + +func (wb *writerBuffers) encodeDefinitionLevels(page BufferedPage, maxDefinitionLevel int8) (err error) { + bitWidth := bits.Len8(maxDefinitionLevel) + encoding := &levelEncodings[bitWidth-1] + wb.definitions, err = encoding.EncodeInt8(wb.definitions[:0], page.DefinitionLevels()) + return err +} + +func (wb *writerBuffers) prependLevelsToDataPageV1(maxRepetitionLevel, maxDefinitionLevel int8) { + hasRepetitionLevels := maxRepetitionLevel > 0 + hasDefinitionLevels := maxDefinitionLevel > 0 + + if hasRepetitionLevels || hasDefinitionLevels { + wb.scratch = wb.scratch[:0] + // In data pages v1, the repetition and definition levels are prefixed + // with the 4 bytes length of the sections. While the parquet-format + // documentation indicates that the length prefix is part of the hybrid + // RLE/Bit-Pack encoding, this is the only condition where it is used + // so we treat it as a special case rather than implementing it in the + // encoding. + // + // Reference https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3 + if hasRepetitionLevels { + wb.scratch = plain.AppendInt32(wb.scratch, int32(len(wb.repetitions))) + wb.scratch = append(wb.scratch, wb.repetitions...) + wb.repetitions = wb.repetitions[:0] + } + if hasDefinitionLevels { + wb.scratch = plain.AppendInt32(wb.scratch, int32(len(wb.definitions))) + wb.scratch = append(wb.scratch, wb.definitions...) + wb.definitions = wb.definitions[:0] + } + wb.scratch = append(wb.scratch, wb.page...) + wb.swapPageAndScratchBuffers() + } +} + +func (wb *writerBuffers) encode(page BufferedPage, enc encoding.Encoding) (err error) { + wb.page, err = page.Encode(wb.page[:0], enc) + return err +} + +func (wb *writerBuffers) compress(codec compress.Codec) (err error) { + wb.scratch, err = codec.Encode(wb.scratch[:0], wb.page) + wb.swapPageAndScratchBuffers() + return err +} + +func (wb *writerBuffers) swapPageAndScratchBuffers() { + wb.page, wb.scratch = wb.scratch, wb.page[:0] +} + type writerColumn struct { insert func(*writerColumn, []Value) error commit func(*writerColumn) error values []Value - filter []BufferedPage pool PageBufferPool pages []io.ReadWriter @@ -687,23 +769,18 @@ type writerColumn struct { buffers *writerBuffers - levels struct { - encoder encoding.Encoder - } - header struct { protocol thrift.CompactProtocol encoder thrift.Encoder } page struct { - filter *bloomFilterEncoder - encoding format.Encoding - encoder encoding.Encoder + encoding encoding.Encoding } - dict struct { - encoder plain.Encoder + filter struct { + bits []byte + pages []BufferedPage } numRows int64 @@ -732,14 +809,17 @@ func (c *writerColumn) reset() { for _, page := range c.pages { c.pool.PutPageBuffer(page) } - for i := range c.filter { - c.filter[i] = nil - } for i := range c.pages { c.pages[i] = nil } - c.filter = c.filter[:0] + for i := range c.filter.pages { + c.filter.pages[i] = nil + } c.pages = c.pages[:0] + // Bloom filters may change in size between row groups, but we retain the + // buffer to avoid reallocating large memory blocks. + c.filter.bits = c.filter.bits[:0] + c.filter.pages = c.filter.pages[:0] c.numRows = 0 c.numValues = 0 // Reset the fields of column chunks that change between row groups, @@ -756,11 +836,6 @@ func (c *writerColumn) reset() { // the number of pages should be roughly the same between row groups written // by the writer. c.offsetIndex.PageLocations = make([]format.PageLocation, 0, cap(c.offsetIndex.PageLocations)) - // Bloom filters may change in size between row groups; we may want to - // optimize this by retaining the filter and reusing it if needed, but - // for now we take the simpler approach of freeing it and having the - // write path lazily reallocate it if the writer is reused. - c.page.filter = nil } func (c *writerColumn) totalRowCount() int64 { @@ -784,30 +859,46 @@ func (c *writerColumn) flush() (err error) { return err } -func (c *writerColumn) flushFilterPages() error { +func (c *writerColumn) flushFilterPages() (err error) { if c.columnFilter != nil { - numValues := int64(0) - for _, page := range c.filter { - numValues += page.NumValues() - } - if c.page.filter == nil { - c.page.filter = c.newBloomFilterEncoder(numValues) - } - - // If there is a dictionary, we need to only write the dictionary. + // If there is a dictionary, it contains all the values that we need to + // write to the filter. if dict := c.dictionary; dict != nil { - return dict.Page().WriteTo(c.page.filter) + if c.filter.bits == nil { + c.resizeBloomFilter(int64(dict.Len())) + } + return c.writePageToFilter(dict.Page()) } - for _, page := range c.filter { - if err := page.WriteTo(c.page.filter); err != nil { - return err + if len(c.filter.pages) > 0 { + numValues := int64(0) + for _, page := range c.filter.pages { + numValues += page.NumValues() + } + c.resizeBloomFilter(numValues) + for _, page := range c.filter.pages { + if err := c.writePageToFilter(page); err != nil { + return err + } } } } return nil } +func (c *writerColumn) resizeBloomFilter(numValues int64) { + const bitsPerValue = 10 // TODO: make this configurable + filterSize := c.columnFilter.Size(numValues, bitsPerValue) + if cap(c.filter.bits) < filterSize { + c.filter.bits = make([]byte, filterSize) + } else { + c.filter.bits = c.filter.bits[:filterSize] + for i := range c.filter.bits { + c.filter.bits[i] = 0 + } + } +} + func (c *writerColumn) insertRepeated(row []Value) error { c.values = append(c.values, row...) return nil @@ -832,14 +923,6 @@ func (c *writerColumn) newColumnBuffer() ColumnBuffer { return column } -func (c *writerColumn) newBloomFilterEncoder(numRows int64) *bloomFilterEncoder { - const bitsPerValue = 10 // TODO: make this configurable - return newBloomFilterEncoder( - c.columnFilter.NewFilter(numRows, bitsPerValue), - c.columnFilter.Hash(), - ) -} - func (c *writerColumn) writeRow(row []Value) error { if c.columnBuffer == nil { // Lazily create the row group column so we don't need to allocate it if @@ -936,12 +1019,11 @@ func (c *writerColumn) writePageValues(page ValueReader) (numValues int64, err e func (c *writerColumn) writeBloomFilter(w io.Writer) error { e := thrift.NewEncoder(c.header.protocol.NewWriter(w)) h := bloomFilterHeader(c.columnFilter) - b := c.page.filter.Bytes() - h.NumBytes = int32(len(b)) + h.NumBytes = int32(len(c.filter.bits)) if err := e.Encode(&h); err != nil { return err } - _, err := w.Write(b) + _, err := w.Write(c.filter.bits) return err } @@ -951,61 +1033,46 @@ func (c *writerColumn) writeBufferedPage(page BufferedPage) (int64, error) { return 0, nil } - buffer := &c.buffers.page - buffer.Reset() - repetitionLevelsByteLength := 0 - definitionLevelsByteLength := 0 + buf := c.buffers + buf.reset() - switch c.dataPageType { - case format.DataPageV2: - if c.maxRepetitionLevel > 0 { - c.levels.encoder.Reset(buffer) - c.levels.encoder.SetBitWidth(bits.Len8(c.maxRepetitionLevel)) - c.levels.encoder.EncodeInt8(page.RepetitionLevels()) - repetitionLevelsByteLength = buffer.Len() - } - if c.maxDefinitionLevel > 0 { - c.levels.encoder.Reset(buffer) - c.levels.encoder.SetBitWidth(bits.Len8(c.maxDefinitionLevel)) - c.levels.encoder.EncodeInt8(page.DefinitionLevels()) - definitionLevelsByteLength = buffer.Len() - repetitionLevelsByteLength - } + if c.maxRepetitionLevel > 0 { + buf.encodeRepetitionLevels(page, c.maxRepetitionLevel) + } + if c.maxDefinitionLevel > 0 { + buf.encodeDefinitionLevels(page, c.maxDefinitionLevel) + } + if err := buf.encode(page, c.page.encoding); err != nil { + return 0, fmt.Errorf("encoding parquet data page: %w", err) + } + if c.dataPageType == format.DataPage { + buf.prependLevelsToDataPageV1(c.maxDefinitionLevel, c.maxDefinitionLevel) + } - case format.DataPage: - // In data pages v1, the repetition and definition levels are prefixed - // with the 4 bytes length of the sections. While the parquet-format - // documentation indicates that the length prefix is part of the hybrid - // RLE/Bit-Pack encoding, this is the only condition where it is used - // so we treat it as a special case rather than implementing it in the - // encoding. - // - // Reference https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3 - lengthPlaceholder := make([]byte, 4) - if c.maxRepetitionLevel > 0 { - buffer.Write(lengthPlaceholder) - offset := buffer.Len() - c.levels.encoder.Reset(buffer) - c.levels.encoder.SetBitWidth(bits.Len8(c.maxRepetitionLevel)) - c.levels.encoder.EncodeInt8(page.RepetitionLevels()) - binary.LittleEndian.PutUint32(buffer.Bytes()[offset-4:], uint32(buffer.Len()-offset)) - } - if c.maxDefinitionLevel > 0 { - buffer.Write(lengthPlaceholder) - offset := buffer.Len() - c.levels.encoder.Reset(buffer) - c.levels.encoder.SetBitWidth(bits.Len8(c.maxDefinitionLevel)) - c.levels.encoder.EncodeInt8(page.DefinitionLevels()) - binary.LittleEndian.PutUint32(buffer.Bytes()[offset-4:], uint32(buffer.Len()-offset)) + uncompressedPageSize := buf.size() + if c.isCompressed { + if err := buf.compress(c.compression); err != nil { + return 0, fmt.Errorf("compressing parquet data page: %w", err) } } switch { - case c.page.filter != nil: - if err := page.WriteTo(c.page.filter); err != nil { + case len(c.filter.bits) > 0: + // When the writer knows the number of values in advance (e.g. when + // writing a full row group), the filter encoding is set and the page + // can be directly applied to the filter, which minimizes memory usage + // since there is no need to buffer the values in order to determine + // the size of the filter. + if err := c.writePageToFilter(page); err != nil { return 0, err } - case c.columnFilter != nil: - c.filter = append(c.filter, page.Clone()) + case c.columnFilter != nil && c.dictionary == nil: + // If the column uses a dictionary encoding, all possible values exist + // in the dictionary and there is no need to buffer the pages, but if + // the column is supposed to generate a filter and the number of values + // wasn't known, we must buffer all the pages in order to properly size + // the filter. + c.filter.pages = append(c.filter.pages, page.Clone()) } statistics := format.Statistics{} @@ -1013,34 +1080,11 @@ func (c *writerColumn) writeBufferedPage(page BufferedPage) (int64, error) { statistics = c.makePageStatistics(page) } - c.page.encoder.Reset(buffer) - if err := page.WriteTo(c.page.encoder); err != nil { - return 0, err - } - - uncompressedPageSize := buffer.Len() - pageData := buffer.Bytes() - if c.isCompressed { - offset := repetitionLevelsByteLength + definitionLevelsByteLength - b, err := c.compress(pageData[offset:]) - if err != nil { - return 0, fmt.Errorf("compressing parquet data page: %w", err) - } - if offset == 0 { - pageData = b - } else { - // TODO: can this copy be optimized away? - buffer.Truncate(offset) - buffer.Write(b) - pageData = buffer.Bytes() - } - } - pageHeader := &format.PageHeader{ Type: c.dataPageType, UncompressedPageSize: int32(uncompressedPageSize), - CompressedPageSize: int32(len(pageData)), - CRC: int32(crc32.ChecksumIEEE(pageData)), + CompressedPageSize: int32(buf.size()), + CRC: int32(buf.crc32()), } numRows := page.NumRows() @@ -1049,7 +1093,7 @@ func (c *writerColumn) writeBufferedPage(page BufferedPage) (int64, error) { case format.DataPage: pageHeader.DataPageHeader = &format.DataPageHeader{ NumValues: int32(numValues), - Encoding: c.page.encoding, + Encoding: c.page.encoding.Encoding(), DefinitionLevelEncoding: format.RLE, RepetitionLevelEncoding: format.RLE, Statistics: statistics, @@ -1059,36 +1103,50 @@ func (c *writerColumn) writeBufferedPage(page BufferedPage) (int64, error) { NumValues: int32(numValues), NumNulls: int32(numNulls), NumRows: int32(numRows), - Encoding: c.page.encoding, - DefinitionLevelsByteLength: int32(definitionLevelsByteLength), - RepetitionLevelsByteLength: int32(repetitionLevelsByteLength), + Encoding: c.page.encoding.Encoding(), + DefinitionLevelsByteLength: int32(len(buf.definitions)), + RepetitionLevelsByteLength: int32(len(buf.repetitions)), IsCompressed: &c.isCompressed, Statistics: statistics, } } - header := &c.buffers.header - header.Reset() + buf.header.Reset() if err := c.header.encoder.Encode(pageHeader); err != nil { return 0, err } - headerSize := int32(header.Len()) - compressedSize := int64(headerSize) + int64(len(pageData)) - reader := &c.buffers.reader - reader.Reset(pageData) - - if err := c.writePage(compressedSize, header, reader); err != nil { + size := int64(buf.header.Len()) + + int64(len(buf.repetitions)) + + int64(len(buf.definitions)) + + int64(len(buf.page)) + + err := c.writePage(size, func(output io.Writer) (written int64, err error) { + for _, data := range [...][]byte{ + buf.header.Bytes(), + buf.repetitions, + buf.definitions, + buf.page, + } { + wn, err := output.Write(data) + written += int64(wn) + if err != nil { + return written, err + } + } + return written, nil + }) + if err != nil { return 0, err } - c.recordPageStats(headerSize, pageHeader, page) + c.recordPageStats(int32(buf.header.Len()), pageHeader, page) return numValues, nil } func (c *writerColumn) writeCompressedPage(page CompressedPage) (int64, error) { switch { - case c.page.filter != nil: + case len(c.filter.bits) > 0: // TODO: modify the Buffer method to accept some kind of buffer pool as // argument so we can use a pre-allocated page buffer to load the page // and reduce the memory footprint. @@ -1098,14 +1156,14 @@ func (c *writerColumn) writeCompressedPage(page CompressedPage) (int64, error) { // most of the compute cost (compression algorithms are usually designed // to make decompressing much cheaper than compressing since it happens // more often). - if err := bufferedPage.WriteTo(c.page.filter); err != nil { + if err := c.writePageToFilter(bufferedPage); err != nil { return 0, err } - case c.columnFilter != nil: + case c.columnFilter != nil && c.dictionary == nil: // When a column filter is configured but no page filter was allocated, // we need to buffer the page in order to have access to the number of // values and properly size the bloom filter when writing the row group. - c.filter = append(c.filter, page.Buffer()) + c.filter.pages = append(c.filter.pages, page.Buffer()) } pageHeader := &format.PageHeader{ @@ -1131,58 +1189,41 @@ func (c *writerColumn) writeCompressedPage(page CompressedPage) (int64, error) { headerSize := int32(header.Len()) compressedSize := int64(headerSize + pageHeader.CompressedPageSize) - if err := c.writePage(compressedSize, header, page.PageData()); err != nil { + err := c.writePage(compressedSize, func(output io.Writer) (int64, error) { + headerSize, err := header.WriteTo(output) + if err != nil { + return headerSize, err + } + dataSize, err := io.Copy(output, page.PageData()) + return headerSize + dataSize, err + }) + if err != nil { return 0, err } - c.recordPageStats(headerSize, pageHeader, page) return page.NumValues(), nil } -func (c *writerColumn) writePage(size int64, header, data io.Reader) error { - buffer := c.pool.GetPageBuffer() - defer func() { - if buffer != nil { - c.pool.PutPageBuffer(buffer) - } - }() - headerSize, err := io.Copy(buffer, header) - if err != nil { - return err - } - dataSize, err := io.Copy(buffer, data) - if err != nil { - return err - } - written := headerSize + dataSize - if size != written { - return fmt.Errorf("writing parquet colum page expected %dB but got %dB: %w", size, written, io.ErrShortWrite) - } - c.pages = append(c.pages, buffer) - buffer = nil - return nil -} - -func (c *writerColumn) writeDictionaryPage(output io.Writer, dict Dictionary) error { - buffer := &c.buffers.page - buffer.Reset() - c.dict.encoder.Reset(buffer) +func (c *writerColumn) writeDictionaryPage(output io.Writer, dict Dictionary) (err error) { + buf := c.buffers + buf.reset() - if err := dict.Page().WriteTo(&c.dict.encoder); err != nil { + if err := buf.encode(dict.Page(), &Plain); err != nil { return fmt.Errorf("writing parquet dictionary page: %w", err) } - uncompressedPageSize := buffer.Len() - pageData, err := c.compress(buffer.Bytes()) - if err != nil { - return fmt.Errorf("compressing parquet dictionary page: %w", err) + uncompressedPageSize := buf.size() + if isCompressed(c.compression) { + if err := buf.compress(c.compression); err != nil { + return fmt.Errorf("copmressing parquet dictionary page: %w", err) + } } pageHeader := &format.PageHeader{ Type: format.DictionaryPage, UncompressedPageSize: int32(uncompressedPageSize), - CompressedPageSize: int32(len(pageData)), - CRC: int32(crc32.ChecksumIEEE(pageData)), + CompressedPageSize: int32(buf.size()), + CRC: int32(buf.crc32()), DictionaryPageHeader: &format.DictionaryPageHeader{ NumValues: int32(dict.Len()), Encoding: format.Plain, @@ -1198,23 +1239,34 @@ func (c *writerColumn) writeDictionaryPage(output io.Writer, dict Dictionary) er if _, err := output.Write(header.Bytes()); err != nil { return err } - if _, err := output.Write(pageData); err != nil { + if _, err := output.Write(buf.page); err != nil { return err } c.recordPageStats(int32(header.Len()), pageHeader, nil) return nil } -func (c *writerColumn) compress(pageData []byte) ([]byte, error) { - if c.compression.CompressionCodec() != format.Uncompressed { - b, err := c.compression.Encode(c.buffers.compressed[:0], pageData) - c.buffers.compressed = b - if err != nil { - return nil, err +func (w *writerColumn) writePageToFilter(page BufferedPage) (err error) { + w.filter.bits, err = page.Encode(w.filter.bits, w.columnFilter.Encoding()) + return err +} + +func (c *writerColumn) writePage(size int64, writeTo func(io.Writer) (int64, error)) error { + buffer := c.pool.GetPageBuffer() + defer func() { + if buffer != nil { + c.pool.PutPageBuffer(buffer) } - pageData = b + }() + written, err := writeTo(buffer) + if err != nil { + return err } - return pageData, nil + if written != size { + return fmt.Errorf("writing parquet column page expected %dB but got %dB: %w", size, written, io.ErrShortWrite) + } + c.pages, buffer = append(c.pages, buffer), nil + return nil } func (c *writerColumn) makePageStatistics(page Page) format.Statistics { diff --git a/writer_test.go b/writer_test.go index eb3a692..2e7313f 100644 --- a/writer_test.go +++ b/writer_test.go @@ -95,18 +95,17 @@ var writerTests = []struct { }, dump: `row group 0 -------------------------------------------------------------------------------- -first_name: BINARY ZSTD DO:4 FPO:55 SZ:90/72/0.80 VC:3 ENC:PLAIN,RLE_DICTIONARY [more]... -last_name: BINARY ZSTD DO:0 FPO:94 SZ:148/121/0.82 VC:3 ENC:DELTA_BYTE_ARRAY [more]... +first_name: BINARY ZSTD DO:4 FPO:55 SZ:90/72/0.80 VC:3 ENC:PLAIN,RLE_DICTIONARY ST:[no stats for this column] +last_name: BINARY ZSTD DO:0 FPO:94 SZ:115/97/0.84 VC:3 ENC:DELTA_BYTE_ARRAY ST:[no stats for this column] first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY [more]... SZ:7 + page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:7 VC:3 last_name TV=3 RL=0 DL=0 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY [more]... SZ:14 - page 1: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY [more]... SZ:19 - page 2: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY [more]... SZ:19 + page 0: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:32 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:19 VC:1 BINARY first_name -------------------------------------------------------------------------------- @@ -134,18 +133,17 @@ value 3: R:0 D:0 V:Skywalker }, dump: `row group 0 -------------------------------------------------------------------------------- -first_name: BINARY ZSTD DO:4 FPO:55 SZ:86/77/0.90 VC:3 ENC:RLE_DICTIONARY,PLAIN [more]... -last_name: BINARY ZSTD DO:0 FPO:90 SZ:163/136/0.83 VC:3 ENC:DELTA_BYTE_ARRAY [more]... +first_name: BINARY ZSTD DO:4 FPO:55 SZ:86/77/0.90 VC:3 ENC:RLE_DICTIONARY,PLAIN ST:[no stats for this column] +last_name: BINARY ZSTD DO:0 FPO:90 SZ:125/107/0.86 VC:3 ENC:DELTA_BYTE_ARRAY ST:[no stats for this column] first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY [more]... VC:3 + page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:7 VC:3 last_name TV=3 RL=0 DL=0 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY [more]... VC:1 - page 1: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY [more]... VC:1 - page 2: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY [more]... VC:1 + page 0: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:32 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:19 VC:1 BINARY first_name -------------------------------------------------------------------------------- @@ -181,30 +179,30 @@ value 3: R:0 D:0 V:Skywalker }, dump: `row group 0 -------------------------------------------------------------------------------- -name: BINARY GZIP DO:4 FPO:70 SZ:126/101/0.80 VC:10 ENC:PLAIN,RL [more]... -timestamp: INT64 GZIP DO:0 FPO:130 SZ:403/278/0.69 VC:10 ENC:DELTA_BI [more]... -value: DOUBLE GZIP DO:0 FPO:533 SZ:344/219/0.64 VC:10 ENC:PLAIN S [more]... +name: BINARY GZIP DO:4 FPO:70 SZ:126/101/0.80 VC:10 ENC:PLAIN,RLE_DICTIONARY ST:[no stats for this column] +timestamp: INT64 GZIP DO:0 FPO:130 SZ:334/209/0.63 VC:10 ENC:DELTA_BINARY_PACKED ST:[no stats for this column] +value: DOUBLE GZIP DO:0 FPO:464 SZ:344/219/0.64 VC:10 ENC:PLAIN ST:[no stats for this column] name TV=10 RL=0 DL=0 DS: 1 DE:PLAIN ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[n [more]... VC:5 - page 1: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[n [more]... VC:5 + page 0: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:5 + page 1: DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:5 timestamp TV=10 RL=0 DL=0 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED [more]... VC:2 - page 1: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED [more]... VC:2 - page 2: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED [more]... VC:2 - page 3: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED [more]... VC:2 - page 4: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED [more]... VC:2 + page 0: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:14 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:14 VC:2 + page 2: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:14 VC:2 + page 3: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:14 VC:2 + page 4: DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:14 VC:2 value TV=10 RL=0 DL=0 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats f [more]... VC:2 - page 1: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats f [more]... VC:2 - page 2: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats f [more]... VC:2 - page 3: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats f [more]... VC:2 - page 4: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats f [more]... VC:2 + page 0: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:16 VC:2 + page 1: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:16 VC:2 + page 2: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:16 VC:2 + page 3: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:16 VC:2 + page 4: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:16 VC:2 BINARY name -------------------------------------------------------------------------------- @@ -278,31 +276,30 @@ value 10: R:0 D:0 V:10.0 dump: `row group 0 -------------------------------------------------------------------------------- -owner: BINARY ZSTD DO:0 FPO:4 SZ:98/80/0.82 VC:2 ENC:DELT [more]... -ownerPhoneNumbers: BINARY GZIP DO:0 FPO:102 SZ:166/116/0.70 VC:3 ENC: [more]... +owner: BINARY ZSTD DO:0 FPO:4 SZ:66/57/0.86 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] +ownerPhoneNumbers: BINARY GZIP DO:0 FPO:70 SZ:162/112/0.69 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] contacts: -.name: BINARY UNCOMPRESSED DO:0 FPO:268 SZ:120/120/1.00 VC:3 [more]... -.phoneNumber: BINARY ZSTD DO:0 FPO:388 SZ:114/96/0.84 VC:3 ENC:D [more]... +.name: BINARY UNCOMPRESSED DO:0 FPO:232 SZ:116/116/1.00 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] +.phoneNumber: BINARY ZSTD DO:0 FPO:348 SZ:113/95/0.84 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] owner TV=2 RL=0 DL=0 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... SZ:18 - page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... SZ:16 + page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:34 VC:2 ownerPhoneNumbers TV=3 RL=1 DL=1 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... SZ:52 - page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... SZ:17 + page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:48 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1 contacts.name TV=3 RL=1 DL=1 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... SZ:57 - page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... SZ:17 + page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:53 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:17 VC:1 contacts.phoneNumber TV=3 RL=1 DL=2 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... SZ:33 - page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... SZ:17 + page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:33 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1 BINARY owner -------------------------------------------------------------------------------- @@ -361,31 +358,30 @@ value 3: R:0 D:0 V: dump: `row group 0 -------------------------------------------------------------------------------- -owner: BINARY ZSTD DO:0 FPO:4 SZ:108/90/0.83 VC:2 ENC:DEL [more]... -ownerPhoneNumbers: BINARY GZIP DO:0 FPO:112 SZ:159/109/0.69 VC:3 ENC: [more]... +owner: BINARY ZSTD DO:0 FPO:4 SZ:71/62/0.87 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] +ownerPhoneNumbers: BINARY GZIP DO:0 FPO:75 SZ:156/106/0.68 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] contacts: -.name: BINARY UNCOMPRESSED DO:0 FPO:271 SZ:114/114/1.00 VC:3 [more]... -.phoneNumber: BINARY ZSTD DO:0 FPO:385 SZ:108/90/0.83 VC:3 ENC:D [more]... +.name: BINARY UNCOMPRESSED DO:0 FPO:231 SZ:110/110/1.00 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] +.phoneNumber: BINARY ZSTD DO:0 FPO:341 SZ:108/90/0.83 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column] owner TV=2 RL=0 DL=0 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... VC:1 - page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... VC:1 + page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:34 VC:2 ownerPhoneNumbers TV=3 RL=1 DL=1 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... VC:2 - page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... VC:1 + page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:40 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 contacts.name TV=3 RL=1 DL=1 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... VC:2 - page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... VC:1 + page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:45 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 contacts.phoneNumber TV=3 RL=1 DL=2 ---------------------------------------------------------------------------- - page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... VC:2 - page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats [more]... VC:1 + page 0: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:25 VC:2 + page 1: DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1 BINARY owner -------------------------------------------------------------------------------- @@ -455,7 +451,7 @@ func hasParquetTools() bool { } func parquetTools(cmd, path string) ([]byte, error) { - p := exec.Command("parquet-tools", cmd, "--debug", path) + p := exec.Command("parquet-tools", cmd, "--debug", "--disable-crop", path) output, err := p.CombinedOutput() if err != nil {