forked from fraugster/parquet-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompress.go
157 lines (125 loc) · 4.28 KB
/
compress.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
package goparquet
import (
"bytes"
"compress/gzip"
"errors"
"fmt"
"io"
"io/ioutil"
"sync"
"github.com/fraugster/parquet-go/parquet"
"github.com/golang/snappy"
)
var (
compressors = make(map[parquet.CompressionCodec]BlockCompressor)
compressorLock sync.RWMutex
)
type (
// BlockCompressor is an interface to describe of a block compressor to be used
// in compressing the content of parquet files.
BlockCompressor interface {
CompressBlock([]byte) ([]byte, error)
DecompressBlock([]byte) ([]byte, error)
}
plainCompressor struct{}
snappyCompressor struct{}
gzipCompressor struct{}
)
func (plainCompressor) CompressBlock(block []byte) ([]byte, error) {
return block, nil
}
func (plainCompressor) DecompressBlock(block []byte) ([]byte, error) {
return block, nil
}
func (snappyCompressor) CompressBlock(block []byte) ([]byte, error) {
return snappy.Encode(nil, block), nil
}
func (snappyCompressor) DecompressBlock(block []byte) ([]byte, error) {
return snappy.Decode(nil, block)
}
func (gzipCompressor) CompressBlock(block []byte) ([]byte, error) {
buf := &bytes.Buffer{}
w := gzip.NewWriter(buf)
if _, err := w.Write(block); err != nil {
return nil, err
}
if err := w.Close(); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func (gzipCompressor) DecompressBlock(block []byte) ([]byte, error) {
buf := bytes.NewReader(block)
r, err := gzip.NewReader(buf)
if err != nil {
return nil, err
}
ret, err := ioutil.ReadAll(r)
if err != nil {
return nil, err
}
return ret, r.Close()
}
func compressBlock(block []byte, method parquet.CompressionCodec) ([]byte, error) {
compressorLock.RLock()
defer compressorLock.RUnlock()
c, ok := compressors[method]
if !ok {
return nil, fmt.Errorf("method %q is not supported", method.String())
}
return c.CompressBlock(block)
}
func decompressBlock(block []byte, method parquet.CompressionCodec) ([]byte, error) {
compressorLock.RLock()
defer compressorLock.RUnlock()
c, ok := compressors[method]
if !ok {
return nil, fmt.Errorf("method %q is not supported", method.String())
}
return c.DecompressBlock(block)
}
func newBlockReader(buf []byte, codec parquet.CompressionCodec, compressedSize int32, uncompressedSize int32, alloc *allocTracker) (io.Reader, error) {
if compressedSize < 0 || uncompressedSize < 0 {
return nil, errors.New("invalid page data size")
}
if len(buf) != int(compressedSize) {
return nil, fmt.Errorf("compressed data must be %d byte but its %d byte", compressedSize, len(buf))
}
alloc.test(uint64(uncompressedSize))
res, err := decompressBlock(buf, codec)
if err != nil {
return nil, fmt.Errorf("decompression failed: %w", err)
}
alloc.register(res, uint64(len(res)))
if len(res) != int(uncompressedSize) {
return nil, fmt.Errorf("decompressed data must be %d byte but its %d byte", uncompressedSize, len(res))
}
return bytes.NewReader(res), nil
}
// RegisterBlockCompressor is a function to to register additional block compressors to the package. By default,
// only UNCOMPRESSED, GZIP and SNAPPY are supported as parquet compression algorithms. The parquet file format
// supports more compression algorithms, such as LZO, BROTLI, LZ4 and ZSTD. To limit the amount of external dependencies,
// the number of supported algorithms was reduced to a core set. If you want to use any of the other compression
// algorithms, please provide your own implementation of it in a way that satisfies the BlockCompressor interface,
// and register it using this function from your code.
func RegisterBlockCompressor(method parquet.CompressionCodec, compressor BlockCompressor) {
compressorLock.Lock()
defer compressorLock.Unlock()
compressors[method] = compressor
}
// GetRegisteredBlockCompressors returns a map of compression codecs to block compressors that
// are currently registered.
func GetRegisteredBlockCompressors() map[parquet.CompressionCodec]BlockCompressor {
result := make(map[parquet.CompressionCodec]BlockCompressor)
compressorLock.Lock()
defer compressorLock.Unlock()
for k, v := range compressors {
result[k] = v
}
return result
}
func init() {
RegisterBlockCompressor(parquet.CompressionCodec_UNCOMPRESSED, plainCompressor{})
RegisterBlockCompressor(parquet.CompressionCodec_GZIP, gzipCompressor{})
RegisterBlockCompressor(parquet.CompressionCodec_SNAPPY, snappyCompressor{})
}