Skip to content

Commit

Permalink
fix memory pressure in cat command with large skip number
Browse files Browse the repository at this point in the history
  • Loading branch information
hangxie committed Jul 3, 2023
1 parent 461ee2f commit 32ac6bf
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 5 deletions.
33 changes: 29 additions & 4 deletions cmd/cat.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ var delimiter = map[string]struct {
"tsv": {"", "\n", '\t', ""},
}

// here are performan number from a MacBook for different settings:
// page_size max_memory_usage time_taken
// 1K 1.9G 25s
// 10K 1.8G 15s
// 100K 2.4G 12s
// 1M 7.1G 15s
// 10M 52.1G 1m14s
const pageSize int64 = 100_000

// Run does actual cat job
func (c CatCmd) Run() error {
if c.PageSize < 1 {
Expand Down Expand Up @@ -92,6 +101,23 @@ func (c CatCmd) outputHeader(fileReader *reader.ParquetReader, schemaRoot *inter
return fieldList, nil
}

func (c CatCmd) skipRows(fileReader *reader.ParquetReader) error {
// Do not abort if c.Skip is greater than total number of rows
// This gives users flexibility to handle this scenario by themselves

// use pagination to avoid excessive memory usage, see https://github.com/xitongsys/parquet-go/issues/545
rowsToSkip := int64(c.Skip)
for ; rowsToSkip > pageSize; rowsToSkip -= pageSize {
if err := fileReader.SkipRows(pageSize); err != nil {
return fmt.Errorf("failed to skip %d rows: %s", c.Skip, err)
}
}
if err := fileReader.SkipRows(rowsToSkip); err != nil {
return fmt.Errorf("failed to skip %d rows: %s", c.Skip, err)
}
return nil
}

func (c CatCmd) outputRows(fileReader *reader.ParquetReader) error {
schemaRoot := internal.NewSchemaTree(fileReader)

Expand All @@ -104,10 +130,9 @@ func (c CatCmd) outputRows(fileReader *reader.ParquetReader) error {
// retrieve schema for better formatting
reinterpretFields := schemaRoot.GetReinterpretFields("", true)

// Do not abort if c.Skip is greater than total number of rows
// This gives users flexibility to handle this scenario by themselves
if err := fileReader.SkipRows(int64(c.Skip)); err != nil {
return fmt.Errorf("failed to skip %d rows: %s", c.Skip, err)
// skip rows
if err != c.skipRows(fileReader) {
return err
}

// Output rows one by one to avoid running out of memory with a jumbo list
Expand Down
2 changes: 1 addition & 1 deletion cmd/cat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func Test_CatCmd_Run_good_skip(t *testing.T) {

func Test_CatCmd_Run_good_all_skip(t *testing.T) {
cmd := &CatCmd{}
cmd.Skip = 12
cmd.Skip = 100_002
cmd.Limit = 10
cmd.PageSize = 10
cmd.SampleRatio = 1.0
Expand Down

0 comments on commit 32ac6bf

Please sign in to comment.