Skip to content

Commit

Permalink
Fix memory growth bug in read_csv
Browse files Browse the repository at this point in the history
The edge case where we hit powers of 2
every time during allocation can be painful.

Closes gh-24805.

xref gh-23527.
  • Loading branch information
gfyoung committed Jan 19, 2019
1 parent f4458c1 commit e241796
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
* just because a recent chunk did not have as many words.
*/
if (self->words_len + nbytes < self->max_words_cap) {
length = self->max_words_cap - nbytes;
length = self->max_words_cap - nbytes - 1;
} else {
length = self->words_len;
}
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/io/parser/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1916,6 +1916,24 @@ def test_filename_with_special_chars(all_parsers):
tm.assert_frame_equal(result, df)


def test_read_csv_memory_growth_chunksize(all_parsers):
# see gh-24805
#
# Let's just make sure that we don't crash
# as we iteratively process all chunks.
parser = all_parsers

with tm.ensure_clean() as path:
with open(path, "w") as f:
for i in range(1000):
f.write(str(i) + "\n")

result = parser.read_csv(path, chunksize=20)

for _ in result:
pass


def test_read_table_deprecated(all_parsers):
# see gh-21948
parser = all_parsers
Expand Down

0 comments on commit e241796

Please sign in to comment.