Skip to content

Commit

Permalink
Fix memory growth bug in read_csv (pandas-dev#24837)
Browse files Browse the repository at this point in the history
* Fix memory growth bug in read_csv

The edge case where we hit powers of 2
every time during allocation can be painful.

Closes pandas-devgh-24805.

xref pandas-devgh-23527.

* TST: Add ASV benchmark for issue
  • Loading branch information
gfyoung authored and Pingviinituutti committed Feb 28, 2019
1 parent c981d4b commit 6205a62
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 1 deletion.
19 changes: 19 additions & 0 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,23 @@ def time_baseline(self):
names=list(string.digits[:9]))


class ReadCSVMemoryGrowth(BaseIO):

chunksize = 20
num_rows = 1000
fname = "__test__.csv"

def setup(self):
with open(self.fname, "w") as f:
for i in range(self.num_rows):
f.write("{i}\n".format(i=i))

def mem_parser_chunks(self):
# see gh-24805.
result = read_csv(self.fname, chunksize=self.chunksize)

for _ in result:
pass


from ..pandas_vb_common import setup # noqa: F401
2 changes: 1 addition & 1 deletion pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
* just because a recent chunk did not have as many words.
*/
if (self->words_len + nbytes < self->max_words_cap) {
length = self->max_words_cap - nbytes;
length = self->max_words_cap - nbytes - 1;
} else {
length = self->words_len;
}
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/io/parser/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1916,6 +1916,24 @@ def test_filename_with_special_chars(all_parsers):
tm.assert_frame_equal(result, df)


def test_read_csv_memory_growth_chunksize(all_parsers):
# see gh-24805
#
# Let's just make sure that we don't crash
# as we iteratively process all chunks.
parser = all_parsers

with tm.ensure_clean() as path:
with open(path, "w") as f:
for i in range(1000):
f.write(str(i) + "\n")

result = parser.read_csv(path, chunksize=20)

for _ in result:
pass


def test_read_table_deprecated(all_parsers):
# see gh-21948
parser = all_parsers
Expand Down

0 comments on commit 6205a62

Please sign in to comment.