Skip to content

Commit

Permalink
Return empty dataframe when reading an empty table (#34)
Browse files Browse the repository at this point in the history
* fix loading `.cbox` files

* add test and empty check directly on TextBuffer

---------

Co-authored-by: alisterburt <[email protected]>
  • Loading branch information
quantumjot and alisterburt authored Apr 11, 2023
1 parent 79893cb commit 9be107a
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 15 deletions.
23 changes: 15 additions & 8 deletions starfile/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def _parse_data_block(self):
self._parse_loop_block()
return

elif line.startswith('data_') or self.crawler.current_line_number == self.n_lines:
elif line.startswith(
'data_') or self.crawler.current_line_number == self.n_lines:
self._parse_simple_block_from_buffer()
return

Expand All @@ -74,6 +75,8 @@ def _parse_loop_block(self):
self.crawler.increment_line_number()
header = self._parse_loop_header()
df = self._parse_loop_data()
if df is None:
df = pd.DataFrame({h: None for h in header}, index=[0])
df.columns = header
df.name = self._current_data_block_name
self._add_dataframe(df)
Expand Down Expand Up @@ -157,7 +160,7 @@ def _parse_loop_header(self) -> List[str]:

return self.text_buffer.buffer

def _parse_loop_data(self) -> pd.DataFrame:
def _parse_loop_data(self) -> Union[pd.DataFrame, None]:
self.text_buffer.clear()

while self.crawler.current_line_number <= self.n_lines:
Expand All @@ -167,8 +170,16 @@ def _parse_loop_data(self) -> pd.DataFrame:
self.text_buffer.add_line(current_line)
self.crawler.increment_line_number()

df = pd.read_csv(StringIO(self.text_buffer.as_str()), delim_whitespace=True, header=None,
comment='#')
# check whether the buffer is empty
if self.text_buffer.is_empty:
return None

df = pd.read_csv(
StringIO(self.text_buffer.as_str()),
delim_whitespace=True,
header=None,
comment='#'
)
return df

def dataframes_to_numeric(self):
Expand Down Expand Up @@ -204,7 +215,3 @@ def dataframe_at_index(self, idx: int):

def dataframes_as_list(self):
return list(self.dataframes.values())




9 changes: 9 additions & 0 deletions starfile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@ class TextBuffer:
def __init__(self):
self.buffer = deque()

@property
def is_empty(self) -> bool:
if len(self.buffer) == 0:
return True
elif len(self.buffer) <= 100: # arbitrary, avoid iterating large buffer
return all([item.strip() == '' for item in self.buffer])
else:
return False

def clear(self):
self.buffer = deque()

Expand Down
1 change: 1 addition & 0 deletions tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
non_existant_file = test_data_directory / 'non_existant_file.star'
two_single_line_loop_blocks = test_data_directory / 'two_single_line_loop_blocks.star'
two_basic_blocks = test_data_directory / 'two_basic_blocks.star'
empty_loop = test_data_directory / 'empty_loop.star'

# Example DataFrame for testing
cars = {'Brand': ['Honda_Civic', 'Toyota_Corolla', 'Ford_Focus', 'Audi_A4'],
Expand Down
6 changes: 6 additions & 0 deletions tests/data/empty_loop.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

data_

loop_
_rlnCoordinateX #1

34 changes: 27 additions & 7 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,23 @@
import pytest

from starfile.parser import StarParser
from .constants import loop_simple, postprocess, pipeline, rln31_style, optimiser_2d, optimiser_3d, \
sampling_2d, \
sampling_3d, single_line_middle_of_multiblock, single_line_end_of_multiblock, non_existant_file, \
loop_simple_columns, two_single_line_loop_blocks, two_basic_blocks
from .constants import (
loop_simple,
postprocess,
pipeline,
rln31_style,
optimiser_2d,
optimiser_3d,
sampling_2d,
sampling_3d,
single_line_middle_of_multiblock,
single_line_end_of_multiblock,
non_existant_file,
loop_simple_columns,
two_single_line_loop_blocks,
two_basic_blocks,
empty_loop,
)
from .utils import generate_large_star_file, remove_large_star_file, million_row_file


Expand Down Expand Up @@ -59,9 +72,10 @@ def test_read_multiblock_file():
assert isinstance(df, pd.DataFrame)

assert s.dataframes['general'].shape == (1, 6)
assert all(['rlnFinalResolution', 'rlnBfactorUsedForSharpening', 'rlnUnfilteredMapHalf1',
'rlnUnfilteredMapHalf2', 'rlnMaskName', 'rlnRandomiseFrom']
== s.dataframes['general'].columns)
assert all(
['rlnFinalResolution', 'rlnBfactorUsedForSharpening', 'rlnUnfilteredMapHalf1',
'rlnUnfilteredMapHalf2', 'rlnMaskName', 'rlnRandomiseFrom']
== s.dataframes['general'].columns)
assert s.dataframes['fsc'].shape == (49, 7)
assert s.dataframes['guinier'].shape == (49, 3)

Expand Down Expand Up @@ -197,3 +211,9 @@ def test_two_basic_blocks():
assert len(parser.dataframes) == 2
for df in parser.dataframes.values():
assert df.shape == (1, 3)


def test_empty_loop_block():
"""Parsing an empty loop block should return an empty dataframe."""
parser = StarParser(empty_loop)
assert len(parser.dataframes) == 1

0 comments on commit 9be107a

Please sign in to comment.