Skip to content

Commit

Permalink
pipeline-commit
Browse files Browse the repository at this point in the history
* pipeline - to wrap state handling
* imputer - fillna automatically
* countna - small feature to count missing value on the entire dataset
* to_records - used for more standard json input/output
  • Loading branch information
xdssio committed Apr 28, 2021
1 parent 9c98b78 commit 9d04565
Show file tree
Hide file tree
Showing 5 changed files with 847 additions and 31 deletions.
25 changes: 25 additions & 0 deletions packages/vaex-core/vaex/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2686,6 +2686,31 @@ def f(i1, i2):
previous_chunk = previous.result()
yield previous_l1, previous_l2, previous_chunk

def countna(self, columns=None):
if columns is None:
columns = self.get_column_names()
return sum([self[feature].countna() for feature in columns])

@docsubst
def to_records(self, item=None, selection=None, column_names=None, strings=True, virtual=True, parallel=True, chunk_size=None, array_type='python'):
if isinstance(item, int):
return {key: value[0] for key, value in self[item:item + 1].to_dict(selection=selection, column_names=column_names, strings=strings,virtual=virtual,parallel=parallel, array_type=array_type).items()}
# return {key: self.to_python(value) for key, value in zip(self.df.get_column_names(), self.df[item])}
if item is not None:
raise RuntimeError(f"item can be None or an int - {type(item)} provided")
else:
if chunk_size is None:
records = self.to_dict(selection=selection, column_names=column_names, strings=strings,virtual=virtual,parallel=parallel, array_type=array_type)
keys = list(records.keys())
return [{key: value for key, value in zip(keys, values)} for values in zip(*records.values())]

def iterator():
for _, _, chunk in self.to_dict(selection=selection, column_names=column_names, strings=strings,virtual=virtual,parallel=parallel, chunk_size=chunk_size, array_type=array_type):
keys = list(chunk.keys())
yield [{key: value for key, value in zip(keys, values)} for values in zip(*chunk.values())]

return iterator()

@docsubst
def to_items(self, column_names=None, selection=None, strings=True, virtual=True, parallel=True, chunk_size=None, array_type=None):
"""Return a list of [(column_name, ndarray), ...)] pairs where the ndarray corresponds to the evaluated data
Expand Down
Loading

0 comments on commit 9d04565

Please sign in to comment.