-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #94 from OpenCOMPES/90-loader-plugin-api
90 loader plugin api
- Loading branch information
Showing
29 changed files
with
560 additions
and
334 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
core: | ||
loader: dask | ||
|
||
binning: | ||
hist_mode: "numba" | ||
mode: fast | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# base loader |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
"""The abstract class off of which to implement loaders.""" | ||
from abc import ABC | ||
from abc import abstractmethod | ||
from typing import List | ||
from typing import Sequence | ||
from typing import Tuple | ||
|
||
import dask.dataframe as ddf | ||
|
||
|
||
class BaseLoader(ABC): | ||
""" | ||
The abstract class off of which to implement loaders. | ||
The reader's folder name is the identifier. | ||
For this BaseLoader with filename base/loader.py the ID becomes 'base' | ||
""" | ||
|
||
# pylint: disable=too-few-public-methods | ||
|
||
__name__ = "BaseLoader" | ||
|
||
supported_file_types: List[str] = [] | ||
|
||
@abstractmethod | ||
def __init__( | ||
self, | ||
config: dict = None, | ||
): | ||
if config is None: | ||
config = {} | ||
|
||
self._config = config | ||
|
||
self.files: List[str] = [] | ||
|
||
@abstractmethod | ||
def read_dataframe( | ||
self, | ||
files: Sequence[str] = None, | ||
folder: str = None, | ||
ftype: str = None, | ||
**kwds, | ||
) -> Tuple[ddf.DataFrame, dict]: | ||
"""Reads data from given files or folder and returns a dask dataframe, | ||
and a dictionary with metadata""" | ||
return None, None | ||
|
||
|
||
LOADER = BaseLoader |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
""" | ||
module sed.loader.mpes, code for loading hdf5 files delayed into a dask dataframe. | ||
Mostly ported from https://github.com/mpes-kit/mpes. | ||
@author: L. Rettig | ||
""" | ||
import os | ||
from typing import Any | ||
from typing import Dict | ||
from typing import List | ||
from typing import Sequence | ||
from typing import Tuple | ||
|
||
import dask.dataframe as ddf | ||
|
||
from sed.loader.base.loader import BaseLoader | ||
from sed.loader.utils import gather_files | ||
|
||
|
||
class GenericLoader(BaseLoader): # pylint: disable=too-few-public-methods | ||
"""Dask implementation of the Loader. Reads from various file types using the | ||
utilities of Dask.""" | ||
|
||
__name__ = "dask" | ||
|
||
supported_file_types = ["parquet", "csv", "json"] | ||
|
||
def __init__( | ||
self, | ||
config: dict = None, | ||
): | ||
if config is None: | ||
config = {} | ||
|
||
self._config = config | ||
|
||
self.files: List[str] = [] | ||
|
||
def read_dataframe( | ||
self, | ||
files: Sequence[str] = None, | ||
folder: str = None, | ||
ftype: str = "parquet", | ||
**kwds, | ||
) -> Tuple[ddf.DataFrame, dict]: | ||
"""Read stored files from a folder into a dataframe. | ||
Parameters: | ||
folder, files: str, list/tuple | None, None | ||
Folder path of the files or a list of file paths. The folder path has | ||
the priority such that if it's specified, the specified files will | ||
be ignored. | ||
ftype: str | 'parquet' | ||
File type to read ('parquet', 'json', 'csv', etc). | ||
If a folder path is given, all files of the specified type are read | ||
into the dataframe in the reading order. | ||
**kwds: keyword arguments | ||
See the keyword arguments for the specific file parser in | ||
``dask.dataframe`` module. | ||
**Return**\n | ||
Dask dataframe read from specified files. | ||
""" | ||
|
||
metadata: Dict[Any, Any] = {} | ||
# pylint: disable=duplicate-code | ||
if folder is not None: | ||
folder = os.path.realpath(folder) | ||
files = gather_files( | ||
folder=folder, | ||
extension=ftype, | ||
file_sorting=True, | ||
**kwds, | ||
) | ||
|
||
elif folder is None: | ||
if files is None: | ||
raise ValueError( | ||
"Either the folder or file path should be provided!", | ||
) | ||
files = [os.path.realpath(file) for file in files] | ||
|
||
self.files = files | ||
|
||
if not files: | ||
raise FileNotFoundError("No valid files found!") | ||
|
||
if ftype == "parquet": | ||
return (ddf.read_parquet(files, **kwds), metadata) | ||
|
||
if ftype == "json": | ||
return (ddf.read_json(files, **kwds), metadata) | ||
|
||
if ftype == "csv": | ||
return (ddf.read_csv(files, **kwds), metadata) | ||
|
||
try: | ||
return (ddf.read_table(files, **kwds), metadata) | ||
except (TypeError, ValueError, NotImplementedError) as exc: | ||
raise Exception( | ||
"The file format cannot be understood!", | ||
) from exc | ||
|
||
|
||
LOADER = GenericLoader |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
"""Interface to select a specified loader | ||
""" | ||
import glob | ||
import importlib.util | ||
import os | ||
from typing import List | ||
|
||
from sed.loader.base.loader import BaseLoader | ||
|
||
|
||
def get_loader(loader_name: str, config: dict = None) -> BaseLoader: | ||
"""Helper function to get the loader object from it's given name""" | ||
|
||
if config is None: | ||
config = {} | ||
|
||
path_prefix = ( | ||
f"{os.path.dirname(__file__)}{os.sep}" | ||
if os.path.dirname(__file__) | ||
else "" | ||
) | ||
path = os.path.join(path_prefix, loader_name, "loader.py") | ||
if not os.path.exists(path): | ||
error_str = f"Invalid loader {loader_name}. Available loaders are: [" | ||
for loader in get_names_of_all_loaders(): | ||
error_str += f"{loader}, " | ||
error_str += "]." | ||
raise ValueError(error_str) | ||
|
||
spec = importlib.util.spec_from_file_location("loader.py", path) | ||
module = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(module) # type: ignore[attr-defined] | ||
return module.LOADER(config=config) # type: ignore[attr-defined] | ||
|
||
|
||
def get_names_of_all_loaders() -> List[str]: | ||
"""Helper function to populate a list of all available loaders""" | ||
path_prefix = ( | ||
f"{os.path.dirname(__file__)}{os.sep}" | ||
if os.path.dirname(__file__) | ||
else "" | ||
) | ||
files = glob.glob(os.path.join(path_prefix, "*", "loader.py")) | ||
all_loaders = [] | ||
for file in files: | ||
if f"{os.sep}base{os.sep}" not in file: | ||
index_of_loaders_folder_name = file.rindex( | ||
f"loader{os.sep}", | ||
) + len(f"loader{os.sep}") | ||
index_of_last_path_sep = file.rindex(os.sep) | ||
all_loaders.append( | ||
file[index_of_loaders_folder_name:index_of_last_path_sep], | ||
) | ||
return all_loaders |
Empty file.
Oops, something went wrong.