Skip to content

Commit

Permalink
made suggested updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Anu-Ra-g committed Jul 8, 2024
1 parent eca50f2 commit d37bd56
Showing 1 changed file with 12 additions and 15 deletions.
27 changes: 12 additions & 15 deletions kerchunk/grib2.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,14 +589,13 @@ def correct_hrrr_subhf_step(group: Dict) -> Dict:
def parse_grib_idx(
basename: str,
suffix: str = "idx",
storage_options: Dict = {},
tstamp: Optional["pd.Timestamp"] = None,
storage_options: Optional[Dict] = None,
validate: bool = False,
) -> "pd.DataFrame":
"""
Standalone method used to extract metadata from a grib2 idx file(text) from NODD.
Parses per-message metadata from a grib2.idx file (text-type) to a dataframe of attributes
The function takes idx file, extracts the metadata known as attrs (variables with
The function uses the idx file, extracts the metadata known as attrs (variables with
level and forecast time) from each idx entry and converts it into pandas
DataFrame. The dataframe is later to build the one-to-one mapping to the grib file metadata.
Expand All @@ -608,8 +607,6 @@ def parse_grib_idx(
The suffix is the ending for the idx file.
storage_options: dict
For accessing the data, passed to filesystem
tstamp : Optional[pd.Timestamp]
The timestamp to use for when the data was indexed
validate : bool
The validation if the metadata table has duplicate attrs.
Expand All @@ -619,28 +616,28 @@ def parse_grib_idx(
"""
import pandas as pd

fs, _ = fsspec.core.url_to_fs(basename, **storage_options)
fs, _ = fsspec.core.url_to_fs(basename, **(storage_options or {}))

fname = f"{basename}.{suffix}"

baseinfo = fs.info(basename)

result = pd.read_csv(fs.open(fname), header=None, names=["raw_data"])
result[["idx", "offset", "date", "attrs"]] = result["raw_data"].str.split(
":", expand=True, n=3
)
result["offset"] = result["offset"].astype(int)
with fs.open(fname) as f:
result = pd.read_csv(f, header=None, names=["raw_data"])
result[["idx", "offset", "date", "attrs"]] = result["raw_data"].str.split(
":", expand=True, n=3
)
result["offset"] = result["offset"].astype(int)

# dropping the original single "raw_data" column before the formatting
result.drop(columns=["raw_data"], inplace=True)
# dropping the original single "raw_data" column after formatting
result.drop(columns=["raw_data"], inplace=True)

result = result.assign(
length=(
result.offset.shift(periods=-1, fill_value=baseinfo["size"]) - result.offset
),
idx_uri=fname,
grib_uri=basename,
indexed_at=tstamp if tstamp else pd.Timestamp.now(),
)

if validate and not result["attrs"].is_unique:
Expand Down

0 comments on commit d37bd56

Please sign in to comment.