diff --git a/RELEASES.md b/RELEASES.md index 90c07fb25951..c31a0f09331c 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -37,6 +37,7 @@ Released on TBD (UTC). - Fixed Databento bars decoding (was incorrectly applying display factor) - Fixed `Binance` bar (kline) to use `close_time` for `ts_event` was `opentime` (#1591), thanks for reporting @OnlyC - Fixed `AccountMarginExceeded` error condition (margin must actually be exceeded now, and can be zero) +- Fixed `ParquetDataCatalog` path globbing which was including all paths with substrings of specified instrument IDs --- diff --git a/nautilus_trader/persistence/catalog/parquet.py b/nautilus_trader/persistence/catalog/parquet.py index 79d5e4203e3e..b4dcaf143f8a 100644 --- a/nautilus_trader/persistence/catalog/parquet.py +++ b/nautilus_trader/persistence/catalog/parquet.py @@ -412,15 +412,22 @@ def backend_session( file_prefix = class_to_filename(data_cls) glob_path = f"{self.path}/data/{file_prefix}/**/*" - dirs = self.fs.glob(glob_path) + dirs: list[str] = self.fs.glob(glob_path) if self.show_query_paths: print(dirs) for idx, path in enumerate(dirs): assert self.fs.exists(path) - if instrument_ids and not any(urisafe_instrument_id(x) in path for x in instrument_ids): + # Parse the parent directory which *should* be the instrument ID, + # this prevents us matching all instrument ID substrings. + parent_dir = path.split("/")[-2] + if instrument_ids and not any( + parent_dir.startswith(urisafe_instrument_id(x)) for x in instrument_ids + ): continue - if bar_types and not any(urisafe_instrument_id(x) in path for x in bar_types): + if bar_types and not any( + parent_dir.startswith(urisafe_instrument_id(x)) for x in bar_types + ): continue table = f"{file_prefix}_{idx}" query = self._build_query(