MIT-LCP · briangow · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -33,23 +33,24 @@ jobs:
       - name: Check source code format
         run: uv run --extra dev black --check --diff .
 
-  test-deb10-i386:
+  test-deb11-i386:
     runs-on: ubuntu-latest
-    container: i386/debian:10
+    container: i386/debian:11
     steps:
       - name: Install dependencies
         run: |
           apt-get update
           apt-get install -y --no-install-recommends \
+                  python3-fsspec \
                   python3-matplotlib \
                   python3-numpy \
                   python3-pandas \
+                  python3-pip \
                   python3-requests \
                   python3-scipy \
                   python3-soundfile \
                   python3-pytest \
-                  git
-
+
       # Note: "actions/checkout@v2" requires libstdc++6:amd64 to be
       # installed in the container.  To keep things simple, use
       # "actions/checkout@v1" instead.

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
     "soundfile >= 0.10.0",
     "matplotlib >= 3.2.2",
     "requests >= 2.8.1",
+    "fsspec >= 2023.10.0",
+    "aiohttp >= 3.10.11",
 ]
 dynamic = ["version"]
 

diff --git a/wfdb/io/_coreio.py b/wfdb/io/_coreio.py
@@ -1,5 +1,7 @@
 import posixpath
 
+import fsspec
+
 from wfdb.io import _url
 from wfdb.io.download import config
 
@@ -28,8 +30,9 @@ def _open_file(
         The PhysioNet database directory where the file is stored, or None
         if file_name is a local path.
     file_name : str
-        The name of the file, either as a local filesystem path (if
-        `pn_dir` is None) or a URL path (if `pn_dir` is a string.)
+        The name of the file, either as a local filesystem path or cloud
+        URL (if `pn_dir` is None) or a PhysioNet URL path
+        (if `pn_dir` is a string.)
     mode : str, optional
         The standard I/O mode for the file ("r" by default).  If `pn_dir`
         is not None, this must be "r", "rt", or "rb".
@@ -47,7 +50,7 @@ def _open_file(
 
     """
     if pn_dir is None:
-        return open(
+        return fsspec.open(
             file_name,
             mode,
             buffering=buffering,

diff --git a/wfdb/io/_signal.py b/wfdb/io/_signal.py
@@ -1,7 +1,9 @@
 import math
 import os
+import posixpath
 import sys
 
+import fsspec
 import numpy as np
 
 from wfdb.io import download, _coreio, util
@@ -1643,10 +1645,10 @@ def _rd_dat_file(file_name, dir_name, pn_dir, fmt, start_byte, n_samp):
         The name of the dat file.
     dir_name : str
         The full directory where the dat file(s) are located, if the dat
-        file(s) are local.
+        file(s) are local or in the cloud.
     pn_dir : str
         The PhysioNet directory where the dat file(s) are located, if
-        the dat file(s) are remote.
+        the dat file(s) are on a PhysioNet server.
     fmt : str
         The format of the dat file.
     start_byte : int
@@ -1686,14 +1688,15 @@ def _rd_dat_file(file_name, dir_name, pn_dir, fmt, start_byte, n_samp):
         element_count = n_samp
         byte_count = n_samp * BYTES_PER_SAMPLE[fmt]
 
-    # Local dat file
+    # Local or cloud dat file
     if pn_dir is None:
-        with open(os.path.join(dir_name, file_name), "rb") as fp:
+        with fsspec.open(os.path.join(dir_name, file_name), "rb") as fp:
             fp.seek(start_byte)
-            sig_data = np.fromfile(
+            sig_data = util.fromfile(
                 fp, dtype=np.dtype(DATA_LOAD_TYPES[fmt]), count=element_count
             )
-    # Stream dat file from Physionet
+
+    # Stream dat file from PhysioNet
     else:
         dtype_in = np.dtype(DATA_LOAD_TYPES[fmt])
         sig_data = download._stream_dat(
@@ -1840,8 +1843,9 @@ def _rd_compressed_file(
     file_name : str
         The name of the signal file.
     dir_name : str
-        The full directory where the signal file is located, if local.
-        This argument is ignored if `pn_dir` is not None.
+        The full directory where the signal file is located, if this
+        is a local or cloud path. This argument is ignored if `pn_dir`
+        is not None.
     pn_dir : str or None
         The PhysioNet database directory where the signal file is located.
     fmt : str
@@ -2585,10 +2589,10 @@ def _infer_sig_len(
         The byte offset of the dat file.  None is equivalent to zero.
     dir_name : str
         The full directory where the dat file(s) are located, if the dat
-        file(s) are local.
+        file(s) are local or on the cloud.
     pn_dir : str, optional
         The PhysioNet directory where the dat file(s) are located, if
-        the dat file(s) are remote.
+        the dat file(s) are on a PhysioNet server.
 
     Returns
     -------
@@ -2600,13 +2604,23 @@ def _infer_sig_len(
     sig_len * tsamps_per_frame * bytes_per_sample == file_size
 
     """
-    if pn_dir is None:
-        file_size = os.path.getsize(os.path.join(dir_name, file_name))
-    else:
+    from wfdb.io.record import CLOUD_PROTOCOLS
+
+    # If this is a cloud path, use posixpath to construct the path and fsspec to open file
+    if any(dir_name.startswith(proto) for proto in CLOUD_PROTOCOLS):
+        with fsspec.open(posixpath.join(dir_name, file_name), mode="rb") as f:
+            file_size = f.seek(0, os.SEEK_END)
+
+    # If the PhysioNet database path is provided, construct the download path using the database version
+    elif pn_dir is not None:
         file_size = download._remote_file_size(
             file_name=file_name, pn_dir=pn_dir
         )
 
+    # If it isn't a cloud path or a PhysioNet path, we treat as a local file
+    else:
+        file_size = os.path.getsize(os.path.join(dir_name, file_name))
+
     if byte_offset is None:
         byte_offset = 0
     data_size = file_size - byte_offset

diff --git a/wfdb/io/record.py b/wfdb/io/record.py
@@ -4,6 +4,7 @@
 import os
 import re
 
+import fsspec
 import numpy as np
 import pandas as pd
 
@@ -155,6 +156,9 @@
     "vtip": "mV",
 }
 
+# Cloud protocols
+CLOUD_PROTOCOLS = ["az://", "azureml://", "s3://", "gs://"]
+
 
 class BaseRecord(object):
     """
@@ -1824,27 +1828,33 @@ def rdheader(record_name, pn_dir=None, rd_segments=False):
 
     """
     dir_name, base_record_name = os.path.split(record_name)
-    dir_name = os.path.abspath(dir_name)
+    file_name = f"{base_record_name}.hea"
 
-    # Construct the download path using the database version
-    if (pn_dir is not None) and ("." not in pn_dir):
-        dir_list = pn_dir.split("/")
-        pn_dir = posixpath.join(
-            dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
-        )
+    # If this is a cloud path, use posixpath to construct the path and fsspec to open file
+    if any(dir_name.startswith(proto) for proto in CLOUD_PROTOCOLS):
+        with fsspec.open(posixpath.join(dir_name, file_name), mode="r") as f:
+            header_content = f.read()
 
-    # Read the local or remote header file.
-    file_name = f"{base_record_name}.hea"
-    if pn_dir is None:
-        with open(
+    # If the PhysioNet database path is provided, construct the download path using the database version
+    elif pn_dir is not None:
+        if "." not in pn_dir:
+            dir_list = pn_dir.split("/")
+            pn_dir = posixpath.join(
+                dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
+            )
+
+        header_content = download._stream_header(file_name, pn_dir)
+
+    # If it isn't a cloud path or a PhysioNet path, we treat as a local file
+    else:
+        dir_name = os.path.abspath(dir_name)
+        with fsspec.open(
             os.path.join(dir_name, file_name),
             "r",
             encoding="ascii",
             errors="ignore",
         ) as f:
             header_content = f.read()
-    else:
-        header_content = download._stream_header(file_name, pn_dir)
 
     # Separate comment and non-comment lines
     header_lines, comment_lines = header.parse_header_content(header_content)
@@ -2017,7 +2027,9 @@ def rdrecord(
 
     """
     dir_name, base_record_name = os.path.split(record_name)
-    dir_name = os.path.abspath(dir_name)
+    # Update the dir_name using abspath unless it is a cloud path
+    if not any(dir_name.startswith(proto) for proto in CLOUD_PROTOCOLS):
+        dir_name = os.path.abspath(dir_name)
 
     # Read the header fields
     if (pn_dir is not None) and ("." not in pn_dir):

diff --git a/wfdb/io/util.py b/wfdb/io/util.py
@@ -2,9 +2,12 @@
 A module for general utility functions
 """
 
+import io
 import math
 import os
 
+import numpy as np
+
 from typing import List, Sequence, Tuple
 
 
@@ -121,3 +124,27 @@ def overlapping_ranges(
         for second in ranges_2
         if max(first[0], second[0]) < min(first[1], second[1])
     ]
+
+
+def fromfile(fileobj, dtype, count=-1):
+    """
+    Detect if the object will work with numpy.fromfile - if so, use it. If not, read the object into a numpy array and
+    calculate the number of elements (if not provided) - this is needed for fsspec objects.
+    """
+    if isinstance(fileobj, io.FileIO) or (
+        isinstance(fileobj, (io.BufferedReader, io.BufferedRandom))
+        and isinstance(fileobj.raw, io.FileIO)
+    ):
+        return np.fromfile(fileobj, dtype=dtype, count=count)
+    else:
+        dtype = np.dtype(dtype)
+        if count < 0:
+            start = fileobj.tell()
+            fileobj.seek(0, os.SEEK_END)
+            end = fileobj.tell()
+            fileobj.seek(start, os.SEEK_SET)
+            count = (end - start) // dtype.itemsize
+        array = np.empty(count, dtype)
+        size = fileobj.readinto(array.view(np.uint8))
+        array.resize(size // dtype.itemsize)
+        return array