Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update to noisepy-io v0.2.0 #327

Merged
merged 7 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Taxonomy of the NoisePy variables.
* ``step`` is the window that get skipped when sliding windows in seconds
* ``smooth_N`` number of points for smoothing the time or frequency domain discrete arrays.
* ``maxlag`` maximum length in seconds saved in files in each side of the correlation (save on storage)
* ``substack,substack_len`` boolean, window length over which to substack the correlation (to save storage or do monitoring), it has to be a multiple of ``cc_len``.
* ``substack, substack_windows`` boolean, number of window over which to substack the correlation (to save storage or do monitoring).
* ``time_chunk, nchunk`` refers to the time unit that defined a single job. for instace, ``cc_len`` is the correlation length (e.g., 1 hour, 30 min), the overall duration of the experiment is the total length (1 month, 1 year, ...). The time chunk could be 1 day: the code would loop through each cc_len window in a for loop. But each day will be sent as a thread.

## Acknowledgements
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ dependencies = [
"PyYAML==6.0",
"pydantic-yaml==1.0",
"psutil>=5.9.5,<6.0.0",
"noisepy-seis-io>=0.1.16",
"noisepy-seis-io>=0.2.0",
"scipy==1.12.0"
]

Expand Down
25 changes: 20 additions & 5 deletions src/noisepy/seis/correlate.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,20 +502,35 @@ def _filter_channel_data(
logging.warning(f"No data available with sampling rate >= {sampling_rate}")
return []
if single_freq:
closest_freq = min(
frequencies,
key=lambda f: max(f - sampling_rate, 0),
)
logger.info(f"Picked {closest_freq} as the closest sampling rate to {sampling_rate}. ")
closest_freq = _get_closest_freq(frequencies, sampling_rate)
logger.info(f"Picked {closest_freq} as the closest sampling_rate to {sampling_rate}. ")
filtered_tuples = list(filter(lambda tup: tup[1].sampling_rate == closest_freq, tuples))
logger.info(f"Filtered to {len(filtered_tuples)}/{len(tuples)} channels with sampling rate == {closest_freq}")
else:
filtered_tuples = list(filter(lambda tup: tup[1].sampling_rate >= sampling_rate, tuples))
# for each station, pick the closest >= to sampling_rate
tmp = list(
map(
lambda s: [t for t in filtered_tuples if t[0].station == s],
set([t[0].station for t in filtered_tuples]),
)
)
filtered_tuples = sum(list(map(lambda t: _filt_single_station(t, sampling_rate), tmp)), [])
logger.info(f"Filtered to {len(filtered_tuples)}/{len(tuples)} channels with sampling rate >= {sampling_rate}")

return filtered_tuples


def _get_closest_freq(frequencies, sampling_rate: int):
return min(frequencies, key=lambda f: max(f - sampling_rate, 0))


def _filt_single_station(tuples: List[Tuple[Channel, ChannelData]], sampling_rate: int):
frequencies = set(t[1].sampling_rate for t in tuples)
closest_freq = _get_closest_freq(frequencies, sampling_rate)
return [t for t in tuples if t[1].sampling_rate == closest_freq]


def check_memory(params: ConfigParameters, nsta: int) -> int:
# maximum memory allowed
# TODO: Is this needed? Should it be configurable?
Expand Down
33 changes: 22 additions & 11 deletions tests/test_cross_correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,32 @@
def test_read_channels():
CLOSEST_FREQ = 60
sampling_rate = 40
freqs = [10, 39, CLOSEST_FREQ, 100]
ch_data = []
for f in freqs:
ch_data1 = []
N = 5
for f in [10, 39, CLOSEST_FREQ, 100]:
cd = ChannelData.empty()
cd.sampling_rate = f
ch_data.append(cd)
N = 5
tuples = [(Channel("foo", Station("CI", "bar")), cd) for cd in ch_data] * N
ch_data1.append(cd)

cd = ChannelData.empty()
cd.sampling_rate = 100
ch_data2 = [cd]

tuples = [(Channel("foo", Station("CI", "FOO")), cd) for cd in ch_data1] * N
tuples += [(Channel("bar", Station("CI", "BAR")), cd) for cd in ch_data2] * N

# we should pick the closest frequency that is >= to the target freq, 60 in this case
# we should pick the closest frequency that is >= to the target sampling_rate
# 60 Hz in this case, for both stations
# CI.FOO returns 60 Hz
# CI.BAR returns nothing
filtered = _filter_channel_data(tuples, sampling_rate, single_freq=True)
assert N == len(filtered)
assert [t[1].sampling_rate for t in filtered] == [CLOSEST_FREQ] * N

# we should get all data at >= 40 Hz (60 and 100)
# we should pick the closest frequency that is >= to the target sampling_rate
# but might be different for each station
# CI.FOO returns 60 Hz
# CI.BAR returns 100 Hz
filtered = _filter_channel_data(tuples, sampling_rate, single_freq=False)
assert N * 2 == len(filtered)
assert all([t[1].sampling_rate >= sampling_rate for t in filtered])
Expand Down Expand Up @@ -79,11 +90,11 @@ def get_inventory(self, timespan: DateTimeRange, station: Station) -> obspy.Inve
@pytest.mark.parametrize("rm_resp", [RmResp.NO, RmResp.INV]) # add tests for SPECTRUM, RESP and POLES_ZEROS
@pytest.mark.parametrize("cc_method", [CCMethod.XCORR, CCMethod.COHERENCY, CCMethod.DECONV])
@pytest.mark.parametrize("substack", [True, False])
@pytest.mark.parametrize("substack_len", [1, 2])
@pytest.mark.parametrize("substack_windows", [1, 2])
@pytest.mark.parametrize("inc_hours", [0, 24])
@pytest.mark.parametrize("dpath", ["./data/cc", "./data/acc"])
def test_cross_correlation(
rm_resp: RmResp, cc_method: CCMethod, substack: bool, substack_len: int, inc_hours: int, dpath: str
rm_resp: RmResp, cc_method: CCMethod, substack: bool, substack_windows: int, inc_hours: int, dpath: str
):
config = ConfigParameters()
config.sampling_rate = 1.0
Expand All @@ -92,7 +103,7 @@ def test_cross_correlation(
config.inc_hours = inc_hours
if substack:
config.substack = substack
config.substack_len = substack_len * config.cc_len
config.substack_windows = substack_windows
path = os.path.join(os.path.dirname(__file__), dpath)

raw_store = SCEDCS3DataStore(path, MockCatalogMock())
Expand Down
2 changes: 1 addition & 1 deletion tutorials/cloud/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@ stationxml: false
step: 450.0
storage_options: {}
substack: false
substack_len: 1800
substack_windows: 1
time_norm: no
xcorr_only: true
2 changes: 1 addition & 1 deletion tutorials/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ cc_method: xcorr
smooth_N: 10
smoothspect_N: 10
substack: true
substack_len: 3600
substack_windows: 1
maxlag: 200
inc_hours: 12
max_over_std: 10
Expand Down
34 changes: 17 additions & 17 deletions tutorials/get_started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"from noisepy.seis.io import plotting_modules\n",
"from noisepy.seis.io.asdfstore import ASDFRawDataStore, ASDFCCStore, ASDFStackStore\n",
"from noisepy.seis.io.datatypes import CCMethod, ConfigParameters, FreqNorm, RmResp, TimeNorm\n",
"from dateutil.parser import isoparse\n",
"from datetime import datetime, timezone\n",
"from datetimerange import DateTimeRange\n",
"import os\n",
"import shutil\n",
Expand Down Expand Up @@ -95,8 +95,8 @@
"source": [
"config = ConfigParameters() # default config parameters which can be customized\n",
"config.inc_hours = 12\n",
"config.sampling_rate= 20 # (int) Sampling rate in Hz of desired processing (it can be different than the data sampling rate)\n",
"config.cc_len= 3600 # (float) basic unit of data length for fft (sec)\n",
"config.sampling_rate = 20 # (int) Sampling rate in Hz of desired processing (it can be different than the data sampling rate)\n",
"config.cc_len = 3600 # (float) basic unit of data length for fft (sec)\n",
" # criteria for data selection\n",
"config.ncomp = 3 # 1 or 3 component data (needed to decide whether do rotation)\n",
"\n",
Expand All @@ -111,32 +111,32 @@
"config.lomax = -115 # max longitude\n",
"\n",
"# pre-processing parameters\n",
"config.step= 1800.0 # (float) overlapping between each cc_len (sec)\n",
"config.stationxml= False # station.XML file used to remove instrument response for SAC/miniseed data\n",
"config.rm_resp= RmResp.INV # select 'no' to not remove response and use 'inv' if you use the stationXML,'spectrum',\n",
"config.step = 1800.0 # (float) overlapping between each cc_len (sec)\n",
"config.stationxml = False # station.XML file used to remove instrument response for SAC/miniseed data\n",
"config.rm_resp = RmResp.INV # select 'no' to not remove response and use 'inv' if you use the stationXML,'spectrum',\n",
"config.freqmin = 0.05\n",
"config.freqmax = 2.0\n",
"config.max_over_std = 10 # threshold to remove window of bad signals: set it to 10*9 if prefer not to remove them\n",
"\n",
"# TEMPORAL and SPECTRAL NORMALISATION\n",
"config.freq_norm= FreqNorm.RMA # choose between \"rma\" for a soft whitenning or \"no\" for no whitening. Pure whitening is not implemented correctly at this point.\n",
"config.freq_norm = FreqNorm.RMA # choose between \"rma\" for a soft whitenning or \"no\" for no whitening. Pure whitening is not implemented correctly at this point.\n",
"config.smoothspect_N = 10 # moving window length to smooth spectrum amplitude (points)\n",
" # here, choose smoothspect_N for the case of a strict whitening (e.g., phase_only)\n",
"\n",
"config.time_norm = TimeNorm.NO # 'no' for no normalization, or 'rma', 'one_bit' for normalization in time domain,\n",
" # TODO: change time_norm option from \"no\" to \"None\"\n",
"config.smooth_N= 10 # moving window length for time domain normalization if selected (points)\n",
"config.smooth_N = 10 # moving window length for time domain normalization if selected (points)\n",
"\n",
"config.cc_method= CCMethod.XCORR # 'xcorr' for pure cross correlation OR 'deconv' for deconvolution;\n",
"config.cc_method = CCMethod.XCORR # 'xcorr' for pure cross correlation OR 'deconv' for deconvolution;\n",
" # FOR \"COHERENCY\" PLEASE set freq_norm to \"rma\", time_norm to \"no\" and cc_method to \"xcorr\"\n",
"\n",
"# OUTPUTS:\n",
"config.substack = True # True = smaller stacks within the time chunk. False: it will stack over inc_hours\n",
"config.substack_len = config.cc_len # how long to stack over (for monitoring purpose): need to be multiples of cc_len\n",
" # if substack=True, substack_len=2*cc_len, then you pre-stack every 2 correlation windows.\n",
" # for instance: substack=True, substack_len=cc_len means that you keep ALL of the correlations\n",
"config.substack_windows = 1 # how long to stack over (for monitoring purpose)\n",
" # if substack=True, substack_windows=2, then you pre-stack every 2 correlation windows.\n",
" # for instance: substack=True, substack_windows=1 means that you keep ALL of the correlations\n",
"\n",
"config.maxlag= 200 # lags of cross-correlation to save (sec)\n",
"config.maxlag = 200 # lags of cross-correlation to save (sec)\n",
"config.substack = True"
]
},
Expand Down Expand Up @@ -173,9 +173,9 @@
"source": [
"config.networks = [\"*\"]\n",
"config.stations = [\"A*\"]\n",
"config.channels = [\"BHE\",\"BHN\",\"BHZ\"]\n",
"config.start_date = isoparse(\"2019-02-01T00:00:00Z\")\n",
"config.end_date = isoparse(\"2019-02-02T00:00:00Z\")\n",
"config.channels = [\"BH?\"]\n",
"config.start_date = datetime(2019, 2, 1, tzinfo=timezone.utc)\n",
"config.end_date = datetime(2019, 2, 2, tzinfo=timezone.utc)\n",
"timerange = DateTimeRange(config.start_date, config.end_date)\n",
"\n",
"# Download data locally. Enters raw data path, channel types, stations, config, and fdsn server.\n",
Expand Down Expand Up @@ -218,7 +218,7 @@
"file = os.path.join(raw_data_path, \"2019_02_01_00_00_00T2019_02_01_12_00_00.h5\")\n",
"raw_store = ASDFRawDataStore(raw_data_path) # Store for reading raw data\n",
"timespans = raw_store.get_timespans()\n",
"plotting_modules.plot_waveform(raw_store, timespans[0], 'CI','ADO',0.01,0.4) # this function takes for input: filename, network, station, freqmin, freqmax for a bandpass filter"
"plotting_modules.plot_waveform(raw_store, timespans[0], 'CI','ADO', 0.01, 0.4) # this function takes for input: filename, network, station, freqmin, freqmax for a bandpass filter"
]
},
{
Expand Down
32 changes: 16 additions & 16 deletions tutorials/tutorial_compositestore.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -169,48 +169,48 @@
" # being a day of data.\n",
"\n",
"# pre-processing parameters\n",
"config.sampling_rate= 20 # (int) Sampling rate in Hz of desired processing (it can be different than the data sampling rate)\n",
"config.sampling_rate = 20 # (int) Sampling rate in Hz of desired processing (it can be different than the data sampling rate)\n",
"config.single_freq = False\n",
"config.cc_len= 3600 # (float) basic unit of data length for fft (sec)\n",
"config.step= 1800.0 # (float) overlapping between each cc_len (sec)\n",
"config.cc_len = 3600 # (float) basic unit of data length for fft (sec)\n",
"config.step = 1800.0 # (float) overlapping between each cc_len (sec)\n",
"\n",
"config.ncomp = 3 # 1 or 3 component data (needed to decide whether do rotation)\n",
"\n",
"config.stationxml= False # station.XML file used to remove instrument response for SAC/miniseed data\n",
"config.stationxml = False # station.XML file used to remove instrument response for SAC/miniseed data\n",
" # If True, the stationXML file is assumed to be provided.\n",
"config.rm_resp= RmResp.INV # select 'no' to not remove response and use 'inv' if you use the stationXML,'spectrum',\n",
"config.rm_resp = RmResp.INV # select 'no' to not remove response and use 'inv' if you use the stationXML,'spectrum',\n",
"\n",
"############## NOISE PRE-PROCESSING ##################\n",
"config.freqmin,config.freqmax = 0.05,2.0 # broad band filtering of the data before cross correlation\n",
"config.freqmin, config.freqmax = 0.05, 2.0 # broad band filtering of the data before cross correlation\n",
"config.max_over_std = 10 # threshold to remove window of bad signals: set it to 10*9 if prefer not to remove them\n",
"\n",
"################### SPECTRAL NORMALIZATION ############\n",
"config.freq_norm= FreqNorm.RMA # choose between \"rma\" for a soft whitening or \"no\" for no whitening. Pure whitening is not implemented correctly at this point.\n",
"config.freq_norm = FreqNorm.RMA # choose between \"rma\" for a soft whitening or \"no\" for no whitening. Pure whitening is not implemented correctly at this point.\n",
"config.smoothspect_N = 10 # moving window length to smooth spectrum amplitude (points)\n",
" # here, choose smoothspect_N for the case of a strict whitening (e.g., phase_only)\n",
"\n",
"#################### TEMPORAL NORMALIZATION ##########\n",
"config.time_norm = TimeNorm.ONE_BIT # 'no' for no normalization, or 'rma', 'one_bit' for normalization in time domain,\n",
"config.smooth_N= 10 # moving window length for time domain normalization if selected (points)\n",
"config.smooth_N = 10 # moving window length for time domain normalization if selected (points)\n",
"\n",
"############ cross correlation ##############\n",
"config.cc_method= CCMethod.XCORR # 'xcorr' for pure cross correlation OR 'deconv' for deconvolution;\n",
"config.cc_method = CCMethod.XCORR # 'xcorr' for pure cross correlation OR 'deconv' for deconvolution;\n",
" # FOR \"COHERENCY\" PLEASE set freq_norm to \"rma\", time_norm to \"no\" and cc_method to \"xcorr\"\n",
"\n",
"# OUTPUTS:\n",
"config.substack = True # True = smaller stacks within the time chunk. False: it will stack over inc_hours\n",
"config.substack_len = config.cc_len # how long to stack over (for monitoring purpose): need to be multiples of cc_len\n",
" # if substack=True, substack_len=2*cc_len, then you pre-stack every 2 correlation windows.\n",
" # for instance: substack=True, substack_len=cc_len means that you keep ALL of the correlations\n",
"config.substack_windows = 1 # how long to stack over (for monitoring purpose)\n",
" # if substack=True, substack_windows=2, then you pre-stack every 2 correlation windows.\n",
" # for instance: substack=True, substack_windows=1 means that you keep ALL of the correlations\n",
" # if substack=False, the cross correlation will be stacked over the inc_hour window\n",
"\n",
"### For monitoring applications ####\n",
"## we recommend substacking ever 2-4 cross correlations and storing the substacks\n",
"# e.g. \n",
"# config.substack = True \n",
"# config.substack_len = 4* config.cc_len\n",
"# config.substack_windows = 4\n",
"\n",
"config.maxlag= 200 # lags of cross-correlation to save (sec)\n",
"config.maxlag = 200 # lags of cross-correlation to save (sec)\n",
"\n",
"# the network list that are used here\n",
"config.networks = ['CI', 'NC']"
Expand Down Expand Up @@ -260,11 +260,11 @@
" storage_options=S3_STORAGE_OPTIONS)\n",
"\n",
"scedc_store = SCEDCS3DataStore(SCEDC_DATA, scedc_catalog, \n",
" channel_filter([\"CI\"], [\"RPV\", \"SVD\", \"BBR\"], [\"BH?\", \"HH?\"]), \n",
" channel_filter([\"CI\"], [\"VES\", \"SVD\", \"BBR\"], [\"BH?\", \"HH?\"]), \n",
" timerange, storage_options=S3_STORAGE_OPTIONS)\n",
"\n",
"ncedc_store = NCEDCS3DataStore(NCEDC_DATA, ncedc_catalog, \n",
" channel_filter([\"NC\"], [\"KCT\", \"KRP\", \"KHMB\"], [\"BH?\", \"HH?\"]), \n",
" channel_filter([\"NC\"], [\"BBGB\", \"AFD\", \"GDXB\"], [\"BH?\", \"HH?\"]), \n",
" timerange, storage_options=S3_STORAGE_OPTIONS)\n",
"\n",
"raw_store = CompositeRawStore({\"CI\": scedc_store, \n",
Expand Down
Loading
Loading