diff --git a/poetry.lock b/poetry.lock index 132c2925..6561d021 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "alabaster" @@ -2555,6 +2555,16 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -3736,6 +3746,48 @@ files = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] +[[package]] +name = "pyarrow" +version = "13.0.0" +description = "Python library for Apache Arrow" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-13.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:1afcc2c33f31f6fb25c92d50a86b7a9f076d38acbcb6f9e74349636109550148"}, + {file = "pyarrow-13.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:70fa38cdc66b2fc1349a082987f2b499d51d072faaa6b600f71931150de2e0e3"}, + {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd57b13a6466822498238877892a9b287b0a58c2e81e4bdb0b596dbb151cbb73"}, + {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8ce69f7bf01de2e2764e14df45b8404fc6f1a5ed9871e8e08a12169f87b7a26"}, + {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:588f0d2da6cf1b1680974d63be09a6530fd1bd825dc87f76e162404779a157dc"}, + {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6241afd72b628787b4abea39e238e3ff9f34165273fad306c7acf780dd850956"}, + {file = "pyarrow-13.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:fda7857e35993673fcda603c07d43889fca60a5b254052a462653f8656c64f44"}, + {file = "pyarrow-13.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:aac0ae0146a9bfa5e12d87dda89d9ef7c57a96210b899459fc2f785303dcbb67"}, + {file = "pyarrow-13.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d7759994217c86c161c6a8060509cfdf782b952163569606bb373828afdd82e8"}, + {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:868a073fd0ff6468ae7d869b5fc1f54de5c4255b37f44fb890385eb68b68f95d"}, + {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51be67e29f3cfcde263a113c28e96aa04362ed8229cb7c6e5f5c719003659d33"}, + {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d1b4e7176443d12610874bb84d0060bf080f000ea9ed7c84b2801df851320295"}, + {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:69b6f9a089d116a82c3ed819eea8fe67dae6105f0d81eaf0fdd5e60d0c6e0944"}, + {file = "pyarrow-13.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ab1268db81aeb241200e321e220e7cd769762f386f92f61b898352dd27e402ce"}, + {file = "pyarrow-13.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:ee7490f0f3f16a6c38f8c680949551053c8194e68de5046e6c288e396dccee80"}, + {file = "pyarrow-13.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e3ad79455c197a36eefbd90ad4aa832bece7f830a64396c15c61a0985e337287"}, + {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68fcd2dc1b7d9310b29a15949cdd0cb9bc34b6de767aff979ebf546020bf0ba0"}, + {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc6fd330fd574c51d10638e63c0d00ab456498fc804c9d01f2a61b9264f2c5b2"}, + {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:e66442e084979a97bb66939e18f7b8709e4ac5f887e636aba29486ffbf373763"}, + {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:0f6eff839a9e40e9c5610d3ff8c5bdd2f10303408312caf4c8003285d0b49565"}, + {file = "pyarrow-13.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b30a27f1cddf5c6efcb67e598d7823a1e253d743d92ac32ec1eb4b6a1417867"}, + {file = "pyarrow-13.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:09552dad5cf3de2dc0aba1c7c4b470754c69bd821f5faafc3d774bedc3b04bb7"}, + {file = "pyarrow-13.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3896ae6c205d73ad192d2fc1489cd0edfab9f12867c85b4c277af4d37383c18c"}, + {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6647444b21cb5e68b593b970b2a9a07748dd74ea457c7dadaa15fd469c48ada1"}, + {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47663efc9c395e31d09c6aacfa860f4473815ad6804311c5433f7085415d62a7"}, + {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:b9ba6b6d34bd2563345488cf444510588ea42ad5613df3b3509f48eb80250afd"}, + {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:d00d374a5625beeb448a7fa23060df79adb596074beb3ddc1838adb647b6ef09"}, + {file = "pyarrow-13.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:c51afd87c35c8331b56f796eff954b9c7f8d4b7fef5903daf4e05fcf017d23a8"}, + {file = "pyarrow-13.0.0.tar.gz", hash = "sha256:83333726e83ed44b0ac94d8d7a21bbdee4a05029c3b1e8db58a863eec8fd8a33"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + [[package]] name = "pycifrw" version = "4.4.5" @@ -4215,6 +4267,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -4222,8 +4275,15 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -4240,6 +4300,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -4247,6 +4308,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -4736,6 +4798,11 @@ files = [ {file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f66eddfda9d45dd6cadcd706b65669ce1df84b8549875691b1f403730bdef217"}, {file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6448c37741145b241eeac617028ba6ec2119e1339b1385c9720dae31367f2be"}, {file = "scikit_learn-1.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:c413c2c850241998168bbb3bd1bb59ff03b1195a53864f0b80ab092071af6028"}, + {file = "scikit_learn-1.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ef540e09873e31569bc8b02c8a9f745ee04d8e1263255a15c9969f6f5caa627f"}, + {file = "scikit_learn-1.3.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9147a3a4df4d401e618713880be023e36109c85d8569b3bf5377e6cd3fecdeac"}, + {file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2cd3634695ad192bf71645702b3df498bd1e246fc2d529effdb45a06ab028b4"}, + {file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c275a06c5190c5ce00af0acbb61c06374087949f643ef32d355ece12c4db043"}, + {file = "scikit_learn-1.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:0e1aa8f206d0de814b81b41d60c1ce31f7f2c7354597af38fae46d9c47c45122"}, {file = "scikit_learn-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:52b77cc08bd555969ec5150788ed50276f5ef83abb72e6f469c5b91a0009bbca"}, {file = "scikit_learn-1.3.1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a683394bc3f80b7c312c27f9b14ebea7766b1f0a34faf1a2e9158d80e860ec26"}, {file = "scikit_learn-1.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15d964d9eb181c79c190d3dbc2fff7338786bf017e9039571418a1d53dab236"}, @@ -5778,10 +5845,10 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [extras] -docs = ["sphinx", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "tomlkit"] -notebook = ["ipykernel", "jupyter"] +docs = ["sphinx", "sphinx-rtd-theme", "tomlkit", "sphinx-autodoc-typehints"] +notebook = ["jupyter", "ipykernel"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "d0ffb3199374ca1e88d3f03cc425b5a0cfcc60abac23b1dcd4fdd4f66267492a" +content-hash = "0886d7c6beeb6d8052154aa45cff0bac51bf4dd7f182e70603758a256f5f819e" diff --git a/pyproject.toml b/pyproject.toml index 45b9b400..78581d11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ tifffile = ">=2022.2.9, <2023.0.0" tqdm = "^4.62.3" xarray = "^0.20.2" joblib = "^1.2.0" +pyarrow = "^13.0.0" jupyter = {version = "^1.0.0", extras = ["notebook"], optional = true} ipykernel = {version = "^6.9.1", extras = ["notebook"], optional = true} sphinx = {version = ">4.4.0", extras = ["docs"], optional = true} diff --git a/sed/config/flash_example_config.yaml b/sed/config/flash_example_config.yaml index 9a614975..2579b6c2 100644 --- a/sed/config/flash_example_config.yaml +++ b/sed/config/flash_example_config.yaml @@ -21,7 +21,8 @@ core: dataframe: # The offset correction to the pulseId ubid_offset: 5 - + # the number of iterations to fill the pulseId forward. + forward_fill_iterations: 2 # The name of the DAQ system to use. Necessary to resolve the filenames/paths. daq: fl1user3 diff --git a/sed/core/dfops.py b/sed/core/dfops.py index 4bc7c386..ecef954f 100644 --- a/sed/core/dfops.py +++ b/sed/core/dfops.py @@ -8,6 +8,7 @@ from typing import Union import dask.dataframe +from dask.diagnostics import ProgressBar import numpy as np import pandas as pd @@ -138,3 +139,57 @@ def map_columns_2d( ) return df + + +def forward_fill_lazy( + df: dask.dataframe.DataFrame, + channels: Sequence[str], + before: Union[str, int] = 'max', + compute_lengths: bool = False, + iterations: int = 2, +) -> dask.dataframe.DataFrame: + """Forward fill the specified columns multiple times in a dask dataframe. + + Allows forward filling between partitions. This is useful for dataframes + that have sparse data, such as those with many NaNs. + Runnin the forward filling multiple times can fix the issue of having + entire partitions consisting of NaNs. By default we run this twice, which + is enough to fix the issue for dataframes with no consecutive partitions of NaNs. + + Args: + df (dask.dataframe.DataFrame): The dataframe to forward fill. + channels (list): The columns to forward fill. + before (int, str, optional): The number of rows to include before the current partition. + if 'max' it takes as much as possible from the previous partition, which is + the size of the smallest partition in the dataframe. Defaults to 'max'. + after (int, optional): The number of rows to include after the current partition. + Defaults to 'part'. + compute_lengths (bool, optional): Whether to compute the length of each partition + iterations (int, optional): The number of times to forward fill the dataframe. + + Returns: + dask.dataframe.DataFrame: The dataframe with the specified columns forward filled. + """ + # Define a custom function to forward fill specified columns + def forward_fill_partition(df): + df[channels] = df[channels].ffill() + return df + + # calculate the number of rows in each partition and choose least + if before == 'max': + nrows = df.map_partitions(len) + if compute_lengths: + with ProgressBar(): + print("Computing dataframe shape...") + nrows = nrows.compute() + before = min(nrows) + elif not isinstance(before, int): + raise TypeError('before must be an integer or "max"') + # Use map_overlap to apply forward_fill_partition + for _ in range(iterations): + df = df.map_overlap( + forward_fill_partition, + before=before, + after=0, + ) + return df diff --git a/sed/loader/flash/loader.py b/sed/loader/flash/loader.py index f5f5c803..c1622c87 100644 --- a/sed/loader/flash/loader.py +++ b/sed/loader/flash/loader.py @@ -7,8 +7,8 @@ This can then be saved as a parquet for out-of-sed processing and reread back to access other sed funtionality. """ +import time from functools import reduce -from itertools import compress from pathlib import Path from typing import List from typing import Sequence @@ -18,6 +18,7 @@ import dask.dataframe as dd import h5py import numpy as np +import pyarrow as pa from joblib import delayed from joblib import Parallel from natsort import natsorted @@ -25,6 +26,7 @@ from pandas import MultiIndex from pandas import Series +from sed.core import dfops from sed.loader.base.loader import BaseLoader from sed.loader.flash.metadata import MetadataRetriever from sed.loader.utils import parse_h5_keys @@ -676,60 +678,6 @@ def buffer_file_handler(self, data_parquet_dir: Path, detector: str): return h5_filenames, parquet_filenames - def fill_na( - self, - dataframes: List[dd.DataFrame], - ) -> dd.DataFrame: - """ - Fill NaN values in the given dataframes using intrafile forward filling. - - Args: - dataframes (List[dd.DataFrame]): List of dataframes to fill NaN values. - - Returns: - dd.DataFrame: Concatenated dataframe with filled NaN values. - - Notes: - This method is specific to the flash data structure and is used to fill NaN values in - certain channels that only store information at a lower frequency. The low frequency - channels are exploded to match the dimensions of higher frequency channels, but they - may contain NaNs in the other columns. This method fills the NaNs for the specific - channels (per_pulse and per_train). - - """ - # Channels to fill NaN values - channels: List[str] = self.get_channels_by_format(["per_pulse", "per_train"]) - - # Fill NaN values within each dataframe - for i, _ in enumerate(dataframes): - dataframes[i][channels] = dataframes[i][channels].fillna( - method="ffill", - ) - - # Forward fill between consecutive dataframes - for i in range(1, len(dataframes)): - # Select pulse channels from current dataframe - subset = dataframes[i][channels] - # Find columns with NaN values in the first row - is_null = subset.loc[0].isnull().values.compute() - # Execute if there are NaN values in the first row - if is_null.sum() > 0: - # Select channel names with only NaNs - channels_to_overwrite = list(compress(channels, is_null[0])) - # Get values for those channels from the previous dataframe - values = dataframes[i - 1][channels].tail(1).values[0] - # Create a dictionary to fill NaN values - fill_dict = dict(zip(channels, values)) - fill_dict = {k: v for k, v in fill_dict.items() if k in channels_to_overwrite} - # Fill NaN values with the corresponding values from the - # previous dataframe - dataframes[i][channels_to_overwrite] = subset[channels_to_overwrite].fillna( - fill_dict, - ) - - # Concatenate the filled dataframes - return dd.concat(dataframes) - def parquet_handler( self, data_parquet_dir: Path, @@ -785,16 +733,24 @@ def parquet_handler( data_parquet_dir, detector, ) - - # Read all parquet files using dask and concatenate into one dataframe after filling - dataframe = self.fill_na( - [dd.read_parquet(file) for file in parquet_filenames], + # Read all parquet files into one dataframe using dask + dataframe = dd.read_parquet(parquet_filenames, calculate_divisions=True) + # Channels to fill NaN values + print("Filling nan values...") + channels: List[str] = self.get_channels_by_format(["per_pulse", "per_train"]) + + overlap = min(pa.parquet.read_metadata(prq).num_rows for prq in parquet_filenames) + + dataframe = dfops.forward_fill_lazy( + df=dataframe, + channels=channels, + before=overlap, + iterations=self._config["dataframe"].get("forward_fill_iterations", 2), ) - + # Remove the NaNs from per_electron channels dataframe = dataframe.dropna( subset=self.get_channels_by_format(["per_electron"]), ) - # Save the dataframe as parquet if requested if save_parquet: dataframe.compute().reset_index(drop=True).to_parquet(parquet_path) @@ -859,6 +815,7 @@ def read_dataframe( ValueError: If neither 'runs' nor 'files'/'data_raw_dir' is provided. FileNotFoundError: If the conversion fails for some files or no data is available. """ + t0 = time.time() data_raw_dir, data_parquet_dir = self.initialize_paths() @@ -891,6 +848,7 @@ def read_dataframe( dataframe = self.parquet_handler(data_parquet_dir, **kwds) metadata = self.parse_metadata() if collect_metadata else {} + print(f"loading complete in {time.time() - t0:.2f} s") return dataframe, metadata diff --git a/tests/test_dfops.py b/tests/test_dfops.py index e412806c..fabb9236 100644 --- a/tests/test_dfops.py +++ b/tests/test_dfops.py @@ -2,13 +2,18 @@ """ import numpy as np import pandas as pd +import dask.dataframe as ddf +import pytest from sed.core.dfops import apply_filter from sed.core.dfops import apply_jitter from sed.core.dfops import drop_column from sed.core.dfops import map_columns_2d +from sed.core.dfops import forward_fill_lazy + N_PTS = 100 +N_PARTITIONS = 10 cols = ["posx", "posy", "energy"] df = pd.DataFrame(np.random.randn(N_PTS, len(cols)), columns=cols) @@ -69,3 +74,65 @@ def swap(x, y): ) assert np.all(df[x_column] == df_swapped[new_x_column]) assert np.all(df[y_column] == df_swapped[new_y_column]) + + +def test_forward_fill_lazy_sparse_nans(): + """ test that a lazy forward fill works as expected with sparse nans""" + t_df = df.copy() + t_df['energy'][::2] = np.nan + t_dask_df = ddf.from_pandas(t_df, npartitions=N_PARTITIONS) + t_dask_df = forward_fill_lazy(t_dask_df, 'energy', before='max') + t_df = t_df.ffill() + pd.testing.assert_frame_equal(t_df, t_dask_df.compute()) + + +def test_forward_fill_lazy_full_partition_nans(): + """ test that a lazy forward fill works as expected with a full partition of nans""" + t_df = df.copy() + t_df['energy'][5:25] = np.nan + t_dask_df = ddf.from_pandas(t_df, npartitions=N_PARTITIONS) + t_dask_df = forward_fill_lazy(t_dask_df, 'energy', before='max') + t_df = t_df.ffill() + pd.testing.assert_frame_equal(t_df, t_dask_df.compute()) + + +def test_forward_fill_lazy_consecutive_full_partition_nans(): + """ test that a lazy forward fill fails as expected on two consecutive partitions + full of nans + """ + t_df = df.copy() + t_df['energy'][5:35] = np.nan + t_dask_df = ddf.from_pandas(t_df, npartitions=N_PARTITIONS) + t_dask_df = forward_fill_lazy(t_dask_df, 'energy', before='max') + t_df = t_df.ffill() + assert not t_df.equals(t_dask_df.compute()) + + +def test_forward_fill_lazy_wrong_parameters(): + """ test that a lazy forward fill fails as expected on wrong parameters""" + t_df = df.copy() + t_df['energy'][5:35] = np.nan + t_dask_df = ddf.from_pandas(t_df, npartitions=N_PARTITIONS) + with pytest.raises(TypeError): + t_dask_df = forward_fill_lazy(t_dask_df, 'energy', before='wrong parameter') + + +def test_forward_fill_lazy_compute(): + """ test that a lazy forward fill works as expected with compute=True""" + t_df = df.copy() + t_df['energy'][5:35] = np.nan + t_dask_df = ddf.from_pandas(t_df, npartitions=N_PARTITIONS) + t_dask_df_comp = forward_fill_lazy(t_dask_df, 'energy', before='max', compute_lengths=True) + t_dask_df_nocomp = forward_fill_lazy(t_dask_df, 'energy', before='max', compute_lengths=False) + pd.testing.assert_frame_equal(t_dask_df_comp.compute(), t_dask_df_nocomp.compute()) + + +def test_forward_fill_lazy_keep_head_nans(): + """ test that a lazy forward fill works as expected with missing values at the + beginning of the dataframe""" + t_df = df.copy() + t_df['energy'][:5] = np.nan + t_dask_df = ddf.from_pandas(t_df, npartitions=N_PARTITIONS) + t_df = forward_fill_lazy(t_dask_df, 'energy', before='max').compute() + assert np.all(np.isnan(t_df['energy'][:5])) + assert np.all(np.isfinite(t_df['energy'][5:]))