diff --git a/.gitignore b/.gitignore index 1c02e5e..5f833dd 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ *.jl.mem Manifest.toml /docs/build/ +/talk/data/ +/talk/build/ diff --git a/README.md b/README.md index ec8d544..bee995e 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,10 @@ [![docs](https://img.shields.io/badge/docs-dev-blue.svg)](https://beacon-biosignals.github.io/OndaBatches.jl/dev) Local and distributed batch loading for Onda formatted datasets. + +### JuliaCon 2023 + +Watch our [JuliaCon2023 talk on +OndaBatches.jl](https://www.youtube.com/live/FIeO1yenQ6Y?feature=share&t=23190)! +[Slides](https://beacon-biosignals.github.io/OndaBatches.jl/juliacon2023/) +(and [source + demo](https://github.com/beacon-biosignals/OndaBatches.jl/tree/main/talk/)) diff --git a/docs/src/index.md b/docs/src/index.md index 5e8c21f..62cb9d7 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -2,6 +2,11 @@ CurrentModule = OndaBatches ``` +Watch our [JuliaCon2023 talk on +OndaBatches.jl](https://www.youtube.com/live/FIeO1yenQ6Y?feature=share&t=23190)! +[Slides](https://beacon-biosignals.github.io/OndaBatches.jl/juliacon2023/) +(and [source + demo](https://github.com/beacon-biosignals/OndaBatches.jl/tree/main/talk/)) + # Public API ## Labeled signals diff --git a/talk/Manifest.toml b/talk/Manifest.toml new file mode 100644 index 0000000..5bd9017 --- /dev/null +++ b/talk/Manifest.toml @@ -0,0 +1,702 @@ +# This file is machine-generated - editing it directly is not advised + +julia_version = "1.9.2" +manifest_format = "2.0" +project_hash = "5b516b946bb060765d644b7a03464ec363779760" + +[[deps.AWS]] +deps = ["Base64", "Compat", "Dates", "Downloads", "GitHub", "HTTP", "IniFile", "JSON", "MbedTLS", "Mocking", "OrderedCollections", "Random", "SHA", "Sockets", "URIs", "UUIDs", "XMLDict"] +git-tree-sha1 = "f3386c719e0096a61c7da0cb64a6b7f03cc3549f" +uuid = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" +version = "1.90.0" + +[[deps.AWSS3]] +deps = ["AWS", "ArrowTypes", "Base64", "Compat", "Dates", "EzXML", "FilePathsBase", "HTTP", "MbedTLS", "Mocking", "OrderedCollections", "Retry", "SymDict", "URIs", "UUIDs", "XMLDict"] +git-tree-sha1 = "d87804d72660de156ceb3f675e5c6bbdc9bee607" +uuid = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" +version = "0.11.2" + +[[deps.AlignedSpans]] +deps = ["ArrowTypes", "Dates", "Intervals", "Onda", "StructTypes", "TimeSpans"] +git-tree-sha1 = "4241b9c701634ee4856625379d10805c7f42771a" +uuid = "72438786-fd5d-49ef-8843-650acbdfe662" +version = "0.2.5" + +[[deps.ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" + +[[deps.Arrow]] +deps = ["ArrowTypes", "BitIntegers", "CodecLz4", "CodecZstd", "ConcurrentUtilities", "DataAPI", "Dates", "EnumX", "LoggingExtras", "Mmap", "PooledArrays", "SentinelArrays", "Tables", "TimeZones", "TranscodingStreams", "UUIDs"] +git-tree-sha1 = "954666e252835c4cf8819ce4ffaf31073c1b7233" +uuid = "69666777-d1a9-59fb-9406-91d4454c9d45" +version = "2.6.2" + +[[deps.ArrowTypes]] +deps = ["Sockets", "UUIDs"] +git-tree-sha1 = "8c37bfdf1b689c6677bbfc8986968fe641f6a299" +uuid = "31f734f8-188a-4ce0-8406-c8a06bd891cd" +version = "2.2.2" + +[[deps.Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[deps.BitFlags]] +git-tree-sha1 = "43b1a4a8f797c1cddadf60499a8a077d4af2cd2d" +uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35" +version = "0.1.7" + +[[deps.BitIntegers]] +deps = ["Random"] +git-tree-sha1 = "abb894fb55122b4604af0d460d3018e687a60963" +uuid = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" +version = "0.3.0" + +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.2" + +[[deps.CodecLz4]] +deps = ["Lz4_jll", "TranscodingStreams"] +git-tree-sha1 = "59fe0cb37784288d6b9f1baebddbf75457395d40" +uuid = "5ba52731-8f18-5e0d-9241-30f10d1ec561" +version = "0.4.0" + +[[deps.CodecZlib]] +deps = ["TranscodingStreams", "Zlib_jll"] +git-tree-sha1 = "02aa26a4cf76381be7f66e020a3eddeb27b0a092" +uuid = "944b1d66-785c-5afd-91f1-9de20f533193" +version = "0.7.2" + +[[deps.CodecZstd]] +deps = ["CEnum", "TranscodingStreams", "Zstd_jll"] +git-tree-sha1 = "849470b337d0fa8449c21061de922386f32949d9" +uuid = "6b39b394-51ab-5f42-8807-6242bab2b4c2" +version = "0.7.2" + +[[deps.Compat]] +deps = ["UUIDs"] +git-tree-sha1 = "5ce999a19f4ca23ea484e92a1774a61b8ca4cf8e" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "4.8.0" +weakdeps = ["Dates", "LinearAlgebra"] + + [deps.Compat.extensions] + CompatLinearAlgebraExt = "LinearAlgebra" + +[[deps.CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" + +[[deps.ConcurrentUtilities]] +deps = ["Serialization", "Sockets"] +git-tree-sha1 = "5372dbbf8f0bdb8c700db5367132925c0771ef7e" +uuid = "f0e56b4a-5159-44fe-b623-3e5288b988bb" +version = "2.2.1" + +[[deps.ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "fe2838a593b5f776e1597e086dcd47560d94e816" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.5.3" + + [deps.ConstructionBase.extensions] + ConstructionBaseIntervalSetsExt = "IntervalSets" + ConstructionBaseStaticArraysExt = "StaticArrays" + + [deps.ConstructionBase.weakdeps] + IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" + StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" + +[[deps.Crayons]] +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.1.1" + +[[deps.DataAPI]] +git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.15.0" + +[[deps.DataFrames]] +deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "1.6.1" + +[[deps.DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "cf25ccb972fec4e4817764d01c82386ae94f77b4" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.14" + +[[deps.DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" + +[[deps.Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[deps.DefaultApplication]] +deps = ["InteractiveUtils"] +git-tree-sha1 = "c0dfa5a35710a193d83f03124356eef3386688fc" +uuid = "3f0dd361-4fe0-5fc6-8523-80b14ec94d85" +version = "1.1.0" + +[[deps.Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[deps.DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.6" + +[[deps.Documenter]] +deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] +git-tree-sha1 = "fb1ff838470573adc15c71ba79f8d31328f035da" +uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +version = "0.25.2" + +[[deps.DocumenterMarkdown]] +deps = ["Documenter"] +git-tree-sha1 = "9af057a98652336e30586d8092fac06f8b28ecdc" +uuid = "997ab1e6-3595-5248-9280-8efb232c3433" +version = "0.2.2" + +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" + +[[deps.EnumX]] +git-tree-sha1 = "bdb1942cd4c45e3c678fd11569d5cccd80976237" +uuid = "4e289a0a-7415-4d19-859d-a7e5c4648b56" +version = "1.0.4" + +[[deps.ExceptionUnwrapping]] +deps = ["Test"] +git-tree-sha1 = "e90caa41f5a86296e014e148ee061bd6c3edec96" +uuid = "460bff9d-24e4-43bc-9d9f-a8973cb893f4" +version = "0.1.9" + +[[deps.ExprTools]] +git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.9" + +[[deps.EzXML]] +deps = ["Printf", "XML2_jll"] +git-tree-sha1 = "0fa3b52a04a4e210aeb1626def9c90df3ae65268" +uuid = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" +version = "1.1.0" + +[[deps.FilePathsBase]] +deps = ["Compat", "Dates", "Mmap", "Printf", "Test", "UUIDs"] +git-tree-sha1 = "e27c4ebe80e8699540f2d6c805cc12203b614f12" +uuid = "48062228-2e41-5def-b9a4-89aafe57970f" +version = "0.9.20" + +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[deps.Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[deps.GitHub]] +deps = ["Base64", "Dates", "HTTP", "JSON", "MbedTLS", "Sockets", "SodiumSeal", "URIs"] +git-tree-sha1 = "5688002de970b9eee14b7af7bbbd1fdac10c9bbe" +uuid = "bc5e4493-9b4d-5f90-b8aa-2b2bcaad7a26" +version = "5.8.2" + +[[deps.Glob]] +git-tree-sha1 = "97285bbd5230dd766e9ef6749b80fc617126d496" +uuid = "c27321d9-0574-5035-807b-f59d2c89b15c" +version = "1.3.1" + +[[deps.HTTP]] +deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] +git-tree-sha1 = "cb56ccdd481c0dd7f975ad2b3b62d9eda088f7e2" +uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" +version = "1.9.14" + +[[deps.IOCapture]] +deps = ["Logging", "Random"] +git-tree-sha1 = "d75853a0bdbfb1ac815478bacd89cd27b550ace6" +uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" +version = "0.2.3" + +[[deps.IniFile]] +git-tree-sha1 = "f550e6e32074c939295eb5ea6de31849ac2c9625" +uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" +version = "0.5.1" + +[[deps.InlineStrings]] +deps = ["Parsers"] +git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" +uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +version = "1.4.0" + +[[deps.InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[deps.Intervals]] +deps = ["Dates", "Printf", "RecipesBase", "Serialization", "TimeZones"] +git-tree-sha1 = "ac0aaa807ed5eaf13f67afe188ebc07e828ff640" +uuid = "d8418881-c3e1-53bb-8760-2df7ec849ed5" +version = "1.10.0" + +[[deps.InvertedIndices]] +git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.3.0" + +[[deps.IrrationalConstants]] +git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" +uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" +version = "0.2.2" + +[[deps.IterTools]] +git-tree-sha1 = "4ced6667f9974fc5c5943fa5e2ef1ca43ea9e450" +uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" +version = "1.8.0" + +[[deps.IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" + +[[deps.JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.4.1" + +[[deps.JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.4" + +[[deps.LaTeXStrings]] +git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996" +uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" +version = "1.3.0" + +[[deps.LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[deps.Legolas]] +deps = ["Arrow", "ConstructionBase", "Tables", "UUIDs"] +git-tree-sha1 = "2af55bc396c8ee085d0953e48308791f247064f5" +uuid = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd" +version = "0.5.14" + +[[deps.LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" + +[[deps.LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "7.84.0+0" + +[[deps.LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[deps.LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" + +[[deps.Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[deps.Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c7cb1f5d892775ba13767a87c7ada0b980ea0a71" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.16.1+2" + +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[deps.Literate]] +deps = ["Base64", "IOCapture", "JSON", "REPL"] +git-tree-sha1 = "1c4418beaa6664041e0f9b48f0710f57bff2fcbe" +uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306" +version = "2.14.0" + +[[deps.LogExpFunctions]] +deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] +git-tree-sha1 = "c3ce8e7420b3a6e071e0fe4745f5d4300e37b13f" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.3.24" + + [deps.LogExpFunctions.extensions] + LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" + LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" + LogExpFunctionsInverseFunctionsExt = "InverseFunctions" + + [deps.LogExpFunctions.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + +[[deps.Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[deps.LoggingExtras]] +deps = ["Dates", "Logging"] +git-tree-sha1 = "cedb76b37bc5a6c702ade66be44f831fa23c681e" +uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36" +version = "1.0.0" + +[[deps.Lz4_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "5d494bc6e85c4c9b626ee0cab05daa4085486ab1" +uuid = "5ced341a-0733-55b8-9ab6-a4889d929147" +version = "1.9.3+0" + +[[deps.Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[deps.MbedTLS]] +deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "Random", "Sockets"] +git-tree-sha1 = "03a9b9718f5682ecb107ac9f7308991db4ce395b" +uuid = "739be429-bea8-5141-9913-cc70e7f3736d" +version = "1.1.7" + +[[deps.MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" + +[[deps.Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "1.1.0" + +[[deps.Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[deps.Mocking]] +deps = ["Compat", "ExprTools"] +git-tree-sha1 = "4cc0c5a83933648b615c36c2b956d94fda70641e" +uuid = "78c3b35d-d492-501b-9361-3d52fe80e533" +version = "0.7.7" + +[[deps.MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2022.10.11" + +[[deps.NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[deps.Onda]] +deps = ["Arrow", "CodecZstd", "Compat", "Dates", "Legolas", "Mmap", "Random", "Tables", "TimeSpans", "TranscodingStreams", "UUIDs"] +git-tree-sha1 = "e9fc7b4bbb68332ecac36f0d82bdf7ed88394b39" +uuid = "e853f5be-6863-11e9-128d-476edb89bfb5" +version = "0.15.1" + +[[deps.OndaBatches]] +deps = ["AWS", "AWSS3", "AlignedSpans", "DataFrames", "Dates", "Distributed", "Legolas", "Onda", "StatsBase", "Tables", "TimeSpans", "UUIDs"] +path = ".." +uuid = "181bd894-5b11-491a-bec3-9b1779d96000" +version = "0.4.7" + +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" + +[[deps.OpenSSL]] +deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"] +git-tree-sha1 = "51901a49222b09e3743c65b8847687ae5fc78eb2" +uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c" +version = "1.4.1" + +[[deps.OpenSSL_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "cae3153c7f6cf3f069a853883fd1919a6e5bab5b" +uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" +version = "3.0.9+0" + +[[deps.OrderedCollections]] +git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.6.2" + +[[deps.Parsers]] +deps = ["Dates", "PrecompileTools", "UUIDs"] +git-tree-sha1 = "4b2e829ee66d4218e0cef22c0a64ee37cf258c29" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "2.7.1" + +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.9.2" + +[[deps.PooledArrays]] +deps = ["DataAPI", "Future"] +git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "1.4.2" + +[[deps.PrecompileTools]] +deps = ["Preferences"] +git-tree-sha1 = "9673d39decc5feece56ef3940e5dafba15ba0f81" +uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +version = "1.1.2" + +[[deps.Preferences]] +deps = ["TOML"] +git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.4.0" + +[[deps.PrettyTables]] +deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"] +git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "2.2.7" + +[[deps.Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[deps.REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[deps.Random]] +deps = ["SHA", "Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[deps.RecipesBase]] +deps = ["PrecompileTools"] +git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff" +uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +version = "1.3.4" + +[[deps.Reexport]] +git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.2.2" + +[[deps.Remark]] +deps = ["CodecZlib", "DefaultApplication", "Documenter", "DocumenterMarkdown", "Downloads", "Glob", "JSON", "Literate", "Random", "Tar"] +git-tree-sha1 = "4f5790d5e66a5337216052ee99936e792f7a81c3" +uuid = "79b45036-8e38-5d04-8f49-b9fb23ff5a0d" +version = "0.3.3" + +[[deps.Retry]] +git-tree-sha1 = "41ac127cd281bb33e42aba46a9d3b25cd35fc6d5" +uuid = "20febd7b-183b-5ae2-ac4a-720e7ce64774" +version = "0.4.1" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.2.0" + +[[deps.SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.4.0" + +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[deps.SimpleBufferStream]] +git-tree-sha1 = "874e8867b33a00e784c8a7e4b60afe9e037b74e1" +uuid = "777ac1f9-54b0-4bf8-805c-2214025038e7" +version = "1.1.0" + +[[deps.Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[deps.SodiumSeal]] +deps = ["Base64", "Libdl", "libsodium_jll"] +git-tree-sha1 = "80cef67d2953e33935b41c6ab0a178b9987b1c99" +uuid = "2133526b-2bfb-4018-ac12-889fb3908a75" +version = "0.1.1" + +[[deps.SortingAlgorithms]] +deps = ["DataStructures"] +git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "1.1.1" + +[[deps.SparseArrays]] +deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[deps.StableRNGs]] +deps = ["Random", "Test"] +git-tree-sha1 = "3be7d49667040add7ee151fefaf1f8c04c8c8276" +uuid = "860ef19b-820b-49d6-a774-d7a799459cd3" +version = "1.0.0" + +[[deps.Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +version = "1.9.0" + +[[deps.StatsAPI]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "45a7769a04a3cf80da1c1c7c60caf932e6f4c9f7" +uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0" +version = "1.6.0" + +[[deps.StatsBase]] +deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] +git-tree-sha1 = "d1bf48bfcc554a3761a133fe3a9bb01488e06916" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.33.21" + +[[deps.StringManipulation]] +git-tree-sha1 = "46da2434b41f41ac3594ee9816ce5541c6096123" +uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" +version = "0.3.0" + +[[deps.StructTypes]] +deps = ["Dates", "UUIDs"] +git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70" +uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" +version = "1.10.0" + +[[deps.SuiteSparse_jll]] +deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" +version = "5.10.1+6" + +[[deps.SymDict]] +deps = ["Test"] +git-tree-sha1 = "0108ccdaea3ef69d9680eeafc8d5ad198b896ec8" +uuid = "2da68c74-98d7-5633-99d6-8493888d7b1e" +version = "0.3.0" + +[[deps.TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" + +[[deps.TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.1" + +[[deps.Tables]] +deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"] +git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "1.10.1" + +[[deps.Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" + +[[deps.Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.TimeSpans]] +deps = ["Compat", "Dates", "Statistics"] +git-tree-sha1 = "c922b37ba4b71c3a59c46de46a8ba0def1820c69" +uuid = "bb34ddd2-327f-4c4a-bfb0-c98fc494ece1" +version = "0.3.8" + +[[deps.TimeZones]] +deps = ["Dates", "Downloads", "InlineStrings", "LazyArtifacts", "Mocking", "Printf", "RecipesBase", "Scratch", "Unicode"] +git-tree-sha1 = "5b347464bdac31eccfdbe1504d9484c31645cafc" +uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53" +version = "1.11.0" + +[[deps.TranscodingStreams]] +deps = ["Random", "Test"] +git-tree-sha1 = "9a6ae7ed916312b41236fcef7e0af564ef934769" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.9.13" + +[[deps.URIs]] +git-tree-sha1 = "074f993b0ca030848b897beff716d93aca60f06a" +uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +version = "1.4.2" + +[[deps.UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[deps.Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[deps.XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "93c41695bc1c08c46c5899f4fe06d6ead504bb73" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.10.3+0" + +[[deps.XMLDict]] +deps = ["EzXML", "IterTools", "OrderedCollections"] +git-tree-sha1 = "d9a3faf078210e477b291c79117676fca54da9dd" +uuid = "228000da-037f-5747-90a9-8195ccbf91a5" +version = "0.4.1" + +[[deps.Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" + +[[deps.Zstd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "49ce682769cd5de6c72dcf1b94ed7790cd08974c" +uuid = "3161d3a3-bdf6-5164-811a-617609db77b4" +version = "1.5.5+0" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" + +[[deps.libsodium_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "848ab3d00fe39d6fbc2a8641048f8f272af1c51e" +uuid = "a9144af2-ca23-56d9-984f-0d03f7b5ccf8" +version = "1.0.20+0" + +[[deps.nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.48.0+0" + +[[deps.p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/talk/Project.toml b/talk/Project.toml new file mode 100644 index 0000000..6e78b64 --- /dev/null +++ b/talk/Project.toml @@ -0,0 +1,8 @@ +[deps] +AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Legolas = "741b9549-f6ed-4911-9fbf-4a1c0c97f0cd" +Onda = "e853f5be-6863-11e9-128d-476edb89bfb5" +OndaBatches = "181bd894-5b11-491a-bec3-9b1779d96000" +Remark = "79b45036-8e38-5d04-8f49-b9fb23ff5a0d" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" diff --git a/talk/demo.jl b/talk/demo.jl new file mode 100644 index 0000000..359101f --- /dev/null +++ b/talk/demo.jl @@ -0,0 +1,209 @@ +using Distributed +# one manager and 3 batch loaders +addprocs(4) + +@everywhere begin + using DataFrames + using Dates + using Legolas + using Onda + using OndaBatches + using StableRNGs +end + +##### +##### setup +##### + +include("local_data.jl") +const VALID_STAGES = ("wake", "nrem1", "nrem2", "nrem3", "rem", "no_stage") +const SLEEP_STAGE_INDEX = Dict(s => UInt8(i) + for (i, s) + in enumerate(VALID_STAGES)) + +##### +##### basic functionality +##### + +signals, labels = load_tables(; strip_refs=true); + +# input signal data (X) +signals +describe(signals, :eltype, :first) +samples = Onda.load(first(signals)) + +# input label data (Y) +describe(labels, :eltype, :first) + +labeled_signals = label_signals(signals, labels, + labels_column=:stage, + encoding=SLEEP_STAGE_INDEX, + epoch=Second(30)) +describe(labeled_signals, :eltype) +labeled_signals.labels[1] + +batches = RandomBatches(; labeled_signals, + # uniform weighting of signals + labels + signal_weights=nothing, + label_weights=nothing, + n_channels=2, + batch_size=3, + batch_duration=Minute(5)) + +state0 = StableRNG(1) +batch, state = iterate_batch(batches, deepcopy(state0)) +describe(batch, :eltype, :first) + +x, y = materialize_batch(batch); + +# signal tensor: +x + +# labels tensor: +y + +##### +##### perist label sets +##### + +labeled_signals_stored = store_labels(labeled_signals, + joinpath(@__DIR__, "data", "labels")) + +describe(labeled_signals_stored, :eltype, :first) +first(labeled_signals_stored.labels) + +batches = RandomBatches(; labeled_signals=labeled_signals_stored, + # uniform weighting of signals + labels + signal_weights=nothing, + label_weights=nothing, + n_channels=2, + batch_size=3, + batch_duration=Minute(5)) + +state0 = StableRNG(1) + +batch, state = iterate_batch(batches, deepcopy(state0)) +describe(batch, :eltype, :first) + +x, y = materialize_batch(batch); + +x +y + +##### +##### zero missing channels +##### + +struct ZeroMissingChannels + channels::Vector{String} +end +function OndaBatches.get_channel_data(samples::Samples, channels::ZeroMissingChannels) + out = zeros(eltype(samples.data), length(channels.channels), size(samples.data, 2)) + for (i, c) in enumerate(channels.channels) + if c ∈ samples.info.channels + @views out[i:i, :] .= samples[c, :].data + end + end + return out +end + +channels = ZeroMissingChannels(["c3", "c4", "o1", "o2", "not-a-real-channel"]) + +OndaBatches.get_channel_data(samples, channels) + +# normally we'd make our batch iterator set this field for us but for demo +# purposes we'll do it manually +batch.batch_channels .= Ref(channels); +x, y = materialize_batch(batch); + +batch +x + +##### +##### compartments +##### + +struct EvenOdds + n_channels::Int +end +function OndaBatches.get_channel_data(samples::Samples, channels::EvenOdds) + n = channels.n_channels + chans = samples.info.channels + odds = @view(samples[chans[1:2:n], :]).data + evens = @view(samples[chans[2:2:n], :]).data + return cat(evens, odds; dims=3) +end + +channels = EvenOdds(4) + +labeled_signals_four_channels = filter(:channels => >=(4) ∘ length, + labeled_signals) +batches_eo = RandomBatches(; labeled_signals=labeled_signals_four_channels, + # uniform weighting of signals + labels + signal_weights=nothing, + label_weights=nothing, + n_channels=2, + batch_size=3, + batch_duration=Minute(5)) + +batch, _ = iterate_batch(batches_eo, copy(state0)) +batch.batch_channels .= Ref(channels) +x, y = materialize_batch(batch) + +x + +batch_flat = deepcopy(batch) +batch_flat.batch_channels .= Ref(1:4) +x_flat, _ = materialize_batch(batch_flat) +x_flat + +##### +##### batch service +##### + +using Distributed +# one manager and 3 batch loaders +addprocs(4) + +@everywhere begin + using DataFrames + using Dates + using Legolas + using Onda + using OndaBatches + using StableRNGs +end + +batcher = Batcher(workers(), batches; start=false) + +get_status(batcher) + +start!(batcher, copy(state0)); +get_status(batcher) + +state = copy(state0) +(x, y), state = take!(batcher, state) +(x, y), state = take!(batcher, state) + +# taking from an out-of-sync state will restart teh batcher +(x, y), state = take!(batcher, copy(state0)) + +stop!(batcher) +get_status(batcher) + +# errors get propagated to consumer +bad_batches = deepcopy(batches) +bad_batches.labeled_signals.file_path .= "blah blah not a path" +bad_batcher = Batcher(workers(), bad_batches; start=true, state=copy(state0)) + +take!(bad_batcher, state0) + +# turn on debug logging if you want to see the gory details for debugging +@everywhere batcher.manager ENV["JULIA_DEBUG"] = "OndaBatches" +start!(batcher, copy(state0)) + +# turn on debug logging on the loaders for the _really gory_ details +stop!(batcher) +@everywhere ENV["JULIA_DEBUG"] = "OndaBatches" +start!(batcher, copy(state0)) + diff --git a/talk/local_data.jl b/talk/local_data.jl new file mode 100644 index 0000000..a1c9df7 --- /dev/null +++ b/talk/local_data.jl @@ -0,0 +1,47 @@ +using AWSS3 +using Legolas: @schema, @version +using Onda +using OndaBatches + +include(joinpath(@__DIR__, "../test/testdataset.jl")) +local_root = joinpath(@__DIR__, "data") + +local_signals_path = joinpath(local_root, "signals.arrow") +if !isfile(local_signals_path) + signals = DataFrame(Legolas.read(uncompressed_signals_path); copycols=true) + + local_signals = transform(signals, + :file_path => ByRow() do path + local_path = joinpath(local_root, "samples", + basename(path)) + cp(path, Path(local_path)) + @info string(path, '→', local_path) + return local_path + end => :file_path) + + Onda.load(first(local_signals)) + + Legolas.write(local_signals_path, local_signals, SignalV2SchemaVersion()) +end + +local_stages_path = joinpath(local_root, "stages.arrow") +if !isfile(local_stages_path) + cp(stages_path, Path(local_stages_path)) + stages = DataFrame(Legolas.read(local_stages_path); copycols=true) + stages = OndaBatches.sort_and_trim_spans(stages, :recording; epoch=Second(30)) + Legolas.write(local_stages_path, stages, SleepStageV1SchemaVersion()) +end + +function load_tables(; strip_refs=true) + signals = DataFrame(Legolas.read(local_signals_path); copycols=true) + if strip_refs + transform!(signals, + :channels => ByRow() do channels + [string(first(split(c, "-"; limit=2))) for c in channels] + end => :channels) + end + + stages = DataFrame(Legolas.read(local_stages_path); copycols=true) + return signals, stages +end + diff --git a/talk/make.jl b/talk/make.jl new file mode 100644 index 0000000..8c9092b --- /dev/null +++ b/talk/make.jl @@ -0,0 +1,9 @@ +using Remark, FileWatching + +while true + Remark.slideshow(@__DIR__; + options = Dict("ratio" => "16:9"), + title = "OndaBatches.jl") + @info "Rebuilt" + FileWatching.watch_file(joinpath(@__DIR__, "src", "index.md")) +end diff --git a/talk/src/index.md b/talk/src/index.md new file mode 100644 index 0000000..b5bce2b --- /dev/null +++ b/talk/src/index.md @@ -0,0 +1,350 @@ +class: middle + +.slide-title[ + +# OndaBatches.jl: Continuous, repeatable, and distributed batching + +## Dave Kleinschmidt — Beacon Biosignals + +### JuliaCon 2023 — [slide source + demo](https://github.com/beacon-biosignals/OndaBatches.jl/tree/main/talk/) +] + +--- + +# Who am I? + +Research Software Engineer at Beacon Biosignals + +Our team builds tools for internal users at Beacon doing machine learning and +other quantitative/computational work + +--- + +# Who are we? + +Beacon Biosignals + +> From its founding in 2019, Beacon Biosignals has engineered a machine learning +> platform designed to interrogate large EEG datasets at unprecedented speed and +> scale. + +--- + +# Why did we make this? + +Support common need to _build batches from annotated time series data_ across +multiple ML efforts at Beacon: + +-- + +Multi-channel, regularly sampled time series data (i.e., EEG recordings) + +Task is "image segmentation": output dense, regularly sampled labels (i.e., +every 30s span gets a label) + +-- + +Input data is Onda-formatted `Samples` + annotations (time span + label) + +Models requires _numerical tensors_ for training/evaluation/inference + +--- + +# Who is this for? + +This might be interesting to you if you are + +1. a ML engineer looking to model large time-series datasets and want to + acutally _use_ OndaBatches to build your batches. +2. developing similar tools and are interested in how we build re-usable + tools like this at Beacon. + +-- + +## Why might you care? + +1. We actually use this at Beacon! +2. It's a potentially useful example (cautionary tale?) for how to wrangle + inconveniently large data and the nuances of distributed computing in a + restricted domain + +??? + +gonna be honest, mostly focusing on the second group here! + +this is pretty specific to beacon's tooling and needs! and there's a fair +amount of path dependence in how we got to this state... + +--- + +# Outline + +Part 1: Design, philosophy, and basic functionality + +Part 2: Making a distributed batch loading system that doesn't require expertise +in distributed systems to use + +--- + +# Design: Goals + +Distributed (integrate with our distributed ML pipelines, throw more resources +at it to make sure data movement is not the bottleneck) + +Scalable (handle out-of-core datasets, both for signal data and labels) + +Deterministic + reproducible (pseudo-random) + +Resumable + +Flexible and extensible via normal Julia mechanisms of multiple dispatch + +--- + +# Design: Philosophy + +Separate the _cheap_ parts where _order matters_ ("batch specification") from +_expensive parts_ which can be done _asynchronously_ ("batch materialization") + +-- + +Build on standard tooling (at Beacon), using +[Legolas.jl](https://github.com/beacon-biosignals/Legolas.jl) to define +interface schemas which extend +[Onda.jl](https://github.com/beacon-biosignals/Onda.jl) schemas. + +-- + +Use _iterator patterns_ to generate pseudorandom sequence of batch specs. + +-- + +Be flexible enough that it can be broadly useful across different ML efforts at +Beaacon (and beyond??) + +-- + +Use function calls we control to provide hooks for users to customize certain +behaviors via multiple dispatch (e.g., how to materialize `Samples` data into +batch tensor) + +--- + +# How does it work? + +```julia +signals, labels = load_tables() +labeled_signals = label_signals(signals, labels, + labels_column=:stage, + encoding=SLEEP_STAGE_INDEX, + epoch=Second(30)) + +batches = RandomBatches(; labeled_signals, + # uniform weighting of signals + labels + signal_weights=nothing, + label_weights=nothing, + n_channels=1, + batch_size=2, + batch_duration=Minute(5)) + +state0 = StableRNG(1) + +batch, state = iterate_batch(batches, deepcopy(state0)) +x, y = materialize_batch(batch) +``` + +??? + +live demo here... + +--- + +# Extensibility + +Some models require a specific set of channels to function (a "montage"), but +recordings don't always have all the required channels. + +Here's a "channel selector" to fill in the missing channels with zeros: + +```julia +struct ZeroMissingChannels + channels::Vector{String} +end + +function OndaBatches.get_channel_data(samples::Samples, channels::ZeroMissingChannels) + out = zeros(eltype(samples.data), length(channels.channels), size(samples.data, 2)) + for (i, c) in enumerate(channels.channels) + if c ∈ samples.info.channels + @views out[i:i, :] .= samples[c, :] + end + end + return out +end +``` + +--- + +# Extensibility + +A very silly kind of "featurization": separate even and odd channels into +separate "compartments" (so they're processed independently in the model) + +```julia +struct EvenOdds end + +function OndaBatches.get_channel_data(samples::Samples, channels::EvenOdds) + chans = samples.info.channels + odds = @view(samples[chans[1:2:end], :]).data + evens = @view(samples[chans[2:2:end], :]).data + return cat(evens, odds; dims=3) +end +``` + +--- + +# Distributed batch loading: Why + +different models have different demands on batch loading (data size, +amount of preprocessing required, etc.) + +batch loading should _never_ be the bottleneck in our pipeline (GPU time is +expensive) + +distributing batch loading means we can always "throw more compute" at it + +??? + +(case study in lifting a serial workload into a distributed/async workload) + +another thing: working around flakiness of multithreading and unacceptably low +throughput for S3 reads. worker-to-worker communication has good enough +throughput + +--- + +# Distributed batch loading: How + +step 1: `return` → `RemoteChannel` + +```julia +start_batching(channel, batches, state) + try + while true + batch, state = iterate_batch(batches, state) + xy = materialize_batch(batch) + put!(channel, (xy, copy(state))) + end + catch e + if is_channel_closed(e) + @info "channel closed, stopping batcher..." + return :closed + else + rethrow() + end + end +end + +init_state = StableRNG(1) +# need a buffered channel in order for producers to stay ahead +channel = RemoteChannel(() -> Channel{Any}(10)) +batch_worker = addprocs(1) +future = remotecall(start_batching!, batch_worker, batches, channel, init_state) +# now consumer can `take!(channel)` to retrieve batches when they're ready +``` + +??? + +the basic idea is that instead of calling a function `materialize_batch ∘ +iterate_batch`, we will instead make a _service_ that feeds materialized batches +and the corresponding batcher states onto a `Distributed.RemoteChannel` where a +consumer can retrieve them. + +of course, this still loads batches in serial, one at a time. if we didn't care +about the order of the batches or reproducibility, we could just start multiple +independent feeder processes to feed the channel. + +--- + +# Distributed batch loading: How + +Step 2: Load multiple batches at the same time + +Need to be careful to make sure the _order of batches_ is the same regardless of +the number of workers etc. + +This is where the separation between batch _specification_ and batch +_materialization_ pays off: the specifications are small and cheap to +produce/serialize, so we can do them sequentially on the "manager" process. + +```julia +function pmap_batches!(channel::RemoteChannel, spec, state, workers) + futures_states = map(workers) do worker + batch, state = iterate_batch(spec, state) + batch_future = remotecall(materialize_batch, worker, batch) + return batch_future, copy(state) + end + + for (future, s) in futures_states + xy = fetch(future) + put!(channel, (xy, s)) + end + + return state +end +``` + +(Note this doesn't quite work when you have _finite_ series of batches) + +??? + +cycle through the workers one at a time, feeding them a batch spec. + +--- + +# Distributed batch loading: How + +Step 2: Load multiple batches at the same time + +```julia +function pmap_batches!(channel::RemoteChannel, spec, state, workers) + # ... +end + +function start_batching(channel::RemoteChannel, spec, state, workers) + try + while true + state = pmap_batches!(channel, spec, state, workers) + end + catch e + if is_closed_ex(e) + @info "batch channel closed, batching stopped" + return :closed + else + rethrow(e) + end + end +end +``` + +--- + +# Batching service + +Lots of bookkeeping requried for this! +- `Future` returned by `remotecall(start_batching, ...)` +- `RemoteChannel` for serving the batches +- batch iterator itself + +What happens when things go wrong?? it's very tricky to get errors to surface +properly and avoid bad states like slient deadlocks + +We provide a `Batcher` struct that +- does the bookkeeping +- provides a limited API surface to reduce complexity for users... +- ...and manage complexity for developers/maintainers + +--- + +# thanks! + + diff --git a/talk/src/style.css b/talk/src/style.css new file mode 100644 index 0000000..703c197 --- /dev/null +++ b/talk/src/style.css @@ -0,0 +1,77 @@ +/* Lora used for body */ +@font-face{ + font-family: 'Lora'; + src: url('fonts/Lora/Lora-Regular.ttf'); +} +@font-face{ + font-family: 'Lora'; + src: url('fonts/Lora/Lora-Bold.ttf'); + font-weight: bold; +} +@font-face{ + font-family: 'Lora'; + src: url('fonts/Lora/Lora-Italic.ttf'); + font-style: italic; +} +@font-face{ + font-family: 'Lora'; + src: url('fonts/Lora/Lora-BoldItalic.ttf'); + font-weight: bold; + font-style: italic; +} + +/* Yanone Kaffeesatz used for h1, h2, h3 */ +@font-face{ + font-family: 'Yanone Kaffeesatz'; + src: url('fonts/Yanone_Kaffeesatz/YanoneKaffeesatz-Regular.ttf'); +} +@font-face{ + font-family: 'Yanone_Kaffeesatz'; + src: url('fonts/Yanone_Kaffeesatz/YanoneKaffeesatz-Bold.ttf'); + font-weight: bold; +} + +/* Ubuntu Mono used for code, do we need Italic for code ? */ +@font-face{ + font-family: 'Ubuntu Mono'; + src: url('fonts/Ubuntu_Mono/UbuntuMono-Regular.ttf'); +} +@font-face{ + font-family: 'Ubuntu Mono'; + src: url('fonts/Ubuntu_Mono/UbuntuMono-Bold.ttf'); + font-weight: bold; +} +@font-face{ + font-family: 'Ubuntu Mono'; + src: url('fonts/Ubuntu_Mono/UbuntuMono-Italic.ttf'); + font-style: italic; +} +@font-face{ + font-family: 'Ubuntu Mono'; + src: url('fonts/Ubuntu_Mono/UbuntuMono-BoldItalic.ttf'); + font-weight: bold; + font-style: italic; +} + +body { font-family: 'Lora'; } +h1, h2, h3 { + font-family: 'Yanone Kaffeesatz'; + font-weight: normal; +} +.remark-code, .remark-inline-code { font-family: 'Ubuntu Mono'; } + +.slide-title h1 { + font-size: 80px; +} + +.slide-title h3 { + font-size: 48px; + color: #888; +} + +.slide-title h2 { + font-size: 48px; + color: #666; +} + +.slide-title.contact { font-size: 60%; } diff --git a/test/OndaBatchesTests.jl b/test/OndaBatchesTests.jl index c2a55b8..fa835bd 100644 --- a/test/OndaBatchesTests.jl +++ b/test/OndaBatchesTests.jl @@ -82,14 +82,10 @@ struct ZeroMissingChannels channels::Vector{String} end function OndaBatches.get_channel_data(samples::Samples, channels::ZeroMissingChannels) - out = zeros(eltype(samples.data), - length(channels.channels), - size(samples.data, 2)) + out = zeros(eltype(samples.data), length(channels.channels), size(samples.data, 2)) for (i, c) in enumerate(channels.channels) if c ∈ samples.info.channels - # XXX: this is extraordinarly inefficient and makes lots of copies, - # it's just here for demonstration - out[i, :] .= samples[c, :].data[1, :] + @views out[i:i, :] .= samples[c, :].data end end return out