Skip to content

Commit

Permalink
Merge pull request #3 from pat-alt/1-polish-data-related-things
Browse files Browse the repository at this point in the history
1 polish data related things
  • Loading branch information
pat-alt authored Jan 15, 2024
2 parents 46e94f8 + 16c333e commit aa3dd5f
Show file tree
Hide file tree
Showing 25 changed files with 749 additions and 97 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ jobs:
fail-fast: false
matrix:
version:
- '1.7'
- '1.8'
- '1.9'
- '1.10'
- '1.6'
- 'nightly'
os:
- ubuntu-latest
Expand Down
6 changes: 3 additions & 3 deletions Artifacts.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[clean_data]
git-tree-sha1 = "c8e8fea9e66a8e6decae92a0241b9939751820bb"
git-tree-sha1 = "a3e9fd02627aea04ff5a7c15c6a6280d534ae025"

[[clean_data.download]]
sha256 = "01b6c33c2af63a34da13ca3c524c869d0dd61a2ea5f17bc1a8a9eedf3f93d974"
url = "https://github.com/pat-alt/TrillionDollarWords.jl/releases/download/artifacts-latest/c8e8fea9e66a8e6decae92a0241b9939751820bb.tar.gz"
sha256 = "6a52e25e8f74a3ed757f5ac06eaa1a187ada7685f09b4c7456af1d035d321adc"
url = "https://github.com/pat-alt/TrillionDollarWords.jl/releases/download/artifacts-latest/a3e9fd02627aea04ff5a7c15c6a6280d534ae025.tar.gz"
2 changes: 1 addition & 1 deletion Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

julia_version = "1.10.0"
manifest_format = "2.0"
project_hash = "30f2cbd08dc7ab95e5dd5eb4b26c054d944c0210"
project_hash = "dcbadeacd4a31e84fbc1a0961c936ceaddb437fc"

[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
Expand Down
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"

[compat]
julia = "1.6"
CSV = "0.10"
DataFrames = "1.6"
julia = "1.7, 1.8, 1.9, 1.10"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
181 changes: 180 additions & 1 deletion dev/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,28 @@

julia_version = "1.10.0"
manifest_format = "2.0"
project_hash = "6513938bfd5cfb8417557e3be387b14d094804cf"
project_hash = "06d46a2007196684bb080acefde8a4980709f33f"

[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.5.0"
weakdeps = ["ChainRulesCore", "Test"]

[deps.AbstractFFTs.extensions]
AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
AbstractFFTsTestExt = "Test"

[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.7.2"
weakdeps = ["StaticArrays"]

[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"

[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
Expand All @@ -17,6 +38,11 @@ version = "0.2.4"
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"

[[deps.BSON]]
git-tree-sha1 = "2208958832d6e1b59e49f53697483a84ca8d664e"
uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
version = "0.3.7"

[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

Expand All @@ -31,6 +57,34 @@ git-tree-sha1 = "679e69c611fff422038e9e21e270c4197d49d918"
uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
version = "0.10.12"

[[deps.CategoricalArrays]]
deps = ["DataAPI", "Future", "Missings", "Printf", "Requires", "Statistics", "Unicode"]
git-tree-sha1 = "1568b28f91293458345dabba6a5ea3f183250a61"
uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
version = "0.10.8"

[deps.CategoricalArrays.extensions]
CategoricalArraysJSONExt = "JSON"
CategoricalArraysRecipesBaseExt = "RecipesBase"
CategoricalArraysSentinelArraysExt = "SentinelArrays"
CategoricalArraysStructTypesExt = "StructTypes"

[deps.CategoricalArrays.weakdeps]
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"

[[deps.ChainRulesCore]]
deps = ["Compat", "LinearAlgebra"]
git-tree-sha1 = "c1deebd76f7a443d527fc0430d5758b8b2112ed8"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.19.1"
weakdeps = ["SparseArrays"]

[deps.ChainRulesCore.extensions]
ChainRulesCoreSparseArraysExt = "SparseArrays"

[[deps.CodecZlib]]
deps = ["TranscodingStreams", "Zlib_jll"]
git-tree-sha1 = "cd67fc487743b2f0fd4380d4cbd3a24660d0eec8"
Expand Down Expand Up @@ -58,6 +112,12 @@ git-tree-sha1 = "8cfa272e8bdedfa88b6aefbbca7c19f1befac519"
uuid = "f0e56b4a-5159-44fe-b623-3e5288b988bb"
version = "2.3.0"

[[deps.CovarianceEstimation]]
deps = ["LinearAlgebra", "Statistics", "StatsBase", "TSVD", "WoodburyMatrices"]
git-tree-sha1 = "8755768c0bae7dfa5c10f16bb8d04bfc98df2aaa"
uuid = "587fd27a-f159-11e8-2dae-1979310e6154"
version = "0.2.12"

[[deps.Crayons]]
git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
Expand All @@ -68,6 +128,12 @@ git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c"
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
version = "1.15.0"

[[deps.DataDeps]]
deps = ["HTTP", "Libdl", "Reexport", "SHA", "p7zip_jll"]
git-tree-sha1 = "6e8d74545d34528c30ccd3fa0f3c00f8ed49584c"
uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
version = "0.7.11"

[[deps.DataFrames]]
deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
Expand All @@ -89,6 +155,23 @@ version = "1.0.0"
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[deps.Distances]]
deps = ["LinearAlgebra", "Statistics", "StatsAPI"]
git-tree-sha1 = "66c4c81f259586e8f002eacebc177e1fb06363b0"
uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
version = "0.10.11"
weakdeps = ["ChainRulesCore", "SparseArrays"]

[deps.Distances.extensions]
DistancesChainRulesCoreExt = "ChainRulesCore"
DistancesSparseArraysExt = "SparseArrays"

[[deps.DocStringExtensions]]
deps = ["LibGit2"]
git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
version = "0.9.3"

[[deps.Downloads]]
deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
Expand Down Expand Up @@ -143,6 +226,12 @@ git-tree-sha1 = "abbbb9ec3afd783a7cbd82ef01dcd088ea051398"
uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
version = "1.10.1"

[[deps.Impute]]
deps = ["BSON", "CSV", "DataDeps", "Distances", "IterTools", "LinearAlgebra", "Missings", "NamedDims", "NearestNeighbors", "Random", "Statistics", "StatsBase", "TableOperations", "Tables"]
git-tree-sha1 = "27da3d215f3ac8bd5dc2b0bb144b4a4367652462"
uuid = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
version = "0.6.11"

[[deps.InlineStrings]]
deps = ["Parsers"]
git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
Expand All @@ -158,6 +247,16 @@ git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
version = "1.3.0"

[[deps.IrrationalConstants]]
git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
version = "0.2.2"

[[deps.IterTools]]
git-tree-sha1 = "42d5f897009e7ff2cf88db414a389e5ed1bdd023"
uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
version = "1.10.0"

[[deps.IteratorInterfaceExtensions]]
git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
uuid = "82899510-4779-5014-852e-03e436cf321d"
Expand Down Expand Up @@ -211,6 +310,22 @@ version = "1.17.0+0"
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[deps.LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
git-tree-sha1 = "7d6dd4e9212aebaeed356de34ccf262a3cd415aa"
uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
version = "0.3.26"

[deps.LogExpFunctions.extensions]
LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
LogExpFunctionsInverseFunctionsExt = "InverseFunctions"

[deps.LogExpFunctions.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"

[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

Expand Down Expand Up @@ -248,6 +363,18 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2023.1.10"

[[deps.NamedDims]]
deps = ["AbstractFFTs", "ChainRulesCore", "CovarianceEstimation", "LinearAlgebra", "Pkg", "Requires", "Statistics"]
git-tree-sha1 = "dc9144f80a79b302b48c282ad29b1dc2f10a9792"
uuid = "356022a1-0364-5f58-8944-0da4b18d706f"
version = "1.2.1"

[[deps.NearestNeighbors]]
deps = ["Distances", "StaticArrays"]
git-tree-sha1 = "ded64ff6d4fdd1cb68dfcbb818c69e144a5b2e4c"
uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
version = "0.4.16"

[[deps.NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"
Expand Down Expand Up @@ -342,6 +469,12 @@ git-tree-sha1 = "a844fabeb735fcc08b36e1ef7f6e216fd54e786d"
uuid = "27faeba3-bc54-5829-b163-df8cb949fe88"
version = "0.1.0"

[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"

[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
Expand Down Expand Up @@ -374,11 +507,39 @@ deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
version = "1.10.0"

[[deps.StaticArrays]]
deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
git-tree-sha1 = "f68dd04d131d9a8a8eb836173ee8f105c360b0c5"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.9.1"
weakdeps = ["ChainRulesCore", "Statistics"]

[deps.StaticArrays.extensions]
StaticArraysChainRulesCoreExt = "ChainRulesCore"
StaticArraysStatisticsExt = "Statistics"

[[deps.StaticArraysCore]]
git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
version = "1.4.2"

[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.10.0"

[[deps.StatsAPI]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "1ff449ad350c9c4cbc756624d6f8a8c3ef56d3ed"
uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
version = "1.7.0"

[[deps.StatsBase]]
deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
git-tree-sha1 = "1d77abd07f617c4868c33d4f5b9e1dbb2643c9cf"
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
version = "0.34.2"

[[deps.StringManipulation]]
deps = ["PrecompileTools"]
git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
Expand All @@ -395,6 +556,18 @@ deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"

[[deps.TSVD]]
deps = ["Adapt", "LinearAlgebra"]
git-tree-sha1 = "61cd1ce64b4ffb69e2d156ff7166a8eb796d699a"
uuid = "9449cd9e-2762-5aa3-a617-5413e99d722e"
version = "0.4.3"

[[deps.TableOperations]]
deps = ["SentinelArrays", "Tables", "Test"]
git-tree-sha1 = "e383c87cf2a1dc41fa30c093b2a19877c83e1bc1"
uuid = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
version = "1.2.0"

[[deps.TableTraits]]
deps = ["IteratorInterfaceExtensions"]
git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
Expand Down Expand Up @@ -443,6 +616,12 @@ git-tree-sha1 = "b1be2855ed9ed8eac54e5caff2afcdb442d52c23"
uuid = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
version = "1.4.2"

[[deps.WoodburyMatrices]]
deps = ["LinearAlgebra", "SparseArrays"]
git-tree-sha1 = "c1a7aa6219628fcd757dede0ca95e245c5cd9511"
uuid = "efce3f68-66dc-5838-9240-27a6d6f5f9b6"
version = "1.0.0"

[[deps.WorkerUtilities]]
git-tree-sha1 = "cd1659ba0d57b71a464a29e64dbc67cfe83d54e7"
uuid = "76eceee3-57b5-4d4a-8e66-0e911cebbf60"
Expand Down
2 changes: 2 additions & 0 deletions dev/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
ArtifactUtils = "8b73e784-e7d8-4ea5-973d-377fed4e3bce"
Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
RegularExpressions = "27faeba3-bc54-5829-b163-df8cb949fe88"
XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"
6 changes: 4 additions & 2 deletions dev/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ The code in this folder can be used to:
2. Pre-process the data.
3. Store data as package artifacts to make them easily accessible and version-controlled.

> [!NOTE]
> The project in this folder depends on this dev [version](https://github.com/pat-alt/ArtifactUtils.jl) of `ArtifactUtils.jl`. To upload artifacts, the `GITHUB_TOKEN` needs to be available from the shell environment.
### Rerun Step 2 and 3

To simply rerun steps 2 and 3, just run `julia --project=dev dev/prepare_all_artifacts.jl`.
To simply rerun steps 2 and 3, just run `julia --project=dev dev/src/prepare_all_artifacts.jl`.

### Rerun All Steps

To rerun all steps, just run `julia --project=dev dev/prepare_all_artifacts.jl -- overwrite`.
To rerun all steps, just run `julia --project=dev dev/src/prepare_all_artifacts.jl -- overwrite`.
16 changes: 0 additions & 16 deletions dev/full_data.jl

This file was deleted.

Loading

0 comments on commit aa3dd5f

Please sign in to comment.