Merge branch 'main' into test_unitful_cellarea

rafaqz · Dec 13, 2024 · 5a14cbe · 5a14cbe
2 parents 3360f34 + bd958bb
commit 5a14cbe
Show file tree

Hide file tree

Showing 31 changed files with 377 additions and 225 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Rasters"
 uuid = "a3a2b9e3-a471-40c9-b274-f788e487c689"
 authors = ["Rafael Schouten <[email protected]>"]
-version = "0.12.0"
+version = "0.12.1"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -25,6 +25,7 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
 ArchGDAL = "c9ce4bd3-c3d5-55b8-8973-c0e20141b8c3"
@@ -58,7 +59,7 @@ CommonDataModel = "0.2.3, 0.3"
 ConstructionBase = "1"
 CoordinateTransformations = "0.6.2"
 DataFrames = "1"
-DimensionalData = "0.28.2"
+DimensionalData = "0.29.4"
 DiskArrays = "0.3, 0.4"
 Extents = "0.1"
 FillArrays = "0.12, 0.13, 1"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -12,9 +12,16 @@ GBIF2 = "dedd4f52-e074-43bf-924d-d6bce14ad628"
 GeoInterface = "cf35fbd7-0cd7-5166-be24-54bfbe79505f"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
+Maxnet = "81f79f80-22f2-4e41-ab86-00c11cf0f26f"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 RasterDataSources = "3cb90ccd-e1b6-4867-9617-4276c8b2ca36"
 Rasters = "a3a2b9e3-a471-40c9-b274-f788e487c689"
 Shapefile = "8e980c4a-a4fe-5da2-b3a7-4b4b0353a2f4"
+SpeciesDistributionModels = "3ef73bbf-0321-4d3b-9a2e-5fbebc8e35da"
+
+[sources]
+SpeciesDistributionModels = {url = "https://github.com/tiemvanderdeure/SpeciesDistributionModels.jl/"}
diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts
@@ -44,27 +44,30 @@ export default defineConfig({
           { text: 'Overview', link: '/methods' },
           { text: 'Array Operations', link: '/array_operations' },
         ]
-       },
-       { text: 'Data Sources',
-       items: [
-         { text: 'Overview', link: '/data_sources' },
-         { text: 'GBIF', link: '/gbif_wflow' }
-        ]
       },
+      { text: 'Data Sources', link: '/data_sources' },
       { text: 'Plots',
-      items: [
-        { text: 'Plots.jl', link: '/plotting' },
-        { text: 'Makie.jl', link: '/plot_makie' },
-      ]
-     },
-     { text: 'Ecosystem',
-      items: [
-        { text: 'DimensionalData.jl', link: 'https://rafaqz.github.io/DimensionalData.jl/dev/' },
-        { text: 'NCDatasets.jl', link: 'https://alexander-barth.github.io/NCDatasets.jl/stable/' },
-        { text: 'ArchGDAL.jl', link: 'https://yeesian.com/ArchGDAL.jl/stable/' },
-        { text: 'HDF5.jl', link: 'https://juliaio.github.io/HDF5.jl/stable/' },
-       ]
-     },
+        items: [
+          { text: 'Plots.jl', link: '/plotting' },
+          { text: 'Makie.jl', link: '/plot_makie' },
+        ]
+      },  
+      { text: 'Examples',
+        items: [
+          { text: 'Species Distribution Modelling', link: '/gbif_wflow' },
+        ]
+      },
+      { text: 'Ecosystem',
+        items: [
+          { text: 'DimensionalData.jl', link: 'https://rafaqz.github.io/DimensionalData.jl' },
+          { text: 'DiskArrays.jl', link: 'https://github.com/JuliaIO/DiskArrays.jl' },
+          { text: 'GeoInterface.jl', link: 'https://github.com/JuliaGeo/GeoInterface.jl' },
+          { text: 'NCDatasets.jl', link: 'https://alexander-barth.github.io/NCDatasets.jl/stable/' },
+          { text: 'ArchGDAL.jl', link: 'https://github.com/yeesian/ArchGDAL.jl' },
+          { text: 'GRIBDatasets.jl', link: 'https://github.com/JuliaGeo/GRIBDatasets.jl' },
+          { text: 'ZarrDatasets.jl', link: 'https://github.com/JuliaGeo/ZarrDatasets.jl' },
+        ]
+      },
       { text: 'API', link: '/api' }
     ],
 
@@ -75,20 +78,19 @@ export default defineConfig({
           { text: 'Overview', link: '/methods' },
           { text: 'Array Operations', link: '/array_operations' },
         ]
-       },
-       { text: 'Data Sources',
-       items: [
-         { text: 'Overview', link: '/data_sources' },
-         { text: 'GBIF', link: '/gbif_wflow' }
-        ]
       },
+      { text: 'Data Sources', link: '/data_sources' },
       { text: 'Plots',
-      items: [
-        { text: 'Plots.jl', link: '/plotting' },
-        { text: 'Makie.jl', link: '/plot_makie' },
-      ]
-     },
-
+        items: [
+          { text: 'Plots.jl', link: '/plotting' },
+          { text: 'Makie.jl', link: '/plot_makie' },
+        ]
+      },
+      { text: 'Examples',
+        items: [
+          { text: 'Species Distribution Modelling', link: '/gbif_wflow' },
+        ]
+      },
       { text: 'API', link: '/api' }
     ],
     editLink: {
@@ -102,4 +104,4 @@ export default defineConfig({
       copyright: `© Copyright ${new Date().getUTCFullYear()}. Released under the MIT License.`
     }
   }
-})
+})
diff --git a/docs/src/data_sources.md b/docs/src/data_sources.md
@@ -3,13 +3,17 @@
 Rasters.jl uses a number of backends to load raster data. `Raster`, `RasterStack`
 and `RasterSeries` will detect which backend to use for you, automatically.
 
-## GRD
+## GDAL
 
-R GRD files can be loaded natively, using Julias `MMap` - which means they are very fast, but are not compressed. They are always 3 dimensional, and have `Y`, `X` and [`Band`](@ref) dimensions.
+All files GDAL can access, such as `.tiff` and `.asc` files, can be loaded,
+using [ArchGDAL.jl](https://github.com/yeesian/ArchGDAL.jl). These are
+generally best loaded as `Raster("filename.tif")`, but can be loaded as
+`RasterStack("filename.tif"; layersfrom=Band)`, taking layers from the `Band`
+dimension, which is also the default.
 
 ## NetCDF
 
-NetCDF `.nc` files are loaded using
+NetCDF `.nc` and some HDF5 `.h5` files cab be loaded using
 [NCDatasets.jl](https://github.com/Alexander-Barth/NCDatasets.jl). Layers from
 files can be loaded as `Raster("filename.nc"; name=:layername)`. Without `name`
 the first layer is used. `RasterStack("filename.nc")` will use all netcdf variables
@@ -19,26 +23,21 @@ NetCDF layers can have arbitrary dimensions. Known, common dimension names are
 converted to `X`, `Y` `Z`, and `Ti`, otherwise `Dim{:layername}` is used. Layers
 in the same file may also have different dimensions.
 
-NetCDF files still have issues loading directly from disk for some operations.
-Using `read(ncstack)` may help.
+## Zarr
 
-## GDAL
+Zarr files can be loaded with the [ZarrDatasets.jl](https://github.com/JuliaGeo/ZarrDatasets.jl) 
+backend. `Raster(filename; source=Zarrsource())` may be needed where the file type cant be detected 
+from the filename. `write` does not yet work for Zarr but will in future.
 
-All files GDAL can access, such as `.tiff` and `.asc` files, can be loaded,
-using [ArchGDAL.jl](https://github.com/yeesian/ArchGDAL.jl/issues). These are
-generally best loaded as `Raster("filename.tif")`, but can be loaded as
-`RasterStack("filename.tif"; layersfrom=Band)`, taking layers from the `Band`
-dimension, which is also the default.
+## GRIB
 
-## SMAP
+GRIB files can be loaded with the [ZarrDatasets.jl](https://github.com/JuliaGeo/GRIBDatasets.jl).
+`write` is not implemented for GRIB.
 
-The [Soil Moisture Active-Passive](https://smap.jpl.nasa.gov/) dataset provides
-global layers of soil moisture, temperature and other related data, in a custom
-HDF5 format. Layers are always 2 dimensional, with `Y` and `X` dimensions.
+## GRD
 
-These can be loaded as multi-layered `RasterStack("filename.h5")`. Individual
-layers can be loaded as `Raster("filename.h5"; name=:layername)`, without `name`
-the first layer is used.
+R GRD files can be loaded natively, using Julias `MMap` - which means they are very fast, but are not compressed. 
+They are always 3 dimensional, and have `Y`, `X` and [`Band`](@ref) dimensions.
 
 ````@example data_sources
 using Rasters
@@ -50,13 +49,12 @@ smapseries
 
 ## Writing file formats to disk
 
-Files can be written to disk in all formats other than SMAP HDF5 using
-`write("filename.ext", A)`. See the docs for [`write`](@ref). They can (with
-some caveats) be written to different formats than they were loaded in as,
-providing file-type conversion for spatial data.
+Files can be written to disk with ArchGDAL.jl and NCDatasets.jl backends using
+`write("filename.ext", raster)`. See the docs for [`write`](@ref). 
 
-Some metadata may be lost in formats that store little metadata, or where
-metadata conversion has not been completely implemented.
+They can (with some caveats) be written to different formats than they were loaded in as,
+providing file-type conversion for spatial data. Some metadata may be lost in formats that 
+store little metadata, or where metadata conversion has not been completely implemented.
 
 ## RasterDataSources.jl integration
 
@@ -75,4 +73,4 @@ Makie.plot(A)
 
 See the docs for [`Raster`](@ref), [`RasterStack`](@ref) and [`RasterSeries`](@ref),
 and the docs for `RasterDataSources.getraster` for syntax to specify various
-data sources.
+data sources.
diff --git a/docs/src/gbif_wflow.md b/docs/src/gbif_wflow.md
@@ -1,54 +1,119 @@
-Load occurrences for the Mountain Pygmy Possum using GBIF.jl
+# Species distribution modelling workflow
 
-## Load GBIF
+This example shows a full Species distribution modelling workflow, from loading data, to cleaning it, to fitting an ensemble and generating predictions.
 
-````@example gbif
-using Rasters, GBIF2
-using RasterDataSources
-const RS = Rasters
-````
+It uses GBIF and WorldClim data, which are common datasets in ecology.
+
+## Load Rasters, ArchGDAL, RasterDataSources and GBIF
+The GBIF2 library is used to download occurrence data, RasterDataSources to conveniently access Bioclim data. ArchGDAL is necessary to load in the Bioclim data.
 
 ````@example gbif
-records = GBIF2.occurrence_search("Burramys parvus"; limit=300)
+using Rasters, GBIF2
+using RasterDataSources, ArchGDAL
 ````
 
-## Extract coordinates
-
-Extract the longitude/latitude value to a `Vector` of points
-(a `Tuple` counts as a `(x, y)` point in GeoInterface.jl):
+Load occurrences for the Mountain Pygmy Possum using GBIF.jl
 
 ````@example gbif
-coords = [(r.decimalLongitude, r.decimalLatitude) for r in records if !ismissing(r.decimalLatitude)]
+records = GBIF2.occurrence_search("Burramys parvus"; limit=300)
 ````
 
-## Get layer / Band
-Get BioClim layers and subset to south-east Australia
+## Get Bioclimatic variables
+Get BioClim layers and subset to south-east Australia.
+The first time this is run, this will automatically download and save the files.
 
 ````@example gbif
 A = RasterStack(WorldClim{BioClim}, (1, 3, 7, 12))
-se_aus = A[X(138 .. 155), Y(-40 .. -25), RS.Band(1)]
+se_aus = A[X(138 .. 155), Y(-40 .. -25), Band(1)]
 ````
 Plot BioClim predictors and scatter occurrence points on all subplots
 
 ````@example gbif
-using Plots
-p = plot(se_aus);
-kw = (legend=:none, opacity=0.5, markershape=:cross, markercolor=:black)
-foreach(i -> scatter!(p, coords; subplot=i, kw...), 1:4)
+# The coordinates from the gbif table
+coords = collect(skipmissing(records.geometry))
+
+using CairoMakie
+p = Rasters.rplot(se_aus);
+for ax in p.content
+    if ax isa Axis
+        scatter!(ax, coords; alpha=0.5, marker='+', color=:black, markersize = 20)
+    end
+end
 p
 ````
 
-Then extract predictor variables and write to CSV.
+## Extract bioclim variables at occurrence points
+Then extract predictor variables and write to CSV. Use the skipmissing keyword to exclude both missing coordinates and coordinates with missing values in the RasterStack.
+
 ````@example gbif
 using CSV
-predictors = collect(extract(se_aus, coords))
-CSV.write("burramys_parvus_predictors.csv", predictors)
+presences = extract(se_aus, coords, skipmissing = true)
+CSV.write("burramys_parvus_predictors.csv", presences)
 ````
 
 Or convert them to a `DataFrame`:
 
 ````@example gbif
 using DataFrames
-df = DataFrame(predictors)
+df = DataFrame(presences)
 df[1:5,:]
-````
+````
+
+## Sample background points
+Next, sample random background points in the Raster. Rasters has a StatsBase extension to make this very straightforward. The syntax and output of `Rasters.sample` is very similar to that of `extract`.
+
+````@example gbif
+using StatsBase
+background = Rasters.sample(se_aus, 500, skipmissing = true)
+````
+
+## Fit a statistical ensemble
+In this example, we will [SpeciesDistributionModels.jl](https://github.com/tiemvanderdeure/SpeciesDistributionModels.jl) to fit a statistical ensemble to the occurrence and background data.
+
+First we need to load the models. SDM.jl integrates with MLJ - see the [model browser](https://juliaai.github.io/MLJ.jl/dev/model_browser/#Classification) for what models are available.
+
+````@example gbif
+import Maxnet: MaxnetBinaryClassifier
+import MLJGLMInterface: LinearBinaryClassifier
+# define the models in the ensemble
+models = (
+    maxnet = MaxnetBinaryClassifier(), 
+    maxnet2 = MaxnetBinaryClassifier(features = "lq"),
+    glm = LinearBinaryClassifier()
+)
+````
+
+Next, format the data using `sdmdata`. To test how rigurous our models are, we will use 3-fold cross-validation.
+
+````@example gbif
+using SpeciesDistributionModels
+const SDM = SpeciesDistributionModels
+data = sdmdata(presences, background; resampler = CV(; nfolds = 3))
+````
+
+Now, fit the ensemble, passing the data object and the `NamedTuple` of models!
+
+````@example gbif
+ensemble = sdm(data, models)
+````
+
+Use SDM.jl's evaluate function to see how this ensemble performs.
+
+````@example gbif
+SDM.evaluate(ensemble)
+````
+
+Not too bad!
+
+## Make predictions of climatic suitability
+Use the ensemble to 
+
+````@example gbif
+suitability = SDM.predict(ensemble, se_aus, reducer = mean)
+````
+
+And let's see what that looks like
+
+````@example gbif
+plot(suitability, colorrange = (0,1))
+````