Introduce linfa-datasets (#72)

This commit puts datasets into a separate crate called `linfa-datasets`
rust-ml · Dec 16, 2020 · 3cec12b · 3cec12b
1 parent a09e0f9
commit 3cec12b
Show file tree

Hide file tree

Showing 20 changed files with 164 additions and 168 deletions.
diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md
@@ -65,21 +65,6 @@ MyAlg::params()
     .fit(&dataset)?;
 ```
 
-## Use a specific backend for testing
-
-When you're implementing tests, which are relying on `ndarray-linalg`, you have to add the `openblas-src` crate. This will instruct cargo to compile the backend, in order to find the required symbols. The `linfa` framework uses the OpenBLAS system library by default, but an additional feature can be used to build the OpenBLAS library while compiling.
-```
-[features]
-default = ["tests-openblas-system"]
-tests-openblas-system = ["openblas-src/system"]
-tests-openblas-build = ["openblas-src/cblas", "openblas-src/lapacke"]
-
-[dev-dependencies]
-...
-openblas-src = "0.9" 
-```
-and you have to add an `extern crate openblas_src` to your the `tests` module.
-
 ## Generic float types
 
 Every algorithm should be implemented for `f32` and `f64` floating points. This can be achieved with the `linfa::Float` trait, which is basically just a combination of `ndarray::NdFloat` and `num_traits::Float`. You can look up most of the constants (like zero, one, PI) in the `num_traits` documentation. Here is a small example for a function, generic over `Float`:
@@ -118,3 +103,24 @@ pub struct HyperParams {
 }
 ```
 
+## Add a dataset
+
+When you want to add a dataset to the `linfa-datasets` crate, you have to do the following:
+ * create a tarball with your dataset as a semicolon separated CSV file and move it to `linfa-datasets/data/?.csv.gz`
+ * add a feature with the name of your dataset to `linfa-datasets`
+ * create a new function in `linfa-datasets/src/lib.rs` carrying the name of your dataset and loading it as a binary file
+
+For the last step you can look at similar implementations, for example the Iris plant dataset. The idea here is to put the dataset into the produced library directly and parse it from memory. This is obviously only feasible for small datasets.
+
+After adding it to the `linfa-datasets` crate you can include with the corresponding feature to your `Cargo.toml` file
+```
+linfa-datasets = { version = "0.2.1", path = "../datasets", features = ["winequality"] }
+```
+and then use it in your example or tests as
+```
+fn main() {
+    let (train, valid) = linfa_datasets::winequality()
+        .split_with_ratio(0.8);
+    /// ...
+}
+```
diff --git a/Cargo.toml b/Cargo.toml
@@ -69,6 +69,7 @@ members = [
     "linfa-svm",
     "linfa-hierarchical",
     "linfa-ica",
+    "datasets",
 ]
 
 [profile.release]

diff --git a/datasets/Cargo.toml b/datasets/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "linfa-datasets"
+version = "0.2.1"
+authors = ["Lorenz Schmidt <[email protected]>"]
+edition = "2018"
+
+[dependencies]
+linfa = { version = "0.2.1", path = ".." }
+ndarray = { version = "0.13", default-features = false }
+ndarray-csv = "0.4"
+csv = "1.1"
+flate2 = "1.0"
+
+[features]
+default = []
+diabetes = []
+iris = []
+winequality = []
diff --git a/datasets/diabetes_data.csv.gz → datasets/data/diabetes_data.csv.gz b/datasets/diabetes_data.csv.gz → datasets/data/diabetes_data.csv.gz
diff --git a/datasets/diabetes_target.csv.gz → datasets/data/diabetes_target.csv.gz b/datasets/diabetes_target.csv.gz → datasets/data/diabetes_target.csv.gz
diff --git a/datasets/iris.csv.gz → datasets/data/iris.csv.gz b/datasets/iris.csv.gz → datasets/data/iris.csv.gz
diff --git a/datasets/data/winequality-red.csv.gz b/datasets/data/winequality-red.csv.gz
diff --git a/datasets/src/lib.rs b/datasets/src/lib.rs
@@ -0,0 +1,58 @@
+use csv::ReaderBuilder;
+use flate2::read::GzDecoder;
+use linfa::Dataset;
+use ndarray::prelude::*;
+use ndarray_csv::Array2Reader;
+
+#[cfg(any(feature = "iris", feature = "diabetes", feature = "winequality"))]
+fn array_from_buf(buf: &[u8]) -> Array2<f64> {
+    // unzip file
+    let file = GzDecoder::new(buf);
+    // create a CSV reader with headers and `;` as delimiter
+    let mut reader = ReaderBuilder::new()
+        .has_headers(true)
+        .delimiter(b',')
+        .from_reader(file);
+
+    // extract ndarray
+    reader.deserialize_array2_dynamic().unwrap()
+}
+
+#[cfg(feature = "iris")]
+/// Read in the iris-flower dataset from dataset path
+/// The `.csv` data is two dimensional: Axis(0) denotes y-axis (rows), Axis(1) denotes x-axis (columns)
+pub fn iris() -> Dataset<Array2<f64>, Vec<usize>> {
+    let data = include_bytes!("../data/iris.csv.gz");
+    let array = array_from_buf(&data[..]);
+
+    let (data, targets) = (
+        array.slice(s![.., 0..4]).to_owned(),
+        array.column(4).to_owned(),
+    );
+
+    Dataset::new(data, targets).map_targets(|x| *x as usize)
+}
+
+#[cfg(feature = "diabetes")]
+pub fn diabetes() -> Dataset<Array2<f64>, Array1<f64>> {
+    let data = include_bytes!("../data/diabetes_data.csv.gz");
+    let data = array_from_buf(&data[..]);
+
+    let targets = include_bytes!("../data/diabetes_target.csv.gz");
+    let targets = array_from_buf(&targets[..]).column(0).to_owned();
+
+    Dataset::new(data, targets)
+}
+
+#[cfg(feature = "winequality")]
+pub fn winequality() -> Dataset<Array2<f64>, Vec<usize>> {
+    let data = include_bytes!("../data/winequality-red.csv.gz");
+    let array = array_from_buf(&data[..]);
+
+    let (data, targets) = (
+        array.slice(s![.., 0..11]).to_owned(),
+        array.column(11).to_owned(),
+    );
+
+    Dataset::new(data, targets).map_targets(|x| *x as usize)
+}
diff --git a/datasets/winequality-red.csv.gz b/datasets/winequality-red.csv.gz
diff --git a/linfa-hierarchical/Cargo.toml b/linfa-hierarchical/Cargo.toml
@@ -23,6 +23,4 @@ linfa-kernel = { version = "0.2.1", path = "../linfa-kernel" }
 [dev-dependencies]
 rand = "0.7"
 ndarray-rand = "0.11"
-csv = "1.1"
-ndarray-csv = "0.4"
-flate2 = "1.0"
+linfa-datasets = { version = "0.2.1", path = "../datasets", features = ["iris"] }
diff --git a/linfa-hierarchical/examples/irisflower.rs b/linfa-hierarchical/examples/irisflower.rs
@@ -1,45 +1,26 @@
 use std::error::Error;
-use std::fs::File;
-
-use csv::ReaderBuilder;
-use flate2::read::GzDecoder;
-use ndarray::{Array2, Axis};
-use ndarray_csv::Array2Reader;
 
 use linfa::traits::Transformer;
 use linfa_hierarchical::HierarchicalCluster;
 use linfa_kernel::{Kernel, KernelMethod};
 
-/// Extract a gziped CSV file and return as dataset
-fn read_array(path: &str) -> Result<Array2<f64>, Box<dyn Error>> {
-    // unzip file
-    let file = GzDecoder::new(File::open(path)?);
-    // create a CSV reader with headers and `;` as delimiter
-    let mut reader = ReaderBuilder::new()
-        .has_headers(true)
-        .delimiter(b',')
-        .from_reader(file);
-
-    // extract ndarray
-    let array = reader.deserialize_array2_dynamic()?;
-    Ok(array)
-}
-
 fn main() -> Result<(), Box<dyn Error>> {
-    // Read in the iris-flower dataset from dataset path
-    // The `.csv` data is two dimensional: Axis(0) denotes y-axis (rows), Axis(1) denotes x-axis (columns)
-    let dataset = read_array("../datasets/iris.csv.gz")?;
-    let (dataset, targets) = dataset.view().split_at(Axis(1), 4);
+    // load Iris plant dataset
+    let dataset = linfa_datasets::iris();
 
     let kernel = Kernel::params()
         .method(KernelMethod::Gaussian(1.0))
-        .transform(dataset);
+        .transform(dataset.records());
 
     let kernel = HierarchicalCluster::default()
         .num_clusters(3)
         .transform(kernel);
 
-    for (id, target) in kernel.targets().into_iter().zip(targets.into_iter()) {
+    for (id, target) in kernel
+        .targets()
+        .into_iter()
+        .zip(dataset.targets().into_iter())
+    {
         let name = match *target as usize {
             0 => "setosa",
             1 => "versicolor",

diff --git a/linfa-linear/Cargo.toml b/linfa-linear/Cargo.toml
@@ -27,7 +27,5 @@ serde = { version = "1.0", default-features = false, features = ["derive"] }
 linfa = { version = "0.2.1", path = ".." }
 
 [dev-dependencies]
-csv = "1.1"
-ndarray-csv = "0.4"
+linfa-datasets = { version = "0.2.1", path = "../datasets", features = ["diabetes"] }
 approx = "0.3.2"
-flate2 = "1.0"
diff --git a/linfa-linear/examples/diabetes.rs b/linfa-linear/examples/diabetes.rs
@@ -1,27 +1,11 @@
 use std::error::Error;
-use std::fs::File;
 
-use csv::ReaderBuilder;
-use flate2::read::GzDecoder;
+use linfa::traits::Fit;
 use linfa_linear::LinearRegression;
-use ndarray::Array2;
-use ndarray_csv::Array2Reader;
-
-use linfa::{traits::Fit, Dataset};
-
-fn read_array(path: &str) -> Result<Array2<f64>, Box<dyn Error>> {
-    let file = GzDecoder::new(File::open(path)?);
-    let mut reader = ReaderBuilder::new().has_headers(false).from_reader(file);
-    let array = reader.deserialize_array2_dynamic()?;
-    Ok(array)
-}
 
 fn main() -> Result<(), Box<dyn Error>> {
-    let data = read_array("../datasets/diabetes_data.csv.gz")?;
-    let target = read_array("../datasets/diabetes_target.csv.gz")?;
-    let target = target.column(0);
-
-    let dataset = Dataset::new(data, target);
+    // load Diabetes dataset
+    let dataset = linfa_datasets::diabetes();
 
     let lin_reg = LinearRegression::new();
     let model = lin_reg.fit(&dataset)?;

diff --git a/linfa-linear/examples/glm.rs b/linfa-linear/examples/glm.rs
@@ -1,22 +1,19 @@
-use csv::ReaderBuilder;
-use flate2::read::GzDecoder;
 use linfa_linear::TweedieRegressor;
-use ndarray::Array2;
-use ndarray_csv::Array2Reader;
 use std::error::Error;
-use std::fs::File;
 
 fn main() -> Result<(), Box<dyn Error>> {
-    let data = read_array("../datasets/diabetes_data.csv.gz")?;
-    let target = read_array("../datasets/diabetes_target.csv.gz")?;
-    let target = target.column(0).to_owned();
+    // load the Diabetes dataset
+    let dataset = linfa_datasets::diabetes();
+
+    let data = dataset.records();
+    let targets = dataset.targets();
 
     // Here the power and alpha is set to 0
     // Setting the power to 0 makes it a Normal Regressioon
     // Setting the alpha to 0 removes any regularization
     // In total this is the regular old Linear Regression
     let lin_reg = TweedieRegressor::new().power(0.).alpha(0.);
-    let model = lin_reg.fit(&data, &target)?;
+    let model = lin_reg.fit(&data, &targets)?;
 
     // We print the learnt parameters
     //
@@ -29,15 +26,9 @@ fn main() -> Result<(), Box<dyn Error>> {
     //
     // Some(43.27739632065444)
     let ypred = model.predict(&data);
-    let loss = (target - ypred).mapv(|x| x.abs()).mean();
+    let loss = (targets - &ypred).mapv(|x| x.abs()).mean();
+
     println!("{:?}", loss);
 
     Ok(())
 }
-
-fn read_array(path: &str) -> Result<Array2<f64>, Box<dyn Error>> {
-    let file = GzDecoder::new(File::open(path)?);
-    let mut reader = ReaderBuilder::new().has_headers(false).from_reader(file);
-    let array = reader.deserialize_array2_dynamic()?;
-    Ok(array)
-}
diff --git a/linfa-svm/Cargo.toml b/linfa-svm/Cargo.toml
@@ -33,7 +33,5 @@ linfa = { version = "0.2.1", path = ".." }
 linfa-kernel = { version = "0.2.1", path = "../linfa-kernel" }
 
 [dev-dependencies]
-csv = "1.1"
-ndarray-csv = "0.4"
-flate2 = "1.0"
+linfa-datasets = { version = "0.2.1", path = "../datasets", features = ["winequality"] }
 rand_isaac = "0.2"
diff --git a/linfa-svm/examples/winequality.rs b/linfa-svm/examples/winequality.rs
@@ -1,45 +1,12 @@
-use std::error::Error;
-use std::fs::File;
-
-use csv::ReaderBuilder;
-use flate2::read::GzDecoder;
-use ndarray::{Array1, Array2, Axis};
-use ndarray_csv::Array2Reader;
-
-use linfa::dataset::Dataset;
-use linfa::dataset::Records;
-use linfa::metrics::ToConfusionMatrix;
-use linfa::traits::*;
+use linfa::prelude::*;
 use linfa_kernel::{Kernel, KernelMethod};
 use linfa_svm::Svm;
 
-/// Extract a gziped CSV file and return as dataset
-fn read_array(path: &str) -> Result<Array2<f64>, Box<dyn Error>> {
-    // unzip file
-    let file = GzDecoder::new(File::open(path)?);
-    // create a CSV reader with headers and `;` as delimiter
-    let mut reader = ReaderBuilder::new()
-        .has_headers(true)
-        .delimiter(b';')
-        .from_reader(file);
-    // extract ndarray
-    let array = reader.deserialize_array2_dynamic()?;
-    Ok(array)
-}
-
-fn main() -> Result<(), Box<dyn Error>> {
-    // Read in the wine-quality dataset from dataset path
-    // The `.csv` data is two dimensional: Axis(0) denotes y-axis (rows), Axis(1) denotes x-axis (columns)
-    let dataset = read_array("../datasets/winequality-red.csv.gz")?;
-    // The first 11 columns are features used in training and the last columns are targets
-    let (data, targets) = dataset.view().split_at(Axis(1), 11);
-    let targets = targets.into_iter().collect::<Array1<_>>();
-
+fn main() {
     // everything above 6.5 is considered a good wine
-    let dataset = Dataset::new(data, targets).map_targets(|x| **x > 6.5);
-
-    // split into training and validation dataset
-    let (train, valid) = dataset.split_with_ratio(0.1);
+    let (train, valid) = linfa_datasets::winequality()
+        .map_targets(|x| *x > 6)
+        .split_with_ratio(0.9);
 
     // transform with RBF kernel
     let train_kernel = Kernel::params()
@@ -84,6 +51,4 @@ fn main() -> Result<(), Box<dyn Error>> {
     // Calculate the accuracy and Matthew Correlation Coefficient (cross-correlation between
     // predicted and targets)
     println!("accuracy {}, MCC {}", cm.accuracy(), cm.mcc());
-
-    Ok(())
 }
diff --git a/linfa-trees/Cargo.toml b/linfa-trees/Cargo.toml
@@ -27,15 +27,14 @@ features = ["std", "derive"]
 ndarray = { version = "0.13" , features = ["rayon", "approx"]}
 ndarray-rand = "0.11"
 
-linfa = { path = ".." }
+linfa = { version = "0.2.1", path = ".." }
 
 [dev-dependencies]
 rand_isaac = "0.2.0"
 criterion = "0.3"
 approx = "0.3"
-csv = "1.1"
-ndarray-csv = "0.4"
-flate2 = "1.0"
+
+linfa-datasets = { version = "0.2.1", path = "../datasets/", features = ["iris"] }
 
 [[bench]]
 name = "decision_tree"