Skip to content

Commit

Permalink
Introduce linfa-datasets (#72)
Browse files Browse the repository at this point in the history
This commit puts datasets into a separate crate called `linfa-datasets`
  • Loading branch information
bytesnake authored Dec 16, 2020
1 parent a09e0f9 commit 3cec12b
Show file tree
Hide file tree
Showing 20 changed files with 164 additions and 168 deletions.
36 changes: 21 additions & 15 deletions CONTRIBUTE.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,6 @@ MyAlg::params()
.fit(&dataset)?;
```

## Use a specific backend for testing

When you're implementing tests, which are relying on `ndarray-linalg`, you have to add the `openblas-src` crate. This will instruct cargo to compile the backend, in order to find the required symbols. The `linfa` framework uses the OpenBLAS system library by default, but an additional feature can be used to build the OpenBLAS library while compiling.
```
[features]
default = ["tests-openblas-system"]
tests-openblas-system = ["openblas-src/system"]
tests-openblas-build = ["openblas-src/cblas", "openblas-src/lapacke"]
[dev-dependencies]
...
openblas-src = "0.9"
```
and you have to add an `extern crate openblas_src` to your the `tests` module.

## Generic float types

Every algorithm should be implemented for `f32` and `f64` floating points. This can be achieved with the `linfa::Float` trait, which is basically just a combination of `ndarray::NdFloat` and `num_traits::Float`. You can look up most of the constants (like zero, one, PI) in the `num_traits` documentation. Here is a small example for a function, generic over `Float`:
Expand Down Expand Up @@ -118,3 +103,24 @@ pub struct HyperParams {
}
```

## Add a dataset

When you want to add a dataset to the `linfa-datasets` crate, you have to do the following:
* create a tarball with your dataset as a semicolon separated CSV file and move it to `linfa-datasets/data/?.csv.gz`
* add a feature with the name of your dataset to `linfa-datasets`
* create a new function in `linfa-datasets/src/lib.rs` carrying the name of your dataset and loading it as a binary file

For the last step you can look at similar implementations, for example the Iris plant dataset. The idea here is to put the dataset into the produced library directly and parse it from memory. This is obviously only feasible for small datasets.

After adding it to the `linfa-datasets` crate you can include with the corresponding feature to your `Cargo.toml` file
```
linfa-datasets = { version = "0.2.1", path = "../datasets", features = ["winequality"] }
```
and then use it in your example or tests as
```
fn main() {
let (train, valid) = linfa_datasets::winequality()
.split_with_ratio(0.8);
/// ...
}
```
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ members = [
"linfa-svm",
"linfa-hierarchical",
"linfa-ica",
"datasets",
]

[profile.release]
Expand Down
18 changes: 18 additions & 0 deletions datasets/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[package]
name = "linfa-datasets"
version = "0.2.1"
authors = ["Lorenz Schmidt <[email protected]>"]
edition = "2018"

[dependencies]
linfa = { version = "0.2.1", path = ".." }
ndarray = { version = "0.13", default-features = false }
ndarray-csv = "0.4"
csv = "1.1"
flate2 = "1.0"

[features]
default = []
diabetes = []
iris = []
winequality = []
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file added datasets/data/winequality-red.csv.gz
Binary file not shown.
58 changes: 58 additions & 0 deletions datasets/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
use csv::ReaderBuilder;
use flate2::read::GzDecoder;
use linfa::Dataset;
use ndarray::prelude::*;
use ndarray_csv::Array2Reader;

#[cfg(any(feature = "iris", feature = "diabetes", feature = "winequality"))]
fn array_from_buf(buf: &[u8]) -> Array2<f64> {
// unzip file
let file = GzDecoder::new(buf);
// create a CSV reader with headers and `;` as delimiter
let mut reader = ReaderBuilder::new()
.has_headers(true)
.delimiter(b',')
.from_reader(file);

// extract ndarray
reader.deserialize_array2_dynamic().unwrap()
}

#[cfg(feature = "iris")]
/// Read in the iris-flower dataset from dataset path
/// The `.csv` data is two dimensional: Axis(0) denotes y-axis (rows), Axis(1) denotes x-axis (columns)
pub fn iris() -> Dataset<Array2<f64>, Vec<usize>> {
let data = include_bytes!("../data/iris.csv.gz");
let array = array_from_buf(&data[..]);

let (data, targets) = (
array.slice(s![.., 0..4]).to_owned(),
array.column(4).to_owned(),
);

Dataset::new(data, targets).map_targets(|x| *x as usize)
}

#[cfg(feature = "diabetes")]
pub fn diabetes() -> Dataset<Array2<f64>, Array1<f64>> {
let data = include_bytes!("../data/diabetes_data.csv.gz");
let data = array_from_buf(&data[..]);

let targets = include_bytes!("../data/diabetes_target.csv.gz");
let targets = array_from_buf(&targets[..]).column(0).to_owned();

Dataset::new(data, targets)
}

#[cfg(feature = "winequality")]
pub fn winequality() -> Dataset<Array2<f64>, Vec<usize>> {
let data = include_bytes!("../data/winequality-red.csv.gz");
let array = array_from_buf(&data[..]);

let (data, targets) = (
array.slice(s![.., 0..11]).to_owned(),
array.column(11).to_owned(),
);

Dataset::new(data, targets).map_targets(|x| *x as usize)
}
Binary file removed datasets/winequality-red.csv.gz
Binary file not shown.
4 changes: 1 addition & 3 deletions linfa-hierarchical/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,4 @@ linfa-kernel = { version = "0.2.1", path = "../linfa-kernel" }
[dev-dependencies]
rand = "0.7"
ndarray-rand = "0.11"
csv = "1.1"
ndarray-csv = "0.4"
flate2 = "1.0"
linfa-datasets = { version = "0.2.1", path = "../datasets", features = ["iris"] }
35 changes: 8 additions & 27 deletions linfa-hierarchical/examples/irisflower.rs
Original file line number Diff line number Diff line change
@@ -1,45 +1,26 @@
use std::error::Error;
use std::fs::File;

use csv::ReaderBuilder;
use flate2::read::GzDecoder;
use ndarray::{Array2, Axis};
use ndarray_csv::Array2Reader;

use linfa::traits::Transformer;
use linfa_hierarchical::HierarchicalCluster;
use linfa_kernel::{Kernel, KernelMethod};

/// Extract a gziped CSV file and return as dataset
fn read_array(path: &str) -> Result<Array2<f64>, Box<dyn Error>> {
// unzip file
let file = GzDecoder::new(File::open(path)?);
// create a CSV reader with headers and `;` as delimiter
let mut reader = ReaderBuilder::new()
.has_headers(true)
.delimiter(b',')
.from_reader(file);

// extract ndarray
let array = reader.deserialize_array2_dynamic()?;
Ok(array)
}

fn main() -> Result<(), Box<dyn Error>> {
// Read in the iris-flower dataset from dataset path
// The `.csv` data is two dimensional: Axis(0) denotes y-axis (rows), Axis(1) denotes x-axis (columns)
let dataset = read_array("../datasets/iris.csv.gz")?;
let (dataset, targets) = dataset.view().split_at(Axis(1), 4);
// load Iris plant dataset
let dataset = linfa_datasets::iris();

let kernel = Kernel::params()
.method(KernelMethod::Gaussian(1.0))
.transform(dataset);
.transform(dataset.records());

let kernel = HierarchicalCluster::default()
.num_clusters(3)
.transform(kernel);

for (id, target) in kernel.targets().into_iter().zip(targets.into_iter()) {
for (id, target) in kernel
.targets()
.into_iter()
.zip(dataset.targets().into_iter())
{
let name = match *target as usize {
0 => "setosa",
1 => "versicolor",
Expand Down
4 changes: 1 addition & 3 deletions linfa-linear/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,5 @@ serde = { version = "1.0", default-features = false, features = ["derive"] }
linfa = { version = "0.2.1", path = ".." }

[dev-dependencies]
csv = "1.1"
ndarray-csv = "0.4"
linfa-datasets = { version = "0.2.1", path = "../datasets", features = ["diabetes"] }
approx = "0.3.2"
flate2 = "1.0"
22 changes: 3 additions & 19 deletions linfa-linear/examples/diabetes.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,11 @@
use std::error::Error;
use std::fs::File;

use csv::ReaderBuilder;
use flate2::read::GzDecoder;
use linfa::traits::Fit;
use linfa_linear::LinearRegression;
use ndarray::Array2;
use ndarray_csv::Array2Reader;

use linfa::{traits::Fit, Dataset};

fn read_array(path: &str) -> Result<Array2<f64>, Box<dyn Error>> {
let file = GzDecoder::new(File::open(path)?);
let mut reader = ReaderBuilder::new().has_headers(false).from_reader(file);
let array = reader.deserialize_array2_dynamic()?;
Ok(array)
}

fn main() -> Result<(), Box<dyn Error>> {
let data = read_array("../datasets/diabetes_data.csv.gz")?;
let target = read_array("../datasets/diabetes_target.csv.gz")?;
let target = target.column(0);

let dataset = Dataset::new(data, target);
// load Diabetes dataset
let dataset = linfa_datasets::diabetes();

let lin_reg = LinearRegression::new();
let model = lin_reg.fit(&dataset)?;
Expand Down
25 changes: 8 additions & 17 deletions linfa-linear/examples/glm.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
use csv::ReaderBuilder;
use flate2::read::GzDecoder;
use linfa_linear::TweedieRegressor;
use ndarray::Array2;
use ndarray_csv::Array2Reader;
use std::error::Error;
use std::fs::File;

fn main() -> Result<(), Box<dyn Error>> {
let data = read_array("../datasets/diabetes_data.csv.gz")?;
let target = read_array("../datasets/diabetes_target.csv.gz")?;
let target = target.column(0).to_owned();
// load the Diabetes dataset
let dataset = linfa_datasets::diabetes();

let data = dataset.records();
let targets = dataset.targets();

// Here the power and alpha is set to 0
// Setting the power to 0 makes it a Normal Regressioon
// Setting the alpha to 0 removes any regularization
// In total this is the regular old Linear Regression
let lin_reg = TweedieRegressor::new().power(0.).alpha(0.);
let model = lin_reg.fit(&data, &target)?;
let model = lin_reg.fit(&data, &targets)?;

// We print the learnt parameters
//
Expand All @@ -29,15 +26,9 @@ fn main() -> Result<(), Box<dyn Error>> {
//
// Some(43.27739632065444)
let ypred = model.predict(&data);
let loss = (target - ypred).mapv(|x| x.abs()).mean();
let loss = (targets - &ypred).mapv(|x| x.abs()).mean();

println!("{:?}", loss);

Ok(())
}

fn read_array(path: &str) -> Result<Array2<f64>, Box<dyn Error>> {
let file = GzDecoder::new(File::open(path)?);
let mut reader = ReaderBuilder::new().has_headers(false).from_reader(file);
let array = reader.deserialize_array2_dynamic()?;
Ok(array)
}
4 changes: 1 addition & 3 deletions linfa-svm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,5 @@ linfa = { version = "0.2.1", path = ".." }
linfa-kernel = { version = "0.2.1", path = "../linfa-kernel" }

[dev-dependencies]
csv = "1.1"
ndarray-csv = "0.4"
flate2 = "1.0"
linfa-datasets = { version = "0.2.1", path = "../datasets", features = ["winequality"] }
rand_isaac = "0.2"
45 changes: 5 additions & 40 deletions linfa-svm/examples/winequality.rs
Original file line number Diff line number Diff line change
@@ -1,45 +1,12 @@
use std::error::Error;
use std::fs::File;

use csv::ReaderBuilder;
use flate2::read::GzDecoder;
use ndarray::{Array1, Array2, Axis};
use ndarray_csv::Array2Reader;

use linfa::dataset::Dataset;
use linfa::dataset::Records;
use linfa::metrics::ToConfusionMatrix;
use linfa::traits::*;
use linfa::prelude::*;
use linfa_kernel::{Kernel, KernelMethod};
use linfa_svm::Svm;

/// Extract a gziped CSV file and return as dataset
fn read_array(path: &str) -> Result<Array2<f64>, Box<dyn Error>> {
// unzip file
let file = GzDecoder::new(File::open(path)?);
// create a CSV reader with headers and `;` as delimiter
let mut reader = ReaderBuilder::new()
.has_headers(true)
.delimiter(b';')
.from_reader(file);
// extract ndarray
let array = reader.deserialize_array2_dynamic()?;
Ok(array)
}

fn main() -> Result<(), Box<dyn Error>> {
// Read in the wine-quality dataset from dataset path
// The `.csv` data is two dimensional: Axis(0) denotes y-axis (rows), Axis(1) denotes x-axis (columns)
let dataset = read_array("../datasets/winequality-red.csv.gz")?;
// The first 11 columns are features used in training and the last columns are targets
let (data, targets) = dataset.view().split_at(Axis(1), 11);
let targets = targets.into_iter().collect::<Array1<_>>();

fn main() {
// everything above 6.5 is considered a good wine
let dataset = Dataset::new(data, targets).map_targets(|x| **x > 6.5);

// split into training and validation dataset
let (train, valid) = dataset.split_with_ratio(0.1);
let (train, valid) = linfa_datasets::winequality()
.map_targets(|x| *x > 6)
.split_with_ratio(0.9);

// transform with RBF kernel
let train_kernel = Kernel::params()
Expand Down Expand Up @@ -84,6 +51,4 @@ fn main() -> Result<(), Box<dyn Error>> {
// Calculate the accuracy and Matthew Correlation Coefficient (cross-correlation between
// predicted and targets)
println!("accuracy {}, MCC {}", cm.accuracy(), cm.mcc());

Ok(())
}
7 changes: 3 additions & 4 deletions linfa-trees/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,14 @@ features = ["std", "derive"]
ndarray = { version = "0.13" , features = ["rayon", "approx"]}
ndarray-rand = "0.11"

linfa = { path = ".." }
linfa = { version = "0.2.1", path = ".." }

[dev-dependencies]
rand_isaac = "0.2.0"
criterion = "0.3"
approx = "0.3"
csv = "1.1"
ndarray-csv = "0.4"
flate2 = "1.0"

linfa-datasets = { version = "0.2.1", path = "../datasets/", features = ["iris"] }

[[bench]]
name = "decision_tree"
Expand Down
Loading

0 comments on commit 3cec12b

Please sign in to comment.