Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/skmeans #99

Merged
merged 11 commits into from
Jun 14, 2022
2 changes: 1 addition & 1 deletion include/algorithms/public/KMeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class KMeans
out = _impl::asFluid(D);
}

private:
protected:
double distance(Eigen::ArrayXd v1, Eigen::ArrayXd v2) const
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
double distance(Eigen::ArrayXd v1, Eigen::ArrayXd v2) const
double distance(const Eigen::ArrayXd& v1, const Eigen::ArrayXd& v2) const

{
return (v1 - v2).matrix().norm();
Expand Down
42 changes: 31 additions & 11 deletions include/algorithms/public/PCA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class PCA
public:
using MatrixXd = Eigen::MatrixXd;
using VectorXd = Eigen::VectorXd;
using ArrayXd = Eigen::ArrayXd;

void init(RealMatrixView in)
{
Expand All @@ -35,47 +36,66 @@ class PCA
MatrixXd X = (input.rowwise() - mMean.transpose());
BDCSVD<MatrixXd> svd(X.matrix(), ComputeThinV | ComputeThinU);
mBases = svd.matrixV();
mValues = svd.singularValues();
mExplainedVariance = svd.singularValues().array().square() / (X.rows() - 1);
mInitialized = true;
}

void init(RealMatrixView bases, RealVectorView values, RealVectorView mean)
void init(RealMatrixView bases, RealVectorView explainedVariance,
RealVectorView mean)
{
mBases = _impl::asEigen<Eigen::Matrix>(bases);
mValues = _impl::asEigen<Eigen::Matrix>(values);
mExplainedVariance = _impl::asEigen<Eigen::Matrix>(explainedVariance);
mMean = _impl::asEigen<Eigen::Matrix>(mean);
mInitialized = true;
}

void processFrame(const RealVectorView in, RealVectorView out, index k) const
void processFrame(const RealVectorView in, RealVectorView out, index k,
bool whiten = false) const
{
using namespace Eigen;
using namespace _impl;
if (k > mBases.cols()) return;
VectorXd input = asEigen<Matrix>(in);
input = input - mMean;
VectorXd result = input.transpose() * mBases.block(0, 0, mBases.rows(), k);
if (whiten)
{
ArrayXd norm = mExplainedVariance.segment(0, k).max(epsilon).rsqrt();
result.array() *= norm;
}
out = _impl::asFluid(result);
}

double process(const RealMatrixView in, RealMatrixView out, index k) const
double process(const RealMatrixView in, RealMatrixView out, index k,
bool whiten = false) const
{
using namespace Eigen;
using namespace _impl;
if (k > mBases.cols()) return 0;
MatrixXd input = asEigen<Matrix>(in);
MatrixXd result = (input.rowwise() - mMean.transpose()) *
mBases.block(0, 0, mBases.rows(), k);
if (whiten)
{
ArrayXd norm = mExplainedVariance.segment(0, k).max(epsilon).rsqrt();
result = result.array().rowwise() * norm.transpose().max(epsilon);
}
double variance = 0;
double total = mValues.sum();
for (index i = 0; i < k; i++) variance += mValues[i];
double total = mExplainedVariance.sum();
for (index i = 0; i < k; i++) variance += mExplainedVariance[i];
out = _impl::asFluid(result);
return variance / total;
}

bool initialized() const { return mInitialized; }
void getBases(RealMatrixView out) const { out = _impl::asFluid(mBases); }
void getValues(RealVectorView out) const { out = _impl::asFluid(mValues); }
bool initialized() const { return mInitialized; }

void getBases(RealMatrixView out) const { out = _impl::asFluid(mBases); }

void getExplainedVariance(RealVectorView out) const
{
out = _impl::asFluid(mExplainedVariance);
}

void getMean(RealVectorView out) const { out = _impl::asFluid(mMean); }
index dims() const { return mBases.rows(); }
index size() const { return mBases.cols(); }
Expand All @@ -87,7 +107,7 @@ class PCA
}

MatrixXd mBases;
VectorXd mValues;
ArrayXd mExplainedVariance;
VectorXd mMean;
bool mInitialized{false};
};
Expand Down
121 changes: 121 additions & 0 deletions include/algorithms/public/SKMeans.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
Part of the Fluid Corpus Manipulation Project (http://www.flucoma.org/)
Copyright 2017-2019 University of Huddersfield.
Licensed under the BSD-3 License.
See license.md file in the project root for full license information.
This project has received funding from the European Research Council (ERC)
under the European Union’s Horizon 2020 research and innovation programme
(grant agreement No 725899).
*/

#pragma once

#include "../public/KMeans.hpp"
#include "../util/FluidEigenMappings.hpp"
#include "../../data/FluidDataSet.hpp"
#include "../../data/FluidIndex.hpp"
#include "../../data/FluidTensor.hpp"
#include "../../data/TensorTypes.hpp"
#include <Eigen/Core>
#include <queue>
#include <string>

namespace fluid {
namespace algorithm {

class SKMeans : public KMeans
{

public:
void train(const FluidDataSet<std::string, double, 1>& dataset, index k,
index maxIter)
{
using namespace Eigen;
using namespace _impl;
assert(!mTrained || (dataset.pointSize() == mDims && mK == k));
MatrixXd dataPoints = asEigen<Matrix>(dataset.getData());
MatrixXd dataPointsT = dataPoints.transpose();
if (mTrained) { mAssignments = assignClusters(dataPointsT);}
else
{
mK = k;
mDims = dataset.pointSize();
initMeans(dataPoints);
}

while (maxIter-- > 0)
{
mEmbedding = mMeans.matrix() * dataPointsT;
auto assignments = assignClusters(mEmbedding);
if (!changed(assignments)) { break; }
else
mAssignments = assignments;
updateEmbedding();
computeMeans(dataPoints);
}
mTrained = true;
}


void transform(RealMatrixView data, RealMatrixView out,
double alpha = 0.25) const
{
using namespace Eigen;
MatrixXd points = _impl::asEigen<Matrix>(data).transpose();
MatrixXd embedding = (mMeans.matrix() * points).array() - alpha;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure about baking in the encoding scheme from Coates and Ng here?

I guess the argument in favour is that it enables recreating their feature learning scheme with the fewest objects. The arguments against would be that it's not strictly part of skmeans, but was a separate step used by C&N specifically in the feature learning setting, and they do discuss alternatives.

Obviously having it here doesn't preclude using an alternative scheme, because alpha can be set to 0. However, the other question is whether this encoder could be useful for other things if it were factored out, e.g. using NMF for feature learning?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The soft thresholding function is similar to neural network activation functions, so NNFuncs is where it would fit best, but I don't think these functions would deserve their own client, so in practice I would still see this as part of the SKMeans client. So it would not help with code duplication. An interesting idea (maybe for the future?) could be to have a feature learning client that could use different learning techniques and encodings. For the moment, maybe it can be introduced as an option for skmeans. We can also use an MLP as autoencoder for feature learning.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, let's roll with what we have and see how we get on. I like the idea of some future feature learning object that could make it easy to explore options and manage some of the complexity / fiddliness.

embedding = (embedding.array() > 0).select(embedding, 0).transpose();
out = _impl::asFluid(embedding);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
out = _impl::asFluid(embedding);
out <<= _impl::asFluid(embedding);

New copy assignment operator for FluidTensorView

}

private:

void initMeans(Eigen::MatrixXd& dataPoints)
{
using namespace Eigen;
mMeans = ArrayXXd::Zero(mK, mDims);
mAssignments =
((0.5 + (0.5 * ArrayXd::Random(dataPoints.rows()))) * (mK - 1))
.round()
.cast<int>();
mEmbedding = MatrixXd::Zero(mK, dataPoints.rows());
for (index i = 0; i < dataPoints.rows(); i++)
mEmbedding(mAssignments(i), i) = 1;
computeMeans(dataPoints);
}

void updateEmbedding()
{
for (index i = 0; i < mAssignments.cols(); i++)
{
double val = mEmbedding(mAssignments(i), i);
mEmbedding.col(i).setZero();
mEmbedding(mAssignments(i), i) = val;
}
}


Eigen::VectorXi assignClusters(Eigen::MatrixXd& embedding) const
{
Eigen::VectorXi assignments = Eigen::VectorXi::Zero(embedding.cols());
for (index i = 0; i < embedding.cols(); i++)
{
Eigen::VectorXd::Index maxIndex;
embedding.col(i).maxCoeff(&maxIndex);
assignments(i) = static_cast<int>(maxIndex);
}
return assignments;
}


void computeMeans(Eigen::MatrixXd& dataPoints)
{
mMeans = mEmbedding * dataPoints;
mMeans.matrix().rowwise().normalize();
}


private:
Eigen::MatrixXd mEmbedding;
};
} // namespace algorithm
} // namespace fluid
16 changes: 9 additions & 7 deletions include/clients/nrt/PCAClient.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,16 @@ namespace pca {

constexpr auto PCAParams = defineParameters(
StringParam<Fixed<true>>("name", "Name"),
LongParam("numDimensions", "Target Number of Dimensions", 2, Min(1)));
LongParam("numDimensions", "Target Number of Dimensions", 2, Min(1)),
EnumParam("whiten", "Whiten data", 0, "No", "Yes"));

class PCAClient : public FluidBaseClient,
OfflineIn,
OfflineOut,
ModelObject,
public DataClient<algorithm::PCA>
{
enum { kName, kNumDimensions };
enum { kName, kNumDimensions, kWhiten };

public:
using string = std::string;
Expand All @@ -54,7 +55,7 @@ class PCAClient : public FluidBaseClient,
template <typename T>
Result process(FluidContext&)
{
return{};
return {};
}

MessageResult<void> fit(DataSetClientRef datasetClient)
Expand Down Expand Up @@ -98,7 +99,7 @@ class PCAClient : public FluidBaseClient,

StringVector ids{srcDataSet.getIds()};
RealMatrix output(srcDataSet.size(), k);
result = mAlgorithm.process(srcDataSet.getData(), output, k);
result = mAlgorithm.process(srcDataSet.getData(), output, k, get<kWhiten>() == 1);
FluidDataSet<string, double, 1> result(ids, output);
destPtr->setDataSet(result);
}
Expand All @@ -124,7 +125,7 @@ class PCAClient : public FluidBaseClient,
FluidTensor<double, 1> src(mAlgorithm.dims());
FluidTensor<double, 1> dest(k);
src = BufferAdaptor::ReadAccess(in.get()).samps(0, mAlgorithm.dims(), 0);
mAlgorithm.processFrame(src, dest, k);
mAlgorithm.processFrame(src, dest, k, get<kWhiten>() == 1);
outBuf.samps(0, k, 0) = dest;
return OK();
}
Expand All @@ -151,12 +152,13 @@ using PCARef = SharedClientRef<PCAClient>;
constexpr auto PCAQueryParams = defineParameters(
PCARef::makeParam("model", "Source Model"),
LongParam("numDimensions", "Target Number of Dimensions", 2, Min(1)),
EnumParam("whiten", "Whiten data", 0, "No", "Yes"),
BufferParam("inputPointBuffer", "Input Point Buffer"),
BufferParam("predictionBuffer", "Prediction Buffer"));

class PCAQuery : public FluidBaseClient, ControlIn, ControlOut
{
enum { kModel, kNumDimensions, kInputBuffer, kOutputBuffer };
enum { kModel, kNumDimensions, kWhiten, kInputBuffer, kOutputBuffer };

public:
using ParamDescType = decltype(PCAQueryParams);
Expand Down Expand Up @@ -207,7 +209,7 @@ class PCAQuery : public FluidBaseClient, ControlIn, ControlOut
RealVector dest(k);
src = BufferAdaptor::ReadAccess(get<kInputBuffer>().get())
.samps(0, algorithm.dims(), 0);
algorithm.processFrame(src, dest, k);
algorithm.processFrame(src, dest, k, get<kWhiten>() == 1);
outBuf.samps(0, k, 0) = dest;
}
}
Expand Down
39 changes: 33 additions & 6 deletions include/data/FluidJSON.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <algorithms/public/KDTree.hpp>
#include <algorithms/public/KMeans.hpp>
#include <algorithms/public/SKMeans.hpp>
#include <algorithms/public/Normalization.hpp>
#include <algorithms/public/RobustScaling.hpp>
#include <algorithms/public/PCA.hpp>
Expand Down Expand Up @@ -186,6 +187,31 @@ void from_json(const nlohmann::json &j, KMeans &kmeans) {
kmeans.setMeans(means);
}

// SKMeans
void to_json(nlohmann::json &j, const SKMeans &skmeans) {
RealMatrix means(skmeans.getK(), skmeans.dims());
skmeans.getMeans(means);
j["means"] = RealMatrixView(means);
j["rows"] = means.rows();
j["cols"] = means.cols();
}

bool check_json(const nlohmann::json &j, const SKMeans &) {
return fluid::check_json(j,
{"rows", "cols", "means"},
{JSONTypes::NUMBER, JSONTypes::NUMBER,JSONTypes::ARRAY}
);
}

void from_json(const nlohmann::json &j, SKMeans &skmeans) {
index rows = j.at("rows");
index cols = j.at("cols");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
index rows = j.at("rows");
index cols = j.at("cols");
index rows = j.at("rows").get<index>();
index cols = j.at("cols").get<index>();

The implicit conversion in at() is now not recommended by the JSON author, especially in from_json https://nicedoc.io/nlohmann/json#implicit-conversions

RealMatrix means(rows, cols);
j.at("means").get_to(means);
skmeans.setMeans(means);
}


// Normalize
void to_json(nlohmann::json &j, const Normalization &normalization) {
RealVector dataMin(normalization.dims());
Expand Down Expand Up @@ -296,20 +322,21 @@ void to_json(nlohmann::json &j, const PCA &pca) {
index cols = pca.size();
RealMatrix bases(rows, cols);
RealVector values(cols);
RealVector explainedVariance(cols);
RealVector mean(rows);
pca.getBases(bases);
pca.getValues(values);
pca.getExplainedVariance(explainedVariance);
pca.getMean(mean);
j["bases"] = RealMatrixView(bases);
j["values"] = RealVectorView(values);
j["explainedvariance"] = RealVectorView(explainedVariance);
j["mean"] = RealVectorView(mean);
j["rows"] = rows;
j["cols"] = cols;
}

bool check_json(const nlohmann::json &j, const PCA &) {
return fluid::check_json(j,
{"rows","cols", "bases", "values", "mean"},
{"rows","cols", "bases", "explainedvariance", "mean"},
{JSONTypes::NUMBER, JSONTypes::NUMBER, JSONTypes::ARRAY, JSONTypes::ARRAY, JSONTypes::ARRAY}
);
}
Expand All @@ -319,11 +346,11 @@ void from_json(const nlohmann::json &j, PCA &pca) {
index cols = j.at("cols");
RealMatrix bases(rows, cols);
RealVector mean(rows);
RealVector values(cols);
RealVector explainedVariance(cols);
j.at("mean").get_to(mean);
j.at("values").get_to(values);
j.at("explainedvariance").get_to(explainedVariance);
j.at("bases").get_to(bases);
pca.init(bases, values, mean);
pca.init(bases, explainedVariance, mean);
}


Expand Down