From 2d82d91637158e7da62ececccdee018f4c600114 Mon Sep 17 00:00:00 2001 From: Oghenemano Utomudo Omogha Date: Wed, 18 Sep 2024 00:37:27 +0200 Subject: [PATCH] [feat] add kmeans --- CMakeLists.txt | 6 ++ examples/kmeans_functions_example.cpp | 34 +++++++ include/algorithms/kmeans.h | 46 +++++++++ include/algorithms/logistic_regression.h | 44 +++++++++ src/algorithms/kmeans.cpp | 116 +++++++++++++++++++++++ 5 files changed, 246 insertions(+) create mode 100644 examples/kmeans_functions_example.cpp create mode 100644 include/algorithms/kmeans.h create mode 100644 include/algorithms/logistic_regression.h create mode 100644 src/algorithms/kmeans.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 18a064b..c395b82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,9 @@ include_directories(include) add_library(InfernoML STATIC src/algorithms/linear_regression.cpp ) +add_library(Kmeans STATIC + src/algorithms/kmeans.cpp +) add_library(Activations STATIC src/activation/activation_functions.cpp @@ -19,12 +22,15 @@ add_library(Activations STATIC # Define executables add_executable(linear_regression_example examples/linear_regression_example.cpp) +add_executable(kmeans_functions_example examples/kmeans_functions_example.cpp) add_executable(activation_functions_example examples/activation_functions_example.cpp) # Link libraries to executables target_link_libraries(linear_regression_example PRIVATE InfernoML) target_link_libraries(activation_functions_example PRIVATE Activations) +target_link_libraries(kmeans_functions_example PRIVATE Kmeans) # Ensure that include directories are added for the specific targets if necessary target_include_directories(linear_regression_example PRIVATE ${PROJECT_SOURCE_DIR}/include) +target_include_directories(kmeans_functions_example PRIVATE ${PROJECT_SOURCE_DIR}/include) target_include_directories(activation_functions_example PRIVATE ${PROJECT_SOURCE_DIR}/include) diff --git a/examples/kmeans_functions_example.cpp b/examples/kmeans_functions_example.cpp new file mode 100644 index 0000000..f894843 --- /dev/null +++ b/examples/kmeans_functions_example.cpp @@ -0,0 +1,34 @@ +#include +#include +#include "algorithms/kmeans.h" + +int main() { + // Define data points (each point is 1D in this example) + std::vector> data = { + {0.5}, {-0.3}, {0.8}, {-1.2}, {0.0}, {2.5}, {3.0}, {2.8}, {-2.0}, {1.5} + }; + + // Instantiate the KMeans class with 2 clusters + algorithms::KMeans kmeans(2); + + // Fit the KMeans algorithm to the data + kmeans.fit(data); + + // Retrieve cluster labels and centroids + std::vector labels = kmeans.getLabels(); + std::vector> centroids = kmeans.getCentroids(); + + // Print cluster labels for each data point + std::cout << "KMeans Labels:" << std::endl; + for (size_t i = 0; i < labels.size(); i++) { + std::cout << "Data Point " << i << " is in Cluster " << labels[i] << std::endl; + } + + // Print the centroids of each cluster + std::cout << "Cluster Centroids:" << std::endl; + for (size_t i = 0; i < centroids.size(); i++) { + std::cout << "Centroid " << i << ": " << centroids[i][0] << std::endl; + } + + return 0; +} diff --git a/include/algorithms/kmeans.h b/include/algorithms/kmeans.h new file mode 100644 index 0000000..5e8cd67 --- /dev/null +++ b/include/algorithms/kmeans.h @@ -0,0 +1,46 @@ +#ifndef KMEANS_H +#define KMEANS_H + +#include + +namespace algorithms{ + +class KMeans { +public: + // Constructor + KMeans(int k, int maxIterations = 100); + + // Fit the model to the data + void fit(const std::vector>& data); + + // Get cluster labels + const std::vector& getLabels() const; + + // Get cluster centers + const std::vector>& getCentroids() const; + +private: + int k; // Number of clusters + int maxIterations; // Maximum number of iterations + std::vector> centroids; // Cluster centroids + std::vector labels; // Cluster labels for each point + + // Randomly initialize centroids from the data points + void initializeCentroids(const std::vector>& data, int dimensions); + + // Assign each point to the nearest centroid + void assignClusters(const std::vector>& data); + + // Update centroids based on the mean of the assigned points + void updateCentroids(const std::vector>& data, int dimensions); + + // Calculate Euclidean distance between two points + double euclideanDistance(const std::vector& p1, const std::vector& p2) const; + + // Check if the centroids have converged (optional, currently fixed iterations) + bool converged() const; +}; + +} + +#endif diff --git a/include/algorithms/logistic_regression.h b/include/algorithms/logistic_regression.h new file mode 100644 index 0000000..8548ee3 --- /dev/null +++ b/include/algorithms/logistic_regression.h @@ -0,0 +1,44 @@ +#ifndef LINEAR_REGRESSION_H +#define LINEAR_REGRESSION_H + +#include +#include + +namespace algorithms { + +class LinearRegression { +public: + // Constructor + LinearRegression() : m_slope(0.0), m_intercept(0.0), m_learning_rate(0.01), m_iterations(1000) {} + + // Fit the model to the training data using gradient descent + void fit(const std::vector& x, const std::vector& y); + + // Predict the output for a given input + double predict(double x) const; + + // Getters for the parameters + double getSlope() const { return m_slope; } + double getIntercept() const { return m_intercept; } + + // Set learning rate and number of iterations + void setLearningRate(double lr) { m_learning_rate = lr; } + void setIterations(int it) { m_iterations = it; } + +private: + double m_slope; + double m_intercept; + double m_learning_rate; + int m_iterations; + + // Helper function to compute the mean of a vector + double mean(const std::vector& v) const; + + // Helper functions for gradient descent + double computeCost(const std::vector& x, const std::vector& y) const; + void gradientDescent(const std::vector& x, const std::vector& y); +}; + +} // namespace algorithms + +#endif // LINEAR_REGRESSION_H diff --git a/src/algorithms/kmeans.cpp b/src/algorithms/kmeans.cpp new file mode 100644 index 0000000..9899562 --- /dev/null +++ b/src/algorithms/kmeans.cpp @@ -0,0 +1,116 @@ +#include "algorithms/kmeans.h" +#include +#include +#include +#include + + +namespace algorithms{ + +// Constructor +KMeans::KMeans(int k, int maxIterations) : k(k), maxIterations(maxIterations) {} + +// Fit the model to the data +void KMeans::fit(const std::vector>& data) { + int numPoints = data.size(); + int dimensions = data[0].size(); + + // Randomly initialize centroids + initializeCentroids(data, dimensions); + + // Run the algorithm for a fixed number of iterations + for (int iteration = 0; iteration < maxIterations; iteration++) { + // Step 1: Assign points to the closest centroid + assignClusters(data); + + // Step 2: Update centroids based on the points assigned to them + updateCentroids(data, dimensions); + + // Check for convergence (not implemented for simplicity, fixed iterations) + if (converged()) { + break; + } + } +} + +// Get cluster labels +const std::vector& KMeans::getLabels() const { + return labels; +} + +// Get cluster centers +const std::vector>& KMeans::getCentroids() const { + return centroids; +} + +// Randomly initialize centroids from the data points +void KMeans::initializeCentroids(const std::vector>& data, int dimensions) { + centroids.resize(k, std::vector(dimensions)); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(0, data.size() - 1); + + for (int i = 0; i < k; i++) { + centroids[i] = data[dis(gen)]; + } +} + +// Assign each point to the nearest centroid +void KMeans::assignClusters(const std::vector>& data) { + labels.resize(data.size()); + for (size_t i = 0; i < data.size(); i++) { + double minDist = std::numeric_limits::max(); + int closestCentroid = 0; + for (int j = 0; j < k; j++) { + double dist = euclideanDistance(data[i], centroids[j]); + if (dist < minDist) { + minDist = dist; + closestCentroid = j; + } + } + labels[i] = closestCentroid; + } +} + +// Update centroids based on the mean of the assigned points +void KMeans::updateCentroids(const std::vector>& data, int dimensions) { + std::vector> newCentroids(k, std::vector(dimensions, 0.0)); + std::vector pointsPerCentroid(k, 0); + + // Sum the points assigned to each centroid + for (size_t i = 0; i < data.size(); i++) { + int centroidIndex = labels[i]; + pointsPerCentroid[centroidIndex]++; + for (int d = 0; d < dimensions; d++) { + newCentroids[centroidIndex][d] += data[i][d]; + } + } + + // Update the centroids by computing the average + for (int j = 0; j < k; j++) { + if (pointsPerCentroid[j] > 0) { + for (int d = 0; d < dimensions; d++) { + newCentroids[j][d] /= pointsPerCentroid[j]; + } + } + } + + centroids = newCentroids; +} + +// Calculate Euclidean distance between two points +double KMeans::euclideanDistance(const std::vector& p1, const std::vector& p2) const { + double sum = 0.0; + for (size_t i = 0; i < p1.size(); i++) { + sum += std::pow(p1[i] - p2[i], 2); + } + return std::sqrt(sum); +} + +// Check if the centroids have converged (optional, currently fixed iterations) +bool KMeans::converged() const { + // This can be implemented with a threshold to check if centroids have stopped moving + return false; +} + +} \ No newline at end of file