diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..f857814
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "files.associations": {
+        "iosfwd": "cpp",
+        "vector": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/include/activation/activation_functions.h b/include/activation/activation_functions.h
new file mode 100644
index 0000000..6f306ea
--- /dev/null
+++ b/include/activation/activation_functions.h
@@ -0,0 +1,33 @@
+#ifndef ACTIVATION_FUNCTIONS_H
+#define ACTIVATION_FUNCTIONS_H
+
+#include <vector>
+#include <cmath>
+
+namespace activation {
+
+    // Sigmoid activation function
+    inline double sigmoid(double x);
+
+    // Derivative of sigmoid function
+    inline double sigmoid_derivative(double x);
+
+    // Tanh activation function
+    inline double tanh(double x);
+
+    // Derivative of tanh function
+    inline double tanh_derivative(double x);
+
+    // ReLU activation function
+    inline double relu(double x);
+
+    // Derivative of ReLU function
+    inline double relu_derivative(double x);
+
+    // Apply an activation function to a vector
+    template <typename Func>
+    std::vector<double> apply(const std::vector<double>& inputs, Func func);
+
+} // namespace activation
+
+#endif // ACTIVATION_FUNCTIONS_H
diff --git a/include/algorithms/linear_regression.h b/include/algorithms/linear_regression.h
index 5caebf7..8548ee3 100644
--- a/include/algorithms/linear_regression.h
+++ b/include/algorithms/linear_regression.h
@@ -9,9 +9,9 @@ namespace algorithms {
 class LinearRegression {
 public:
     // Constructor
-    LinearRegression() : m_slope(0.0), m_intercept(0.0) {}
+    LinearRegression() : m_slope(0.0), m_intercept(0.0), m_learning_rate(0.01), m_iterations(1000) {}
 
-    // Fit the model to the training data
+    // Fit the model to the training data using gradient descent
     void fit(const std::vector<double>& x, const std::vector<double>& y);
 
     // Predict the output for a given input
@@ -21,12 +21,22 @@ class LinearRegression {
     double getSlope() const { return m_slope; }
     double getIntercept() const { return m_intercept; }
 
+    // Set learning rate and number of iterations
+    void setLearningRate(double lr) { m_learning_rate = lr; }
+    void setIterations(int it) { m_iterations = it; }
+
 private:
     double m_slope;
     double m_intercept;
+    double m_learning_rate;
+    int m_iterations;
 
     // Helper function to compute the mean of a vector
     double mean(const std::vector<double>& v) const;
+
+    // Helper functions for gradient descent
+    double computeCost(const std::vector<double>& x, const std::vector<double>& y) const;
+    void gradientDescent(const std::vector<double>& x, const std::vector<double>& y);
 };
 
 } // namespace algorithms
diff --git a/src/activation/activation_functions.cpp b/src/activation/activation_functions.cpp
new file mode 100644
index 0000000..319c4aa
--- /dev/null
+++ b/src/activation/activation_functions.cpp
@@ -0,0 +1,51 @@
+#include "activation/activation_functions.h"
+
+namespace activation {
+
+// Sigmoid activation function
+inline double sigmoid(double x) {
+    return 1.0 / (1.0 + std::exp(-x));
+}
+
+// Derivative of sigmoid function
+inline double sigmoid_derivative(double x) {
+    double sig = sigmoid(x);
+    return sig * (1.0 - sig);
+}
+
+// Tanh activation function
+inline double tanh(double x) {
+    return std::tanh(x);
+}
+
+// Derivative of tanh function
+inline double tanh_derivative(double x) {
+    double tanh_x = tanh(x);
+    return 1.0 - tanh_x * tanh_x;
+}
+
+// ReLU activation function
+inline double relu(double x) {
+    return std::max(0.0, x);
+}
+
+// Derivative of ReLU function
+inline double relu_derivative(double x) {
+    return (x > 0) ? 1.0 : 0.0;
+}
+
+// Apply an activation function to a vector
+template <typename Func>
+std::vector<double> apply(const std::vector<double>& inputs, Func func) {
+    std::vector<double> result;
+    result.reserve(inputs.size());
+    for (double input : inputs) {
+        result.push_back(func(input));
+    }
+    return result;
+}
+
+// Explicit template instantiations
+template std::vector<double> apply(const std::vector<double>& inputs, double (*func)(double));
+
+} // namespace activation
diff --git a/src/algorithms/linear_regression.cpp b/src/algorithms/linear_regression.cpp
index 017d359..3119dc0 100644
--- a/src/algorithms/linear_regression.cpp
+++ b/src/algorithms/linear_regression.cpp
@@ -1,38 +1,64 @@
 #include "algorithms/linear_regression.h"
-#include <numeric> // for std::accumulate
+#include <numeric>
+#include <cmath>
 
 namespace algorithms {
 
-void LinearRegression::fit(const std::vector<double>& x, const std::vector<double>& y) {
-    if (x.size() != y.size() || x.empty()) {
-        throw std::invalid_argument("Input vectors must be of the same size and non-empty.");
+// Compute the mean of a vector
+double LinearRegression::mean(const std::vector<double>& v) const {
+    return std::accumulate(v.begin(), v.end(), 0.0) / v.size();
+}
+
+// Compute the cost (Mean Squared Error)
+double LinearRegression::computeCost(const std::vector<double>& x, const std::vector<double>& y) const {
+    double total_error = 0.0;
+    size_t n = x.size();
+    for (size_t i = 0; i < n; ++i) {
+        double prediction = m_slope * x[i] + m_intercept;
+        double error = prediction - y[i];
+        total_error += error * error;
     }
+    return total_error / (2 * n); // Mean Squared Error
+}
 
-    double x_mean = mean(x);
-    double y_mean = mean(y);
+// Perform gradient descent to optimize slope and intercept
+void LinearRegression::gradientDescent(const std::vector<double>& x, const std::vector<double>& y) {
+    size_t n = x.size();
+    for (int i = 0; i < m_iterations; ++i) {
+        double slope_gradient = 0.0;
+        double intercept_gradient = 0.0;
+        for (size_t j = 0; j < n; ++j) {
+            double prediction = m_slope * x[j] + m_intercept;
+            double error = prediction - y[j];
+            slope_gradient += error * x[j];
+            intercept_gradient += error;
+        }
+        slope_gradient /= n;
+        intercept_gradient /= n;
 
-    double numerator = 0.0;
-    double denominator = 0.0;
+        m_slope -= m_learning_rate * slope_gradient;
+        m_intercept -= m_learning_rate * intercept_gradient;
 
-    for (size_t i = 0; i < x.size(); ++i) {
-        numerator += (x[i] - x_mean) * (y[i] - y_mean);
-        denominator += (x[i] - x_mean) * (x[i] - x_mean);
+        // Optional: Print cost every 100 iterations
+        if (i % 100 == 0) {
+            double cost = computeCost(x, y);
+             //uncomment to see cost progress
+            // std::cout << "Iteration " << i << ": Cost " << cost << std::endl;
+        }
     }
+}
 
-    if (denominator == 0.0) {
-        throw std::runtime_error("Denominator in slope calculation is zero.");
+// Fit the model using gradient descent
+void LinearRegression::fit(const std::vector<double>& x, const std::vector<double>& y) {
+    if (x.size() != y.size()) {
+        throw std::invalid_argument("Input vectors must have the same size.");
     }
-
-    m_slope = numerator / denominator;
-    m_intercept = y_mean - m_slope * x_mean;
+    gradientDescent(x, y);
 }
 
+// Predict the output for a given input
 double LinearRegression::predict(double x) const {
     return m_slope * x + m_intercept;
 }
 
-double LinearRegression::mean(const std::vector<double>& v) const {
-    return std::accumulate(v.begin(), v.end(), 0.0) / v.size();
-}
-
 } // namespace algorithms