From 78741990f3706ac0d159bff1c32fb3bbc3a6870f Mon Sep 17 00:00:00 2001
From: Sermet Pekin <sermet.pekin@gmail.com>
Date: Sun, 8 Dec 2024 23:00:32 +0300
Subject: [PATCH] ch1 mismatch shared_ptr Value

---
 Makefile                  |   1 +
 easy_df.cpp               |  42 +++++++++
 include/console_utils.hpp |  13 +--
 include/data_utils.hpp    |  13 +++
 include/dataframe.hpp     |  94 ++++++++++++++++++--
 include/datasetType.hpp   |  38 ++++++++
 include/loss.hpp          |  31 ++++---
 include/micrograd.hpp     |   1 +
 include/train_eval.hpp    | 181 ++++++++++++++++++++++++++++++++++++++
 include/types.hpp         |   1 +
 include/value.hpp         |  18 +++-
 main.cpp                  |  15 ++++
 tests/test_cross_ent.cpp  |  41 +++++++++
 13 files changed, 466 insertions(+), 23 deletions(-)
 create mode 100644 easy_df.cpp
 create mode 100644 include/train_eval.hpp
 create mode 100644 tests/test_cross_ent.cpp
diff --git a/Makefile b/Makefile
index d3fa511..bf010b3 100644
--- a/Makefile
+++ b/Makefile
@@ -40,6 +40,7 @@ clean:
 	rm -f $(OBJS) $(TARGET) $(TEST_TARGET)
 
 test: clean test_only
+pytest: clean test_only
 
 
 .PHONY: clean run test
diff --git a/easy_df.cpp b/easy_df.cpp
new file mode 100644
index 0000000..27bd692
--- /dev/null
+++ b/easy_df.cpp
@@ -0,0 +1,42 @@
+#include "micrograd.hpp"
+using namespace microgradCpp;
+
+int main()
+{
+
+    // DatasetType dataset = get_iris();
+
+    DataFrame df;
+    df.from_csv("./data/iris.csv");
+
+    df.encode_column("variety");
+
+    df.print();
+
+    // return 0;
+    // shuffle(dataset);
+    double TRAIN_SIZE{0.8};
+
+    // Create MLP model
+    // Input: 4 features, hidden layers: [7,7], output: 3 classes
+    // Define the model and hyperparameters
+    MLP model(4, {10, 10, 3});
+    double learning_rate = 0.01;
+    int epochs = 2;
+    // Train and evaluate the model
+    train_eval(df, TRAIN_SIZE, model, learning_rate, epochs);
+
+    return 0;
+}
+
+/*
+Notes
+-----------
+
+g++ -std=c++17 -Iinclude -O2 -o main easy_df.cpp
+
+// or
+make run
+
+
+*/
diff --git a/include/console_utils.hpp b/include/console_utils.hpp
index 1fb132b..926011a 100644
--- a/include/console_utils.hpp
+++ b/include/console_utils.hpp
@@ -20,8 +20,8 @@ namespace microgradCpp
 
         std::exit(EXIT_FAILURE);
     }
-
-    inline void epic_out_of_range(const std::string &reason )
+  
+    inline void epic_out_of_range(const std::string &reason)
     {
         std::cout << "\n💥💥💥 BOOM! 💥💥💥" << std::endl;
         std::cout << "❌ Uh-oh! Something went wrong: [ 🔥 " << reason << " 🔥 ] " << std::endl;
@@ -29,10 +29,13 @@ namespace microgradCpp
         std::cout << "📉 Better luck next time, brave coder!" << std::endl;
         std::cout << "🔥🔥🔥 Program terminated. 🔥🔥🔥\n"
                   << std::endl;
-    
-        throw std::out_of_range(reason) ; 
-    }
 
+        throw std::out_of_range(reason);
+    }
+  inline void stop(const std::string &reason = "...")
+    {
+        epic_out_of_range(reason);
+    }
     // Function to format shapes for display
 
     inline std::string format_shape(size_t rows, size_t cols)
diff --git a/include/data_utils.hpp b/include/data_utils.hpp
index 1f21fae..b639c82 100644
--- a/include/data_utils.hpp
+++ b/include/data_utils.hpp
@@ -14,9 +14,22 @@
 #include <unordered_map>
 #include "mlp.hpp"
 #include "console_utils.hpp"
+#include "types.hpp"
+using namespace microgradCpp;
 
 using vv_string = std::vector<std::vector<std::string>>;
 using vv_double = std::vector<std::vector<double>>;
+
+static inline v_shared_Value one_hot_encode(int class_index, int num_classes)
+{
+
+    v_shared_Value target(num_classes, std::make_shared<Value>(0.0));
+
+    target[class_index] = std::make_shared<Value>(1.0);
+
+    return target;
+}
+
 inline void log_model_info(const std::vector<int> &layer_sizes,
                            size_t input_features,
                            size_t output_targets,
diff --git a/include/dataframe.hpp b/include/dataframe.hpp
index a62e420..2815e01 100644
--- a/include/dataframe.hpp
+++ b/include/dataframe.hpp
@@ -54,6 +54,7 @@
 #include "header.hpp"
 #include "range.hpp"
 #include "console_utils.hpp"
+#include "types.hpp"
 
 namespace microgradCpp
 
@@ -78,6 +79,12 @@ namespace microgradCpp
 
         static inline bool DEFAULT_INPLACE = true;
 
+        int size() const
+        {
+
+            return get_all_row_indices().size();
+        }
+
         DataFrame operator()(const std::initializer_list<int> &row_indices, const std::vector<std::string> &col_names)
         {
             return this->slice(std::vector<size_t>(row_indices.begin(), row_indices.end()), col_names, DEFAULT_INPLACE);
@@ -96,6 +103,82 @@ namespace microgradCpp
             return this->slice(numbers, col_names, DEFAULT_INPLACE);
         }
 
+        DataFrame rows(const Range &range)
+        {
+
+            auto numbers = range.to_vector<size_t>();
+
+            return this->slice(numbers, column_order, DEFAULT_INPLACE);
+        }
+
+        v_string v(const Range &column_range)
+        {
+            v_string items;
+            for (size_t i = 0; i < column_order.size(); ++i)
+            {
+                if (column_range.includes(i))
+                {
+                    items.push_back(column_order[i]);
+                }
+            }
+            return items;
+        }
+
+        vv_double to_vv_double() const
+        {
+            vv_double result;
+
+            if (columns.empty())
+                return result;
+
+            // Determine the number of rows based on the first column
+            size_t num_rows = columns.begin()->second.size();
+
+            // Iterate through each row
+            for (size_t i = 0; i < num_rows; ++i)
+            {
+                std::vector<double> row;
+                for (const auto &col_name : column_order)
+                {
+                    const auto &col = columns.at(col_name);
+                    if (i < col.size())
+                    {
+                        const auto &cell = col[i];
+                        if (std::holds_alternative<double>(cell))
+                        {
+                            row.push_back(std::get<double>(cell));
+                        }
+                        else
+                        {
+                            row.push_back(0.0);  
+                        }
+                    }
+                }
+                result.push_back(row);
+            }
+
+            return result;
+        }
+
+        // vv_string v(const Range &colum_range){
+
+        //     vv_string items ;
+        //     for(int i =0 ; i< column_order.size() ; i++ ){
+        //             if( colum_range.includes( i ))
+        //                 items.push_back( column_order[ i ]) ;
+
+        //     }
+        //     return items ;
+
+        // }
+        DataFrame subset(const Range &range, const Range &colum_range)
+        {
+
+            auto numbers = range.to_vector<size_t>();
+
+            return this->slice(numbers, column_order, DEFAULT_INPLACE);
+        }
+
         DataFrame operator()(const Range &range)
         {
 
@@ -115,6 +198,11 @@ namespace microgradCpp
             return this->slice(get_all_row_indices(), column_order, DEFAULT_INPLACE);
         }
 
+        // DataFrame operator()(const Range &range)
+        // {
+        //     return this->slice(range.to_vector<size_t>(), column_order, DEFAULT_INPLACE);
+        // }
+
         DataFrame operator()(const std::vector<size_t> &row_indices)
         {
             return this->slice(row_indices, column_order, DEFAULT_INPLACE);
@@ -131,12 +219,6 @@ namespace microgradCpp
 
             return this->slice(get_all_row_indices(), col_names, inplace);
         }
-         
-
-        
-
-
-
 
         DataFrame slice(const std::vector<size_t> &row_indices, const std::vector<std::string> &col_names, bool inplace = DEFAULT_INPLACE)
         {
diff --git a/include/datasetType.hpp b/include/datasetType.hpp
index 71a8d35..8d2e6a7 100644
--- a/include/datasetType.hpp
+++ b/include/datasetType.hpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include "value.hpp"
 #include "types.hpp"
+#include "dataframe.hpp"
 using namespace microgradCpp;
 
 inline DatasetType convert_to_dataset(const vv_double &data, int target_column = -1)
@@ -43,6 +44,43 @@ inline DatasetType convert_to_dataset(const vv_double &data, int target_column =
     return dataset;
 }
 
+inline DatasetType convert_to_dataset(const DataFrame &df, int target_column = -1)
+{
+    DatasetType dataset;
+    vv_double data = df.to_vv_double();
+
+
+    for (const auto &row : data)
+    {
+        if (row.empty())
+        {
+            continue; // Skip empty rows
+        }
+
+        // Determine target column index
+        int target_idx = (target_column == -1) ? row.size() - 1 : target_column;
+        // Create inputs and targets
+        std::vector<std::shared_ptr<Value>> inputs;
+        std::vector<std::shared_ptr<Value>> targets;
+        for (size_t i = 0; i < row.size(); ++i)
+        {
+            if (static_cast<int>(i) == target_idx)
+            {
+                targets.push_back(std::make_shared<Value>(row[i]));
+            }
+            else
+            {
+                inputs.push_back(std::make_shared<Value>(row[i]));
+            }
+        }
+
+        // Add the pair to the dataset
+        dataset.emplace_back(inputs, targets);
+    }
+
+    return dataset;
+}
+
 #include <fstream>
 #include <iostream>
 #include <vector>
diff --git a/include/loss.hpp b/include/loss.hpp
index 69e1fb3..5087d55 100644
--- a/include/loss.hpp
+++ b/include/loss.hpp
@@ -24,31 +24,40 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-
-#include "value.hpp"
 #include <vector>
 #include <memory>
 #include <cmath>
 #include <iostream>
+#include "value.hpp"
+#include "console_utils.hpp"
+#include "data_utils.hpp"
 
+using namespace microgradCpp ; 
 
-
-
-
-class Loss {
+class Loss
+{
 public:
     static std::shared_ptr<Value> cross_entropy(
-        const std::vector<std::shared_ptr<Value>>& predictions,
-        const std::vector<std::shared_ptr<Value>>& targets
-    ) {
+        const std::vector<std::shared_ptr<Value>> &predictions,
+        const std::vector<std::shared_ptr<Value>> &targets)
+    {
         // Assumes:
         // 1. predictions are already probabilities (from softmax in MLP forward)
         // 2. targets are one-hot encoded: exactly one element is 1, others are 0
         // cross entropy = -sum_i t_i * log(p_i)
 
-        auto loss = std::make_shared<Value>(0.0);
 
-        for (size_t i = 0; i < predictions.size(); ++i) {
+    // auto XX = one_hot_encode( targets , 3 ) ; 
+
+        if (predictions.size() != targets.size() || !(predictions.size() > 0) ){
+            
+            std::cout << predictions.size() << " predictions <== ==> targets " <<targets.size()  ;  
+            // stop( "problem") ; 
+        }
+            auto loss = std::make_shared<Value>(0.0);
+
+        for (size_t i = 0; i < predictions.size(); ++i)
+        {
             // log(p_i)
             auto logp = predictions[i]->log();
             // accumulate t_i * log(p_i)
diff --git a/include/micrograd.hpp b/include/micrograd.hpp
index 3f3ff50..f5ae96e 100644
--- a/include/micrograd.hpp
+++ b/include/micrograd.hpp
@@ -32,6 +32,7 @@ THE SOFTWARE.
 #include "dataframe.hpp"
 #include "dataframe_utils.hpp"
 #include "sp_testing_utils.hpp"
+#include "train_eval.hpp"
 
 #include "value.hpp"
 #include "iris.hpp"
diff --git a/include/train_eval.hpp b/include/train_eval.hpp
new file mode 100644
index 0000000..37100cf
--- /dev/null
+++ b/include/train_eval.hpp
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <iostream>
+#include <chrono>
+#include <vector>
+#include "dataframe.hpp"
+#include "mlp.hpp"
+#include "sgd.hpp"
+#include "loss.hpp"
+#include "dataframe.hpp"
+#include "types.hpp"
+#include "datasetType.hpp"
+
+using namespace microgradCpp;
+
+// Function to split the DataFrame into train and test sets
+// inline void train_test_split(DataFrame &df, double train_size,
+//                              std::vector<std::vector<double>> &train_inputs,
+//                              std::vector<std::vector<double>> &train_targets,
+//                              std::vector<std::vector<double>> &test_inputs,
+//                              std::vector<std::vector<double>> &test_targets)
+// {
+//     size_t total_rows = 150; // TODO
+//     size_t train_rows = static_cast<size_t>(total_rows * train_size);
+
+//     DataFrame train_df = df.rows(Range(0, train_rows));
+//     DataFrame test_df = df.rows(Range(train_rows, total_rows));
+
+//     // Assuming inputs are all columns except the last one, and targets are in the last column
+//     // train_inputs = train_df.get_inputs();   // Extract inputs from train DataFrame
+//     // train_targets = train_df.get_targets(); // Extract targets from train DataFrame
+
+//     // test_inputs = test_df.get_inputs();   // Extract inputs from test DataFrame
+//     // test_targets = test_df.get_targets(); // Extract targets from test DataFrame
+// }
+
+// The updated train_eval function
+inline void train_test_split(
+    const DataFrame &df,
+    double TRAIN_SIZE,
+    ColRows &train_inputs,
+    ColRows &train_targets,
+    ColRows &test_inputs,
+    ColRows &test_targets)
+{
+
+    // size_t train_size = static_cast<size_t>(df.size() * TRAIN_SIZE);
+
+    DatasetType dataset;
+
+    dataset = convert_to_dataset(df);
+    size_t train_size = static_cast<size_t>(dataset.size() * TRAIN_SIZE);
+
+    for (size_t i = 0; i < train_size; ++i)
+    {
+        train_inputs.push_back(dataset[i].first);
+        train_targets.push_back(dataset[i].second);
+    }
+    for (size_t i = train_size; i < dataset.size(); ++i)
+    {
+        test_inputs.push_back(dataset[i].first);
+        test_targets.push_back(dataset[i].second);
+    }
+}
+inline void print(const ColRows &colrows)
+{
+    for (size_t i = 0; i < colrows.size(); ++i)
+    {
+        std::cout << "Row " << i << ": ";
+        for (const auto &cell : colrows[i])
+        {
+            if (cell)
+            {
+                std::cout << cell->data << " ";
+            }
+            else
+            {
+                std::cout << "NaN ";
+            }
+        }
+        std::cout << "\n";
+    }
+}
+
+// std::vector<std::shared_ptr<Value>> one_hot_encode(int class_index, int num_classes)
+// {
+
+//     std::vector<std::shared_ptr<Value>> target(num_classes, std::make_shared<Value>(0.0));
+
+//     target[class_index] = std::make_shared<Value>(1.0);
+
+//     return target;
+// }
+
+/*========================      train_eval DataFrame      =================================================*/
+inline void train_eval(DataFrame &df, double train_size, MLP &model, double lr = 0.01, int epochs = 100)
+{
+
+    ColRows train_inputs, train_targets;
+    ColRows test_inputs, test_targets;
+
+    train_test_split(df, train_size, train_inputs, train_targets, test_inputs, test_targets);
+
+    // Create SGD optimizer
+    SGD optimizer(lr);
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    print(test_targets);
+    print(train_targets);
+    print(train_inputs);
+    print(test_inputs);
+    // throw std::runtime_error("[ok]");
+
+    // return std::EXIT_SUCCESS ;
+
+    // Training loop
+    for (int epoch = 0; epoch < epochs; ++epoch)
+    {
+        double total_loss = 0.0;
+        std::cout << "[ Here it is ] ...Epoch " << epoch + 1 << "/" << epochs << ", Loss: " << total_loss / train_inputs.size() << std::endl;
+
+        size_t NUM_Training = train_inputs.size();
+
+        for (size_t i = 0; i < NUM_Training; ++i)
+        {
+
+            // Forward pass (training=true)
+            auto predictions = model.forward(train_inputs[i], true);
+            std::cout << "[ Here it is ] predictions ";
+
+            // int num_classes = predictions.size();                                  // Number of classes inferred from predictions
+            // v_shared_Value target = one_hot_encode(train_targets[i], num_classes); // Convert class index to one-hot
+
+            // Compute Cross-Entropy Loss
+            auto loss = Loss::cross_entropy(predictions, train_targets[i]);
+            total_loss += loss->data;
+
+            // Backpropagation
+            optimizer.zero_grad(model.parameters());
+            loss->backward();
+
+            // Update weights
+            optimizer.step(model.parameters());
+        }
+
+        std::cout << "Epoch " << epoch + 1 << "/" << epochs << ", Loss: " << total_loss / train_inputs.size() << std::endl;
+
+        // Evaluate test accuracy every 10 epochs and on the last epoch
+        if (epoch % 10 == 0 || epoch == epochs - 1)
+        {
+            int correct = 0;
+
+            size_t NUM_Test = test_inputs.size();
+
+            for (size_t i = 0; i < NUM_Test; ++i)
+            {
+                auto predictions = model.forward(test_inputs[i], false);
+
+                // Find predicted class
+                int predicted_class = std::distance(predictions.begin(), std::max_element(predictions.begin(), predictions.end()));
+
+                // Check if prediction matches the target
+                if (test_targets[i][predicted_class]->data == 1.0)
+                {
+                    correct++;
+                }
+            }
+
+            double accuracy = static_cast<double>(correct) / test_inputs.size();
+            std::cout << "Epoch " << epoch + 1 << ": Test Accuracy = " << accuracy * 100.0 << "%" << std::endl;
+
+            if (epoch == epochs - 1)
+            {
+                auto end = std::chrono::high_resolution_clock::now();
+                std::chrono::duration<double> duration = end - start;
+                std::cout << "Duration: " << duration.count() << " seconds" << std::endl;
+            }
+        }
+    }
+}
diff --git a/include/types.hpp b/include/types.hpp
index 3419fd5..49774bf 100644
--- a/include/types.hpp
+++ b/include/types.hpp
@@ -44,6 +44,7 @@ namespace microgradCpp
     using ColumnData = std::variant<std::vector<double>, std::vector<std::string>>;
 
     using vv_shared_Value = std::vector<std::vector<std::shared_ptr<Value>>>;
+    using v_shared_Value =  std::vector<std::shared_ptr<Value>>;
 
     using DatasetType = std::vector<std::pair<std::vector<std::shared_ptr<Value>>, std::vector<std::shared_ptr<Value>>>>;
     using ColRows = std::vector<std::vector<std::shared_ptr<Value>>>;
diff --git a/include/value.hpp b/include/value.hpp
index dfa6643..fee5f35 100644
--- a/include/value.hpp
+++ b/include/value.hpp
@@ -48,7 +48,7 @@ class Value : public std::enable_shared_from_this<Value>
     mutable bool topo_cached = false;
 
     // Constructor
-    Value(double data, const std::string &label = "", bool cache_topology = true )
+    Value(double data, const std::string &label = "", bool cache_topology = true)
         : data(data),
           grad(0.0),
           label(label),
@@ -69,6 +69,17 @@ class Value : public std::enable_shared_from_this<Value>
             parents.push_back(parent);
         }
     }
+    // Member function for Value == double
+    bool operator==(double val) const
+    {
+        return data == val;
+    }
+
+    // Friend function for double == Value
+    friend bool operator==(double val, const Value &v)
+    {
+        return val == v.data;
+    }
 
     // Build topological order for backpropagation (with caching)
     std::vector<std::shared_ptr<Value>> build_topological_order()
@@ -83,6 +94,11 @@ class Value : public std::enable_shared_from_this<Value>
 
         std::function<void(Value *)> visit = [&](Value *v)
         {
+            if (!v)
+            {
+                throw std::runtime_error("Null pointer encountered in build_topological_order");
+            }
+
             if (v && visited.find(v) == visited.end())
             {
                 visited.insert(v);
diff --git a/main.cpp b/main.cpp
index 17a81cb..bac1336 100644
--- a/main.cpp
+++ b/main.cpp
@@ -158,4 +158,19 @@ g++ -std=c++17 -Iinclude -O2 -o main main.cpp
 make run
 
 
+ g++ -std=c++20 -Iinclude -O2  -g -o main easy_df.cpp 
+
+g++ -std=c++20 -Iinclude -g -O0 -o main easy_df.cpp
+
+lldb ./main 
+
+r 
+bt 
+
+... g++ -std=c++20 -fsanitize=address -g -o main main.cpp
+
+Address Sanitizer will provide detailed error messages if there are invalid memory accesses.
+g++ -std=c++20 -fsanitize=address  -Iinclude -g -O0 -o main easy_df.cpp
+./main
+
 */
diff --git a/tests/test_cross_ent.cpp b/tests/test_cross_ent.cpp
new file mode 100644
index 0000000..04430ff
--- /dev/null
+++ b/tests/test_cross_ent.cpp
@@ -0,0 +1,41 @@
+#include <gtest/gtest.h>
+#include <vector>
+#include <memory>
+#include "value.hpp"
+#include "loss.hpp"
+#include "micrograd.hpp"
+#include "types.hpp"
+using namespace microgradCpp;
+
+// Helper function to create a Value pointer
+std::shared_ptr<Value> make_value(double data)
+{
+    return std::make_shared<Value>(data);
+}
+
+// Test for Loss::cross_entropy
+TEST(LossTest, CrossEntropyLoss)
+{
+    // Example predictions: probabilities for 3 classes
+    std::vector<std::shared_ptr<Value>> predictions = {
+        make_value(0.7), // Class 0
+        make_value(0.2), // Class 1
+        make_value(0.1)  // Class 2
+    };
+
+    // Target: class 1 (one-hot encoded)
+    std::vector<std::shared_ptr<Value>> targets = {
+        make_value(0.0), // Class 0
+        make_value(1.0), // Class 1 (correct class)
+        make_value(0.0)  // Class 2
+    };
+
+    // Compute the cross-entropy loss
+    auto loss = Loss::cross_entropy(predictions, targets);
+
+    // Expected cross-entropy loss: -log(p_1) = -log(0.2) ≈ 1.6094
+    double expected_loss = -std::log(0.2);
+
+    // Check if the computed loss is approximately equal to the expected loss
+    ASSERT_NEAR(loss->data, expected_loss, 1e-4);
+}