From 58d176b8313e1e27d82b47f164358610e866dac5 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Mon, 3 Oct 2022 00:24:35 -0400
Subject: [PATCH 01/36] add negative number support

---
 backend/utility/bin_tsv.py                 |  4 ++
 backend/utility/tsv_to_bin.cpp             | 47 +++++++++++++++++-----
 compiler/src/builtins.cpp                  | 17 +++++++-
 slog/common/tuple.py                       |  6 ++-
 slog/tests/testcase/number/ground_truth    |  3 ++
 slog/tests/testcase/number/input/foo.facts |  2 +
 slog/tests/testcase/number/number.slog     |  8 ++++
 7 files changed, 74 insertions(+), 13 deletions(-)
 create mode 100644 slog/tests/testcase/number/ground_truth
 create mode 100644 slog/tests/testcase/number/input/foo.facts
 create mode 100644 slog/tests/testcase/number/number.slog
diff --git a/backend/utility/bin_tsv.py b/backend/utility/bin_tsv.py
index 9b54643e..44054398 100755
--- a/backend/utility/bin_tsv.py
+++ b/backend/utility/bin_tsv.py
@@ -50,6 +50,8 @@
 BUCKET_MASK = 0x00003FFFF0000000
 TUPLE_ID_MASK = 0xFFFFFFFFF0000000
 VAL_MASK = ~ TAG_MASK
+SIGN_FILP_CONST = 0x0000200000000000
+SIGNED_NUM_MASK = 0xFFFFE00000000000
 
 INT_TAG = 0
 STRING_TAG = 2
@@ -115,6 +117,8 @@ def bin_to_tsv(filename, arity, output, index, meta_folder):
                 val_tag = raw_val >> 46
                 if val_tag == INT_TAG:
                     attr_val = raw_val & VAL_MASK
+                    if attr_val >= SIGN_FILP_CONST:
+                        attr_val = -(attr_val - SIGN_FILP_CONST)
                 elif val_tag == STRING_TAG:
                     attr_val = string_dict[raw_val & VAL_MASK]
                 # elif val_tag == SYMBOL_TAG:
diff --git a/backend/utility/tsv_to_bin.cpp b/backend/utility/tsv_to_bin.cpp
index e3fc574e..62758039 100644
--- a/backend/utility/tsv_to_bin.cpp
+++ b/backend/utility/tsv_to_bin.cpp
@@ -2,7 +2,10 @@
 // Subsequently by Kris Micinski
 // Convert Souffle CSV (tab-separated value) files to Slog input tuple files
 // compile with >= c++14
+#include <cmath>
 #include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <unistd.h>
@@ -37,6 +40,8 @@
 #define BUCKET_MASK 0x00003FFFF0000000
 #define BUCKET_MASK_LENGTH 18
 #define TAG_MASK 0xFFFFC00000000000
+#define SIGN_FILP_CONST 0x0000200000000000
+#define SIGNED_NUM_MASK 0xFFFFE00000000000
 
 using namespace std;
 
@@ -232,16 +237,40 @@ void file_to_slog(char *input_file, char *output_file,
 			{
 				break;
 			}
-			try
-			{
-				// TODO: support float later
-				// FIXME: detect empty space here!
-				u64 u64_v = stoi(col);
-				tuple_buffer[col_count] = TUPLE_MASK & u64_v;
-				// cout << "number at " << col_count << " : " << u64_v << endl;
+			bool convert_success_flag = false;
+			if (!convert_success_flag) {
+				// integer
+				try {
+					// FIXME: detect empty space here!
+					long long int_v = stoll(col);
+					if (int_v < 0) {
+						int_v = SIGN_FILP_CONST - int_v;
+					}
+					tuple_buffer[col_count] = (~ TAG_MASK) & ((u64)int_v);
+					// cout << col << " number at " << col_count << " : " <<int_v << " " << tuple_buffer[col_count] << endl;
+					convert_success_flag = true;
+				} catch (...) {}
 			}
-			catch (...)
-			{
+			if (!convert_success_flag) {
+				// float
+				try {
+					float float_v = stof(col);
+					float float_v_abs = abs(float_v);
+					u64 u64_v = 0;
+					memcpy(&u64_v, &float_v_abs, sizeof(float_v_abs));
+					u64 encoded_v = FLOAT_TAG;
+					if (float_v < 0) {
+						encoded_v = (encoded_v << 1) + 1;
+					} else {
+						encoded_v <<= 1;
+					}
+					encoded_v <<= TUPLE_MASK_LENGTH + BUCKET_MASK_LENGTH - 1;
+					encoded_v |= u64_v & (~ SIGNED_NUM_MASK);
+					tuple_buffer[col_count] = encoded_v;
+				} catch (...) {}
+			}
+			// string is last case
+			if (!convert_success_flag) {
 				// if not number all goes to string
 				u64 u64_v = STRING_TAG;
 				u64_v <<= TUPLE_MASK_LENGTH + BUCKET_MASK_LENGTH;
diff --git a/compiler/src/builtins.cpp b/compiler/src/builtins.cpp
index e3352516..ffd4ba50 100644
--- a/compiler/src/builtins.cpp
+++ b/compiler/src/builtins.cpp
@@ -18,6 +18,8 @@ const u64 tag_mask = 0xffffc00000000000;
 const u64 tag_position = 46;
 const u64 int_tag = 0;
 const u64 str_tag = 2;
+const u64 sign_flip_const = 0x0000200000000000;
+const u64 signed_num_mask = 0xFFFFE00000000000;
 
 inline bool is_number(u64 datum) {
   // cout << "is_number(" << datum << "): " << (datum >> tag_position == int_tag) << "\n";
@@ -25,13 +27,24 @@ inline bool is_number(u64 datum) {
 }
 
 inline i64 datum_to_number(u64 datum) {
-  return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
+  i64 signed_val = (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
+  if (signed_val >= sign_flip_const) {
+    signed_val = sign_flip_const - signed_val;
+  }
+  return signed_val;
+  // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
 }
 const auto d2n = datum_to_number;
 
 inline u64 number_to_datum(i64 number) {
-  return (number & ~tag_mask) | (int_tag << tag_position);
+  i64 unsigned_value = number;
+  if (number < 0) {
+    unsigned_value = (- number) + sign_flip_const;
+  }
+  return (unsigned_value & ~tag_mask) | (int_tag << tag_position);
+  // return (number & ~tag_mask) | (int_tag << tag_position);
 }
+
 const auto n2d = number_to_datum;
 
 inline u64 string_to_datum(std::string str)
diff --git a/slog/common/tuple.py b/slog/common/tuple.py
index 5cec57db..ff3d7ee4 100644
--- a/slog/common/tuple.py
+++ b/slog/common/tuple.py
@@ -9,6 +9,8 @@
 BUCKET_MASK =   0x00003FFFF0000000
 TUPLE_ID_MASK = 0xFFFFFFFFF0000000
 U32_MASK =      0x00000000FFFFFFFF
+SIGN_FILP_CONST = 0x0000200000000000
+SIGNED_NUM_MASK = 0xFFFFE00000000000
 VAL_MASK = ~ TAG_MASK
 INT_TAG = 0
 STRING_TAG = 2
@@ -67,8 +69,8 @@ def parse_tuple_row(self, u64_list, rel_name, intern_string_dict) -> SlogTuple:
             val_tag = u64 >> 46
             if val_tag == INT_TAG:
                 attr_val = (u64 & VAL_MASK)
-                if attr_val > 2 ** 31:
-                    attr_val = attr_val - 2 ** 32
+                if attr_val >= SIGN_FILP_CONST:
+                    attr_val = SIGN_FILP_CONST - attr_val
             elif val_tag == STRING_TAG:
                 attr_val = intern_string_dict[u64 & U32_MASK]
             else:
diff --git a/slog/tests/testcase/number/ground_truth b/slog/tests/testcase/number/ground_truth
new file mode 100644
index 00000000..c5a9d6d1
--- /dev/null
+++ b/slog/tests/testcase/number/ground_truth
@@ -0,0 +1,3 @@
+bar-res-check-1,1, 1
+bar-res-check-2,1, 1
+bar-res-check-3,1, 1
diff --git a/slog/tests/testcase/number/input/foo.facts b/slog/tests/testcase/number/input/foo.facts
new file mode 100644
index 00000000..5aba8f93
--- /dev/null
+++ b/slog/tests/testcase/number/input/foo.facts
@@ -0,0 +1,2 @@
+-2
+-3
\ No newline at end of file
diff --git a/slog/tests/testcase/number/number.slog b/slog/tests/testcase/number/number.slog
new file mode 100644
index 00000000..b4ae865b
--- /dev/null
+++ b/slog/tests/testcase/number/number.slog
@@ -0,0 +1,8 @@
+; testing negative and floating
+
+(foo -1)
+[(bar {+ 2 x}) <-- (foo x)]
+
+[(bar-res-check-1 "pass") <-- (bar 1)]
+[(bar-res-check-2 "pass") <-- (bar 0)]
+[(bar-res-check-3 "pass") <-- (bar -1)]

From a0f26d224c5220d313260eda37f7071bca19a077 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Wed, 2 Nov 2022 15:40:57 -0400
Subject: [PATCH 02/36] add already compiled check in client

---
 examples/datalog-example | 1 +
 slog/common/client.py    | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 160000 examples/datalog-example

diff --git a/examples/datalog-example b/examples/datalog-example
new file mode 160000
index 00000000..9b29866c
--- /dev/null
+++ b/examples/datalog-example
@@ -0,0 +1 @@
+Subproject commit 9b29866cadd18644be52da674585831b7416dfc6
diff --git a/slog/common/client.py b/slog/common/client.py
index 03217114..ccb07ed7 100644
--- a/slog/common/client.py
+++ b/slog/common/client.py
@@ -199,7 +199,7 @@ def csv_request_generator(csv_hash_map):
             writer.write(f" {response.error_msg} fail to update!")
         ftp_conn.close()
 
-    @lru_cache(maxsize=None)
+    # @lru_cache(maxsize=None)
     def compile_slog(self, filename, writer=Writer()):
         '''
         compile a slog file, and set current DB as the resultant DB.
@@ -254,6 +254,9 @@ def _compile(self, program_hashes, writer=Writer()):
         req.using_database = ""
         req.hashes.extend(program_hashes)
         response = self._stub.CompileHashes(req)
+        if response.promise_id == MAXSIZE:
+            writer.write("Already compiled!")
+            return self.cur_db
         # Wait to resolve the promise in the terminal...
         # Break when promise is resolved
         edb = self.run_until_promised(response.promise_id, PING_INTERVAL, writer)

From 4c9c25573f1c0ab4b204158deac3d337866fd1f0 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Wed, 2 Nov 2022 15:47:28 -0400
Subject: [PATCH 03/36] remove recompile message

---
 slog/common/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slog/common/client.py b/slog/common/client.py
index ccb07ed7..4d170714 100644
--- a/slog/common/client.py
+++ b/slog/common/client.py
@@ -255,7 +255,7 @@ def _compile(self, program_hashes, writer=Writer()):
         req.hashes.extend(program_hashes)
         response = self._stub.CompileHashes(req)
         if response.promise_id == MAXSIZE:
-            writer.write("Already compiled!")
+            # writer.write("Already compiled!")
             return self.cur_db
         # Wait to resolve the promise in the terminal...
         # Break when promise is resolved

From 9b81caf425b504d905094ad15fda88968d731be2 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Fri, 11 Nov 2022 15:51:59 -0500
Subject: [PATCH 04/36] single threaded

---
 backend/src/RA/parallel_agg.cpp               |  18 +-
 backend/src/RA/parallel_agg.h                 |   2 +-
 backend/src/RA/parallel_join.cpp              |  14 +-
 backend/src/RA/parallel_join.h                |  12 +-
 backend/src/RAM/RA_tasks.cpp                  |   5 -
 backend/src/lie/lie.cpp                       |  22 +-
 backend/src/parallel_RA_inc.h                 |   7 +-
 .../src/relation/balanced_hash_relation.cpp   | 137 ++++++-------
 backend/src/relation/balanced_hash_relation.h |  37 +++-
 backend/src/relation/shmap_relation.h         |  35 ++--
 backend/src/relation/shmap_relation_exp.cpp   | 190 +++++++++++++++---
 .../compiled_pre/CMakeLists.txt               |   4 +-
 .../checkpoint-final/256.edge.3.table_full    | Bin
 .../checkpoint-final/257.spath.3.table_full   | Bin 0 -> 512 bytes
 .../checkpoints/checkpoint-final/$strings.csv |   0
 .../checkpoint-final/256.edge.3.table_full    | Bin 0 -> 288 bytes
 .../checkpoint-final/257.spath.3.table_full   | Bin
 .../compiled_pre/compiler-out                 |   0
 .../compiled_pre/input-data/$strings.csv      |   0
 .../compiled_pre/input-data/256.edge.3.table  | Bin
 .../compiled_pre/input-data/257.spath.3.table |   0
 .../compiled_pre/sssp.cpp}                    | 103 +++++++++-
 .../compiled_pre/sssp.cpp.backup}             |   0
 backend/tests/sssp/sssp.slog                  |   3 +
 .../test-input-graph/edge.csv                 |   0
 backend/tests/update/sssp.slog                |   3 -
 slog/common/client.py                         |  12 +-
 slogdb                                        |  22 ++
 28 files changed, 448 insertions(+), 178 deletions(-)
 rename backend/tests/{update => sssp}/compiled_pre/CMakeLists.txt (94%)
 rename backend/tests/{update/compiled_pre/checkpoints => sssp/compiled_pre}/checkpoint-final/256.edge.3.table_full (100%)
 create mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full
 rename backend/tests/{update => sssp}/compiled_pre/checkpoints/checkpoint-final/$strings.csv (100%)
 create mode 100644 backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full
 rename backend/tests/{update => sssp}/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full (100%)
 rename backend/tests/{update => sssp}/compiled_pre/compiler-out (100%)
 rename backend/tests/{update => sssp}/compiled_pre/input-data/$strings.csv (100%)
 rename backend/tests/{update => sssp}/compiled_pre/input-data/256.edge.3.table (100%)
 rename backend/tests/{update => sssp}/compiled_pre/input-data/257.spath.3.table (100%)
 rename backend/tests/{update/sssp_update.cpp => sssp/compiled_pre/sssp.cpp} (82%)
 rename backend/tests/{update/compiled_pre/sssp.cpp => sssp/compiled_pre/sssp.cpp.backup} (100%)
 create mode 100644 backend/tests/sssp/sssp.slog
 rename backend/tests/{update => sssp}/test-input-graph/edge.csv (100%)
 delete mode 100644 backend/tests/update/sssp.slog
 create mode 100755 slogdb

diff --git a/backend/src/RA/parallel_agg.cpp b/backend/src/RA/parallel_agg.cpp
index 6e315515..90c07c3d 100644
--- a/backend/src/RA/parallel_agg.cpp
+++ b/backend/src/RA/parallel_agg.cpp
@@ -80,7 +80,7 @@ void parallel_join_aggregate::local_aggregate(
 
     u32* output_sub_bucket_count = output->get_sub_bucket_per_bucket_count();
     u32** output_sub_bucket_rank = output->get_sub_bucket_rank();
-    int real_join_count = output->get_join_column_count() - 1;
+    u32 real_join_count = output->get_join_column_count() - 1;
     agg_buffer.width[ra_counter] = real_join_count + 1;
 
     shmap_relation* agg_target;
@@ -95,7 +95,7 @@ void parallel_join_aggregate::local_aggregate(
     }
 
     btree::btree_map<std::vector<u64>, u64, shmap_relation::t_comparator> res_map;
-    for (int bucket=0; bucket < buckets; bucket ++) {
+    for (u32 bucket=0; bucket < buckets; bucket ++) {
         for (auto tuple: input->get_full()[bucket]) {
             std::vector<u64> data_v(tuple.begin(), tuple.begin()+target->get_join_column_count());
             // std::cout << "On rank " << mcomm.get_rank() << " bucket " << *(target->get_sub_bucket_per_bucket_count()) << std::endl;
@@ -111,18 +111,18 @@ void parallel_join_aggregate::local_aggregate(
         }
     }
 
-    for (int bucket=0; bucket < buckets; bucket ++) {
+    for (u32 bucket=0; bucket < buckets; bucket ++) {
         for (auto input_tuple: input->get_full()[bucket]) {
             std::vector<u64> joined_input_tuple(input_tuple.begin(), input_tuple.begin()+input->get_join_column_count());
             auto agg_res = res_map[joined_input_tuple];
             std::vector<u64> tuple(reorder_mapping.size(), 0);
             int reorder_agg_index = input->get_arity() + 1;
-            for (int j = 0; j < reorder_mapping.size(); j++) {
-                if (reorder_mapping[j] == reorder_agg_index) {
-                    tuple[j] = agg_res;
-                } else {
-                    tuple[j] = input_tuple[reorder_mapping[j]];
-                }
+            for (long unsigned int j = 0; j < reorder_mapping.size(); j++) {
+              if (reorder_mapping[j] == reorder_agg_index) {
+                tuple[j] = agg_res;
+              } else {
+                tuple[j] = input_tuple[reorder_mapping[j]];
+              }
             }
 
             uint64_t bucket_id = tuple_hash(tuple.data(), output->get_join_column_count()) % buckets;
diff --git a/backend/src/RA/parallel_agg.h b/backend/src/RA/parallel_agg.h
index 8c07c7a8..7189142d 100644
--- a/backend/src/RA/parallel_agg.h
+++ b/backend/src/RA/parallel_agg.h
@@ -79,7 +79,7 @@ class parallel_join_aggregate : public parallel_RA
     local_agg_func_t local_func;
     reduce_agg_func_t reduce_func;
     global_agg_func_t global_func;
-    std::vector<u64> reorder_mapping;
+    std::vector<int> reorder_mapping;
 
     parallel_join_aggregate(relation* output, relation* target_rel, relation* input,
                             int t_type, local_agg_func_t local_agg_func, 
diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index 76cc949c..38d9e20c 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -6,6 +6,7 @@
 
 
 #include "../parallel_RA_inc.h"
+#include <cstddef>
 
 
 bool parallel_join::local_join(int threshold, int* offset,
@@ -24,6 +25,13 @@ bool parallel_join::local_join(int threshold, int* offset,
     join_buffer.width[counter] = reorder_map_array.size();
 
     shmap_relation deduplicate(join_column_count, false);
+    auto out_dep_cols = output->get_dependent_column();
+    if (out_dep_cols.size() != 0) {
+        for (size_t i = 0; i < out_dep_cols.size() - 1; i++) {
+            deduplicate.dependent_column_indices.push_back(out_dep_cols[i]);
+        }
+        deduplicate.update_compare_func = output->get_update_compare_func();
+    }
     u32* output_sub_bucket_count = output->get_sub_bucket_per_bucket_count();
     u32** output_sub_bucket_rank = output->get_sub_bucket_rank();
 
@@ -53,7 +61,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                 join_column_count, deduplicate,
                 &local_join_count, global_join_duplicates,
                 global_join_inserts, output->get_join_column_count(),
-                output->get_is_canonical());
+                output->get_is_canonical(),
+                generator_mode, generator_func);
 
             // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl;
             if (local_join_count > threshold)
@@ -84,7 +93,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                 join_column_count, deduplicate,
                 &local_join_count, global_join_duplicates,
                 global_join_inserts,
-                output->get_join_column_count(),output->get_is_canonical());
+                output->get_join_column_count(),output->get_is_canonical(),
+                generator_mode, generator_func);
 
             // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl;
             if (local_join_count > threshold)
diff --git a/backend/src/RA/parallel_join.h b/backend/src/RA/parallel_join.h
index f7b20979..30e15000 100644
--- a/backend/src/RA/parallel_join.h
+++ b/backend/src/RA/parallel_join.h
@@ -6,7 +6,9 @@
 
 
 #pragma once
+#include "../parallel_RA_inc.h"
 #include "../ds.h"
+#include <vector>
 
 class parallel_join: public parallel_RA {
 
@@ -23,6 +25,11 @@ class parallel_join: public parallel_RA {
     std::vector<int> projection_reorder_index_array;
     int projection_reorder_index_array_length;
 
+    // a function used to generate new tuple based on join input, target tuple (optional)
+    // if this is provided, it will make join works similar to `copy_generate`
+    join_generator_func_t generator_func;
+    bool generator_mode = false;
+
 public:
     parallel_join()
     {
@@ -64,6 +71,7 @@ class parallel_join: public parallel_RA {
     int get_join_input1_graph_type()    {return join_input1_graph_type;}
     relation* get_join_output() {return join_output_table;}
     void get_join_projection_index(std::vector<int>* projection_reorder_index_array)    {*projection_reorder_index_array = this->projection_reorder_index_array; }
+    void set_generator_func(join_generator_func_t func) { generator_func = func; generator_mode = true; }
 
 #ifdef GOOGLE_MAP
     bool local_join(int threshold, int* offset,
@@ -75,7 +83,7 @@ class parallel_join: public parallel_RA {
                     relation* output,
                     all_to_allv_buffer& join_buffer,
                     int counter,
-                    int join_colun_count,
+                    int join_column_count,
                     u32* local_join_duplicates,
                     u32* local_join_inserts);
 #else
@@ -88,7 +96,7 @@ class parallel_join: public parallel_RA {
                     relation* output,
                     all_to_allv_buffer& join_buffer,
                     int counter,
-                    int join_colun_count,
+                    int join_column_count,
                     u32* local_join_duplicates,
                     u32* local_join_inserts);
 
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 9528aeb7..b7a8a029 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -1003,15 +1003,10 @@ void RAM::local_insert_in_newt(std::map<u64, u64>& intern_map)
 void RAM::local_insert_in_full()
 {
     for (u32 i=0; i < ram_relation_count; i++)
-        //for (std::map<relation*, bool>::iterator it = ram_relations.begin() ; it != ram_relations.end(); ++it)
     {
-        //relation* current_r = it->first;
         relation* current_r = ram_relations[i];
         current_r->insert_delta_in_full();
         current_r->local_insert_in_delta();
-
-        //if (current_r->get_debug_id() == 11)
-        //    current_r->print();
     }
     return;
 }
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 6018caf6..ad504f1d 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -80,7 +80,7 @@ void LIE::update_task_graph(RAM* executable_task)
             taskgraph.erase(lie_sccs[i]);
             // check if relation in this scc need gc
             auto gc_rels = executable_task->get_gc_relation();
-            for (int j=0; j < gc_rels.size(); j++) {
+            for (size_t j=0; j < gc_rels.size(); j++) {
                 auto pos = std::find(lie_relations.begin(), lie_relations.end(), gc_rels[j]);
                 if (pos != lie_relations.end()) {
                     lie_relations.erase(pos);
@@ -440,7 +440,7 @@ bool LIE::execute ()
                     delta_filename = delta_filename + "_" + std::to_string(mcomm.get_local_rank());
 
                 scc_relation[i]->set_filename(delta_filename);
-                scc_relation[i]->set_initailization_type(0);
+                scc_relation[i]->set_initialization_type(0);
 
                 int is_access = access(delta_filename.c_str(), F_OK);
                 int access_sum = 0;
@@ -476,9 +476,9 @@ bool LIE::execute ()
             else
                 executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num);
 
-            // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
-            // for (u32 i = 0 ; i < scc_relation_count; i++)
-            //     print_relation_size(scc_relation[i]);
+            std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
+            for (u32 i = 0 ; i < scc_relation_count; i++)
+                print_relation_size(scc_relation[i]);
             // stat_intermediate();
             //executed_scc_id.push_back(executable_task->get_id());
 #if 0
@@ -544,10 +544,12 @@ bool LIE::execute ()
                     //    std::cout << "Writing checkpoint dump " << checkpoint_dumps_num << " takes " << max_write_cp_time << "(s)" << std::endl;
                     checkpoint_dumps_num++;
                 }
-#endif
-                // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
-                // for (u32 i = 0 ; i < scc_relation_count; i++)
-                //     print_relation_size(scc_relation[i]);
+#endif 
+                // if (loop_counter < 20) {
+                std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
+                for (u32 i = 0 ; i < scc_relation_count; i++)
+                    print_relation_size(scc_relation[i]);
+                // }
                 // stat_intermediate();
                 // loop_counter++;
                 //iteration_count[executable_task->get_id()] = loop_counter;
@@ -565,7 +567,7 @@ bool LIE::execute ()
 
         if (mcomm.get_rank() == 0)
         {
-            // std::cout << "<<<<<<<<<<< SCC " << executable_task->get_id() << " finish, " << loop_counter << " iteration in total." << std::endl;
+            std::cout << "<<<<<<<<<<< SCC " << executable_task->get_id() << " finish, " << loop_counter << " iteration in total." << std::endl;
             // print_all_relation_size();
         }
         full_iteration_count += loop_counter;
diff --git a/backend/src/parallel_RA_inc.h b/backend/src/parallel_RA_inc.h
index 957d8c42..e02739c5 100644
--- a/backend/src/parallel_RA_inc.h
+++ b/backend/src/parallel_RA_inc.h
@@ -13,11 +13,13 @@
 #include "compat.h"
 // #include "shmap/shmap.h"
 #include "shmap/shmap_goog.h"
-#include <functional>
 
 //#define DEBUG_OUTPUT 1
 #define MAX_LOOP_COUNT 120000
 
+using update_partial_compare_func_t = std::function<std::optional<bool>(std::vector<u64> old_v, std::vector<u64> new_v)>;
+using join_generator_func_t = std::function<void(std::vector<u64>& target_v, std::vector<u64>& input_v, u64* res)>;
+
 #include "log/logger.h"
 #include "hash/hash.h"
 #include "comm/comm.h"
@@ -33,7 +35,7 @@ enum class SpecialAggregator {
   count,
   maximum,
   minimum,
-  recusive
+  recursive
 };
 
 // TODO: remove unused argument
@@ -46,6 +48,7 @@ using global_agg_func_t = std::function<u64(local_agg_res_t a, local_agg_res_t b
 // typedef local_agg_res_t *reduce_agg_func_t (local_agg_res_t x, local_agg_res_t y);
 // typedef int *global_agg_func_t (std::vector<u64>& data, local_agg_res_t agg_data, int agg_data_count, std::vector<u64>& output); 
 
+
 #include "relation/balanced_hash_relation.h"
 #include "RA/parallel_RA.h"
 #include "RA/fact.h"
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index 50d31ea0..7b5deef9 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -528,9 +528,9 @@ void relation::load_data_from_separate_files()
 	double read_data_end = MPI_Wtime();
 	double read_data_time = read_data_end - read_data_start;
 
-	if (initailization_type == DELTA)
+	if (initialization_type == DELTA)
 		 populate_delta(file_io.get_hash_buffer_size(), file_io.get_hash_buffer());
-	else if (initailization_type == FULL)
+	else if (initialization_type == FULL)
 		populate_full(file_io.get_hash_buffer_size(), file_io.get_hash_buffer());
 
 	file_io.delete_hash_buffers();
@@ -539,7 +539,7 @@ void relation::load_data_from_separate_files()
     MPI_Reduce(&read_data_time, &max_read_data_time, 1, MPI_DOUBLE, MPI_MAX, 0, mcomm.get_local_comm());
 
     std::string read_io = (share_io == true)? "MPI IO": "POSIX IO";
-    std::string type = (initailization_type == DELTA)? "DELTA": "FULL";
+    std::string type = (initialization_type == DELTA)? "DELTA": "FULL";
 
     if (mcomm.get_rank() == 0 && restart_flag == true)
     	std::cout << "Read " << get_debug_id() << " (" << read_io << ") :\n  " << type << " [RD], " <<
@@ -554,9 +554,9 @@ void relation::load_data_from_file_with_offset()
 	double read_data_end = MPI_Wtime();
 	double read_data_time = read_data_end - read_data_start;
 
-	if (initailization_type == DELTA)
+	if (initialization_type == DELTA)
 		 populate_delta(file_io.get_hash_buffer_size(), file_io.get_hash_buffer());
-	else if (initailization_type == FULL)
+	else if (initialization_type == FULL)
 		populate_full(file_io.get_hash_buffer_size(), file_io.get_hash_buffer());
 
 	file_io.delete_hash_buffers();
@@ -565,7 +565,7 @@ void relation::load_data_from_file_with_offset()
     MPI_Reduce(&read_data_time, &max_read_data_time, 1, MPI_DOUBLE, MPI_MAX, 0, mcomm.get_local_comm());
 
     std::string read_io = (share_io == true)? "MPI IO": "POSIX IO";
-    std::string type = (initailization_type == DELTA)? "DELTA": "FULL";
+    std::string type = (initialization_type == DELTA)? "DELTA": "FULL";
 
     if (mcomm.get_rank() == 0 && restart_flag == true)
     	std::cout << "Read " << get_debug_id() << " (" << read_io << ") :\n  " << type << " [RD], " <<
@@ -584,7 +584,7 @@ void relation::load_data_from_file()
     //         //   << "c++ object " << this
     //           << "start normal IO" << std::endl;
     /// reading from file
-    if (initailization_type != -1)
+    if (initialization_type != -1)
     {
         /// Main : Execute : init : io : end
     	double read_data_start = MPI_Wtime();
@@ -601,10 +601,10 @@ void relation::load_data_from_file()
         file_io.delete_raw_buffers();
 
         /* Copy data from buffer to relation */
-        if (initailization_type == DELTA)
+        if (initialization_type == DELTA)
             populate_delta(file_io.get_hash_buffer_size(), file_io.get_hash_buffer());
 
-        else if (initailization_type == FULL)
+        else if (initialization_type == FULL)
             populate_full(file_io.get_hash_buffer_size(), file_io.get_hash_buffer());
 
         file_io.delete_hash_buffers();
@@ -615,7 +615,7 @@ void relation::load_data_from_file()
         MPI_Reduce(&all_to_all_time, &max_all_to_all_time, 1, MPI_DOUBLE, MPI_MAX, 0, mcomm.get_local_comm());
 
         std::string read_io = (share_io == true)? "MPI IO": "POSIX IO";
-        std::string type = (initailization_type == DELTA)? "DELTA": "FULL";
+        std::string type = (initialization_type == DELTA)? "DELTA": "FULL";
 
         if (mcomm.get_rank() == 0 && restart_flag == true)
         	std::cout << "Read " << get_debug_id() << " (" << read_io << ") :\n " << type << " [RD] [AC], " <<
@@ -655,13 +655,16 @@ void relation::initialize_relation(mpi_comm& mcomm, std::map<u64, u64>& intern_m
     full = new shmap_relation[buckets];
     newt = new shmap_relation[buckets];
 
-    for (int i = 0 ; i < buckets; i++) {
+    for (u32 i = 0 ; i < buckets; i++) {
         delta[i].arity = arity;
-        delta[i].dependant_column_index = dependant_column_index;
+        delta[i].dependent_column_indices = dependent_column_indices;
+        delta[i].update_compare_func = update_compare_func;
         full[i].arity = arity;
-        full[i].dependant_column_index = dependant_column_index;
+        full[i].dependent_column_indices = dependent_column_indices;
+        full[i].update_compare_func = update_compare_func;
         newt[i].arity = arity;
-        newt[i].dependant_column_index = dependant_column_index;
+        newt[i].dependent_column_indices = dependent_column_indices;
+        newt[i].update_compare_func = update_compare_func;
     }
 #endif
 
@@ -777,6 +780,7 @@ void relation::populate_full(int buffer_size, u64* buffer)
     u32 counter = 0;
     u64 t[arity+1];
     u32 buckets = get_bucket_count();
+    std::cout << "populating full for " << intern_tag << std::endl;
 
     for (int i = 0; i < buffer_size; i = i + (arity+1))
     {
@@ -800,6 +804,7 @@ void relation::populate_delta (int buffer_size, u64* buffer)
 {
     u64 t[arity+1];
     u32 buckets = get_bucket_count();
+    std::cout << "populating delta for " << intern_tag << std::endl;
 
     for (int i = 0; i < buffer_size; i = i + (arity+1))
     {
@@ -908,7 +913,7 @@ void relation::finalize_relation()
     full_element_count = 0;
     delta_element_count = 0;
 
-    initailization_type = -1;
+    initialization_type = -1;
 
     delete[] distinct_sub_bucket_rank_count;
     for (u64 b = 0; b < buckets; b++)
@@ -1064,7 +1069,7 @@ void relation::copy_relation(relation*& recv_rel, mpi_comm output_comm, int targ
 
 
     finalize_relation();
-    recv_rel->set_initailization_type(-1);
+    recv_rel->set_initialization_type(-1);
     //recv_rel->initialize_relation(output_comm);
 
 
@@ -1128,6 +1133,7 @@ bool relation::insert_in_delta(u64* t)
     if (is_canonical == false   && arity != 0 && arity >= join_column_count)
         sub_bucket_id = tuple_hash(t + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id];
 
+    // std::cout << "inserting delta for " << intern_tag << std::endl;
     //assert((int)bucket_id == mcomm.get_local_rank());
     if (delta[bucket_id].insert_tuple_from_array(t, arity+1) == true)
     {
@@ -1150,6 +1156,7 @@ bool relation::insert_in_newt(u64* t)
     if (is_canonical == false && arity != 0 && arity >= join_column_count)
         sub_bucket_id = tuple_hash(t + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id];
 
+    // std::cout << "inserting newt for " << intern_tag << std::endl;
     //assert((int)bucket_id == mcomm.get_local_rank());
     if (newt[bucket_id].insert_tuple_from_array(t, arity+1) == true)
     {
@@ -1184,7 +1191,9 @@ bool relation::insert_in_full(u64* t)
         std::cout << std::endl;
     }
 #endif
+    // std::cout << "inserting full for " << intern_tag << std::endl;
 
+    // TODO: use normal insert here!
     if (full[bucket_id].insert_tuple_from_array(t, arity+1) == true)
     {
         full_element_count++;
@@ -1274,75 +1283,49 @@ int relation::insert_full_in_delta()
     return insert_success;
 }
 
-
-
 void relation::local_insert_in_delta()
 {
     int rank;
     MPI_Comm_rank(mcomm.get_comm(), &rank);
     u32 buckets = get_bucket_count();
 
-    delete[] delta;
-
-
-    delta = newt;
-
-
-    /*
-    u32 i = mcomm.get_rank();
-    vector_buffer *vb_newt = new vector_buffer[buckets];
-    vb_newt[i].vector_buffer_create_empty();
-    std::vector<u64> prefix = {};
-    newt[i].as_vector_buffer_recursive(&(vb_newt[i]), prefix);
-
-    if (i == 0)
-        std::cout << "XX [" << get_debug_id() << "] Test " << mcomm.get_rank() << " DELTA " << vb_newt[i].size/(sizeof(u64) * (arity + 1)) << " arity " << arity + 1 << std::endl;
-
-    vb_newt[i].vector_buffer_free();
-
-    delete[] vb_newt;
-
-
-
-    //u32 i = mcomm.get_rank();
-    vector_buffer *vb_delta = new vector_buffer[buckets];
-    vb_delta[i].vector_buffer_create_empty();
-    //std::vector<u64> prefix = {};
-    delta[i].as_vector_buffer_recursive(&(vb_delta[i]), prefix);
-
-    if (i == 0)
-        std::cout << "YY [" << get_debug_id() << "] Test " << mcomm.get_rank() << " DELTA " << vb_delta[i].size/(sizeof(u64) * (arity + 1)) << " arity " << arity + 1 << std::endl;
-
-    vb_delta[i].vector_buffer_free();
-
-    delete[] vb_delta;
-    */
-
-
-    delta_element_count = newt_element_count;
-    //if (rank == 0)
-    //    std::cout << "[" << get_debug_id() << "] copyng newt pointer to delta   " << delta_element_count << std::endl;
-
-    memcpy(delta_bucket_element_count, newt_bucket_element_count, buckets * sizeof(u32));
-    for (u32 b = 0; b < buckets; b++)
-    {
-        memcpy(delta_sub_bucket_element_count[b], newt_sub_bucket_element_count[b], sub_bucket_per_bucket_count[b] * sizeof(u32));
-        memset(newt_sub_bucket_element_count[b], 0, sub_bucket_per_bucket_count[b] * sizeof(u32));
-    }
-
-#ifdef GOOGLE_MAP
-    newt = new google_relation[buckets];
-#else
-    newt = new shmap_relation[buckets];
-
-    for (int i = 0; i < buckets; i++) {
-        newt[i].arity = arity;
-        newt[i].dependant_column_index = dependant_column_index;
+    if (dependent_column_indices.size() > 0) {
+        delta_element_count = 0;
+        for (u32 i = 0; i < buckets; i++) {
+            delta[i].purge();
+            memset(delta_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32));
+            for (auto& t: newt[i]) {
+                if (full[i].check_dependent_insertion(t)) {
+                    delta[i].insert(t);
+                    uint64_t bucket_id = tuple_hash(t.data(), join_column_count) % get_bucket_count();
+                    u32 sub_bucket_id = 0;
+                    if (is_canonical == false   && arity != 0 && arity >= join_column_count)
+                        sub_bucket_id = tuple_hash(t.data() + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id];
+                    delta_sub_bucket_element_count[bucket_id][sub_bucket_id]++;
+                    delta_element_count++;
+                }
+            }
+            newt[i].purge();
+            memset(newt_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32));
+        }
+    } else {
+        delete[] delta;
+        delta = newt;
+        delta_element_count = newt_element_count;
+        memcpy(delta_bucket_element_count, newt_bucket_element_count, buckets * sizeof(u32));
+        for (u32 b = 0; b < buckets; b++)
+        {
+            memcpy(delta_sub_bucket_element_count[b], newt_sub_bucket_element_count[b], sub_bucket_per_bucket_count[b] * sizeof(u32));
+            memset(newt_sub_bucket_element_count[b], 0, sub_bucket_per_bucket_count[b] * sizeof(u32));
+        }
+        newt = new shmap_relation[buckets];
+        for (u32 i = 0; i < buckets; i++) {
+            newt[i].arity = arity;
+            newt[i].dependent_column_indices = dependent_column_indices;
+            newt[i].update_compare_func = update_compare_func;
+        }
     }
-#endif
 
-    //for(u32 i=0; i<buckets; i++)
-    //    newt[i] = new google_relation();
     newt_element_count = 0;
     memset(newt_bucket_element_count, 0, buckets * sizeof(u32));
 }
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index f2472678..c0e88f9e 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -8,14 +8,21 @@
 #pragma once
 
 #include "../ds.h"
+#include "../parallel_RA_inc.h"
 #include <algorithm>
+#include <functional>
+#include <optional>
 #include <string>
+#include <vector>
 
 enum {LEFT=0, RIGHT};
 enum {DELTA=0, FULL, FULL_AND_DELTA};
-enum {COPY=0, COPY_FILTER, COPY_GENERATE, ACOPY, JOIN, FACT, NEGATION, AGGREGATION};
+enum {COPY=0, COPY_FILTER, COPY_GENERATE, ACOPY, JOIN, FACT, NEGATION, AGGREGATION, UPDATE};
 enum {STATIC=0, DYNAMIC};
 
+// this is update function for column has functional dependence
+// the size of vector arguments must have exactly same size as dependent_column_indices
+
 class relation
 {
 
@@ -27,7 +34,7 @@ class relation
     u32 intern_tag;                             /// id of relation (to be used for interning)
 
     std::string debug_id;
-    int initailization_type = -1;               /// used when task balancing is required
+    int initialization_type = -1;               /// used when task balancing is required
     std::string filename = NULL;                /// Name of file to open
 
 
@@ -79,7 +86,8 @@ class relation
     bool restart_flag;
     //bool fact_load=false;
     //std::vector<u64> init_val;
-    std::optional<int> dependant_column_index = std::nullopt; 
+    std::vector<int> dependent_column_indices;
+    update_partial_compare_func_t update_compare_func;
 
 public:
 
@@ -92,7 +100,7 @@ class relation
     /// "/var/tmp/g13236/path_2_1_2": location of data file that gets loaded in the relation
     /// FULL: load in FULL (other option is to loadin DELTA, but we alwys load in FULL)
     relation (u32 jcc, bool is_c, u32 ar, u32 tg, std::string fname, int version)
-        :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), initailization_type(version), filename(fname)
+        :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), initialization_type(version), filename(fname)
     {
         //fact_load = false;
         full_element_count=0;
@@ -100,7 +108,7 @@ class relation
     }
 
     relation (u32 jcc, bool is_c, u32 ar, u32 tg, std::string did, std::string fname, int version)
-        :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), debug_id(did), initailization_type(version), filename(fname)
+        :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), debug_id(did), initialization_type(version), filename(fname)
     {
         //fact_load = false;
         full_element_count=0;
@@ -108,7 +116,7 @@ class relation
     }
 
     relation (u32 jcc, bool is_c, u32 ar, u32 tg, int version)
-        :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), initailization_type(version), filename("")
+        :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), initialization_type(version), filename("")
     {
         //fact_load = false;
         full_element_count=0;
@@ -134,14 +142,27 @@ class relation
 
     //void set_init_val(std::vector<u64> temp_init_val)   {init_val = temp_init_val;}
 
-    void set_dependant_column(int idx) { dependant_column_index = idx; }
+    void set_dependent_column_update(std::vector<int> idx, update_partial_compare_func_t f) {
+        dependent_column_indices = idx;
+        update_compare_func= f;
+        // for (int i = 0; i < get_bucket_count(); i++) {
+        //     delta[i].dependent_column_indices = dependent_column_indices;
+        //     delta[i].update_compare_func = update_compare_func;
+        //     full[i].dependent_column_indices = dependent_column_indices;
+        //     full[i].update_compare_func = update_compare_func;
+        //     newt[i].dependent_column_indices = dependent_column_indices;
+        //     newt[i].update_compare_func = update_compare_func;
+        // }
+    }
+    std::vector<int> get_dependent_column() { return dependent_column_indices; }
+    update_partial_compare_func_t get_update_compare_func() { return update_compare_func; }
 
     /// used for load balancing
     void set_last_rank(int lr)   {last_rank = lr;}
     int get_last_rank() {   return last_rank;}
 
     /// used for task-level parallelism
-    void set_initailization_type(int x) { initailization_type = x;  }
+    void set_initialization_type(int x) { initialization_type = x;  }
 
 
     bool get_is_canonical() {return is_canonical;}
diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h
index e299cbc3..19287e53 100644
--- a/backend/src/relation/shmap_relation.h
+++ b/backend/src/relation/shmap_relation.h
@@ -11,7 +11,6 @@
 #include "../btree/btree_set.h"
 #include <cstdint>
 #include <utility>
-#include <optional>
 
 struct shmap_relation {
 
@@ -19,37 +18,28 @@ struct shmap_relation {
 
     int data_structure_type;
 
-    std::optional<int> dependant_column_index;     // some column may have functional dependance its support for lattice like language feature
+    // some column may have functional dependance its support for lattice like language feature
+    // please always consider id column as a functional dependent column
+    std::vector<int> dependent_column_indices;     
+    update_partial_compare_func_t update_compare_func;
 
     using t_tuple = std::vector<u64>;
     struct t_comparator {
         // 0-arity compare will fail
-        t_comparator() : _id_flag(true) { dependant_column_index = std::nullopt; }
-        t_comparator(std::optional<int> dt): dependant_column_index(dt) {}
+        t_comparator() {}
         
         bool operator()(const t_tuple &a, const t_tuple &b) const {
             // make it an unroll loop when change to array
             int size = a.size();
-            if (dependant_column_index.has_value()) {
                 for (int i=0; i < size; i++) {
-                    if (i == dependant_column_index.value()) { continue; }
                     if (a[i] < b[i])
                         return true;
                     if (a[i] > b[i])
                         return false;
                 }
-            } else {
-                for (int i=0; i < size; i++) {
-                    if (a[i] < b[i])
-                        return true;
-                    if (a[i] > b[i])
-                        return false;
-                }
-            }
+
             return false;
         }
-        bool _id_flag;
-        std::optional<int> dependant_column_index; 
     };
 
     // souffle use multi set for some relation
@@ -123,7 +113,7 @@ struct shmap_relation {
     shmap_relation(int arity, bool id_flag);
     shmap_relation() {
         // id_flag = true;
-        dependant_column_index = std::nullopt;
+        // dependent_column_indices = std::nullopt;
         // ind = new t_ind(t_comparator(id_flag));
         // int rank;
         // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -134,9 +124,12 @@ struct shmap_relation {
     bool insert_tuple_from_array(u64* t, int arity);
     void remove_tuple();
     bool find_tuple_from_array(u64* t, int arity);
+    bool check_dependent_insertion(const std::vector<u64> &v);
 
     void as_vector_buffer_recursive(vector_buffer* vb, std::vector<u64> prefix);
 
+    // TODO: move all these logic to RA operation!
+
     void as_all_to_allv_copy_buffer(all_to_allv_buffer& buffer, std::vector<u64> prefix, std::vector<int> reorder_map, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, u32 arity, u32 join_column_count, int head_rel_hash_col_count, bool canonical);
     
     void as_all_to_allv_copy_filter_buffer(all_to_allv_buffer& buffer, std::vector<u64> prefix, std::vector<int> reorder_map, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, u32 arity, u32 join_column_count, bool(*lambda)(const u64* const), int head_rel_hash_col_count, bool canonical);
@@ -150,7 +143,8 @@ struct shmap_relation {
         int join_column_count, shmap_relation& deduplicate,
         int* local_join_count, u32* local_join_duplicates,
         u32* local_join_inserts,
-        int head_rel_hash_col_count, bool canonical);
+        int head_rel_hash_col_count, bool canonical,
+        bool generator_mode, join_generator_func_t gen_func);
 
     void as_all_to_allv_left_join_buffer(
         std::vector<u64> prefix, all_to_allv_buffer& join_buffer,
@@ -161,7 +155,8 @@ struct shmap_relation {
         int join_column_count, shmap_relation& deduplicate,
         int* local_join_count, u32* local_join_duplicates,
         u32* local_join_inserts, int head_rel_hash_col_count,
-        bool canonical);
+        bool canonical,
+        bool generator_mode, join_generator_func_t gen_func);
     
     void as_all_to_allv_right_outer_join_buffer(
         shmap_relation* target_relation,
@@ -171,7 +166,7 @@ struct shmap_relation {
         int ra_id,
         u32 buckets, u32* output_sub_bucket_count,
         u32** output_sub_bucket_rank, std::vector<int>& reorder_map,
-        int join_column_count, int out_airty,
+        int join_column_count, int out_arity,
         int head_rel_hash_col_count, bool canonical);
 
     void as_all_to_allv_copy_generate_buffer(all_to_allv_buffer& buffer, std::vector<u64> prefix, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, u32 arity, u32 join_column_count, int(*lambda)(const u64* const, u64* const), int head_rel_hash_col_count, bool canonical);
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index d9f6c6d6..654cc350 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -12,10 +12,12 @@
 
 #include "../parallel_RA_inc.h"
 #include "shmap_relation.h"
+#include <cassert>
 #include <cstddef>
 #include <iostream>
-
-
+#include <mpi.h>
+#include <ostream>
+#include <vector>
 
 shmap_relation::shmap_relation(int arity, bool id_flag)
 {
@@ -26,8 +28,134 @@ shmap_relation::shmap_relation(int arity, bool id_flag)
 bool shmap_relation::insert_tuple_from_array(u64 *t, int width)
 {
     t_tuple tp(t, t+width);
+    // check if relation has functional dependance
+    if (dependent_column_indices.size() > 0) {
+        std::vector<u64> index_columns;
+        std::vector<u64> dependent_columns;
+        t_tuple upper_bound(width, std::numeric_limits<u64>::max());
+        t_tuple lower_bound(width, std::numeric_limits<u64>::min());
+        for (int i = 0; i < width-dependent_column_indices.size();  i++) {
+            upper_bound[i] = tp[i];
+            lower_bound[i] = tp[i];
+        }
+        for (auto i: dependent_column_indices) {
+            dependent_columns.push_back(t[i]);
+        }
+        auto exist_tuples_range = lowerUpperRange(lower_bound, upper_bound);
+        if (exist_tuples_range.first == ind.end()) {
+            // std::cout << "adding to lattice with <<<<<< ";
+            // for (auto c: tp) {
+            //     std::cout << c << " ";
+            // }
+            // std::cout << " while lower bound ... ";
+            // for (auto c: lower_bound) {
+            //     std::cout << c << " ";
+            // }
+            // std::cout << std::endl;
+            // std::cout << "The current btree: " << std::endl;
+            // for (auto t: ind) {
+            //     std::cout << "Tuple : ";
+            //     for (auto c: t) {
+            //         std::cout << c << " ";
+            //     }
+            //     std::cout << std::endl;
+            // }
+            return insert(tp);
+        } else {
+            // update
+            // iterator need_delete = ind.end();
+            std::vector<iterator> need_deletes;
+            for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) {
+                auto cur_tuple = *it;
+                // std::cout << "comparing  <<<<<< ";
+                // for (auto c: cur_tuple) {
+                //     std::cout << c << " ";
+                // }
+                // std::cout << std::endl;
+                std::vector<u64> old_t;
+                for (auto i: dependent_column_indices) {
+                    old_t.push_back(cur_tuple[i]);
+                }
+                auto compare_res = update_compare_func(old_t, dependent_columns);
+                if (compare_res.has_value() && compare_res.value()) {
+                    need_deletes.push_back(it);
+                    // std::cout << "update with <<<<<< ";
+                    // for (auto c: tp) {
+                    //     std::cout << c << " ";
+                    // }
+                    // std::cout << std::endl;
+                }
+            }
+            if (!need_deletes.empty()) {
+                for (auto d: need_deletes) {
+                    ind.erase(*d);
+                }
+                return insert(tp);
+            } else {
+                return false;
+            }
+        }
+    } else {
+        // std::cout << "adding to normal "<< arity << "  with <<<<<< ";
+        // for (auto c: tp) {
+        //     std::cout << c << " ";
+        // }
+        // std::cout << std::endl;
+        return insert(tp);
+    }
+}
 
-    return insert(tp);
+bool
+shmap_relation::check_dependent_insertion(const std::vector<u64> &tp) {
+    if (dependent_column_indices.size() > 0) {
+        std::vector<u64> index_columns;
+        std::vector<u64> dependent_columns;
+        t_tuple upper_bound(tp.size(), std::numeric_limits<u64>::max());
+        t_tuple lower_bound(tp.size(), std::numeric_limits<u64>::min());
+        for (size_t i = 0; i < tp.size()-dependent_column_indices.size();  i++) {
+            upper_bound[i] = tp[i];
+            lower_bound[i] = tp[i];
+        }
+        for (auto i: dependent_column_indices) {
+            dependent_columns.push_back(tp[i]);
+        }
+        auto exist_tuples_range = lowerUpperRange(lower_bound, upper_bound);
+        if (exist_tuples_range.first == ind.end()) {
+            return true;
+        } else {
+            for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) {
+                auto cur_tuple = *it;
+                std::vector<u64> old_t;
+                for (auto i: dependent_column_indices) {
+                    old_t.push_back(cur_tuple[i]);
+                }
+                auto compare_res = update_compare_func(old_t, dependent_columns);
+                if (compare_res.has_value() && compare_res.value()) {
+                    return true;
+                }
+            }
+            // std::cout << " not adding to lattice with <<<<<< ";
+            // for (auto c: tp) {
+            //     std::cout << c << " ";
+            // }
+            // std::cout << " while lower bound ... ";
+            // for (auto c: lower_bound) {
+            //     std::cout << c << " ";
+            // }
+            // std::cout << std::endl;
+            // std::cout << "The current btree: " << std::endl;
+            // for (auto& t: ind) {
+            //     std::cout << "Tuple : ";
+            //     for (auto c: t) {
+            //         std::cout << c << " ";
+            //     }
+            //     std::cout << std::endl;
+            // }
+            return false;
+        }
+    } else {
+        return true;
+    }
 }
 
 std::pair<shmap_relation::iterator, shmap_relation::iterator>
@@ -267,7 +395,8 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
     u32 *local_join_duplicates,
     u32 *local_join_inserts,
     int head_rel_hash_col_count,
-    bool canonical)
+    bool canonical,
+    bool generator_mode, join_generator_func_t gen_func)
 {
     if (size() == 0)
         return;
@@ -284,16 +413,20 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
     {
         auto cur_path = *it;
         u64 projected_path[join_buffer.width[ra_id]];
-        u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
-        for (int i = 0; i < input1_buffer_width; i++)
-            reordered_cur_path[i] = cur_path[i];
-
-        for (int i = join_column_count; i < input0_buffer_width; i++)
-            reordered_cur_path[input1_buffer_width + (i - join_column_count)] = input0_buffer[i];
-
-        for (int i =0; i < join_buffer.width[ra_id]; i++)
-            projected_path[i] = reordered_cur_path[reorder_map[i]];
-
+        if (generator_mode) {
+            std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
+            gen_func(input_t, cur_path, projected_path);
+        } else {
+            u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
+            for (int i = 0; i < input1_buffer_width; i++)
+                reordered_cur_path[i] = cur_path[i];
+
+            for (int i = join_column_count; i < input0_buffer_width; i++)
+                reordered_cur_path[input1_buffer_width + (i - join_column_count)] = input0_buffer[i];
+
+            for (int i =0; i < join_buffer.width[ra_id]; i++)
+                projected_path[i] = reordered_cur_path[reorder_map[i]];
+        }
         if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
         {
             uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets;
@@ -318,7 +451,6 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
             (*local_join_duplicates)++;
         }
     }
-    // std::cout << "inserted " << *local_join_inserts << std::endl;
 }
 
 void shmap_relation::as_all_to_allv_left_join_buffer(
@@ -335,7 +467,8 @@ void shmap_relation::as_all_to_allv_left_join_buffer(
     u32 *local_join_duplicates,
     u32 *local_join_inserts,
     int head_rel_hash_col_count,
-    bool canonical)
+    bool canonical,
+    bool generator_mode, join_generator_func_t gen_func)
 {
     if (size() == 0)
         return;
@@ -352,16 +485,21 @@ void shmap_relation::as_all_to_allv_left_join_buffer(
     {
         auto cur_path = *it;
         u64 projected_path[join_buffer.width[ra_id]];
-        u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
-        for (int i = 0; i < input0_buffer_width; i++)
-            reordered_cur_path[i] = input0_buffer[i];
-
-        for (int i = join_column_count; i < input1_buffer_width; i++)
-            reordered_cur_path[input0_buffer_width + (i - join_column_count)] = cur_path[i];
-
-        for (int i =0; i < join_buffer.width[ra_id]; i++)
-            projected_path[i] = reordered_cur_path[reorder_map[i]];
-
+        if (generator_mode) {
+            std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
+            gen_func(cur_path, input_t, projected_path);
+        } else {
+            u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
+            for (int i = 0; i < input0_buffer_width; i++)
+                reordered_cur_path[i] = input0_buffer[i];
+
+            for (int i = join_column_count; i < input1_buffer_width; i++)
+                reordered_cur_path[input0_buffer_width + (i - join_column_count)] = cur_path[i];
+
+            for (int i =0; i < join_buffer.width[ra_id]; i++)
+                projected_path[i] = reordered_cur_path[reorder_map[i]];
+        }
+        
         //std::cout << "NT " << projected_path[0] << " " << projected_path[1] << std::endl;
         if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
         {
diff --git a/backend/tests/update/compiled_pre/CMakeLists.txt b/backend/tests/sssp/compiled_pre/CMakeLists.txt
similarity index 94%
rename from backend/tests/update/compiled_pre/CMakeLists.txt
rename to backend/tests/sssp/compiled_pre/CMakeLists.txt
index cb2c1d5f..a5e5801d 100644
--- a/backend/tests/update/compiled_pre/CMakeLists.txt
+++ b/backend/tests/sssp/compiled_pre/CMakeLists.txt
@@ -15,8 +15,8 @@ find_package(MPI REQUIRED)
 # endif()
 
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive")
-set (base_dir "${PROJECT_SOURCE_DIR}/../backend")
-set (source_dir "${base_dir}/src")
+# set (base_dir "${PROJECT_SOURCE_DIR}/../backend")
+set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
 
 file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
 file (GLOB source_files_sssp "${PROJECT_SOURCE_DIR}/sssp.cpp")
diff --git a/backend/tests/update/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full
similarity index 100%
rename from backend/tests/update/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full
rename to backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full
diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full
new file mode 100644
index 0000000000000000000000000000000000000000..478adf3b2a8e2bf5756cf0669e86d003e15ca8d3
GIT binary patch
literal 512
zcmZ9ITMB?M5CdD^qKKl1$Lak~<#x?rsXw8couoB}?`x!;^vIf0h&s2@LHcAZe7Iiz
zS^3P#@m%BSMTLh;JaahCuIcCL=YDs^)6bsg8BZ=s|1O?6+^hvZ8_%4*QNH&TK6~VE
d%I98<4F0#j!+3o1(Y4~qCugdBa#Rx*`7ihw2~z+7

literal 0
HcmV?d00001

diff --git a/backend/tests/update/compiled_pre/checkpoints/checkpoint-final/$strings.csv b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv
similarity index 100%
rename from backend/tests/update/compiled_pre/checkpoints/checkpoint-final/$strings.csv
rename to backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv
diff --git a/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full
new file mode 100644
index 0000000000000000000000000000000000000000..a5b47390726befd417416b8e76e64db49a1e53f8
GIT binary patch
literal 288
zcmYL@+YP`l3<61CX)=}fpQ=*f<ouir24~3c2zO|dM&>&_L+d4S=*-u@*FJgclJ_f=
g&{z83<k9J$@y!yw+~vQX+b8c{CD%T8=y`Jg18>&?&Hw-a

literal 0
HcmV?d00001

diff --git a/backend/tests/update/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full
similarity index 100%
rename from backend/tests/update/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full
rename to backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full
diff --git a/backend/tests/update/compiled_pre/compiler-out b/backend/tests/sssp/compiled_pre/compiler-out
similarity index 100%
rename from backend/tests/update/compiled_pre/compiler-out
rename to backend/tests/sssp/compiled_pre/compiler-out
diff --git a/backend/tests/update/compiled_pre/input-data/$strings.csv b/backend/tests/sssp/compiled_pre/input-data/$strings.csv
similarity index 100%
rename from backend/tests/update/compiled_pre/input-data/$strings.csv
rename to backend/tests/sssp/compiled_pre/input-data/$strings.csv
diff --git a/backend/tests/update/compiled_pre/input-data/256.edge.3.table b/backend/tests/sssp/compiled_pre/input-data/256.edge.3.table
similarity index 100%
rename from backend/tests/update/compiled_pre/input-data/256.edge.3.table
rename to backend/tests/sssp/compiled_pre/input-data/256.edge.3.table
diff --git a/backend/tests/update/compiled_pre/input-data/257.spath.3.table b/backend/tests/sssp/compiled_pre/input-data/257.spath.3.table
similarity index 100%
rename from backend/tests/update/compiled_pre/input-data/257.spath.3.table
rename to backend/tests/sssp/compiled_pre/input-data/257.spath.3.table
diff --git a/backend/tests/update/sssp_update.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp
similarity index 82%
rename from backend/tests/update/sssp_update.cpp
rename to backend/tests/sssp/compiled_pre/sssp.cpp
index 6c399409..00313a55 100644
--- a/backend/tests/update/sssp_update.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp.cpp
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <iterator>
 #include <map>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <unordered_set>
@@ -401,24 +402,116 @@ int main(int argc, char **argv) {
       slog_input_dir + "/" +
           std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
       FULL);
+  relation* rel__edge__3__1 = new relation(
+    1, false, 3, get_tag_for_rel("edge","1"),
+    std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table",
+    FULL);
+
+  // the dependent column must be exclude from hash computation, so join column count is 3 - 1 = 2
   relation *rel__spath__3__1__2__3 = new relation(
-      3, true, 3, get_tag_for_rel("spath", "1__2__3"),
+      2, true, 3, get_tag_for_rel("spath", "1__2__3"),
       std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table",
       slog_input_dir + "/" +
           std::to_string(get_tag_for_rel("spath", "1__2__3")) +
           ".spath.3.table",
       FULL);
+  // set functional dependency for spath
+  rel__spath__3__1__2__3->set_dependent_column_update(
+    {2, 3},   // len and id column
+    [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool> {
+      // if (new_v[0] < old_v[0]) {
+      // std::cout << "Comparing  >>>> ";
+      // for (auto v: old_v) {
+      //   std::cout << v << " ";
+      // }
+      // std::cout << " with ";
+      // for (auto v: new_v) {
+      //   std::cout << v << " ";
+      // }
+      // std::cout << std::endl;
+      // }
+      return new_v[0] < old_v[0]; 
+    }
+  );
+  relation* rel__spath__3__2 = new relation(
+    1, false, 3, get_tag_for_rel("spath","2"),
+    std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table",
+    FULL);
+  rel__spath__3__2->set_dependent_column_update(
+    {2, 3},
+    [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool> {
+      // if (new_v[0] < old_v[0]) {
+      // std::cout << "Comparing  >>>> ";
+      // for (auto v: old_v) {
+      //   std::cout << v << " ";
+      // }
+      // std::cout << " with ";
+      // for (auto v: new_v) {
+      //   std::cout << v << " ";
+      // }
+      // std::cout << std::endl;
+      // }
+      return new_v[0] < old_v[0];
+    }
+  );
 
-  RAM *scc0 = new RAM(false, 0);
-  scc0->add_relation(rel__edge__3__1__2__3, false, false);
-  scc0->add_relation(rel__spath__3__1__2__3, true, false);
-  scc0->add_rule(new parallel_copy(rel__spath__3__1__2__3,
+  RAM* scc0 = new RAM(false, 0);
+  scc0->add_relation(rel__edge__3__1, true, false);
+  scc0->add_relation(rel__edge__3__1__2__3, true, false);
+  scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, DELTA, {0, 3, 1, 2}));
+
+  RAM *scc1 = new RAM(false, 0);
+  scc1->add_relation(rel__edge__3__1__2__3, false, false);
+  scc1->add_relation(rel__spath__3__1__2__3, true, false);
+  scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3,
                                    rel__edge__3__1__2__3, FULL, {0, 1, 2}));
 
+  RAM *scc2 = new RAM(true, 1);
+  scc2->add_relation(rel__edge__3__1__2__3, false, false);
+  scc2->add_relation(rel__spath__3__2, true, false);
+  scc2->add_relation(rel__spath__3__1__2__3, true, false);
+  //  the order of non join column also need to be carefully arranged because, dependent column
+  //  should always at last
+  scc2->add_rule(new parallel_acopy(
+    rel__spath__3__2,
+    rel__spath__3__1__2__3, DELTA,
+    {1, 0, 2, 3})); // 2, 1, 3, id
+  parallel_join* update_spath_j = new parallel_join(
+    rel__spath__3__1__2__3,
+    rel__edge__3__1, FULL,
+    rel__spath__3__2, DELTA,
+    {5, 2, 3}// useless
+  );
+  update_spath_j->set_generator_func([](std::vector<u64>& target_v, std::vector<u64>& input_v, u64* res) {
+    // std::cout << "Join  >>>> ";
+    // for (auto v: target_v) {
+    //   std::cout << v << " ";
+    // }
+    // std::cout << " with ";
+    // for (auto v: input_v) {
+    //   std::cout << v << " ";
+    // }
+    // std::cout << std::endl;
+    res[0] = target_v[1];
+    res[1] = input_v[2];
+    if (res[0] == res[1]) {
+      res[2] = 0;      
+    } else {
+      res[2] = target_v[2] + input_v[3];
+    }
+  });
+  scc2->add_rule(update_spath_j);
+
   LIE *lie = new LIE();
+  lie->add_relation(rel__edge__3__1);
   lie->add_relation(rel__edge__3__1__2__3);
+  lie->add_relation(rel__spath__3__2);
   lie->add_relation(rel__spath__3__1__2__3);
   lie->add_scc(scc0);
+  lie->add_scc(scc1);
+  lie->add_scc(scc2);
+  lie->add_scc_dependance(scc0, scc2);
+  lie->add_scc_dependance(scc1, scc2);
 
   // Enable IO
   lie->enable_all_to_all_dump();
diff --git a/backend/tests/update/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp.backup
similarity index 100%
rename from backend/tests/update/compiled_pre/sssp.cpp
rename to backend/tests/sssp/compiled_pre/sssp.cpp.backup
diff --git a/backend/tests/sssp/sssp.slog b/backend/tests/sssp/sssp.slog
new file mode 100644
index 00000000..abdace44
--- /dev/null
+++ b/backend/tests/sssp/sssp.slog
@@ -0,0 +1,3 @@
+
+[(spath from to dist) <-- (edge from to dist)]
+[(spath from to l) <-- (spath from mid dist) (edge mid to l)]
diff --git a/backend/tests/update/test-input-graph/edge.csv b/backend/tests/sssp/test-input-graph/edge.csv
similarity index 100%
rename from backend/tests/update/test-input-graph/edge.csv
rename to backend/tests/sssp/test-input-graph/edge.csv
diff --git a/backend/tests/update/sssp.slog b/backend/tests/update/sssp.slog
deleted file mode 100644
index 71bfbe56..00000000
--- a/backend/tests/update/sssp.slog
+++ /dev/null
@@ -1,3 +0,0 @@
-
-[(spath from to dist) <-- (edge from to dist)]
-
diff --git a/slog/common/client.py b/slog/common/client.py
index 4d170714..6398146c 100644
--- a/slog/common/client.py
+++ b/slog/common/client.py
@@ -318,12 +318,12 @@ def _run(self, program_hashes:list, input_database:str, cores=2, writer=Writer()
     def _update_intern_strings(self, db_id):
         """ update cached string.csv data """
         if self.local_db_path:
-            with open(os.path.join(self.local_db_path, '$strings.csv'), 'r') as string_file:
-                for s_line in string_file:
-                    if s_line.strip() == '':
-                        continue
-                    sv = s_line.split('\t')[1]
-                    self.intern_string_dict[string_hash(sv.strip())] = sv.strip()
+            # with open(os.path.join(self.local_db_path, '$strings.csv'), 'r') as string_file:
+            #     for s_line in string_file:
+            #         if s_line.strip() == '':
+            #             continue
+            #         sv = s_line.split('\t')[1]
+            #         self.intern_string_dict[string_hash(sv.strip())] = sv.strip()
             return
         req = slog_pb2.StringRequest()
         req.database_id = db_id
diff --git a/slogdb b/slogdb
new file mode 100755
index 00000000..aefb6b59
--- /dev/null
+++ b/slogdb
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+"""
+show a local database with limited feature in REPL
+
+Yihao Sun
+"""
+
+import argparse
+
+from slog.repl.repl import Repl
+
+def run_repl(db_path):
+    repl = Repl(local_db_path=db_path)
+    repl.loop()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("db_path", help="The file folder path of a slog database.")
+
+    args = parser.parse_args()
+    run_repl(args.db_path)

From 357c7c49f9c7a7749d5dd3f51731fee306fd7b61 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Sun, 13 Nov 2022 23:49:38 -0500
Subject: [PATCH 05/36] use real sssp

---
 backend/src/lie/lie.cpp                       |  12 +-
 backend/src/relation/shmap_relation_exp.cpp   |   1 +
 .../checkpoint-final/257.spath.3.table_full   | Bin 512 -> 0 bytes
 .../checkpoint-final/258.spath.2.table_full   | Bin 0 -> 96 bytes
 backend/tests/sssp/compiled_pre/sssp.cpp      | 261 ++++++++++++------
 backend/tests/sssp/sssp.slog                  |   4 +-
 slog/tests/benchmark.py                       | 191 +++++++++++++
 7 files changed, 373 insertions(+), 96 deletions(-)
 delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full
 create mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full
 create mode 100644 slog/tests/benchmark.py

diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index ad504f1d..b2d16761 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -476,9 +476,9 @@ bool LIE::execute ()
             else
                 executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num);
 
-            std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
-            for (u32 i = 0 ; i < scc_relation_count; i++)
-                print_relation_size(scc_relation[i]);
+            // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
+            // for (u32 i = 0 ; i < scc_relation_count; i++)
+            //     print_relation_size(scc_relation[i]);
             // stat_intermediate();
             //executed_scc_id.push_back(executable_task->get_id());
 #if 0
@@ -546,9 +546,9 @@ bool LIE::execute ()
                 }
 #endif 
                 // if (loop_counter < 20) {
-                std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
-                for (u32 i = 0 ; i < scc_relation_count; i++)
-                    print_relation_size(scc_relation[i]);
+                // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
+                // for (u32 i = 0 ; i < scc_relation_count; i++)
+                //     print_relation_size(scc_relation[i]);
                 // }
                 // stat_intermediate();
                 // loop_counter++;
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index 654cc350..2db1941b 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -417,6 +417,7 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
             std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
             gen_func(input_t, cur_path, projected_path);
         } else {
+            // std::cout << "here" << std::endl;
             u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
             for (int i = 0; i < input1_buffer_width; i++)
                 reordered_cur_path[i] = cur_path[i];
diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full
deleted file mode 100644
index 478adf3b2a8e2bf5756cf0669e86d003e15ca8d3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 512
zcmZ9ITMB?M5CdD^qKKl1$Lak~<#x?rsXw8couoB}?`x!;^vIf0h&s2@LHcAZe7Iiz
zS^3P#@m%BSMTLh;JaahCuIcCL=YDs^)6bsg8BZ=s|1O?6+^hvZ8_%4*QNH&TK6~VE
d%I98<4F0#j!+3o1(Y4~qCugdBa#Rx*`7ihw2~z+7

diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full
new file mode 100644
index 0000000000000000000000000000000000000000..d164029436e14bdd3dce073e8e69fa10f32f91de
GIT binary patch
literal 96
ycmZQ#fB-Hi4W=3#7+9fvW+=@FqJiQ(P=3O^f}TJ13=Aw#aTuQoDi78NR1W}19SRiy

literal 0
HcmV?d00001

diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp
index 00313a55..2c95a173 100644
--- a/backend/tests/sssp/compiled_pre/sssp.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp.cpp
@@ -396,121 +396,201 @@ int main(int argc, char **argv) {
   mpi_comm mcomm;
   mcomm.create(argc, argv);
 
+  // relation *rel__edge__3__1__2__3 = new relation(
+  //     3, true, 3, get_tag_for_rel("edge", "1__2__3"),
+  //     std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
+  //     slog_input_dir + "/" +
+  //         std::to_string(get_tag_for_rel("edge", "1__2__3")) +
+  //         ".edge.3.table",
+  //     FULL);
+  // relation* rel__edge__3__1 = new relation(
+  //   1, false, 3, get_tag_for_rel("edge","1"),
+  //   std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table",
+  //   FULL);
+
+  // // the dependent column must be exclude from hash computation, so join
+  // column count is 3 - 1 = 2 relation *rel__spath__3__1__2__3 = new relation(
+  //     2, true, 3, get_tag_for_rel("spath", "1__2__3"),
+  //     std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table",
+  //     slog_input_dir + "/" +
+  //         std::to_string(get_tag_for_rel("spath", "1__2__3")) +
+  //         ".spath.3.table",
+  //     FULL);
+  // // set functional dependency for spath
+  // rel__spath__3__1__2__3->set_dependent_column_update(
+  //   {2, 3},   // len and id column
+  //   [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
+  //   {
+  //     return new_v[0] < old_v[0];
+  //   }
+  // );
+  // relation* rel__spath__3__2 = new relation(
+  //   1, false, 3, get_tag_for_rel("spath","2"),
+  //   std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table",
+  //   FULL);
+  // rel__spath__3__2->set_dependent_column_update(
+  //   {2, 3},
+  //   [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
+  //   {
+  //     return new_v[0] < old_v[0];
+  //   }
+  // );
+
+  // RAM* scc0 = new RAM(false, 0);
+  // scc0->add_relation(rel__edge__3__1, true, false);
+  // scc0->add_relation(rel__edge__3__1__2__3, true, false);
+  // scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3,
+  // DELTA, {0, 3, 1, 2}));
+
+  // RAM *scc1 = new RAM(false, 0);
+  // scc1->add_relation(rel__edge__3__1__2__3, false, false);
+  // scc1->add_relation(rel__spath__3__1__2__3, true, false);
+  // scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3,
+  //                                  rel__edge__3__1__2__3, FULL, {0, 1, 2}));
+
+  // RAM *scc2 = new RAM(true, 1);
+  // scc2->add_relation(rel__edge__3__1__2__3, false, false);
+  // scc2->add_relation(rel__spath__3__2, true, false);
+  // scc2->add_relation(rel__spath__3__1__2__3, true, false);
+  // //  the order of non join column also need to be carefully arranged
+  // because, dependent column
+  // //  should always at last
+  // scc2->add_rule(new parallel_acopy(
+  //   rel__spath__3__2,
+  //   rel__spath__3__1__2__3, DELTA,
+  //   {1, 0, 2, 3})); // 2, 1, 3, id
+  // parallel_join* update_spath_j = new parallel_join(
+  //   rel__spath__3__1__2__3,
+  //   rel__edge__3__1, FULL,
+  //   rel__spath__3__2, DELTA,
+  //   {5, 2, 3}// useless
+  // );
+  // update_spath_j->set_generator_func([](std::vector<u64>& target_v,
+  // std::vector<u64>& input_v, u64* res) {
+  //   res[0] = target_v[1];
+  //   res[1] = input_v[2];
+  //   if (res[0] == res[1]) {
+  //     res[2] = 0;
+  //   } else {
+  //     res[2] = target_v[2] + input_v[3];
+  //   }
+  // });
+  // scc2->add_rule(update_spath_j);
+
+  // LIE *lie = new LIE();
+  // lie->add_relation(rel__edge__3__1);
+  // lie->add_relation(rel__edge__3__1__2__3);
+  // lie->add_relation(rel__spath__3__2);
+  // lie->add_relation(rel__spath__3__1__2__3);
+  // lie->add_scc(scc0);
+  // lie->add_scc(scc1);
+  // lie->add_scc(scc2);
+  // lie->add_scc_dependance(scc0, scc2);
+  // lie->add_scc_dependance(scc1, scc2);
+
+  relation *rel__spath__2__1__2 = new relation(
+      2, true, 2, get_tag_for_rel("spath", "1__2"),
+      std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) +
+          ".spath.2.table",
+      FULL);
+  rel__spath__2__1__2->set_dependent_column_update(
+    {1, 2},   // len and id column
+    [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
+    {
+      return new_v[0] < old_v[0];
+    }
+  );
+  relation *rel__edge__3__1 = new relation(
+      1, false, 3, get_tag_for_rel("edge", "1"),
+      std::to_string(get_tag_for_rel("edge", "1")) + ".edge.3.table", FULL);
   relation *rel__edge__3__1__2__3 = new relation(
       3, true, 3, get_tag_for_rel("edge", "1__2__3"),
       std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
       slog_input_dir + "/" +
           std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
       FULL);
-  relation* rel__edge__3__1 = new relation(
-    1, false, 3, get_tag_for_rel("edge","1"),
-    std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table",
-    FULL);
-
-  // the dependent column must be exclude from hash computation, so join column count is 3 - 1 = 2
-  relation *rel__spath__3__1__2__3 = new relation(
-      2, true, 3, get_tag_for_rel("spath", "1__2__3"),
-      std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table",
-      slog_input_dir + "/" +
-          std::to_string(get_tag_for_rel("spath", "1__2__3")) +
-          ".spath.3.table",
-      FULL);
-  // set functional dependency for spath
-  rel__spath__3__1__2__3->set_dependent_column_update(
-    {2, 3},   // len and id column
-    [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool> {
-      // if (new_v[0] < old_v[0]) {
-      // std::cout << "Comparing  >>>> ";
-      // for (auto v: old_v) {
-      //   std::cout << v << " ";
-      // }
-      // std::cout << " with ";
-      // for (auto v: new_v) {
-      //   std::cout << v << " ";
-      // }
-      // std::cout << std::endl;
-      // }
-      return new_v[0] < old_v[0]; 
-    }
-  );
-  relation* rel__spath__3__2 = new relation(
-    1, false, 3, get_tag_for_rel("spath","2"),
-    std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table",
-    FULL);
-  rel__spath__3__2->set_dependent_column_update(
-    {2, 3},
-    [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool> {
-      // if (new_v[0] < old_v[0]) {
-      // std::cout << "Comparing  >>>> ";
-      // for (auto v: old_v) {
-      //   std::cout << v << " ";
-      // }
-      // std::cout << " with ";
-      // for (auto v: new_v) {
-      //   std::cout << v << " ";
-      // }
-      // std::cout << std::endl;
-      // }
+  relation *rel__spath__2__1 = new relation(
+      1, false, 2, get_tag_for_rel("spath", "1"),
+      std::to_string(get_tag_for_rel("spath", "1")) + ".spath.2.table", FULL);
+  rel__spath__2__1->set_dependent_column_update(
+    {1, 2},
+    [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
+    {
       return new_v[0] < old_v[0];
     }
   );
 
-  RAM* scc0 = new RAM(false, 0);
+  RAM *scc0 = new RAM(false, 0);
   scc0->add_relation(rel__edge__3__1, true, false);
   scc0->add_relation(rel__edge__3__1__2__3, true, false);
-  scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, DELTA, {0, 3, 1, 2}));
-
-  RAM *scc1 = new RAM(false, 0);
-  scc1->add_relation(rel__edge__3__1__2__3, false, false);
-  scc1->add_relation(rel__spath__3__1__2__3, true, false);
-  scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3,
-                                   rel__edge__3__1__2__3, FULL, {0, 1, 2}));
-
-  RAM *scc2 = new RAM(true, 1);
-  scc2->add_relation(rel__edge__3__1__2__3, false, false);
-  scc2->add_relation(rel__spath__3__2, true, false);
-  scc2->add_relation(rel__spath__3__1__2__3, true, false);
-  //  the order of non join column also need to be carefully arranged because, dependent column
-  //  should always at last
-  scc2->add_rule(new parallel_acopy(
-    rel__spath__3__2,
-    rel__spath__3__1__2__3, DELTA,
-    {1, 0, 2, 3})); // 2, 1, 3, id
+  scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3,
+                                    DELTA, {0, 3, 1, 2}));
+
+  RAM *scc1 = new RAM(false, 1);
+  scc1->add_relation(rel__spath__2__1__2, true, false);
+  scc1->add_relation(rel__edge__3__1, false, false);
+  scc1->add_rule(new parallel_copy_generate(
+      rel__spath__2__1__2, rel__edge__3__1, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
+        auto args_for_old_bi = std::array<u64, 1>{data[0]};
+        using TState = std::tuple<const u64 *, u64 *>;
+        TState state = std::make_tuple(data, output);
+        auto callback = [](u64 res_0, TState state) -> TState {
+          auto [data, output] = state;
+          auto head_tuple = output;
+
+          bool compatible = true && res_0 == n2d(1);
+          if (!compatible)
+            return state;
+
+          head_tuple[0] = data[2];
+          head_tuple[1] = data[3];
+          return std::make_tuple(data, output + 2);
+        };
+        auto [_, new_ptr] =
+            builtin_eq_1<TState>(args_for_old_bi.data(), state, callback);
+        auto tuples_count = (new_ptr - output) / 2;
+        return tuples_count;
+      }));
+
+  RAM *scc2 = new RAM(true, 2);
+  scc2->add_relation(rel__spath__2__1__2, true, false);
+  scc2->add_relation(rel__edge__3__1, false, false);
+  scc2->add_relation(rel__spath__2__1, true, false);
+  // scc2->add_rule(new parallel_join(rel__spath__2__1__2, rel__spath__2__1, DELTA,
+  //                                  rel__edge__3__1, FULL, {4, 5}));
   parallel_join* update_spath_j = new parallel_join(
-    rel__spath__3__1__2__3,
+    rel__spath__2__1__2,
     rel__edge__3__1, FULL,
-    rel__spath__3__2, DELTA,
-    {5, 2, 3}// useless
+    rel__spath__2__1, DELTA,
+    {5,4}// useless
   );
-  update_spath_j->set_generator_func([](std::vector<u64>& target_v, std::vector<u64>& input_v, u64* res) {
-    // std::cout << "Join  >>>> ";
-    // for (auto v: target_v) {
-    //   std::cout << v << " ";
-    // }
-    // std::cout << " with ";
-    // for (auto v: input_v) {
-    //   std::cout << v << " ";
-    // }
-    // std::cout << std::endl;
-    res[0] = target_v[1];
-    res[1] = input_v[2];
-    if (res[0] == res[1]) {
-      res[2] = 0;      
+  update_spath_j->set_generator_func([](std::vector<u64>& target_v,
+  std::vector<u64>& input_v, u64* res) {
+    res[0] = target_v[0];
+    // res[1] = input_v[2];
+    if (res[0] == input_v[2]) {
+      res[1] = 0;
     } else {
-      res[2] = target_v[2] + input_v[3];
+      res[1] = target_v[1] + input_v[3];
     }
   });
+  scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2,
+                                    DELTA, {0, 1, 2}));
   scc2->add_rule(update_spath_j);
 
+
   LIE *lie = new LIE();
+  lie->add_relation(rel__spath__2__1__2);
   lie->add_relation(rel__edge__3__1);
   lie->add_relation(rel__edge__3__1__2__3);
-  lie->add_relation(rel__spath__3__2);
-  lie->add_relation(rel__spath__3__1__2__3);
+  lie->add_relation(rel__spath__2__1);
   lie->add_scc(scc0);
   lie->add_scc(scc1);
   lie->add_scc(scc2);
   lie->add_scc_dependance(scc0, scc2);
+  lie->add_scc_dependance(scc0, scc1);
   lie->add_scc_dependance(scc1, scc2);
 
   // Enable IO
@@ -524,6 +604,11 @@ int main(int argc, char **argv) {
   lie->print_all_relation_size(); // Continuously print relation sizes
   lie->stat_intermediate();
 
+  // rel__spath__2__1__2->print();
+  // rel__spath__2__1->print();
+  // rel__edge__3__1->print();
+  // rel__edge__3__1__2__3->print();
+
   // print all variants(non-canonical index of each relation)
   if (mcomm.get_rank() == 0) {
     std::cout << "rel_name"
diff --git a/backend/tests/sssp/sssp.slog b/backend/tests/sssp/sssp.slog
index abdace44..ae617a6d 100644
--- a/backend/tests/sssp/sssp.slog
+++ b/backend/tests/sssp/sssp.slog
@@ -1,3 +1,3 @@
 
-[(spath from to dist) <-- (edge from to dist)]
-[(spath from to l) <-- (spath from mid dist) (edge mid to l)]
+[(spath to dist) <-- (edge 1 to dist)]
+[(spath to l) <-- (spath mid dist) (edge mid to l)]
diff --git a/slog/tests/benchmark.py b/slog/tests/benchmark.py
new file mode 100644
index 00000000..db1bd2d1
--- /dev/null
+++ b/slog/tests/benchmark.py
@@ -0,0 +1,191 @@
+"""
+Benchmark Harness
+"""
+
+import logging
+import os
+import shutil
+import tempfile
+from typing import Iterator
+
+
+# class ExecutionResult:
+#     """ result class for each datalog run """
+
+#     def __init__(self, engine_name, dataset_name, cores,
+#                  runtime, memory_usage) -> None:
+#         self.engine_name = engine_name
+#         self.dataset_name = dataset_name
+#         self.cores = cores
+#         self.runtime = runtime
+#         self.memory_usage = memory_usage
+
+
+class Dataset:
+    """ dataset class, all file inside dataset folder must be either csv/tsv/facts/ """
+
+    def __init__(self, name: str, data_dir: str, row_sep: str) -> None:
+        """ type is csv/tsv/facts """
+        self.name = name
+        self.data_dir = data_dir
+        self.row_sep = row_sep
+        self.files = os.listdir(data_dir)
+
+    def fetch_data(self, rel_fname) -> Iterator[list]:
+        """
+        return a tuple iterator of each row, tuple in dataset is processed as python list
+        """
+        rel_file_path = os.path.join(self.data_dir, rel_fname)
+        if os.path.exists(rel_file_path):
+            with open(rel_file_path) as rel_f:
+                for row in rel_f:
+                    cols = row.split(self.row_sep)
+                    if cols != []:
+                        yield list(map(lambda x: x.strip(), cols))
+        else:
+            logging.error("Relation %s not exists in dataset %s",
+                          rel_fname, self.name)
+            return []
+
+    def dump(self, out_dir, fname_mapping, data_format="tsv", customize_format_function=None):
+        """
+        dump a dataset to target path
+        """
+        if os.path.exists(out_dir):
+            shutil.rmtree(out_dir)
+        os.mkdir(out_dir)
+        if (self.row_sep == '\t' and data_format in ['tsv', 'facts']) or \
+                (self.row_sep == ',' and data_format in ['csv']):
+            for fname in self.files:
+                shutil.copyfile(os.path.join(self.data_dir, fname),
+                                os.path.join(out_dir, fname_mapping[fname]))
+        else:
+            for fname in fname_mapping.keys():
+                with open(fname_mapping[fname], "w+") as out_f:
+                    for row in self.fetch_data(fname):
+                        new_row_txt = ""
+                        if data_format in ['tsv', 'facts']:
+                            new_row_txt = "\t".join(row)
+                        elif data_format in ['csv']:
+                            new_row_txt = ",".join(row)
+                        else:
+                            new_row_txt = customize_format_function(row)
+                        out_f.write(new_row_txt+'\n')
+
+
+class DatalogEngine:
+    """ datalog engine abstract class """
+
+    def __init__(self, name, verbose=False) -> None:
+        self.name = name
+        self.verbose = verbose
+
+    def run(self, dataset: Dataset, output_file, src, file_mapping, cores):
+        """
+        data_input: dataset
+        output: statistic info output path
+        core: core counts used to run benchmark
+        file_mapping: mapping from dataset file to datalog input facts file
+        """
+
+
+class BenchmarkCase:
+    """ one time test """
+
+    def __init__(self, datalog: DatalogEngine, dataset: Dataset, src, file_mapping, cores) -> None:
+        self.datalog = datalog
+        self.dataset = dataset
+        self.file_mapping = file_mapping
+        self.cores = cores
+        self.datalog_file = src
+
+    def run(self, output_file):
+        self.datalog.run(self.dataset, output_file, self.datalog_file,
+                         self.file_mapping, self.cores)
+
+    def __str__(self) -> str:
+        prog_name = os.path.basename(self.datalog_file)
+        return f"{self.datalog.name}_{self.dataset.name}_{prog_name}_{self.cores}"
+
+
+class Slog(DatalogEngine):
+    """ slog test harness """
+
+    def __init__(self, verbose=False) -> None:
+        super().__init__("slog", verbose)
+
+    def run(self, dataset: Dataset, output_file, src, file_mapping, cores):
+        program_name = os.path.basename(src)[:-5]
+        with tempfile.TemporaryDirectory() as tempdir_name:
+            dataset.dump(tempdir_name+'/in', file_mapping, 'facts')
+            print(os.listdir(tempdir_name+'/in'))
+            logging.info(
+                "Running slog %d cores, dataset %s ..., file %s", cores, dataset.data_dir, src)
+            os.system(
+                f"cd /slog && ./runslog -v -co -j {cores} -f {tempdir_name}/in {src} out")
+            os.system(
+                f"cd /slog/out/build && /usr/bin/time -v -o {output_file} mpirun -np {cores} ./{program_name} ../input-data ../")
+
+
+class Souffle(DatalogEngine):
+    """ souffle test harness """
+
+    def __init__(self, verbose=False) -> None:
+        super().__init__("souffle", verbose)
+
+    def run(self, dataset: Dataset, output_file, src, file_mapping, cores):
+        program_name = os.path.basename(src)[:-3]
+        with tempfile.TemporaryDirectory() as tempdir_name:
+            dataset.dump(tempdir_name+'/in', file_mapping, 'facts')
+            print(os.listdir(tempdir_name+'/in'))
+            out_dir = os.path.join(tempdir_name, "out")
+            os.mkdir(out_dir)
+            logging.info(
+                "Running souffle %d cores, dataset %s ..., file %s", cores, dataset.data_dir, src)
+            os.system(
+                f"souffle -o {tempdir_name}/{program_name} -j {cores} -F {tempdir_name}/in -D {out_dir} {src}")
+            os.system(
+                f"/usr/bin/time -v -o {output_file} {tempdir_name}/{program_name} -j {cores} -F {tempdir_name}/in -D {out_dir}")
+
+
+class Benchmark:
+    """ benchmark entrance class """
+
+    def __init__(self, case_list, output_dir) -> None:
+        self.case_list = case_list
+        self.output_dir = output_dir
+
+    def run(self):
+        """ start benchmark """
+        for bench_case in self.case_list:
+            output_fpath = os.path.join(self.output_dir, str(bench_case))
+            bench_case.run(output_fpath)
+            print(f"case finish, output in {str(bench_case)}")
+
+
+if __name__ == "__main__":
+    """ test code """
+    test_dataset = Dataset("test", "/slog/slog/tests/testcase/tc/input", "\t")
+    souffle_engine = Souffle()
+    slog_engine = Slog()
+    target_slog_program = "/slog/slog/tests/testcase/tc/tc.slog"
+    target_souffle_program = "/slog/examples/souffle/tc.dl"
+    case_list = []
+    for i in [1, 3, 6]:
+        case_list.append(BenchmarkCase(
+            slog_engine, test_dataset, target_slog_program,
+            {
+                "edge.facts": "edge.facts"
+            },
+            i
+        ))
+        case_list.append(BenchmarkCase(
+            souffle_engine, test_dataset, target_souffle_program,
+            {
+                "edge.facts": "edge.facts"
+            },
+            i
+        ))
+    bench_out = "/benchmark_out"
+    Benchmark(case_list, bench_out).run()
+    print(f"Benchmark finished, result in {bench_out}.")

From d2c7ed617be11bda0d652d522a285eba254678d2 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Mon, 14 Nov 2022 01:07:55 -0500
Subject: [PATCH 06/36] add loop

---
 .../checkpoint-final/258.spath.2.table_full   | Bin 96 -> 96 bytes
 backend/tests/sssp/compiled_pre/sssp.cpp      |  48 +++++++++++-------
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full
index d164029436e14bdd3dce073e8e69fa10f32f91de..32cf0a911bf6c18b1d59c6e2b5f3f49f85a4ea6a 100644
GIT binary patch
literal 96
vcmZQ(fB<$V4W=3#7+9fv7Bq1lC_iCdLC>Fh1_rP^P(6eIq7(LiwkM|}zt

literal 96
ycmZQ#fB-Hi4W=3#7+9fvW+=@FqJiQ(P=3O^f}TJ13=Aw#aTuQoDi78NR1W}19SRiy

diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp
index 2c95a173..499370d0 100644
--- a/backend/tests/sssp/compiled_pre/sssp.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp.cpp
@@ -33,6 +33,7 @@ const u64 int_tag = 0;
 const u64 str_tag = 2;
 const u64 sign_flip_const = 0x0000200000000000;
 const u64 signed_num_mask = 0xFFFFE00000000000;
+int start_node = 1;
 
 inline bool is_number(u64 datum) {
   // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
@@ -381,20 +382,10 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
   return max_rel;
 }
 
-int main(int argc, char **argv) {
-  // input dir from compiler
-  std::string slog_input_dir =
-      "/home/stargazermiao/workspace/PL/slog/out/input-data";
-  // output dir from compiler
-  std::string slog_output_dir =
-      "/home/stargazermiao/workspace/PL/slog/out/checkpoints";
-  if (argc == 3) {
-    slog_input_dir = argv[1];
-    slog_output_dir = argv[2];
-  }
-  load_input_relation(slog_input_dir);
-  mpi_comm mcomm;
-  mcomm.create(argc, argv);
+void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::string output_dir, int argc, char **argv) {
+  start_node = sp;
+  load_input_relation(input_dir);
+
 
   // relation *rel__edge__3__1__2__3 = new relation(
   //     3, true, 3, get_tag_for_rel("edge", "1__2__3"),
@@ -491,7 +482,7 @@ int main(int argc, char **argv) {
   relation *rel__spath__2__1__2 = new relation(
       2, true, 2, get_tag_for_rel("spath", "1__2"),
       std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table",
-      slog_input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) +
+      input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) +
           ".spath.2.table",
       FULL);
   rel__spath__2__1__2->set_dependent_column_update(
@@ -507,7 +498,7 @@ int main(int argc, char **argv) {
   relation *rel__edge__3__1__2__3 = new relation(
       3, true, 3, get_tag_for_rel("edge", "1__2__3"),
       std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
-      slog_input_dir + "/" +
+      input_dir + "/" +
           std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
       FULL);
   relation *rel__spath__2__1 = new relation(
@@ -540,7 +531,7 @@ int main(int argc, char **argv) {
           auto [data, output] = state;
           auto head_tuple = output;
 
-          bool compatible = true && res_0 == n2d(1);
+          bool compatible = true && res_0 == n2d(start_node);
           if (!compatible)
             return state;
 
@@ -597,7 +588,7 @@ int main(int argc, char **argv) {
   lie->enable_all_to_all_dump();
   lie->enable_data_IO();
   lie->enable_IO();
-  lie->set_output_dir(slog_output_dir); // Write to this directory
+  lie->set_output_dir(output_dir); // Write to this directory
   lie->set_comm(mcomm);
   lie->set_batch_size(1);
   lie->execute();
@@ -624,7 +615,26 @@ int main(int argc, char **argv) {
 
   delete lie;
 
-  mcomm.destroy();
+}
 
+int main(int argc, char **argv) {
+  // input dir from compiler
+  std::string slog_input_dir =
+      "/home/stargazermiao/workspace/PL/slog/out/input-data";
+  // output dir from compiler
+  std::string slog_output_dir =
+      "/home/stargazermiao/workspace/PL/slog/out/checkpoints";
+  if (argc == 3) {
+    slog_input_dir = argv[1];
+    slog_output_dir = argv[2];
+  }
+  mpi_comm mcomm;
+  mcomm.create(argc, argv);
+
+  for (int i = 0; i < 5; i++) {
+    compute_sssp_from(mcomm, i, slog_input_dir, slog_output_dir, argc, argv);
+  }
+
+  mcomm.destroy();
   return 0;
 }

From 649a9d708dd3b2a60dd9638578bfa40ea82eed03 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Mon, 14 Nov 2022 01:25:40 -0500
Subject: [PATCH 07/36] w

---
 backend/tests/sssp/compiled_pre/run_sssp.sh | 6 ++++++
 backend/tests/sssp/compiled_pre/sssp.cpp    | 4 +---
 2 files changed, 7 insertions(+), 3 deletions(-)
 create mode 100755 backend/tests/sssp/compiled_pre/run_sssp.sh

diff --git a/backend/tests/sssp/compiled_pre/run_sssp.sh b/backend/tests/sssp/compiled_pre/run_sssp.sh
new file mode 100755
index 00000000..9866b9b6
--- /dev/null
+++ b/backend/tests/sssp/compiled_pre/run_sssp.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+for i in {1..5}
+do
+    mpirun -np 1 ./build/sssp ./input-data ./ $i
+done
diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp
index 499370d0..56846cc4 100644
--- a/backend/tests/sssp/compiled_pre/sssp.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp.cpp
@@ -631,9 +631,7 @@ int main(int argc, char **argv) {
   mpi_comm mcomm;
   mcomm.create(argc, argv);
 
-  for (int i = 0; i < 5; i++) {
-    compute_sssp_from(mcomm, i, slog_input_dir, slog_output_dir, argc, argv);
-  }
+  compute_sssp_from(mcomm, atoi(argv[3]), slog_input_dir, slog_output_dir, argc, argv);
 
   mcomm.destroy();
   return 0;

From 80f2a14959c86daa5ab454292918aa1eb91e65a3 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Mon, 14 Nov 2022 01:41:41 -0500
Subject: [PATCH 08/36] add loop script to run sssp

w
---
 backend/src/lie/lie.cpp                         | 2 +-
 backend/src/relation/balanced_hash_relation.cpp | 2 +-
 backend/tests/sssp/compiled_pre/run_sssp.sh     | 2 +-
 backend/tests/sssp/compiled_pre/sssp.cpp        | 2 +-
 backend/tests/sssp/sssp.slog                    | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index b2d16761..59a0ec57 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -589,7 +589,7 @@ bool LIE::execute ()
 
     write_final_checkpoint_dump();
 
-    std::cout << "finish writting checkpoint!" << std::endl;
+    // std::cout << "finish writting checkpoint!" << std::endl;
 
     delete[] rotate_index_array;
     for (int i=0; i < nprocs; i++)
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index 7b5deef9..b3f4e879 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -780,7 +780,7 @@ void relation::populate_full(int buffer_size, u64* buffer)
     u32 counter = 0;
     u64 t[arity+1];
     u32 buckets = get_bucket_count();
-    std::cout << "populating full for " << intern_tag << std::endl;
+    // std::cout << "populating full for " << intern_tag << std::endl;
 
     for (int i = 0; i < buffer_size; i = i + (arity+1))
     {
diff --git a/backend/tests/sssp/compiled_pre/run_sssp.sh b/backend/tests/sssp/compiled_pre/run_sssp.sh
index 9866b9b6..c2af3466 100755
--- a/backend/tests/sssp/compiled_pre/run_sssp.sh
+++ b/backend/tests/sssp/compiled_pre/run_sssp.sh
@@ -2,5 +2,5 @@
 
 for i in {1..5}
 do
-    mpirun -np 1 ./build/sssp ./input-data ./ $i
+    mpirun -np 1 ./build/sssp ./input-data ./ $i || exit 1;
 done
diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp
index 56846cc4..2c0603e8 100644
--- a/backend/tests/sssp/compiled_pre/sssp.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp.cpp
@@ -624,7 +624,7 @@ int main(int argc, char **argv) {
   // output dir from compiler
   std::string slog_output_dir =
       "/home/stargazermiao/workspace/PL/slog/out/checkpoints";
-  if (argc == 3) {
+  if (argc > 2) {
     slog_input_dir = argv[1];
     slog_output_dir = argv[2];
   }
diff --git a/backend/tests/sssp/sssp.slog b/backend/tests/sssp/sssp.slog
index ae617a6d..bd00df2c 100644
--- a/backend/tests/sssp/sssp.slog
+++ b/backend/tests/sssp/sssp.slog
@@ -1,3 +1,3 @@
 
-[(spath to dist) <-- (edge 1 to dist)]
-[(spath to l) <-- (spath mid dist) (edge mid to l)]
+[(spath to dist) <-- (edge 1 to dist)]  ; loop from 1 ~ 10000 compute average (in c++)
+[(spath to {l+dist}) <-- (spath mid dist) (edge mid to l)]

From a707d5353351293b9e51c8bf51634f0c953754c0 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Mon, 14 Nov 2022 22:28:57 -0500
Subject: [PATCH 09/36] w

---
 examples/datalog-example | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/datalog-example b/examples/datalog-example
index 30c6423b..be103a21 160000
--- a/examples/datalog-example
+++ b/examples/datalog-example
@@ -1 +1 @@
-Subproject commit 30c6423bf1b1a101075e9712f77c156a688459a2
+Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa

From 058f1dccc7144ee66113b539b481dcbef2906890 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Tue, 15 Nov 2022 01:29:24 -0500
Subject: [PATCH 10/36] fix reorder

---
 .../src/relation/balanced_hash_relation.cpp   |   6 ++--
 backend/src/relation/shmap_relation_exp.cpp   |  31 ++++++++++++++++++
 .../checkpoint-final/258.edge.3.table_full    | Bin 0 -> 288 bytes
 .../checkpoint-final/259.spath.2.table_full   | Bin 0 -> 144 bytes
 .../compiled_pre/input-data/256.edge.3.table  | Bin 288 -> 0 bytes
 .../compiled_pre/input-data/258.edge.3.table  | Bin 0 -> 288 bytes
 backend/tests/sssp/compiled_pre/sssp.cpp      |  24 +++++++++++---
 backend/tests/sssp/test-input-graph/edge.csv  |  16 ++++-----
 8 files changed, 62 insertions(+), 15 deletions(-)
 create mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full
 create mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full
 delete mode 100644 backend/tests/sssp/compiled_pre/input-data/256.edge.3.table
 create mode 100644 backend/tests/sssp/compiled_pre/input-data/258.edge.3.table

diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index b3f4e879..7663343b 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -1307,6 +1307,7 @@ void relation::local_insert_in_delta()
             }
             newt[i].purge();
             memset(newt_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32));
+            newt_element_count = 0;
         }
     } else {
         delete[] delta;
@@ -1324,8 +1325,7 @@ void relation::local_insert_in_delta()
             newt[i].dependent_column_indices = dependent_column_indices;
             newt[i].update_compare_func = update_compare_func;
         }
+        newt_element_count = 0;
+        memset(newt_bucket_element_count, 0, buckets * sizeof(u32));
     }
-
-    newt_element_count = 0;
-    memset(newt_bucket_element_count, 0, buckets * sizeof(u32));
 }
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index 2db1941b..fec862d4 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -408,6 +408,14 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
         upper_bound[i] = prefix[i];
         lower_bound[i] = prefix[i];
     }
+    // std::cout << "cur tree >>> " << std::endl;
+    // for (auto r:  ind) {
+    //     std::cout << ">>> ";
+    //     for (auto c: r) {
+    //         std::cout << c << " ";
+    //     }
+    //     std::cout << std::endl;
+    // }
     auto joined_range = lowerUpperRange(lower_bound, upper_bound);
     for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it)
     {
@@ -415,6 +423,11 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
         u64 projected_path[join_buffer.width[ra_id]];
         if (generator_mode) {
             std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
+            // std::cout << "join facts ";
+            // for (auto c: input_t) {
+            //     std::cout << c << " ";
+            // }
+            // std::cout << std::endl;
             gen_func(input_t, cur_path, projected_path);
         } else {
             // std::cout << "here" << std::endl;
@@ -428,6 +441,11 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
             for (int i =0; i < join_buffer.width[ra_id]; i++)
                 projected_path[i] = reordered_cur_path[reorder_map[i]];
         }
+        // std::cout << "add new facts ";
+        // for (auto c: projected_path) {
+        //     std::cout << c << " ";
+        // }
+        // std::cout << std::endl;
         if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
         {
             uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets;
@@ -481,6 +499,19 @@ void shmap_relation::as_all_to_allv_left_join_buffer(
         upper_bound[i] = prefix[i];
         lower_bound[i] = prefix[i];
     }
+    // std::cout << "join >>> ";
+    // for (auto c: prefix) {
+    //     std::cout << c << " ";
+    // }
+    // std::cout << std::endl;
+    // std::cout << "cur tree >>> " << std::endl;
+    // for (auto r:  ind) {
+    //     std::cout << ">>> ";
+    //     for (auto c: r) {
+    //         std::cout << c << " ";
+    //     }
+    //     std::cout << std::endl;
+    // }
     auto joined_range = lowerUpperRange(lower_bound, upper_bound);
     for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it)
     {
diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full
new file mode 100644
index 0000000000000000000000000000000000000000..ff8d44ee05bfce6725eb4fca00771199d6e4af8b
GIT binary patch
literal 288
zcmZ9^K@xx<3`5aYMUYc@G>&d2Gz-I~!+(X0eU6%0lViQ1`&s?B2iM)59j<=a|Lkz}
bX3h>*zs=d<>fM|jKB_<F+=r`AbH0OrtJMcc

literal 0
HcmV?d00001

diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full
new file mode 100644
index 0000000000000000000000000000000000000000..f9fa0f4550a97cb1cde84f74525d202b0c48713a
GIT binary patch
literal 144
zcmXwu$qfJ?48uyf)?#!<BMNRlR_u(w290~7^Aozu4}Nu1e&VZ}@(W+xl;8O3ru+kk
C_Xb1&

literal 0
HcmV?d00001

diff --git a/backend/tests/sssp/compiled_pre/input-data/256.edge.3.table b/backend/tests/sssp/compiled_pre/input-data/256.edge.3.table
deleted file mode 100644
index fc8e3fb858cd084514efb450c76bdd6302277e4d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 288
zcmY+8$qm3D5Ccsvl2o<*R~0GtXa%1UzF^2_3*XQvjoi<053QHb>67=pgih{QcG_o`
fymP&ee=q-OpI!RbpDUr$pXr+=baGF8zf<}EYF7fz

diff --git a/backend/tests/sssp/compiled_pre/input-data/258.edge.3.table b/backend/tests/sssp/compiled_pre/input-data/258.edge.3.table
new file mode 100644
index 0000000000000000000000000000000000000000..ff8d44ee05bfce6725eb4fca00771199d6e4af8b
GIT binary patch
literal 288
zcmZ9^K@xx<3`5aYMUYc@G>&d2Gz-I~!+(X0eU6%0lViQ1`&s?B2iM)59j<=a|Lkz}
bX3h>*zs=d<>fM|jKB_<F+=r`AbH0OrtJMcc

literal 0
HcmV?d00001

diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp
index 2c0603e8..284ba9eb 100644
--- a/backend/tests/sssp/compiled_pre/sssp.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp.cpp
@@ -489,6 +489,14 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri
     {1, 2},   // len and id column
     [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
     {
+      // std::cout << "Comparing  ";
+      // for (auto c : old_v) {
+      //   std::cout << c << " ";
+      // }
+      // std::cout << " <<<<<<  ";
+      // for (auto c : new_v) {
+      //   std::cout << c << " ";
+      // }
       return new_v[0] < old_v[0];
     }
   );
@@ -508,6 +516,14 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri
     {1, 2},
     [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
     {
+      // std::cout << "Comparing  ";
+      // for (auto c : old_v) {
+      //   std::cout << c << " ";
+      // }
+      // std::cout << " <<<<<<  ";
+      // for (auto c : new_v) {
+      //   std::cout << c << " ";
+      // }
       return new_v[0] < old_v[0];
     }
   );
@@ -559,17 +575,17 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri
   );
   update_spath_j->set_generator_func([](std::vector<u64>& target_v,
   std::vector<u64>& input_v, u64* res) {
-    res[0] = target_v[0];
-    // res[1] = input_v[2];
-    if (res[0] == input_v[2]) {
+    // res[0] = target_v[0];
+    res[0] = input_v[2];
+    if (res[0] == start_node) {
       res[1] = 0;
     } else {
       res[1] = target_v[1] + input_v[3];
     }
   });
+  scc2->add_rule(update_spath_j);
   scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2,
                                     DELTA, {0, 1, 2}));
-  scc2->add_rule(update_spath_j);
 
 
   LIE *lie = new LIE();
diff --git a/backend/tests/sssp/test-input-graph/edge.csv b/backend/tests/sssp/test-input-graph/edge.csv
index 936bb262..de8668e0 100644
--- a/backend/tests/sssp/test-input-graph/edge.csv
+++ b/backend/tests/sssp/test-input-graph/edge.csv
@@ -1,9 +1,9 @@
-1	2	10
-1	5	3
-5	2	1
-2	5	4
+1	2	1
+1	3	2
 2	3	2
-5	3	8
-5	4	2
-4	3	7
-3	4	9
+3	4	1
+4	5	1
+5	6	1
+6	7	1
+8	9	1
+9	10	1

From 329e9417212e86ce2281df397bfe61b53c1965ce Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-253.ec2.internal>
Date: Wed, 16 Nov 2022 21:50:02 +0000
Subject: [PATCH 11/36] change to compute multi sssp

---
 backend/src/parallel_RA_inc.h                 |   3 +-
 backend/src/relation/shmap_relation_exp.cpp   |   4 +-
 .../checkpoint-final/256.edge.3.table_full    | Bin 288 -> 0 bytes
 .../checkpoint-final/258.edge.3.table_full    | Bin 288 -> 0 bytes
 .../checkpoint-final/258.spath.2.table_full   | Bin 96 -> 0 bytes
 .../checkpoint-final/259.spath.2.table_full   | Bin 144 -> 0 bytes
 .../sssp/compiled_pre/res-128-1000.output     |  24 ++
 backend/tests/sssp/compiled_pre/run_sssp.sh   |   4 +-
 backend/tests/sssp/compiled_pre/sssp.cpp      | 359 ++++++++++--------
 examples/datalog-example                      |   2 +-
 10 files changed, 224 insertions(+), 172 deletions(-)
 delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full
 delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full
 delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full
 delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full
 create mode 100644 backend/tests/sssp/compiled_pre/res-128-1000.output

diff --git a/backend/src/parallel_RA_inc.h b/backend/src/parallel_RA_inc.h
index e02739c5..00c3f688 100644
--- a/backend/src/parallel_RA_inc.h
+++ b/backend/src/parallel_RA_inc.h
@@ -13,11 +13,12 @@
 #include "compat.h"
 // #include "shmap/shmap.h"
 #include "shmap/shmap_goog.h"
+#include <vector>
 
 //#define DEBUG_OUTPUT 1
 #define MAX_LOOP_COUNT 120000
 
-using update_partial_compare_func_t = std::function<std::optional<bool>(std::vector<u64> old_v, std::vector<u64> new_v)>;
+using update_partial_compare_func_t = std::function<std::optional<bool>(const std::vector<u64>& old_v, const std::vector<u64>& new_v, const std::vector<u64>& prefix)>;
 using join_generator_func_t = std::function<void(std::vector<u64>& target_v, std::vector<u64>& input_v, u64* res)>;
 
 #include "log/logger.h"
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index fec862d4..700c7a6f 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -76,7 +76,7 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width)
                 for (auto i: dependent_column_indices) {
                     old_t.push_back(cur_tuple[i]);
                 }
-                auto compare_res = update_compare_func(old_t, dependent_columns);
+                auto compare_res = update_compare_func(old_t, dependent_columns, tp);
                 if (compare_res.has_value() && compare_res.value()) {
                     need_deletes.push_back(it);
                     // std::cout << "update with <<<<<< ";
@@ -129,7 +129,7 @@ shmap_relation::check_dependent_insertion(const std::vector<u64> &tp) {
                 for (auto i: dependent_column_indices) {
                     old_t.push_back(cur_tuple[i]);
                 }
-                auto compare_res = update_compare_func(old_t, dependent_columns);
+                auto compare_res = update_compare_func(old_t, dependent_columns, tp);
                 if (compare_res.has_value() && compare_res.value()) {
                     return true;
                 }
diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full
deleted file mode 100644
index a5b47390726befd417416b8e76e64db49a1e53f8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 288
zcmYL@+YP`l3<61CX)=}fpQ=*f<ouir24~3c2zO|dM&>&_L+d4S=*-u@*FJgclJ_f=
g&{z83<k9J$@y!yw+~vQX+b8c{CD%T8=y`Jg18>&?&Hw-a

diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full
deleted file mode 100644
index ff8d44ee05bfce6725eb4fca00771199d6e4af8b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 288
zcmZ9^K@xx<3`5aYMUYc@G>&d2Gz-I~!+(X0eU6%0lViQ1`&s?B2iM)59j<=a|Lkz}
bX3h>*zs=d<>fM|jKB_<F+=r`AbH0OrtJMcc

diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full
deleted file mode 100644
index 32cf0a911bf6c18b1d59c6e2b5f3f49f85a4ea6a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 96
vcmZQ(fB<$V4W=3#7+9fv7Bq1lC_iCdLC>Fh1_rP^P(6eIq7(LiwkM|}zt

diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full
deleted file mode 100644
index f9fa0f4550a97cb1cde84f74525d202b0c48713a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 144
zcmXwu$qfJ?48uyf)?#!<BMNRlR_u(w290~7^Aozu4}Nu1e&VZ}@(W+xl;8O3ru+kk
C_Xb1&

diff --git a/backend/tests/sssp/compiled_pre/res-128-1000.output b/backend/tests/sssp/compiled_pre/res-128-1000.output
new file mode 100644
index 00000000..314f84c1
--- /dev/null
+++ b/backend/tests/sssp/compiled_pre/res-128-1000.output
@@ -0,0 +1,24 @@
+        Command being timed: "mpirun --use-hwthread-cpus -np 128 ./sssp /home/ubuntu/workspace/dataset/livejournal-bin/ ../ 100"
+        User time (seconds): 258281.11
+        System time (seconds): 1206.51
+        Percent of CPU this job got: 12775%
+        Elapsed (wall clock) time (h:mm:ss or m:ss): 33:51.09
+        Average shared text size (kbytes): 0
+        Average unshared data size (kbytes): 0
+        Average stack size (kbytes): 0
+        Average total size (kbytes): 0
+        Maximum resident set size (kbytes): 3137436
+        Average resident set size (kbytes): 0
+        Major (requiring I/O) page faults: 30763
+        Minor (reclaiming a frame) page faults: 904770632
+        Voluntary context switches: 826718
+        Involuntary context switches: 129160
+        Swaps: 0
+        File system inputs: 0
+        File system outputs: 31831464
+        Socket messages sent: 0
+        Socket messages received: 0
+        Signals delivered: 0
+        Page size (bytes): 4096
+        Exit status: 0
+
diff --git a/backend/tests/sssp/compiled_pre/run_sssp.sh b/backend/tests/sssp/compiled_pre/run_sssp.sh
index c2af3466..530faefe 100755
--- a/backend/tests/sssp/compiled_pre/run_sssp.sh
+++ b/backend/tests/sssp/compiled_pre/run_sssp.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-for i in {1..5}
+for i in {0..5000}
 do
-    mpirun -np 1 ./build/sssp ./input-data ./ $i || exit 1;
+    /usr/bin/time --verbose mpirun --use-hwthread-cpus -np 128 ./build/sssp /home/ubuntu/workspace/dataset/livejournal-bin ../ $i || exit 2;
 done
diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp
index 284ba9eb..e99cd477 100644
--- a/backend/tests/sssp/compiled_pre/sssp.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp.cpp
@@ -1,5 +1,5 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 
 #include <iostream>
 #include <iterator>
@@ -333,7 +333,7 @@ void load_input_relation(std::string db_dir) {
   for (const auto &entry : std::filesystem::directory_iterator(db_dir)) {
     // check if ends with table
     std::string filename_ss = entry.path().filename().string();
-    std::cout << "input database has file " << filename_ss << std::endl;
+    //std::cout << "input database has file " << filename_ss << std::endl;
     std::string suffix = ".table";
     int ft = filename_ss.size() - suffix.size();
     if (ft < 0)
@@ -356,8 +356,8 @@ void load_input_relation(std::string db_dir) {
     }
     if (tag > max_rel)
       max_rel = tag;
-    std::cout << "load " << tag << "." << index_stream.str() << "has arity "
-              << arity << std::endl;
+    //std::cout << "load " << tag << "." << index_stream.str() << "has arity "
+    //          << arity << std::endl;
     rel_tag_map[index_stream.str()] = tag;
   }
 }
@@ -377,8 +377,8 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
   }
   max_rel++;
   rel_tag_map[name_arity] = max_rel;
-  std::cout << "generate rel tag: " << name_arity << " " << max_rel
-            << std::endl;
+  //std::cout << "generate rel tag: " << name_arity << " " << max_rel
+  //          << std::endl;
   return max_rel;
 }
 
@@ -386,173 +386,75 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri
   start_node = sp;
   load_input_relation(input_dir);
 
-
-  // relation *rel__edge__3__1__2__3 = new relation(
-  //     3, true, 3, get_tag_for_rel("edge", "1__2__3"),
-  //     std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
-  //     slog_input_dir + "/" +
-  //         std::to_string(get_tag_for_rel("edge", "1__2__3")) +
-  //         ".edge.3.table",
-  //     FULL);
-  // relation* rel__edge__3__1 = new relation(
-  //   1, false, 3, get_tag_for_rel("edge","1"),
-  //   std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table",
-  //   FULL);
-
-  // // the dependent column must be exclude from hash computation, so join
-  // column count is 3 - 1 = 2 relation *rel__spath__3__1__2__3 = new relation(
-  //     2, true, 3, get_tag_for_rel("spath", "1__2__3"),
-  //     std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table",
-  //     slog_input_dir + "/" +
-  //         std::to_string(get_tag_for_rel("spath", "1__2__3")) +
-  //         ".spath.3.table",
-  //     FULL);
-  // // set functional dependency for spath
-  // rel__spath__3__1__2__3->set_dependent_column_update(
-  //   {2, 3},   // len and id column
-  //   [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
-  //   {
-  //     return new_v[0] < old_v[0];
-  //   }
-  // );
-  // relation* rel__spath__3__2 = new relation(
-  //   1, false, 3, get_tag_for_rel("spath","2"),
-  //   std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table",
-  //   FULL);
-  // rel__spath__3__2->set_dependent_column_update(
-  //   {2, 3},
-  //   [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
-  //   {
-  //     return new_v[0] < old_v[0];
-  //   }
-  // );
-
-  // RAM* scc0 = new RAM(false, 0);
-  // scc0->add_relation(rel__edge__3__1, true, false);
-  // scc0->add_relation(rel__edge__3__1__2__3, true, false);
-  // scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3,
-  // DELTA, {0, 3, 1, 2}));
-
-  // RAM *scc1 = new RAM(false, 0);
-  // scc1->add_relation(rel__edge__3__1__2__3, false, false);
-  // scc1->add_relation(rel__spath__3__1__2__3, true, false);
-  // scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3,
-  //                                  rel__edge__3__1__2__3, FULL, {0, 1, 2}));
-
-  // RAM *scc2 = new RAM(true, 1);
-  // scc2->add_relation(rel__edge__3__1__2__3, false, false);
-  // scc2->add_relation(rel__spath__3__2, true, false);
-  // scc2->add_relation(rel__spath__3__1__2__3, true, false);
-  // //  the order of non join column also need to be carefully arranged
-  // because, dependent column
-  // //  should always at last
-  // scc2->add_rule(new parallel_acopy(
-  //   rel__spath__3__2,
-  //   rel__spath__3__1__2__3, DELTA,
-  //   {1, 0, 2, 3})); // 2, 1, 3, id
-  // parallel_join* update_spath_j = new parallel_join(
-  //   rel__spath__3__1__2__3,
-  //   rel__edge__3__1, FULL,
-  //   rel__spath__3__2, DELTA,
-  //   {5, 2, 3}// useless
-  // );
-  // update_spath_j->set_generator_func([](std::vector<u64>& target_v,
-  // std::vector<u64>& input_v, u64* res) {
-  //   res[0] = target_v[1];
-  //   res[1] = input_v[2];
-  //   if (res[0] == res[1]) {
-  //     res[2] = 0;
-  //   } else {
-  //     res[2] = target_v[2] + input_v[3];
-  //   }
-  // });
-  // scc2->add_rule(update_spath_j);
-
-  // LIE *lie = new LIE();
-  // lie->add_relation(rel__edge__3__1);
-  // lie->add_relation(rel__edge__3__1__2__3);
-  // lie->add_relation(rel__spath__3__2);
-  // lie->add_relation(rel__spath__3__1__2__3);
-  // lie->add_scc(scc0);
-  // lie->add_scc(scc1);
-  // lie->add_scc(scc2);
-  // lie->add_scc_dependance(scc0, scc2);
-  // lie->add_scc_dependance(scc1, scc2);
-
-  relation *rel__spath__2__1__2 = new relation(
-      2, true, 2, get_tag_for_rel("spath", "1__2"),
-      std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table",
-      input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) +
-          ".spath.2.table",
-      FULL);
-  rel__spath__2__1__2->set_dependent_column_update(
-    {1, 2},   // len and id column
-    [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
-    {
-      // std::cout << "Comparing  ";
-      // for (auto c : old_v) {
-      //   std::cout << c << " ";
-      // }
-      // std::cout << " <<<<<<  ";
-      // for (auto c : new_v) {
-      //   std::cout << c << " ";
-      // }
-      return new_v[0] < old_v[0];
-    }
-  );
-  relation *rel__edge__3__1 = new relation(
-      1, false, 3, get_tag_for_rel("edge", "1"),
-      std::to_string(get_tag_for_rel("edge", "1")) + ".edge.3.table", FULL);
   relation *rel__edge__3__1__2__3 = new relation(
       3, true, 3, get_tag_for_rel("edge", "1__2__3"),
       std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
       input_dir + "/" +
-          std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
+          std::to_string(get_tag_for_rel("edge", "1__2__3")) +
+          ".edge.3.table",
       FULL);
-  relation *rel__spath__2__1 = new relation(
-      1, false, 2, get_tag_for_rel("spath", "1"),
-      std::to_string(get_tag_for_rel("spath", "1")) + ".spath.2.table", FULL);
-  rel__spath__2__1->set_dependent_column_update(
-    {1, 2},
-    [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
+  relation* rel__edge__3__1 = new relation(
+    1, false, 3, get_tag_for_rel("edge","1"),
+    std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table",
+    FULL);
+
+  // the dependent column must be exclude from hash computation, so join
+  // column count is 3 - 1 = 2
+  relation *rel__spath__3__1__2__3 = new relation(
+      2, true, 3, get_tag_for_rel("spath", "1__2__3"),
+      std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table",
+      input_dir + "/" +
+          std::to_string(get_tag_for_rel("spath", "1__2__3")) +
+          ".spath.3.table",
+      FULL);
+  // set functional dependency for spath
+  rel__spath__3__1__2__3->set_dependent_column_update(
+    {2, 3},   // len and id column
+    [](const std::vector<u64>& old_v, const std::vector<u64>& new_v, const vector<u64>& nt) -> std::optional<bool>
+    {
+      return new_v[0] < old_v[0];
+    }
+  );
+  relation* rel__spath__3__2 = new relation(
+    1, false, 3, get_tag_for_rel("spath","2"),
+    std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table",
+    FULL);
+  rel__spath__3__2->set_dependent_column_update(
+    {2, 3},
+    [](const std::vector<u64>& old_v, const std::vector<u64>& new_v, const vector<u64>& nt) -> std::optional<bool>
     {
-      // std::cout << "Comparing  ";
-      // for (auto c : old_v) {
-      //   std::cout << c << " ";
-      // }
-      // std::cout << " <<<<<<  ";
-      // for (auto c : new_v) {
-      //   std::cout << c << " ";
-      // }
       return new_v[0] < old_v[0];
     }
   );
 
-  RAM *scc0 = new RAM(false, 0);
+  RAM* scc0 = new RAM(false, 0);
   scc0->add_relation(rel__edge__3__1, true, false);
   scc0->add_relation(rel__edge__3__1__2__3, true, false);
   scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3,
-                                    DELTA, {0, 3, 1, 2}));
+  DELTA, {0, 3, 1, 2}));
 
   RAM *scc1 = new RAM(false, 1);
-  scc1->add_relation(rel__spath__2__1__2, true, false);
-  scc1->add_relation(rel__edge__3__1, false, false);
+  scc1->add_relation(rel__edge__3__1__2__3, false, false);
+  scc1->add_relation(rel__spath__3__1__2__3, true, false);
+  // scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3,
+  //                                  rel__edge__3__1__2__3, FULL, {0, 1, 2}));
   scc1->add_rule(new parallel_copy_generate(
-      rel__spath__2__1__2, rel__edge__3__1, FULL,
+      rel__spath__3__1__2__3, rel__edge__3__1__2__3, FULL,
       [](const u64 *const data, u64 *const output) -> int {
-        auto args_for_old_bi = std::array<u64, 1>{data[0]};
+        auto args_for_old_bi = std::array<u64, 3>{data[0], data[1], data[2]};
         using TState = std::tuple<const u64 *, u64 *>;
         TState state = std::make_tuple(data, output);
         auto callback = [](u64 res_0, TState state) -> TState {
           auto [data, output] = state;
           auto head_tuple = output;
 
-          bool compatible = true && res_0 == n2d(start_node);
+          bool compatible = true && res_0 < n2d(start_node);
           if (!compatible)
             return state;
 
-          head_tuple[0] = data[2];
-          head_tuple[1] = data[3];
+          head_tuple[0] = data[0];
+          head_tuple[1] = data[1];
+          head_tuple[2] = data[2];
           return std::make_tuple(data, output + 2);
         };
         auto [_, new_ptr] =
@@ -562,44 +464,167 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri
       }));
 
   RAM *scc2 = new RAM(true, 2);
-  scc2->add_relation(rel__spath__2__1__2, true, false);
-  scc2->add_relation(rel__edge__3__1, false, false);
-  scc2->add_relation(rel__spath__2__1, true, false);
-  // scc2->add_rule(new parallel_join(rel__spath__2__1__2, rel__spath__2__1, DELTA,
-  //                                  rel__edge__3__1, FULL, {4, 5}));
+  scc2->add_relation(rel__edge__3__1__2__3, false, false);
+  scc2->add_relation(rel__spath__3__2, true, false);
+  scc2->add_relation(rel__spath__3__1__2__3, true, false);
+  //  the order of non join column also need to be carefully arranged
+  // because, dependent column
+  //  should always at last
+  scc2->add_rule(new parallel_acopy(
+    rel__spath__3__2,
+    rel__spath__3__1__2__3, DELTA,
+    {1, 0, 2, 3})); // 2, 1, 3, id
   parallel_join* update_spath_j = new parallel_join(
-    rel__spath__2__1__2,
+    rel__spath__3__1__2__3,
     rel__edge__3__1, FULL,
-    rel__spath__2__1, DELTA,
-    {5,4}// useless
+    rel__spath__3__2, DELTA,
+    {5, 2, 3}// useless
   );
   update_spath_j->set_generator_func([](std::vector<u64>& target_v,
   std::vector<u64>& input_v, u64* res) {
-    // res[0] = target_v[0];
-    res[0] = input_v[2];
-    if (res[0] == start_node) {
-      res[1] = 0;
+    res[0] = target_v[1];
+    res[1] = input_v[2];
+    if (res[0] == res[1]) {
+      res[2] = 0;
     } else {
-      res[1] = target_v[1] + input_v[3];
+      res[2] = target_v[2] + input_v[3];
     }
   });
   scc2->add_rule(update_spath_j);
-  scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2,
-                                    DELTA, {0, 1, 2}));
-
 
   LIE *lie = new LIE();
-  lie->add_relation(rel__spath__2__1__2);
   lie->add_relation(rel__edge__3__1);
   lie->add_relation(rel__edge__3__1__2__3);
-  lie->add_relation(rel__spath__2__1);
+  lie->add_relation(rel__spath__3__2);
+  lie->add_relation(rel__spath__3__1__2__3);
   lie->add_scc(scc0);
   lie->add_scc(scc1);
   lie->add_scc(scc2);
   lie->add_scc_dependance(scc0, scc2);
-  lie->add_scc_dependance(scc0, scc1);
   lie->add_scc_dependance(scc1, scc2);
 
+  // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+  // relation *rel__spath__2__1__2 = new relation(
+  //     2, true, 2, get_tag_for_rel("spath", "1__2"),
+  //     std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table",
+  //     input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) +
+  //         ".spath.2.table",
+  //     FULL);
+  // rel__spath__2__1__2->set_dependent_column_update(
+  //   {1, 2},   // len and id column
+  //   [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
+  //   {
+  //     // std::cout << "Comparing  ";
+  //     // for (auto c : old_v) {
+  //     //   std::cout << c << " ";
+  //     // }
+  //     // std::cout << " <<<<<<  ";
+  //     // for (auto c : new_v) {
+  //     //   std::cout << c << " ";
+  //     // }
+  //     return new_v[0] < old_v[0];
+  //   }
+  // );
+  // relation *rel__edge__3__1 = new relation(
+  //     1, false, 3, get_tag_for_rel("edge", "1"),
+  //     std::to_string(get_tag_for_rel("edge", "1")) + ".edge.3.table", FULL);
+  // relation *rel__edge__3__1__2__3 = new relation(
+  //     3, true, 3, get_tag_for_rel("edge", "1__2__3"),
+  //     std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
+  //     input_dir + "/" +
+  //         std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
+  //     FULL);
+  // relation *rel__spath__2__1 = new relation(
+  //     1, false, 2, get_tag_for_rel("spath", "1"),
+  //     std::to_string(get_tag_for_rel("spath", "1")) + ".spath.2.table", FULL);
+  // rel__spath__2__1->set_dependent_column_update(
+  //   {1, 2},
+  //   [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
+  //   {
+  //     // std::cout << "Comparing  ";
+  //     // for (auto c : old_v) {
+  //     //   std::cout << c << " ";
+  //     // }
+  //     // std::cout << " <<<<<<  ";
+  //     // for (auto c : new_v) {
+  //     //   std::cout << c << " ";
+  //     // }
+  //     return new_v[0] < old_v[0];
+  //   }
+  // );
+
+  // RAM *scc0 = new RAM(false, 0);
+  // scc0->add_relation(rel__edge__3__1, true, false);
+  // scc0->add_relation(rel__edge__3__1__2__3, true, false);
+  // scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3,
+  //                                   DELTA, {0, 3, 1, 2}));
+
+  // RAM *scc1 = new RAM(false, 1);
+  // scc1->add_relation(rel__spath__2__1__2, true, false);
+  // scc1->add_relation(rel__edge__3__1, false, false);
+  // scc1->add_rule(new parallel_copy_generate(
+  //     rel__spath__2__1__2, rel__edge__3__1, FULL,
+  //     [](const u64 *const data, u64 *const output) -> int {
+  //       auto args_for_old_bi = std::array<u64, 1>{data[0]};
+  //       using TState = std::tuple<const u64 *, u64 *>;
+  //       TState state = std::make_tuple(data, output);
+  //       auto callback = [](u64 res_0, TState state) -> TState {
+  //         auto [data, output] = state;
+  //         auto head_tuple = output;
+
+  //         bool compatible = true && res_0 == n2d(start_node);
+  //         if (!compatible)
+  //           return state;
+
+  //         head_tuple[0] = data[2];
+  //         head_tuple[1] = data[3];
+  //         return std::make_tuple(data, output + 2);
+  //       };
+  //       auto [_, new_ptr] =
+  //           builtin_eq_1<TState>(args_for_old_bi.data(), state, callback);
+  //       auto tuples_count = (new_ptr - output) / 2;
+  //       return tuples_count;
+  //     }));
+
+  // RAM *scc2 = new RAM(true, 2);
+  // scc2->add_relation(rel__spath__2__1__2, true, false);
+  // scc2->add_relation(rel__edge__3__1, false, false);
+  // scc2->add_relation(rel__spath__2__1, true, false);
+  // // scc2->add_rule(new parallel_join(rel__spath__2__1__2, rel__spath__2__1, DELTA,
+  // //                                  rel__edge__3__1, FULL, {4, 5}));
+  // parallel_join* update_spath_j = new parallel_join(
+  //   rel__spath__2__1__2,
+  //   rel__edge__3__1, FULL,
+  //   rel__spath__2__1, DELTA,
+  //   {5,4}// useless
+  // );
+  // update_spath_j->set_generator_func([](std::vector<u64>& target_v,
+  // std::vector<u64>& input_v, u64* res) {
+  //   // res[0] = target_v[0];
+  //   res[0] = input_v[2];
+  //   if (res[0] == start_node) {
+  //     res[1] = 0;
+  //   } else {
+  //     res[1] = target_v[1] + input_v[3];
+  //   }
+  // });
+  // scc2->add_rule(update_spath_j);
+  // scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2,
+  //                                   DELTA, {0, 1, 2}));
+
+
+  // LIE *lie = new LIE();
+  // lie->add_relation(rel__spath__2__1__2);
+  // lie->add_relation(rel__edge__3__1);
+  // lie->add_relation(rel__edge__3__1__2__3);
+  // lie->add_relation(rel__spath__2__1);
+  // lie->add_scc(scc0);
+  // lie->add_scc(scc1);
+  // lie->add_scc(scc2);
+  // lie->add_scc_dependance(scc0, scc2);
+  // lie->add_scc_dependance(scc0, scc1);
+  // lie->add_scc_dependance(scc1, scc2);
+
   // Enable IO
   lie->enable_all_to_all_dump();
   lie->enable_data_IO();
@@ -611,6 +636,8 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri
   lie->print_all_relation_size(); // Continuously print relation sizes
   lie->stat_intermediate();
 
+  // rel__spath__3__1__2__3->print();
+
   // rel__spath__2__1__2->print();
   // rel__spath__2__1->print();
   // rel__edge__3__1->print();
diff --git a/examples/datalog-example b/examples/datalog-example
index be103a21..87266643 160000
--- a/examples/datalog-example
+++ b/examples/datalog-example
@@ -1 +1 @@
-Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa
+Subproject commit 872666433df43282a408b4e37dad3fcbeafa1891

From f155e41352f9557716dd693fd6fccd3a2e3b6bd5 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Sun, 20 Nov 2022 15:26:17 -0500
Subject: [PATCH 12/36] delete copy rule

---
 .../tests/sssp/compiled_pre/CMakeLists.txt    |   2 +-
 .../compiled_pre/input-data/257.spath.3.table |   0
 .../compiled_pre/input-data/258.edge.3.table  | Bin 288 -> 0 bytes
 .../sssp/compiled_pre/res-128-1000.output     |  24 -
 backend/tests/sssp/compiled_pre/sssp.cpp      |   2 +-
 backend/tests/sssp/compiled_pre/sssp_opt.cpp  | 525 ++++++++++++++++++
 backend/tests/sssp/test-input-graph/edge.csv  |  18 +-
 examples/datalog-example                      |   2 +-
 runslog                                       |   2 +
 9 files changed, 539 insertions(+), 36 deletions(-)
 delete mode 100644 backend/tests/sssp/compiled_pre/input-data/257.spath.3.table
 delete mode 100644 backend/tests/sssp/compiled_pre/input-data/258.edge.3.table
 delete mode 100644 backend/tests/sssp/compiled_pre/res-128-1000.output
 create mode 100644 backend/tests/sssp/compiled_pre/sssp_opt.cpp

diff --git a/backend/tests/sssp/compiled_pre/CMakeLists.txt b/backend/tests/sssp/compiled_pre/CMakeLists.txt
index a5e5801d..89ee3ea4 100644
--- a/backend/tests/sssp/compiled_pre/CMakeLists.txt
+++ b/backend/tests/sssp/compiled_pre/CMakeLists.txt
@@ -19,7 +19,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla
 set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
 
 file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
-file (GLOB source_files_sssp "${PROJECT_SOURCE_DIR}/sssp.cpp")
+file (GLOB source_files_sssp "${PROJECT_SOURCE_DIR}/sssp_opt.cpp")
 
 ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}")
 
diff --git a/backend/tests/sssp/compiled_pre/input-data/257.spath.3.table b/backend/tests/sssp/compiled_pre/input-data/257.spath.3.table
deleted file mode 100644
index e69de29b..00000000
diff --git a/backend/tests/sssp/compiled_pre/input-data/258.edge.3.table b/backend/tests/sssp/compiled_pre/input-data/258.edge.3.table
deleted file mode 100644
index ff8d44ee05bfce6725eb4fca00771199d6e4af8b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 288
zcmZ9^K@xx<3`5aYMUYc@G>&d2Gz-I~!+(X0eU6%0lViQ1`&s?B2iM)59j<=a|Lkz}
bX3h>*zs=d<>fM|jKB_<F+=r`AbH0OrtJMcc

diff --git a/backend/tests/sssp/compiled_pre/res-128-1000.output b/backend/tests/sssp/compiled_pre/res-128-1000.output
deleted file mode 100644
index 314f84c1..00000000
--- a/backend/tests/sssp/compiled_pre/res-128-1000.output
+++ /dev/null
@@ -1,24 +0,0 @@
-        Command being timed: "mpirun --use-hwthread-cpus -np 128 ./sssp /home/ubuntu/workspace/dataset/livejournal-bin/ ../ 100"
-        User time (seconds): 258281.11
-        System time (seconds): 1206.51
-        Percent of CPU this job got: 12775%
-        Elapsed (wall clock) time (h:mm:ss or m:ss): 33:51.09
-        Average shared text size (kbytes): 0
-        Average unshared data size (kbytes): 0
-        Average stack size (kbytes): 0
-        Average total size (kbytes): 0
-        Maximum resident set size (kbytes): 3137436
-        Average resident set size (kbytes): 0
-        Major (requiring I/O) page faults: 30763
-        Minor (reclaiming a frame) page faults: 904770632
-        Voluntary context switches: 826718
-        Involuntary context switches: 129160
-        Swaps: 0
-        File system inputs: 0
-        File system outputs: 31831464
-        Socket messages sent: 0
-        Socket messages received: 0
-        Signals delivered: 0
-        Page size (bytes): 4096
-        Exit status: 0
-
diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp
index e99cd477..ee96ecb9 100644
--- a/backend/tests/sssp/compiled_pre/sssp.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp.cpp
@@ -1,5 +1,5 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h"
 
 #include <iostream>
 #include <iterator>
diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
new file mode 100644
index 00000000..748dc4ca
--- /dev/null
+++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
@@ -0,0 +1,525 @@
+// location of `parallel_RA_inc.h` here
+#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h"
+
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+// builtins.cpp goes here!
+// builtins.cpp
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace std;
+#define u64 uint64_t
+#define u32 uint32_t
+using i64 = int64_t;
+
+const u64 tag_mask = 0xffffc00000000000;
+const u64 tag_position = 46;
+const u64 int_tag = 0;
+const u64 str_tag = 2;
+const u64 sign_flip_const = 0x0000200000000000;
+const u64 signed_num_mask = 0xFFFFE00000000000;
+int start_node = 1;
+
+inline bool is_number(u64 datum) {
+  // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
+  // int_tag) << "\n";
+  return datum >> tag_position == int_tag;
+}
+
+inline i64 datum_to_number(u64 datum) {
+  i64 signed_val =
+      (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
+  if (signed_val >= sign_flip_const) {
+    signed_val = sign_flip_const - signed_val;
+  }
+  return signed_val;
+  // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 -
+  // tag_position);
+}
+const auto d2n = datum_to_number;
+
+inline u64 number_to_datum(i64 number) {
+  i64 unsigned_value = number;
+  if (number < 0) {
+    unsigned_value = (-number) + sign_flip_const;
+  }
+  return (unsigned_value & ~tag_mask) | (int_tag << tag_position);
+  // return (number & ~tag_mask) | (int_tag << tag_position);
+}
+
+const auto n2d = number_to_datum;
+
+inline u64 string_to_datum(std::string str) {
+  u32 str_hash = string_hash(str);
+  return (str_hash & ~tag_mask) | (str_tag << tag_position);
+}
+const auto s2d = string_to_datum;
+
+vector<array<u64, 2>> builtin_div_rem(const u64 *const data) {
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto div = number_to_datum(d2n(data[0]) / d2n(data[1]));
+    auto rem = number_to_datum(d2n(data[0]) % d2n(data[1]));
+    return {{div, rem}};
+  } else {
+    return {};
+  }
+}
+
+#define BUILTIN_BINARY_NUMBER_PRED(name, op)                                   \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(TState state)) {                       \
+    if (is_number(data[0]) && is_number(data[1]) &&                            \
+        datum_to_number(data[0]) op datum_to_number(data[1])) {                \
+      return callback(init_state);                                             \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+BUILTIN_BINARY_NUMBER_PRED(builtin_less, <)
+BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >)
+BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=)
+BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=)
+
+#define BUILTIN_BINARY_NUMBER_FUNC(name, op)                                   \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0]) && is_number(data[1])) {                            \
+      auto res = number_to_datum(datum_to_number(data[0])                      \
+                                     op datum_to_number(data[1]));             \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /)
+
+#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl)                                \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0]) && is_number(data[1])) {                            \
+      auto res = number_to_datum(                                              \
+          impl(datum_to_number(data[0]), datum_to_number(data[1])));           \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) { return arg2 - arg1; }
+BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1)
+
+#define BUILTIN_UNARY_NUMBER_FUNC(name, impl)                                  \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0])) {                                                  \
+      auto res = number_to_datum(impl(datum_to_number(data[0])));              \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+inline u64 add1(u64 x) { return x + 1; }
+inline u64 sub1(u64 x) { return x - 1; }
+
+BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1)
+
+vector<array<u64, 1>> builtin_range(const u64 *const data) {
+  vector<array<u64, 1>> res;
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto lb = datum_to_number(data[0]);
+    auto ub = datum_to_number(data[1]);
+    res.reserve(ub - lb);
+    for (u64 x = lb; x < ub; x++)
+      res.push_back({number_to_datum(x)});
+  }
+  return res;
+}
+
+template <typename TState>
+TState callback_builtin_range(const u64 *data, TState init_state,
+                              TState (*callback)(u64 res, TState state)) {
+  auto state = init_state;
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto lb = datum_to_number(data[0]);
+    auto ub = datum_to_number(data[1]);
+    for (u64 x = lb; x < ub; x++)
+      state = callback(number_to_datum(x), state);
+  }
+  return state;
+}
+
+#define BUILTIN_BINARY_PRED(name, op)                                          \
+  template <typename TState>                                                   \
+  TState name(const u64 *data, TState init_state,                              \
+              TState (*callback)(TState state)) {                              \
+    if (data[0] op data[1])                                                    \
+      return callback(init_state);                                             \
+    else                                                                       \
+      return init_state;                                                       \
+  }
+BUILTIN_BINARY_PRED(builtin_eq, ==)
+BUILTIN_BINARY_PRED(builtin_neq, !=)
+
+template <typename TState>
+TState builtin_eq_1(const u64 *data, TState init_state,
+                    TState (*callback)(u64 res, TState state)) {
+  return callback(data[0], init_state);
+}
+
+#define BUILTIN_UNARY_PRED(name, pred)                                         \
+  template <typename TState>                                                   \
+  TState name(const u64 *data, TState init_state,                              \
+              TState (*callback)(TState state)) {                              \
+    if (pred(data[0]))                                                         \
+      return callback(init_state);                                             \
+    else                                                                       \
+      return init_state;                                                       \
+  }
+
+bool is_not_number(u64 datum) { return !is_number(datum); }
+BUILTIN_UNARY_PRED(builtin_number_huh, is_number)
+BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number)
+
+// for generate-cpp-lambda-for-computational-join
+struct CL2CB_State {
+  void *original_callback; // There be dragons?
+  void *original_state;
+  const u64 *original_data;
+  u64 *cl1_output_args;
+};
+
+// for generate-cpp-lambda-for-computational-copy
+struct BCLCB_State {
+  void *original_callback;
+  void *original_state;
+  const u64 *original_data;
+};
+
+// an experiment:
+template <bool f(u64, u64)> bool builtin_binary_number_pred(const u64 *data) {
+  if (is_number(data[0]) && is_number(data[1])) {
+    return f(datum_to_number(data[0]), datum_to_number(data[1]));
+  } else {
+    return false;
+  }
+}
+bool _less(u64 x, u64 y) { return x < y; }
+auto builtin_less2 = builtin_binary_number_pred<_less>;
+
+template <typename TState>
+inline TState builtin_nop(const u64 *data, TState init_state,
+                          TState (*callback)(TState state)) {
+  return callback(init_state);
+}
+
+// //////////////////// AGGREGATORS Alternative design ////////////////////
+
+// TODO: add number type check
+//////////////////////////////  count /////////////////////////////////////
+
+local_agg_res_t
+agg_count_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                    joined_range) {
+  local_agg_res_t cnt = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    cnt++;
+  }
+  return cnt;
+}
+
+local_agg_res_t agg_count_reduce(local_agg_res_t x, local_agg_res_t y) {
+  return x + y;
+}
+
+//////////////////////////////  sum /////////////////////////////////////
+
+local_agg_res_t
+agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                  joined_range) {
+  local_agg_res_t sum_res = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    sum_res += tuple[tuple.size() - 1];
+  }
+  return sum_res;
+}
+
+local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  return x + y;
+}
+
+//////////////////////////////  maximum  /////////////////////////////////////
+
+local_agg_res_t
+agg_maximum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                      joined_range) {
+  local_agg_res_t max_res = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    auto current_v = tuple[tuple.size() - 1];
+    if (current_v > max_res) {
+      max_res = current_v;
+    }
+  }
+  return max_res;
+}
+
+local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  if (x > y) {
+    return x;
+  } else {
+    return y;
+  }
+}
+
+//////////////////////////////  minimum  /////////////////////////////////////
+
+local_agg_res_t
+agg_minimum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                      joined_range) {
+  local_agg_res_t min_res = std::numeric_limits<u32>::max();
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    auto current_v = tuple[tuple.size() - 1];
+    if (current_v < min_res) {
+      min_res = current_v;
+    }
+  }
+  return min_res;
+}
+
+local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  if (x < y) {
+    return x;
+  } else {
+    return y;
+  }
+}
+
+// // end of builtins.cpp
+
+// global definitions:
+
+int max_rel = 255;
+std::map<std::string, int> rel_tag_map;
+std::map<std::string, std::unordered_set<std::string>> rel_index_map;
+
+// load all relation inside input database
+void load_input_relation(std::string db_dir) {
+  for (const auto &entry : std::filesystem::directory_iterator(db_dir)) {
+    // check if ends with table
+    std::string filename_ss = entry.path().filename().string();
+    // std::cout << "input database has file " << filename_ss << std::endl;
+    std::string suffix = ".table";
+    int ft = filename_ss.size() - suffix.size();
+    if (ft < 0)
+      ft = 0;
+    if (filename_ss.rfind(suffix) != ft) {
+      continue;
+    }
+    std::string filename_s = entry.path().stem().string();
+    int tag = std::stoi(filename_s.substr(0, filename_s.find(".")));
+    std::string name_arity = filename_s.substr(
+        filename_s.find(".") + 1, filename_s.size() - filename_s.find(".") - 1);
+    std::string name = name_arity.substr(0, name_arity.rfind("."));
+    std::string arity_s =
+        name_arity.substr(name_arity.rfind(".") + 1, name_arity.size());
+    int arity = std::stoi(arity_s);
+    std::stringstream index_stream;
+    index_stream << name;
+    for (int i = 1; i <= arity; i++) {
+      index_stream << "__" << i;
+    }
+    if (tag > max_rel)
+      max_rel = tag;
+    // std::cout << "load " << tag << "." << index_stream.str() << "has arity "
+    //           << arity << std::endl;
+    rel_tag_map[index_stream.str()] = tag;
+  }
+}
+
+int get_tag_for_rel(std::string relation_name, std::string index_str) {
+  std::string name_arity = relation_name + "__" + index_str;
+  if (rel_index_map.find(relation_name) != rel_index_map.end()) {
+    rel_index_map[relation_name].insert(index_str);
+  } else {
+    rel_index_map[relation_name] = {index_str};
+  }
+
+  if (rel_tag_map.find(name_arity) != rel_tag_map.end()) {
+    // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] <<
+    // std::endl;
+    return rel_tag_map[name_arity];
+  }
+  max_rel++;
+  rel_tag_map[name_arity] = max_rel;
+  // std::cout << "generate rel tag: " << name_arity << " " << max_rel
+  //           << std::endl;
+  return max_rel;
+}
+
+void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
+                       std::string output_dir, int argc, char **argv) {
+  start_node = sp;
+  load_input_relation(input_dir);
+
+  relation *rel__edge__2__1__2 = new relation(
+      1, true, 2, get_tag_for_rel("edge", "1__2"),
+      std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
+      input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
+          ".edge.2.table",
+      FULL);
+
+  relation *rel__spath__3__2 = new relation(
+      1, true, 3, get_tag_for_rel("spath", "2"),
+      std::to_string(get_tag_for_rel("spath", "2")) + ".spath.3.table",
+      std::to_string(get_tag_for_rel("spath", "2")) + ".spath.3.table", FULL);
+  rel__spath__3__2->set_dependent_column_update(
+      {2, 3},
+      [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
+         const vector<u64> &nt) -> std::optional<bool> {
+        return new_v[0] < old_v[0];
+      });
+
+  RAM *scc0 = new RAM(false, 0);
+  scc0->add_relation(rel__edge__2__1__2, false, false);
+  scc0->add_relation(rel__spath__3__2, true, false);
+  scc0->add_rule(new parallel_copy_generate(
+      rel__spath__3__2, rel__edge__2__1__2, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
+        auto args_for_old_bi = std::array<u64, 3>{data[0], data[1], n2d(1)};
+        using TState = std::tuple<const u64 *, u64 *>;
+        TState state = std::make_tuple(args_for_old_bi.data(), output);
+        auto callback = [](u64 res_0, TState state) -> TState {
+          auto [data, output] = state;
+          auto head_tuple = output;
+
+          bool compatible = true && res_0 < n2d(start_node);
+          if (!compatible)
+            return state;
+
+          head_tuple[0] = data[1];
+          head_tuple[1] = data[0];
+          head_tuple[2] = data[2];
+          return std::make_tuple(data, output + 2);
+        };
+        auto [_, new_ptr] =
+            builtin_eq_1<TState>(args_for_old_bi.data(), state, callback);
+        auto tuples_count = (new_ptr - output) / 2;
+        return tuples_count;
+      }));
+
+  RAM *scc1 = new RAM(true, 1);
+  scc1->add_relation(rel__edge__2__1__2, false, false);
+  scc1->add_relation(rel__spath__3__2, true, false);
+  parallel_join *update_spath_j =
+      new parallel_join(rel__spath__3__2, rel__edge__2__1__2, FULL,
+                        rel__spath__3__2, DELTA, {5, 2, 3} // useless
+      );
+  update_spath_j->set_generator_func(
+      [](std::vector<u64> &target_v, std::vector<u64> &input_v, u64 *res) {
+        // std::cout << "Joining  >>> ";
+        // for (auto c : input_v) {
+        //   std::cout << c << " ";
+        // }
+        // std::cout << " and >>>>>>>";
+        // for (auto c : target_v) {
+        //     std::cout << c << " ";
+        // }
+        // std::cout << std::endl;
+        res[0] = input_v[1];
+        res[1] = target_v[1];
+        if (res[0] == res[1]) {
+          res[2] = 0;
+        } else {
+          res[2] = target_v[2] + 1;
+        }
+      });
+  scc1->add_rule(update_spath_j);
+
+  LIE *lie = new LIE();
+  lie->add_relation(rel__edge__2__1__2);
+  lie->add_relation(rel__spath__3__2);
+  lie->add_scc(scc0);
+  lie->add_scc(scc1);
+  lie->add_scc_dependance(scc0, scc1);
+
+  // Enable IO
+  lie->enable_all_to_all_dump();
+  lie->enable_data_IO();
+  //   lie->enable_share_io();
+  lie->enable_IO();
+  lie->set_output_dir(output_dir); // Write to this directory
+  lie->set_comm(mcomm);
+  lie->set_batch_size(1);
+  lie->execute();
+  lie->print_all_relation_size(); // Continuously print relation sizes
+                                  //   lie->stat_intermediate();
+
+  // rel__spath__3__1__2__3->print();
+
+  // rel__spath__2__1__2->print();
+//   rel__spath__3__2->print();
+  // rel__edge__3__1->print();
+  // rel__edge__3__1__2__3->print();
+
+  // print all variants(non-canonical index of each relation)
+  if (mcomm.get_rank() == 0) {
+    std::cout << "rel_name"
+              << ",\t"
+              << "indices\n";
+    for (auto const &rel_p : rel_index_map) {
+      std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n";
+    }
+    std::cout << std::endl;
+  }
+
+  // lie->print_all_relation_size(); // Continuously print relation sizes
+
+  delete lie;
+}
+
+int main(int argc, char **argv) {
+  // input dir from compiler
+  std::string slog_input_dir =
+      "/home/stargazermiao/workspace/PL/slog/out/input-data";
+  // output dir from compiler
+  std::string slog_output_dir =
+      "/home/stargazermiao/workspace/PL/slog/out/checkpoints";
+  if (argc > 2) {
+    slog_input_dir = argv[1];
+    slog_output_dir = argv[2];
+  }
+  mpi_comm mcomm;
+  mcomm.create(argc, argv);
+
+  compute_sssp_from(mcomm, atoi(argv[3]), slog_input_dir, slog_output_dir, argc,
+                    argv);
+
+  mcomm.destroy();
+  return 0;
+}
diff --git a/backend/tests/sssp/test-input-graph/edge.csv b/backend/tests/sssp/test-input-graph/edge.csv
index de8668e0..1d997fb7 100644
--- a/backend/tests/sssp/test-input-graph/edge.csv
+++ b/backend/tests/sssp/test-input-graph/edge.csv
@@ -1,9 +1,9 @@
-1	2	1
-1	3	2
-2	3	2
-3	4	1
-4	5	1
-5	6	1
-6	7	1
-8	9	1
-9	10	1
+1	2
+1	3
+2	3
+3	4
+4	5
+5	6
+6	7
+8	9
+9	10
diff --git a/examples/datalog-example b/examples/datalog-example
index 87266643..be103a21 160000
--- a/examples/datalog-example
+++ b/examples/datalog-example
@@ -1 +1 @@
-Subproject commit 872666433df43282a408b4e37dad3fcbeafa1891
+Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa
diff --git a/runslog b/runslog
index 13112e90..73e747e0 100755
--- a/runslog
+++ b/runslog
@@ -89,6 +89,8 @@ def ingest_facts(factloc, inputloc, tsv_bin_path, cores):
             tableloc = os.path.join(inputloc, table[1])
         try:
             # idk why 16 is buckets, got from rpc.py
+            print(" ".join([tsv_bin_path, factfile, str(arity),
+                            tableloc, str(cores), str(tabletag), inputloc]))
             subprocess.check_output([tsv_bin_path, factfile, str(arity),
                                      tableloc, str(cores), str(tabletag), inputloc])
         except subprocess.CalledProcessError as e:

From 61b9eba688bb488a9d912406fc17298c0b88dbdd Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Mon, 21 Nov 2022 13:01:05 -0500
Subject: [PATCH 13/36] add debug

---
 backend/src/IO/parallel_io.cpp                |   2 +-
 backend/src/RAM/RA_tasks.cpp                  |  29 +++++++++++++-----
 backend/src/RAM/RA_tasks.h                    |   2 ++
 backend/src/lie/lie.cpp                       |   4 +--
 .../checkpoints/checkpoint-final/$strings.csv |   0
 .../checkpoint-final/256.edge.3.table_full    | Bin 288 -> 0 bytes
 .../checkpoint-final/257.spath.3.table_full   | Bin 288 -> 0 bytes
 backend/tests/sssp/compiled_pre/sssp_opt.cpp  |   5 +--
 8 files changed, 30 insertions(+), 12 deletions(-)
 delete mode 100644 backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv
 delete mode 100644 backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full
 delete mode 100644 backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full

diff --git a/backend/src/IO/parallel_io.cpp b/backend/src/IO/parallel_io.cpp
index 123b11a1..ce57ae83 100644
--- a/backend/src/IO/parallel_io.cpp
+++ b/backend/src/IO/parallel_io.cpp
@@ -158,7 +158,7 @@ void parallel_io::parallel_read_input_relation_from_file_to_local_buffer(u32 ari
 
     /* Read all data in parallel */
     uint64_t read_offset;
-    read_offset = ceil((float)global_row_count / nprocs) * rank;
+    read_offset = (int)ceil((float)global_row_count / nprocs) * rank;
 
     if (read_offset > (uint64_t)global_row_count)
     {
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index b7a8a029..71a85d34 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -6,6 +6,7 @@
 
 
 #include "../parallel_RA_inc.h"
+#include "mpi.h"
 #include <iostream>
 
 RAM::~RAM()
@@ -579,6 +580,7 @@ bool RAM::local_compute(int* offset)
 
         else if ((*it)->get_RA_type() == JOIN)
         {
+            // auto before_time = MPI_Wtime();
             parallel_join* current_ra = (parallel_join*) *it;
             relation* output_relation = current_ra->get_join_output();
 
@@ -603,6 +605,7 @@ bool RAM::local_compute(int* offset)
                                                                          &join_tuples_duplicates,
                                                                          &join_tuples);
                 total_join_tuples = total_join_tuples + join_tuples;
+
             }
             else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL)
             {
@@ -654,8 +657,12 @@ bool RAM::local_compute(int* offset)
                                                                          &join_tuples);
                 total_join_tuples = total_join_tuples + join_tuples;
             }
+            // auto after_time = MPI_Wtime();
+            // if (mcomm.get_local_rank() == 0) {
+            //     std::cout << "local join on rank " << mcomm.get_local_rank() << " takes " << after_time - before_time << std::endl;
+            // }
         }
-        counter++;
+        counter++;      
     }
 
 #if 0
@@ -714,12 +721,14 @@ void RAM::local_comm()
     int cnt=0;
     cumulative_all_to_allv_buffer_cmp = new u64*[RA_list.size()];
     cumulative_all_to_allv_recv_process_size_array_cmp = new int[RA_list.size()];
-
+    auto before_time = MPI_Wtime();
     for (std::vector<parallel_RA*>::iterator it = RA_list.begin() ; it != RA_list.end(); ++it)
     {
         all_to_all_comm(compute_buffer.local_compute_output[cnt], compute_buffer.local_compute_output_size_rel[cnt], compute_buffer.local_compute_output_size[cnt], &cumulative_all_to_allv_recv_process_size_array_cmp[cnt], &cumulative_all_to_allv_buffer_cmp[cnt], mcomm.get_local_comm());
         cnt++;
     }
+    auto after_time = MPI_Wtime();
+    all_to_all_time += (after_time - before_time);
 }
 #endif
 
@@ -1226,22 +1235,28 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
         bool local_join_status = false;
         while (local_join_status == false)
         {
+            auto allocate_buffers_start = MPI_Wtime();
             allocate_compute_buffers();
+            auto allocate_buffers_end = MPI_Wtime();
 
-
+            auto compute_start = MPI_Wtime();
             local_join_status = local_compute(offset);
+            auto compute_end = MPI_Wtime();
 
+            auto all_to_all_start = MPI_Wtime();
             comm_compaction_all_to_all(compute_buffer, &cumulative_all_to_allv_recv_process_count_array, &cumulative_all_to_allv_buffer, mcomm.get_local_comm(), *loop_counter, task_id, output_dir, all_to_all_record, sloav_mode, rotate_index_array, send_indexes, sendb_num);
+            auto all_to_all_end = MPI_Wtime();
 
-
+            auto free_buffers_start = MPI_Wtime();
             free_compute_buffers();
+            auto free_buffers_end = MPI_Wtime();
 
-
-
+            auto insert_in_newt_start = MPI_Wtime();
             local_insert_in_newt_comm_compaction(intern_map);
+            auto insert_in_newt_end = MPI_Wtime();
 
 
-#if DEBUG_OUTPUT
+#if 1
             if (mcomm.get_rank() == 0)
             {
 #if 0
diff --git a/backend/src/RAM/RA_tasks.h b/backend/src/RAM/RA_tasks.h
index 0c650a8d..8b5d8e0d 100644
--- a/backend/src/RAM/RA_tasks.h
+++ b/backend/src/RAM/RA_tasks.h
@@ -56,6 +56,8 @@ class RAM
 
 public:
 
+    double all_to_all_time = 0;
+
     ~RAM();
     RAM (bool ic, int ram_id);
 
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 59a0ec57..7b25b044 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -511,8 +511,8 @@ bool LIE::execute ()
         /// For SCCs that runs till fixed point is reached
         else
         {
-            //if (mcomm.get_rank() == 0)
-            //    std::cout << "name\tnprocs\tmin\tmax\tmean\tIteration#\tBuffer_creation_time\tComputation_time\tAll_to_all_time\tBuffer_free_time\tInsert_in_newt_time\tIntra_comm_time\tInsert_in_full_time\tTotal_time" << std::endl;
+            if (mcomm.get_rank() == 0)
+                std::cout << "name\tnprocs\tmin\tmax\tmean\tIteration#\tBuffer_creation_time\tComputation_time\tAll_to_all_time\tBuffer_free_time\tInsert_in_newt_time\tIntra_comm_time\tInsert_in_full_time\tTotal_time" << std::endl;
             u64 delta_in_scc = 0;
             do
             {
diff --git a/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv
deleted file mode 100644
index e69de29b..00000000
diff --git a/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full
deleted file mode 100644
index a5b47390726befd417416b8e76e64db49a1e53f8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 288
zcmYL@+YP`l3<61CX)=}fpQ=*f<ouir24~3c2zO|dM&>&_L+d4S=*-u@*FJgclJ_f=
g&{z83<k9J$@y!yw+~vQX+b8c{CD%T8=y`Jg18>&?&Hw-a

diff --git a/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full
deleted file mode 100644
index b6a5a0c18677b3766072782e2050000c95305bde..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 288
zcmYk0+Y!JZ5CYXaOj4EoR~16onezv^J<#6%Gonw_vR3QyM|6vNE>4a)-dZ^Mz4nNc
jV}6I*i!*0$hPzLk-ckRh`}z68$zR!XPvN{nkN3o1lcog@

diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
index 748dc4ca..dc3e9c86 100644
--- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
@@ -1,7 +1,7 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 
-#include <iostream>
+#include <optional>
 #include <iterator>
 #include <map>
 #include <optional>
@@ -21,6 +21,7 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+#include <filesystem>
 
 using namespace std;
 #define u64 uint64_t

From a11ac849f0d5f74a4cfe04ece0fe02ae3825b38a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-253.ec2.internal>
Date: Wed, 23 Nov 2022 07:04:56 +0000
Subject: [PATCH 14/36] add support for multi dependent column

---
 backend/src/RA/parallel_join.cpp              |   2 +
 backend/src/RAM/RA_tasks.cpp                  | 168 ++++---
 backend/src/lie/lie.cpp                       |   4 +-
 backend/src/parallel_RA_inc.h                 |  20 +-
 .../src/relation/balanced_hash_relation.cpp   |  65 ++-
 backend/src/relation/balanced_hash_relation.h |   2 +
 backend/src/relation/shmap_relation_exp.cpp   | 223 ++++++---
 .../tests/msum/compiled_pre/CMakeLists.txt    |  28 ++
 backend/tests/msum/compiled_pre/compiler-out  |  18 +
 .../msum/compiled_pre/input-data/$strings.csv |   0
 .../compiled_pre/input-data/257.edge.2.table  | Bin 0 -> 264 bytes
 .../sssp.cpp => msum/compiled_pre/msum.cpp}   | 344 ++++----------
 backend/tests/msum/msum.slog                  |   3 +
 backend/tests/pagerank/pagerank.slog          |   3 +
 .../compiled_pre/input-data/258.edge.2.table  | Bin 0 -> 528 bytes
 .../tests/sssp/compiled_pre/sssp.cpp.backup   | 429 ------------------
 backend/tests/sssp/compiled_pre/sssp_opt.cpp  |  14 +-
 backend/tests/sssp/sssp.py                    |  18 +
 backend/tests/sssp/test-input-graph/edge.csv  |   2 +
 examples/datalog-example                      |   2 +-
 20 files changed, 491 insertions(+), 854 deletions(-)
 create mode 100644 backend/tests/msum/compiled_pre/CMakeLists.txt
 create mode 100644 backend/tests/msum/compiled_pre/compiler-out
 create mode 100644 backend/tests/msum/compiled_pre/input-data/$strings.csv
 create mode 100644 backend/tests/msum/compiled_pre/input-data/257.edge.2.table
 rename backend/tests/{sssp/compiled_pre/sssp.cpp => msum/compiled_pre/msum.cpp} (58%)
 create mode 100644 backend/tests/msum/msum.slog
 create mode 100644 backend/tests/pagerank/pagerank.slog
 create mode 100644 backend/tests/sssp/compiled_pre/input-data/258.edge.2.table
 delete mode 100644 backend/tests/sssp/compiled_pre/sssp.cpp.backup
 create mode 100644 backend/tests/sssp/sssp.py

diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index 38d9e20c..59b2fd48 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -7,6 +7,7 @@
 
 #include "../parallel_RA_inc.h"
 #include <cstddef>
+#include <iostream>
 
 
 bool parallel_join::local_join(int threshold, int* offset,
@@ -34,6 +35,7 @@ bool parallel_join::local_join(int threshold, int* offset,
     }
     u32* output_sub_bucket_count = output->get_sub_bucket_per_bucket_count();
     u32** output_sub_bucket_rank = output->get_sub_bucket_rank();
+    // std::cout << "wwwwwwwww " << input0_buffer_size << " " << input0_buffer_size << " " << i1_size << std::endl;
 
     if (*offset > input0_buffer_size || input0_buffer_size == 0 || i1_size == 0)
         return true;
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 71a85d34..2712e343 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -7,7 +7,9 @@
 
 #include "../parallel_RA_inc.h"
 #include "mpi.h"
+#include <iomanip>
 #include <iostream>
+#include <vector>
 
 RAM::~RAM()
 {
@@ -256,12 +258,22 @@ u64 RAM::intra_bucket_comm_execute()
             /// Join between full and delta
             else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
             {
-                intra_bucket_comm(get_bucket_count(),
-                                  input1->get_delta(),
-                                  input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
+                // std::cout << "here>>>>>>>>>>>>>"  << std::endl;
+                if (input1->get_dependent_column().size() > 0) {
+                    intra_bucket_comm(get_bucket_count(),
+                                  input0->get_full(),
                                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
+                                  input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
                                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
                                   mcomm.get_local_comm());
+                } else {
+                    intra_bucket_comm(get_bucket_count(),
+                                    input1->get_delta(),
+                                    input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
+                                    input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
+                                    &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
+                                    mcomm.get_local_comm());
+                }
                 total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
             }
 
@@ -626,19 +638,33 @@ bool RAM::local_compute(int* offset)
             }
             else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
             {
-
-                join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
-                                                                         RIGHT,
-                                                                         get_bucket_count(),
-                                                                         intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
-                                                                         input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1,
-                                                                         reorder_map_array,
-                                                                         output_relation,
-                                                                         compute_buffer,
-                                                                         counter,
-                                                                         join_column_count,
-                                                                         &join_tuples_duplicates,
-                                                                         &join_tuples);
+                if (input1->get_dependent_column().size() > 0) {
+                    join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
+                                                                            LEFT,
+                                                                            get_bucket_count(),
+                                                                            intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
+                                                                            input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1,
+                                                                            reorder_map_array,
+                                                                            output_relation,
+                                                                            compute_buffer,
+                                                                            counter,
+                                                                            join_column_count,
+                                                                            &join_tuples_duplicates,
+                                                                            &join_tuples); 
+                } else {
+                    join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
+                                                                            RIGHT,
+                                                                            get_bucket_count(),
+                                                                            intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
+                                                                            input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1,
+                                                                            reorder_map_array,
+                                                                            output_relation,
+                                                                            compute_buffer,
+                                                                            counter,
+                                                                            join_column_count,
+                                                                            &join_tuples_duplicates,
+                                                                            &join_tuples);
+                }
                 total_join_tuples = total_join_tuples + join_tuples;
             }
             else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL)
@@ -824,10 +850,22 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
             for (int tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++)
             {
                 u32 x = starting + tuple_ind * width;
-                if (output->find_in_full(cumulative_all_to_allv_buffer + x, width) == false &&
+                bool insert_flag = true;
+                if (output->get_dependent_column().size() > 1) {
+                    std::vector<u64> tt;
+                    for (int i = 0; i < width; i++) {
+                        tt.push_back(cumulative_all_to_allv_buffer[x+i]);
+                    }
+                    // temporary index column just to match size of column
+                    tt.push_back(0);
+                    insert_flag = output->check_dependent_value_insert_avalible(tt);
+
+                } else {
+                    insert_flag = output->find_in_full(cumulative_all_to_allv_buffer + x, width) == false &&
                         output->find_in_delta(cumulative_all_to_allv_buffer + x, width) == false &&
-                        output->find_in_newt(cumulative_all_to_allv_buffer + x, width) == false)
-                {
+                        output->find_in_newt(cumulative_all_to_allv_buffer + x, width) == false;
+                }
+                if (insert_flag){
                     for (u32 i = 0; i < width; i++)
                         tuple[i] = cumulative_all_to_allv_buffer[x+i];
 
@@ -850,17 +888,7 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
 
                     if (output->insert_in_newt(tuple) == true)
                         successful_insert++;
-
-                    //if (RA_list[ra_id]->get_RA_type() == FACT)
-                    //    std::cout << "FFFFFFFFFF "<< tuple[0] << " " << tuple[1] << " " << successful_insert << std::endl;
                 } 
-                // else {
-                    // std::cout << "insert fail ";
-                    // for (int i = 0; i < width; i++) {
-                    //     std::cout << cumulative_all_to_allv_buffer[i] << " ";
-                // }
-                // std::cout << std::endl;
-                // }
             }
             starting = starting + elements_to_read;
         }
@@ -1097,35 +1125,35 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
         //    std::cout << "--------------FIXED POINT ITERATION " << loop_count_tracker << "--------------" << std::endl;
 #endif
 
-
+        auto intra_start = MPI_Wtime(); 
         intra_bucket_comm_execute();
+        auto intra_end = MPI_Wtime(); 
 
-
+        std::cout << std::setiosflags(std::ios::fixed);
         bool local_join_status = false;
         while (local_join_status == false)
         {
-
+            auto allocate_buffers_start = MPI_Wtime();
             allocate_compute_buffers();
+            auto allocate_buffers_end = MPI_Wtime();
 
-
-
+            auto compute_start = MPI_Wtime();
             local_join_status = local_compute(offset);
+            auto compute_end = MPI_Wtime();
 
-
-
+            auto all_to_all_start = MPI_Wtime();
             local_comm();
+            auto all_to_all_end = MPI_Wtime();
 
-
-
-
+            auto free_buffers_start = MPI_Wtime();
             free_compute_buffers();
+            auto free_buffers_end = MPI_Wtime();
 
-
-
+            auto insert_in_newt_start = MPI_Wtime();
             local_insert_in_newt(intern_map);
+            auto insert_in_newt_end = MPI_Wtime();
 
-
-#if DEBUG_OUTPUT
+#if 1
             if (mcomm.get_rank() == 0)
             {
 #if 0
@@ -1145,19 +1173,24 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
                           << " newt " << *running_insert_newt
                           << std::endl;
 #endif
-                std::cout << loop_count_tracker << "\t"
-                          << (allocate_buffers_end - allocate_buffers_start) << "\t"
-                          << (compute_end - compute_start) << "\t"
-                          << (all_to_all_end - all_to_all_start) << "\t"
-                          << (free_buffers_end - free_buffers_start) << "\t"
-                          << (insert_in_newt_end - insert_in_newt_start) << "\t";
+                std::cout << "loop" << std::setw(12) << "alloc_buf" << std::setw(12) << "compute" << std::setw(12)
+                          << "all2all" << std::setw(12) << "free_buf" << std::setw(12) << "insert_newt" << std::setw(12)
+                          << "intra" << std::setw(12) << "insert_full" << std::setw(12) << "total" << "\n" ;
+                std::cout << loop_count_tracker << std::setprecision(4) << std::setw(12)
+                          << (allocate_buffers_end - allocate_buffers_start) << std::setprecision(4) << std::setw(12)
+                          << (compute_end - compute_start) << std::setprecision(4) << std::setw(12)
+                          << (all_to_all_end - all_to_all_start) << std::setprecision(4) << std::setw(12)
+                          << (free_buffers_end - free_buffers_start) << std::setprecision(4) << std::setw(12)
+                          << (insert_in_newt_end - insert_in_newt_start) << std::setprecision(4) << std::setw(12);
             }
 #endif
             inner_loop++;
         }
+        auto insert_in_full_start = MPI_Wtime(); 
         local_insert_in_full();
-
-#if DEBUG_OUTPUT
+        auto insert_in_full_end = MPI_Wtime(); 
+        
+#if 1
         if (mcomm.get_rank() == 0)
         {
 #if 0
@@ -1174,8 +1207,8 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
                        << " full " << *running_insert_in_full
                        << " Total " << *running_intra_bucket_comm + *running_buffer_allocate + *running_local_compute + *running_all_to_all + *running_buffer_free + *running_insert_newt + *running_insert_in_full << std::endl;
 #endif
-            std::cout << (intra_end - intra_start) << "\t"
-                      << (insert_in_full_end - insert_in_full_start)  << "\t"
+            std::cout << (intra_end - intra_start) << std::setw(12)
+                      << (insert_in_full_end - insert_in_full_start)  << std::setw(12)
                       << (insert_in_full_end - intra_start) << std::endl;
 
         }
@@ -1229,8 +1262,10 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
         //    std::cout << "--------------FIXED POINT ITERATION " << loop_count_tracker << "--------------" << std::endl;
 #endif
 
-
+        std::cout << std::setiosflags(std::ios::fixed);
+        auto intra_start = MPI_Wtime(); 
         intra_bucket_comm_execute();
+        auto intra_end = MPI_Wtime(); 
 
         bool local_join_status = false;
         while (local_join_status == false)
@@ -1276,22 +1311,25 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
                           << " newt " << *running_insert_newt
                           << std::endl;
 #endif
-                std::cout << loop_count_tracker << "\t"
-                          << (allocate_buffers_end - allocate_buffers_start) << "\t"
-                          << (compute_end - compute_start) << "\t"
-                          << (all_to_all_end - all_to_all_start) << "\t"
-                          << (free_buffers_end - free_buffers_start) << "\t"
-                          << (insert_in_newt_end - insert_in_newt_start) << "\t";
+                std::cout << "loop" << std::setw(12) << "alloc_buf" << std::setw(12) << "compute" << std::setw(12)
+                          << "all2all" << std::setw(12) << "free_buf" << std::setw(12) << "insert_newt" << std::setw(12)
+                          << "intra" << std::setw(12) << "insert_full" << std::setw(12) << "total" << "\n" ;
+                std::cout << loop_count_tracker << std::setprecision(4) << std::setw(12)
+                          << (allocate_buffers_end - allocate_buffers_start) << std::setprecision(4) << std::setw(12)
+                          << (compute_end - compute_start) << std::setprecision(4) << std::setw(12)
+                          << (all_to_all_end - all_to_all_start) << std::setprecision(4) << std::setw(12)
+                          << (free_buffers_end - free_buffers_start) << std::setprecision(4) << std::setw(12)
+                          << (insert_in_newt_end - insert_in_newt_start) << std::setprecision(4) << std::setw(12);
             }
 #endif
             inner_loop++;
         }
 
-
+        auto insert_in_full_start = MPI_Wtime(); 
         local_insert_in_full();
+        auto insert_in_full_end = MPI_Wtime(); 
 
-
-#if DEBUG_OUTPUT
+#if 1
         if (mcomm.get_rank() == 0)
         {
 #if 0
@@ -1308,8 +1346,8 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
                        << " full " << *running_insert_in_full
                        << " Total " << *running_intra_bucket_comm + *running_buffer_allocate + *running_local_compute + *running_all_to_all + *running_buffer_free + *running_insert_newt + *running_insert_in_full << std::endl;
 #endif
-            std::cout << (intra_end - intra_start) << "\t"
-                      << (insert_in_full_end - insert_in_full_start)  << "\t"
+            std::cout << (intra_end - intra_start) << std::setw(12)
+                      << (insert_in_full_end - insert_in_full_start)  << std::setw(12)
                       << (insert_in_full_end - intra_start) << std::endl;
 
         }
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 7b25b044..59a0ec57 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -511,8 +511,8 @@ bool LIE::execute ()
         /// For SCCs that runs till fixed point is reached
         else
         {
-            if (mcomm.get_rank() == 0)
-                std::cout << "name\tnprocs\tmin\tmax\tmean\tIteration#\tBuffer_creation_time\tComputation_time\tAll_to_all_time\tBuffer_free_time\tInsert_in_newt_time\tIntra_comm_time\tInsert_in_full_time\tTotal_time" << std::endl;
+            //if (mcomm.get_rank() == 0)
+            //    std::cout << "name\tnprocs\tmin\tmax\tmean\tIteration#\tBuffer_creation_time\tComputation_time\tAll_to_all_time\tBuffer_free_time\tInsert_in_newt_time\tIntra_comm_time\tInsert_in_full_time\tTotal_time" << std::endl;
             u64 delta_in_scc = 0;
             do
             {
diff --git a/backend/src/parallel_RA_inc.h b/backend/src/parallel_RA_inc.h
index 00c3f688..a0047436 100644
--- a/backend/src/parallel_RA_inc.h
+++ b/backend/src/parallel_RA_inc.h
@@ -18,8 +18,26 @@
 //#define DEBUG_OUTPUT 1
 #define MAX_LOOP_COUNT 120000
 
+struct vec_comparator {
+  vec_comparator() {}
+
+  bool operator()(const std::vector<u64> &a, const std::vector<u64> &b) const {
+      // make it an unroll loop when change to array
+      int size = a.size();
+          for (int i=0; i < size; i++) {
+              if (a[i] < b[i])
+                  return true;
+              if (a[i] > b[i])
+                  return false;
+          }
+
+      return false;
+  }
+};
+
+using depend_val_t = std::vector<std::vector<u64>>;
 using update_partial_compare_func_t = std::function<std::optional<bool>(const std::vector<u64>& old_v, const std::vector<u64>& new_v, const std::vector<u64>& prefix)>;
-using join_generator_func_t = std::function<void(std::vector<u64>& target_v, std::vector<u64>& input_v, u64* res)>;
+using join_generator_func_t = std::function<bool(const depend_val_t& target_vs, const std::vector<u64>& input_v, depend_val_t& res_set)>;
 
 #include "log/logger.h"
 #include "hash/hash.h"
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index 7663343b..7182fc24 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -6,6 +6,7 @@
 
 
 #include "../parallel_RA_inc.h"
+#include "balanced_hash_relation.h"
 #include <cassert>
 #include <cstddef>
 #include <filesystem>
@@ -1196,6 +1197,7 @@ bool relation::insert_in_full(u64* t)
     // TODO: use normal insert here!
     if (full[bucket_id].insert_tuple_from_array(t, arity+1) == true)
     {
+        // TODO: change how to deal with element counts
         full_element_count++;
         full_bucket_element_count[bucket_id]++;
         full_sub_bucket_element_count[bucket_id][sub_bucket_id]++;
@@ -1289,27 +1291,27 @@ void relation::local_insert_in_delta()
     MPI_Comm_rank(mcomm.get_comm(), &rank);
     u32 buckets = get_bucket_count();
 
-    if (dependent_column_indices.size() > 0) {
-        delta_element_count = 0;
-        for (u32 i = 0; i < buckets; i++) {
-            delta[i].purge();
-            memset(delta_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32));
-            for (auto& t: newt[i]) {
-                if (full[i].check_dependent_insertion(t)) {
-                    delta[i].insert(t);
-                    uint64_t bucket_id = tuple_hash(t.data(), join_column_count) % get_bucket_count();
-                    u32 sub_bucket_id = 0;
-                    if (is_canonical == false   && arity != 0 && arity >= join_column_count)
-                        sub_bucket_id = tuple_hash(t.data() + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id];
-                    delta_sub_bucket_element_count[bucket_id][sub_bucket_id]++;
-                    delta_element_count++;
-                }
-            }
-            newt[i].purge();
-            memset(newt_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32));
-            newt_element_count = 0;
-        }
-    } else {
+    // if (dependent_column_indices.size() > 0) {
+    //     delta_element_count = 0;
+    //     for (u32 i = 0; i < buckets; i++) {
+    //         delta[i].purge();
+    //         memset(delta_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32));
+    //         for (auto& t: newt[i]) {
+    //             if (full[i].check_dependent_insertion(t)) {
+    //                 delta[i].insert(t);
+    //                 uint64_t bucket_id = tuple_hash(t.data(), join_column_count) % get_bucket_count();
+    //                 u32 sub_bucket_id = 0;
+    //                 if (is_canonical == false   && arity != 0 && arity >= join_column_count)
+    //                     sub_bucket_id = tuple_hash(t.data() + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id];
+    //                 delta_sub_bucket_element_count[bucket_id][sub_bucket_id]++;
+    //                 delta_element_count++;
+    //             }
+    //         }
+    //         newt[i].purge();
+    //         memset(newt_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32));
+    //         newt_element_count = 0;
+    //     }
+    // } else {
         delete[] delta;
         delta = newt;
         delta_element_count = newt_element_count;
@@ -1327,5 +1329,24 @@ void relation::local_insert_in_delta()
         }
         newt_element_count = 0;
         memset(newt_bucket_element_count, 0, buckets * sizeof(u32));
-    }
+    // }
+}
+
+bool relation::check_dependent_value_insert_avalible(const std::vector<u64>& tuple) {
+    uint64_t bucket_id = tuple_hash(tuple.data(), join_column_count) % get_bucket_count();
+    // return newt[bucket_id].check_dependent_insertion(tuple);
+    // if (!(full[bucket_id].check_dependent_insertion(tuple) && delta[bucket_id].check_dependent_insertion(tuple))) {
+    //     for (auto c: tuple) {
+    //         std::cout << c << " ";
+    //     }
+    //     std::cout << std::endl;
+    //     std::cout << "current tree >>" << std::endl;
+    //     for (auto t: delta[bucket_id]) {
+    //         for (auto c: t) {
+    //             std::cout << c << " ";
+    //         }
+    //         std::cout << std::endl;
+    //     }
+    // }
+    return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ;
 }
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index c0e88f9e..cfd322ad 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -267,6 +267,8 @@ class relation
     void local_insert_in_delta();
     void copy_newt_to_delta()   {delta = newt;}
 
+    // lattice value check
+    bool check_dependent_value_insert_avalible(const std::vector<u64>& tuple);
 
     /// for load balancing (implemented in relation_load_balance.cpp)
     bool load_balance_merge_full_and_delta(float rf);
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index 700c7a6f..9ceff570 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -123,6 +123,7 @@ shmap_relation::check_dependent_insertion(const std::vector<u64> &tp) {
         if (exist_tuples_range.first == ind.end()) {
             return true;
         } else {
+            auto joined = false;
             for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) {
                 auto cur_tuple = *it;
                 std::vector<u64> old_t;
@@ -130,8 +131,14 @@ shmap_relation::check_dependent_insertion(const std::vector<u64> &tp) {
                     old_t.push_back(cur_tuple[i]);
                 }
                 auto compare_res = update_compare_func(old_t, dependent_columns, tp);
-                if (compare_res.has_value() && compare_res.value()) {
+                if (!compare_res.has_value()) {
+                    continue;
+                }
+                if (compare_res.value()) {
+                    joined = true;
                     return true;
+                } else {
+                    joined = true;
                 }
             }
             // std::cout << " not adding to lattice with <<<<<< ";
@@ -151,7 +158,11 @@ shmap_relation::check_dependent_insertion(const std::vector<u64> &tp) {
             //     }
             //     std::cout << std::endl;
             // }
-            return false;
+            if (!joined) {
+                return true;
+            } else {
+                return false;        
+            }
         }
     } else {
         return true;
@@ -416,42 +427,41 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
     //     }
     //     std::cout << std::endl;
     // }
+    // std::cout << "upper bound >> ";
+    // for (auto c: upper_bound) {
+    //     std::cout << c << " ";
+    // }
+    std::cout << std::endl;
     auto joined_range = lowerUpperRange(lower_bound, upper_bound);
-    for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it)
-    {
-        auto cur_path = *it;
-        u64 projected_path[join_buffer.width[ra_id]];
-        if (generator_mode) {
-            std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
-            // std::cout << "join facts ";
-            // for (auto c: input_t) {
-            //     std::cout << c << " ";
-            // }
-            // std::cout << std::endl;
-            gen_func(input_t, cur_path, projected_path);
-        } else {
-            // std::cout << "here" << std::endl;
-            u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
-            for (int i = 0; i < input1_buffer_width; i++)
-                reordered_cur_path[i] = cur_path[i];
-
-            for (int i = join_column_count; i < input0_buffer_width; i++)
-                reordered_cur_path[input1_buffer_width + (i - join_column_count)] = input0_buffer[i];
 
-            for (int i =0; i < join_buffer.width[ra_id]; i++)
-                projected_path[i] = reordered_cur_path[reorder_map[i]];
+    if (generator_mode) {
+        std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
+        std::vector<std::vector<u64>> eq_tuple_set;
+        std::vector<std::vector<u64>> generated_tuple_set;
+        std::vector<u64> prev_non_dependent_columns;
+        for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it){
+            auto cur_path = *it;
+            std::vector<u64> cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+arity+1-dependent_column_indices.size());
+            if (cur_non_dependent_columns == prev_non_dependent_columns) {
+                eq_tuple_set.push_back(cur_path);
+                continue;
+            } else {
+                if (eq_tuple_set.size() != 0) {
+                    gen_func(eq_tuple_set, input_t, generated_tuple_set);
+                    eq_tuple_set.clear();
+                }
+                prev_non_dependent_columns = cur_non_dependent_columns;
+                eq_tuple_set.push_back(cur_path);
+            }
         }
-        // std::cout << "add new facts ";
-        // for (auto c: projected_path) {
-        //     std::cout << c << " ";
-        // }
-        // std::cout << std::endl;
-        if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
-        {
-            uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets;
+        if (eq_tuple_set.size() != 0) {
+            gen_func(eq_tuple_set, input_t, generated_tuple_set);
+        }
+        for (auto& tp: generated_tuple_set) {
+            uint64_t bucket_id = tuple_hash(tp.data(), head_rel_hash_col_count) % buckets;
             uint64_t sub_bucket_id=0;
             if (canonical == false)
-                sub_bucket_id = tuple_hash(projected_path + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id];
+                sub_bucket_id = tuple_hash(tp.data() + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id];
 
             int index = output_sub_bucket_rank[bucket_id][sub_bucket_id];
 
@@ -459,15 +469,55 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
             join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id];
             join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id];
             join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++;
-            join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id];
 
+            join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id];
             join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id];
-            join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)projected_path, sizeof(u64)*join_buffer.width[ra_id]);
+            join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)tp.data(), sizeof(u64)*join_buffer.width[ra_id]);
             (*local_join_inserts)++;
             (*local_join_count)++;
         }
-        else {
-            (*local_join_duplicates)++;
+    } else {
+        for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it)
+        {
+            auto cur_path = *it;
+            u64 projected_path[join_buffer.width[ra_id]];
+            u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
+            for (int i = 0; i < input1_buffer_width; i++)
+                reordered_cur_path[i] = cur_path[i];
+
+            for (int i = join_column_count; i < input0_buffer_width; i++)
+                reordered_cur_path[input1_buffer_width + (i - join_column_count)] = input0_buffer[i];
+
+            for (int i =0; i < join_buffer.width[ra_id]; i++)
+                projected_path[i] = reordered_cur_path[reorder_map[i]];
+            // std::cout << "add new facts ";
+            // for (auto c: projected_path) {
+            //     std::cout << c << " ";
+            // }
+            // std::cout << std::endl;
+            if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
+            {
+                uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets;
+                uint64_t sub_bucket_id=0;
+                if (canonical == false)
+                    sub_bucket_id = tuple_hash(projected_path + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id];
+
+                int index = output_sub_bucket_rank[bucket_id][sub_bucket_id];
+
+                join_buffer.local_compute_output_size_rel[ra_id] = join_buffer.local_compute_output_size_rel[ra_id] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++;
+                join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id];
+
+                join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)projected_path, sizeof(u64)*join_buffer.width[ra_id]);
+                (*local_join_inserts)++;
+                (*local_join_count)++;
+            }
+            else {
+                (*local_join_duplicates)++;
+            }
         }
     }
 }
@@ -499,11 +549,9 @@ void shmap_relation::as_all_to_allv_left_join_buffer(
         upper_bound[i] = prefix[i];
         lower_bound[i] = prefix[i];
     }
-    // std::cout << "join >>> ";
-    // for (auto c: prefix) {
-    //     std::cout << c << " ";
-    // }
-    // std::cout << std::endl;
+
+    auto joined_range = lowerUpperRange(lower_bound, upper_bound);
+
     // std::cout << "cur tree >>> " << std::endl;
     // for (auto r:  ind) {
     //     std::cout << ">>> ";
@@ -512,33 +560,35 @@ void shmap_relation::as_all_to_allv_left_join_buffer(
     //     }
     //     std::cout << std::endl;
     // }
-    auto joined_range = lowerUpperRange(lower_bound, upper_bound);
-    for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it)
-    {
-        auto cur_path = *it;
-        u64 projected_path[join_buffer.width[ra_id]];
-        if (generator_mode) {
-            std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
-            gen_func(cur_path, input_t, projected_path);
-        } else {
-            u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
-            for (int i = 0; i < input0_buffer_width; i++)
-                reordered_cur_path[i] = input0_buffer[i];
 
-            for (int i = join_column_count; i < input1_buffer_width; i++)
-                reordered_cur_path[input0_buffer_width + (i - join_column_count)] = cur_path[i];
-
-            for (int i =0; i < join_buffer.width[ra_id]; i++)
-                projected_path[i] = reordered_cur_path[reorder_map[i]];
+    if (generator_mode) {
+        std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
+        std::vector<std::vector<u64>> eq_tuple_set;
+        std::vector<std::vector<u64>> generated_tuple_set;
+        std::vector<u64> prev_non_dependent_columns;
+        for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it){
+            auto cur_path = *it;
+            std::vector<u64> cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+arity+1-dependent_column_indices.size());
+            if (cur_non_dependent_columns == prev_non_dependent_columns) {
+                eq_tuple_set.push_back(cur_path);
+                continue;
+            } else {
+                if (eq_tuple_set.size() != 0) {
+                    gen_func(eq_tuple_set, input_t, generated_tuple_set);
+                    eq_tuple_set.clear();
+                }
+                prev_non_dependent_columns = cur_non_dependent_columns;
+                eq_tuple_set.push_back(cur_path);
+            }
         }
-        
-        //std::cout << "NT " << projected_path[0] << " " << projected_path[1] << std::endl;
-        if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
-        {
-            uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets;
+        if (eq_tuple_set.size() != 0) {
+            gen_func(eq_tuple_set, input_t, generated_tuple_set);
+        }
+        for (auto& tp: generated_tuple_set) {
+            uint64_t bucket_id = tuple_hash(tp.data(), head_rel_hash_col_count) % buckets;
             uint64_t sub_bucket_id=0;
             if (canonical == false)
-                sub_bucket_id = tuple_hash(projected_path + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id];
+                sub_bucket_id = tuple_hash(tp.data() + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id];
 
             int index = output_sub_bucket_rank[bucket_id][sub_bucket_id];
 
@@ -549,12 +599,49 @@ void shmap_relation::as_all_to_allv_left_join_buffer(
 
             join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id];
             join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id];
-            join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)projected_path, sizeof(u64)*join_buffer.width[ra_id]);
+            join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)tp.data(), sizeof(u64)*join_buffer.width[ra_id]);
             (*local_join_inserts)++;
             (*local_join_count)++;
         }
-        else {
-            (*local_join_duplicates)++;
+    } else {
+        for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it)
+        {
+            auto cur_path = *it;
+            u64 projected_path[join_buffer.width[ra_id]];
+            u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count];
+            for (int i = 0; i < input0_buffer_width; i++)
+                reordered_cur_path[i] = input0_buffer[i];
+
+            for (int i = join_column_count; i < input1_buffer_width; i++)
+                reordered_cur_path[input0_buffer_width + (i - join_column_count)] = cur_path[i];
+
+            for (int i =0; i < join_buffer.width[ra_id]; i++)
+                projected_path[i] = reordered_cur_path[reorder_map[i]];
+            
+            //std::cout << "NT " << projected_path[0] << " " << projected_path[1] << std::endl;
+            if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
+            {
+                uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets;
+                uint64_t sub_bucket_id=0;
+                if (canonical == false)
+                    sub_bucket_id = tuple_hash(projected_path + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id];
+
+                int index = output_sub_bucket_rank[bucket_id][sub_bucket_id];
+
+                join_buffer.local_compute_output_size_rel[ra_id] = join_buffer.local_compute_output_size_rel[ra_id] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++;
+
+                join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id];
+                join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)projected_path, sizeof(u64)*join_buffer.width[ra_id]);
+                (*local_join_inserts)++;
+                (*local_join_count)++;
+            }
+            else {
+                (*local_join_duplicates)++;
+            }
         }
     }
     // std::cout << "inserted " << *local_join_inserts << std::endl;
diff --git a/backend/tests/msum/compiled_pre/CMakeLists.txt b/backend/tests/msum/compiled_pre/CMakeLists.txt
new file mode 100644
index 00000000..2930b4c2
--- /dev/null
+++ b/backend/tests/msum/compiled_pre/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required (VERSION 3.9)
+
+project (msum)
+
+add_compile_options(--std=c++17 -lstdc++fs -Wno-strict-aliasing -Werror=class-memaccess -fpermissive)
+
+link_libraries(stdc++fs)
+
+find_package(MPI REQUIRED)
+# find_package(OpenMP)
+# if (OPENMP_FOUND)
+#     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+#     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+#     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+# endif()
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive")
+# set (base_dir "${PROJECT_SOURCE_DIR}/../backend")
+set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
+
+file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
+file (GLOB source_files_msum "${PROJECT_SOURCE_DIR}/msum.cpp")
+
+ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}")
+
+add_executable(msum ${source_files_msum})
+INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH})
+TARGET_LINK_LIBRARIES(msum parallel_RA ${MPI_LIBRARIES})
diff --git a/backend/tests/msum/compiled_pre/compiler-out b/backend/tests/msum/compiled_pre/compiler-out
new file mode 100644
index 00000000..d617368b
--- /dev/null
+++ b/backend/tests/msum/compiled_pre/compiler-out
@@ -0,0 +1,18 @@
+parsing + compilation took 4 ms.
+ir-small: 
+RULES:
+------------------------------------------------------
+/home/ubuntu/workspace/slog/backend/tests/msum/msum.slog 1:
+[((rel-arity cpath 4 db) x y x $=1) <--
+  (= $_1 ((rel-arity edge 2 db) x y))
+  (= $_5 ((rel-arity = 2 comp) $=1 1))]
+------------------------------------------------------
+/home/ubuntu/workspace/slog/backend/tests/msum/msum.slog 2:
+[((rel-arity cpath 4 db) x z y l) <--
+  (= $_3 ((rel-arity cpath 4 db) x y prev l))
+  (= $_4 ((rel-arity edge 2 db) y z))]
+
+
+All rules: 5, arules: 3, copy rules: 0, join rules: 2, facts: 0
+rels: 2, sccs: 4
+[wrote C++ driver and data to "/home/ubuntu/workspace/slog/out/msum.cpp"]
diff --git a/backend/tests/msum/compiled_pre/input-data/$strings.csv b/backend/tests/msum/compiled_pre/input-data/$strings.csv
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/msum/compiled_pre/input-data/257.edge.2.table b/backend/tests/msum/compiled_pre/input-data/257.edge.2.table
new file mode 100644
index 0000000000000000000000000000000000000000..a1596313ab507848e78f8864707b0e9340f23b48
GIT binary patch
literal 264
zcmYk%K@NZ*00Th~5%g93|5Y?=o4}>n2^^^9KbMzTt5kRM((>Gk)AICvzLTfBotCE`
bJ1tKSJ1tL7J1tMYc3Pg!Kj_0h-)H3;;?@Q!

literal 0
HcmV?d00001

diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/msum/compiled_pre/msum.cpp
similarity index 58%
rename from backend/tests/sssp/compiled_pre/sssp.cpp
rename to backend/tests/msum/compiled_pre/msum.cpp
index ee96ecb9..2df29aa4 100644
--- a/backend/tests/sssp/compiled_pre/sssp.cpp
+++ b/backend/tests/msum/compiled_pre/msum.cpp
@@ -1,5 +1,5 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 
 #include <iostream>
 #include <iterator>
@@ -33,7 +33,6 @@ const u64 int_tag = 0;
 const u64 str_tag = 2;
 const u64 sign_flip_const = 0x0000200000000000;
 const u64 signed_num_mask = 0xFFFFE00000000000;
-int start_node = 1;
 
 inline bool is_number(u64 datum) {
   // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
@@ -333,7 +332,7 @@ void load_input_relation(std::string db_dir) {
   for (const auto &entry : std::filesystem::directory_iterator(db_dir)) {
     // check if ends with table
     std::string filename_ss = entry.path().filename().string();
-    //std::cout << "input database has file " << filename_ss << std::endl;
+    std::cout << "input database has file " << filename_ss << std::endl;
     std::string suffix = ".table";
     int ft = filename_ss.size() - suffix.size();
     if (ft < 0)
@@ -356,8 +355,8 @@ void load_input_relation(std::string db_dir) {
     }
     if (tag > max_rel)
       max_rel = tag;
-    //std::cout << "load " << tag << "." << index_stream.str() << "has arity "
-    //          << arity << std::endl;
+    std::cout << "load " << tag << "." << index_stream.str() << "has arity "
+              << arity << std::endl;
     rel_tag_map[index_stream.str()] = tag;
   }
 }
@@ -377,271 +376,107 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
   }
   max_rel++;
   rel_tag_map[name_arity] = max_rel;
-  //std::cout << "generate rel tag: " << name_arity << " " << max_rel
-  //          << std::endl;
+  std::cout << "generate rel tag: " << name_arity << " " << max_rel
+            << std::endl;
   return max_rel;
 }
 
-void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::string output_dir, int argc, char **argv) {
-  start_node = sp;
-  load_input_relation(input_dir);
+int main(int argc, char **argv) {
+  // input dir from compiler
+  std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data";
+  // output dir from compiler
+  std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints";
+  if (argc == 3) {
+    slog_input_dir = argv[1];
+    slog_output_dir = argv[2];
+  }
+  load_input_relation(slog_input_dir);
+  mpi_comm mcomm;
+  mcomm.create(argc, argv);
 
-  relation *rel__edge__3__1__2__3 = new relation(
-      3, true, 3, get_tag_for_rel("edge", "1__2__3"),
-      std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
-      input_dir + "/" +
-          std::to_string(get_tag_for_rel("edge", "1__2__3")) +
-          ".edge.3.table",
-      FULL);
-  relation* rel__edge__3__1 = new relation(
-    1, false, 3, get_tag_for_rel("edge","1"),
-    std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table",
-    FULL);
-
-  // the dependent column must be exclude from hash computation, so join
-  // column count is 3 - 1 = 2
-  relation *rel__spath__3__1__2__3 = new relation(
-      2, true, 3, get_tag_for_rel("spath", "1__2__3"),
-      std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table",
-      input_dir + "/" +
-          std::to_string(get_tag_for_rel("spath", "1__2__3")) +
-          ".spath.3.table",
+  relation *rel__edge__2__1__2 = new relation(
+      2, true, 2, get_tag_for_rel("edge", "1__2"),
+      std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
+          ".edge.2.table",
       FULL);
-  // set functional dependency for spath
-  rel__spath__3__1__2__3->set_dependent_column_update(
-    {2, 3},   // len and id column
-    [](const std::vector<u64>& old_v, const std::vector<u64>& new_v, const vector<u64>& nt) -> std::optional<bool>
-    {
-      return new_v[0] < old_v[0];
-    }
-  );
-  relation* rel__spath__3__2 = new relation(
-    1, false, 3, get_tag_for_rel("spath","2"),
-    std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table",
-    FULL);
-  rel__spath__3__2->set_dependent_column_update(
-    {2, 3},
-    [](const std::vector<u64>& old_v, const std::vector<u64>& new_v, const vector<u64>& nt) -> std::optional<bool>
-    {
-      return new_v[0] < old_v[0];
+  relation *rel__cpath__4__2 = new relation(
+      1, true, 4, get_tag_for_rel("cpath", "2"),
+      std::to_string(get_tag_for_rel("cpath", "2")) + ".cpath.4.table",
+      std::to_string(get_tag_for_rel("cpath", "2")) + ".cpath.4.table", FULL);
+  rel__cpath__4__2->set_dependent_column_update(
+    {2,3,4},
+    [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
+         const vector<u64> &nt) -> std::optional<bool> {
+      if (new_v[0] != old_v[0]) {
+        return std::nullopt;
+      } else {
+        // monotonic
+        assert(new_v[1] > old_v[1]);
+        return new_v[1] > old_v[1];
+      }
     }
   );
 
-  RAM* scc0 = new RAM(false, 0);
-  scc0->add_relation(rel__edge__3__1, true, false);
-  scc0->add_relation(rel__edge__3__1__2__3, true, false);
-  scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3,
-  DELTA, {0, 3, 1, 2}));
-
-  RAM *scc1 = new RAM(false, 1);
-  scc1->add_relation(rel__edge__3__1__2__3, false, false);
-  scc1->add_relation(rel__spath__3__1__2__3, true, false);
-  // scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3,
-  //                                  rel__edge__3__1__2__3, FULL, {0, 1, 2}));
-  scc1->add_rule(new parallel_copy_generate(
-      rel__spath__3__1__2__3, rel__edge__3__1__2__3, FULL,
+  RAM *scc0 = new RAM(false, 0);
+  scc0->add_relation(rel__edge__2__1__2, false, false);
+  scc0->add_relation(rel__cpath__4__2, true, false);
+  scc0->add_rule(new parallel_copy_generate(
+      rel__cpath__4__2, rel__edge__2__1__2, FULL,
       [](const u64 *const data, u64 *const output) -> int {
-        auto args_for_old_bi = std::array<u64, 3>{data[0], data[1], data[2]};
-        using TState = std::tuple<const u64 *, u64 *>;
-        TState state = std::make_tuple(data, output);
-        auto callback = [](u64 res_0, TState state) -> TState {
-          auto [data, output] = state;
-          auto head_tuple = output;
-
-          bool compatible = true && res_0 < n2d(start_node);
-          if (!compatible)
-            return state;
-
-          head_tuple[0] = data[0];
-          head_tuple[1] = data[1];
-          head_tuple[2] = data[2];
-          return std::make_tuple(data, output + 2);
-        };
-        auto [_, new_ptr] =
-            builtin_eq_1<TState>(args_for_old_bi.data(), state, callback);
-        auto tuples_count = (new_ptr - output) / 2;
-        return tuples_count;
+        output[0] = data[1];
+        output[1] = data[0];
+        output[2] = data[1];
+        output[3] = n2d(1);
+        return 1;
       }));
 
-  RAM *scc2 = new RAM(true, 2);
-  scc2->add_relation(rel__edge__3__1__2__3, false, false);
-  scc2->add_relation(rel__spath__3__2, true, false);
-  scc2->add_relation(rel__spath__3__1__2__3, true, false);
-  //  the order of non join column also need to be carefully arranged
-  // because, dependent column
-  //  should always at last
-  scc2->add_rule(new parallel_acopy(
-    rel__spath__3__2,
-    rel__spath__3__1__2__3, DELTA,
-    {1, 0, 2, 3})); // 2, 1, 3, id
-  parallel_join* update_spath_j = new parallel_join(
-    rel__spath__3__1__2__3,
-    rel__edge__3__1, FULL,
-    rel__spath__3__2, DELTA,
-    {5, 2, 3}// useless
+  RAM *scc1 = new RAM(true, 1);
+  scc1->add_relation(rel__cpath__4__2, true, false);
+  scc1->add_relation(rel__edge__2__1__2, false, false);
+  auto pj = new parallel_join(
+    rel__cpath__4__2,
+    rel__edge__2__1__2, FULL,
+    rel__cpath__4__2, DELTA,
+    {4, 2, 0, 6}  // useless
   );
-  update_spath_j->set_generator_func([](std::vector<u64>& target_v,
-  std::vector<u64>& input_v, u64* res) {
-    res[0] = target_v[1];
-    res[1] = input_v[2];
-    if (res[0] == res[1]) {
-      res[2] = 0;
-    } else {
-      res[2] = target_v[2] + input_v[3];
+  pj->set_generator_func(
+    [](const depend_val_t& target_vs, const std::vector<u64>& input_v, depend_val_t& res_set) -> bool {
+      auto sum_res = 0;
+      for (auto& tv: target_vs) {
+        sum_res += tv[3];
+      }
+      std::vector<u64> res_tuple(4, 0);
+      res_tuple[0] = input_v[1];
+      res_tuple[1] = target_vs[0][1];
+      res_tuple[2] = target_vs[0][0];
+      res_tuple[3] = sum_res;
+      res_set.push_back(res_tuple);
+      return true;
     }
-  });
-  scc2->add_rule(update_spath_j);
+  );
+  scc1->add_rule(pj);
+
+  
 
   LIE *lie = new LIE();
-  lie->add_relation(rel__edge__3__1);
-  lie->add_relation(rel__edge__3__1__2__3);
-  lie->add_relation(rel__spath__3__2);
-  lie->add_relation(rel__spath__3__1__2__3);
+  lie->add_relation(rel__edge__2__1__2);
+  lie->add_relation(rel__cpath__4__2);
   lie->add_scc(scc0);
   lie->add_scc(scc1);
-  lie->add_scc(scc2);
-  lie->add_scc_dependance(scc0, scc2);
-  lie->add_scc_dependance(scc1, scc2);
-
-  // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  // relation *rel__spath__2__1__2 = new relation(
-  //     2, true, 2, get_tag_for_rel("spath", "1__2"),
-  //     std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table",
-  //     input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) +
-  //         ".spath.2.table",
-  //     FULL);
-  // rel__spath__2__1__2->set_dependent_column_update(
-  //   {1, 2},   // len and id column
-  //   [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
-  //   {
-  //     // std::cout << "Comparing  ";
-  //     // for (auto c : old_v) {
-  //     //   std::cout << c << " ";
-  //     // }
-  //     // std::cout << " <<<<<<  ";
-  //     // for (auto c : new_v) {
-  //     //   std::cout << c << " ";
-  //     // }
-  //     return new_v[0] < old_v[0];
-  //   }
-  // );
-  // relation *rel__edge__3__1 = new relation(
-  //     1, false, 3, get_tag_for_rel("edge", "1"),
-  //     std::to_string(get_tag_for_rel("edge", "1")) + ".edge.3.table", FULL);
-  // relation *rel__edge__3__1__2__3 = new relation(
-  //     3, true, 3, get_tag_for_rel("edge", "1__2__3"),
-  //     std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
-  //     input_dir + "/" +
-  //         std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table",
-  //     FULL);
-  // relation *rel__spath__2__1 = new relation(
-  //     1, false, 2, get_tag_for_rel("spath", "1"),
-  //     std::to_string(get_tag_for_rel("spath", "1")) + ".spath.2.table", FULL);
-  // rel__spath__2__1->set_dependent_column_update(
-  //   {1, 2},
-  //   [](std::vector<u64> old_v, std::vector<u64> new_v) -> std::optional<bool>
-  //   {
-  //     // std::cout << "Comparing  ";
-  //     // for (auto c : old_v) {
-  //     //   std::cout << c << " ";
-  //     // }
-  //     // std::cout << " <<<<<<  ";
-  //     // for (auto c : new_v) {
-  //     //   std::cout << c << " ";
-  //     // }
-  //     return new_v[0] < old_v[0];
-  //   }
-  // );
-
-  // RAM *scc0 = new RAM(false, 0);
-  // scc0->add_relation(rel__edge__3__1, true, false);
-  // scc0->add_relation(rel__edge__3__1__2__3, true, false);
-  // scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3,
-  //                                   DELTA, {0, 3, 1, 2}));
-
-  // RAM *scc1 = new RAM(false, 1);
-  // scc1->add_relation(rel__spath__2__1__2, true, false);
-  // scc1->add_relation(rel__edge__3__1, false, false);
-  // scc1->add_rule(new parallel_copy_generate(
-  //     rel__spath__2__1__2, rel__edge__3__1, FULL,
-  //     [](const u64 *const data, u64 *const output) -> int {
-  //       auto args_for_old_bi = std::array<u64, 1>{data[0]};
-  //       using TState = std::tuple<const u64 *, u64 *>;
-  //       TState state = std::make_tuple(data, output);
-  //       auto callback = [](u64 res_0, TState state) -> TState {
-  //         auto [data, output] = state;
-  //         auto head_tuple = output;
-
-  //         bool compatible = true && res_0 == n2d(start_node);
-  //         if (!compatible)
-  //           return state;
-
-  //         head_tuple[0] = data[2];
-  //         head_tuple[1] = data[3];
-  //         return std::make_tuple(data, output + 2);
-  //       };
-  //       auto [_, new_ptr] =
-  //           builtin_eq_1<TState>(args_for_old_bi.data(), state, callback);
-  //       auto tuples_count = (new_ptr - output) / 2;
-  //       return tuples_count;
-  //     }));
-
-  // RAM *scc2 = new RAM(true, 2);
-  // scc2->add_relation(rel__spath__2__1__2, true, false);
-  // scc2->add_relation(rel__edge__3__1, false, false);
-  // scc2->add_relation(rel__spath__2__1, true, false);
-  // // scc2->add_rule(new parallel_join(rel__spath__2__1__2, rel__spath__2__1, DELTA,
-  // //                                  rel__edge__3__1, FULL, {4, 5}));
-  // parallel_join* update_spath_j = new parallel_join(
-  //   rel__spath__2__1__2,
-  //   rel__edge__3__1, FULL,
-  //   rel__spath__2__1, DELTA,
-  //   {5,4}// useless
-  // );
-  // update_spath_j->set_generator_func([](std::vector<u64>& target_v,
-  // std::vector<u64>& input_v, u64* res) {
-  //   // res[0] = target_v[0];
-  //   res[0] = input_v[2];
-  //   if (res[0] == start_node) {
-  //     res[1] = 0;
-  //   } else {
-  //     res[1] = target_v[1] + input_v[3];
-  //   }
-  // });
-  // scc2->add_rule(update_spath_j);
-  // scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2,
-  //                                   DELTA, {0, 1, 2}));
-
-
-  // LIE *lie = new LIE();
-  // lie->add_relation(rel__spath__2__1__2);
-  // lie->add_relation(rel__edge__3__1);
-  // lie->add_relation(rel__edge__3__1__2__3);
-  // lie->add_relation(rel__spath__2__1);
-  // lie->add_scc(scc0);
-  // lie->add_scc(scc1);
-  // lie->add_scc(scc2);
-  // lie->add_scc_dependance(scc0, scc2);
-  // lie->add_scc_dependance(scc0, scc1);
-  // lie->add_scc_dependance(scc1, scc2);
+  lie->add_scc_dependance(scc0, scc1);
 
   // Enable IO
   lie->enable_all_to_all_dump();
   lie->enable_data_IO();
   lie->enable_IO();
-  lie->set_output_dir(output_dir); // Write to this directory
+  // lie->enable_share_io();
+  lie->set_output_dir(slog_output_dir); // Write to this directory
   lie->set_comm(mcomm);
   lie->set_batch_size(1);
   lie->execute();
   lie->print_all_relation_size(); // Continuously print relation sizes
-  lie->stat_intermediate();
-
-  // rel__spath__3__1__2__3->print();
-
-  // rel__spath__2__1__2->print();
-  // rel__spath__2__1->print();
-  // rel__edge__3__1->print();
-  // rel__edge__3__1__2__3->print();
+  // lie->stat_intermediate();
 
   // print all variants(non-canonical index of each relation)
   if (mcomm.get_rank() == 0) {
@@ -655,27 +490,12 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri
   }
 
   // lie->print_all_relation_size(); // Continuously print relation sizes
+  rel__edge__2__1__2->print();
+  rel__cpath__4__2->print();
 
   delete lie;
 
-}
-
-int main(int argc, char **argv) {
-  // input dir from compiler
-  std::string slog_input_dir =
-      "/home/stargazermiao/workspace/PL/slog/out/input-data";
-  // output dir from compiler
-  std::string slog_output_dir =
-      "/home/stargazermiao/workspace/PL/slog/out/checkpoints";
-  if (argc > 2) {
-    slog_input_dir = argv[1];
-    slog_output_dir = argv[2];
-  }
-  mpi_comm mcomm;
-  mcomm.create(argc, argv);
-
-  compute_sssp_from(mcomm, atoi(argv[3]), slog_input_dir, slog_output_dir, argc, argv);
-
   mcomm.destroy();
+
   return 0;
 }
diff --git a/backend/tests/msum/msum.slog b/backend/tests/msum/msum.slog
new file mode 100644
index 00000000..98152e61
--- /dev/null
+++ b/backend/tests/msum/msum.slog
@@ -0,0 +1,3 @@
+
+[(cpath x y {x 1}) <-- (edge x y)]
+[(cpath x z {y l}) <-- (cpath x y prev l) (edge y z)]
diff --git a/backend/tests/pagerank/pagerank.slog b/backend/tests/pagerank/pagerank.slog
new file mode 100644
index 00000000..f52d9663
--- /dev/null
+++ b/backend/tests/pagerank/pagerank.slog
@@ -0,0 +1,3 @@
+
+[(rank x x (computed1 x)) <-- (matrix x _ _)]
+[(rank x y (computed2 c d)) <-- (rank y _ c) (matrix y x d)]
diff --git a/backend/tests/sssp/compiled_pre/input-data/258.edge.2.table b/backend/tests/sssp/compiled_pre/input-data/258.edge.2.table
new file mode 100644
index 0000000000000000000000000000000000000000..f0c99a55eee0e88f256ab0afb576049c5b55fbd2
GIT binary patch
literal 528
zcmZ{gTMmLi5Cj)gR6xMjKS%Hg9F3#XJ9LFe0zZ@L)K2Ir()yL28e^KLRG3em;$P13
z$b*%`=j~nWfiKR3<-?V8arp4nd9Zx==3E>;e0Lr!AEt+R$p24X+t>W-TRLxcIsMv$
rJ3R6`?Z=~c=RGjJd;2iG2m3I+pY~yTzwE>Gd)tS(!{^`*U+-A}u#FT7

literal 0
HcmV?d00001

diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp.backup b/backend/tests/sssp/compiled_pre/sssp.cpp.backup
deleted file mode 100644
index 0b208342..00000000
--- a/backend/tests/sssp/compiled_pre/sssp.cpp.backup
+++ /dev/null
@@ -1,429 +0,0 @@
-// location of `parallel_RA_inc.h` here
-#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h"
-
-#include <iostream>
-#include <iterator>
-#include <sstream>
-#include <string>
-#include <unordered_set>
-#include <map>
-
-// builtins.cpp goes here!
-// builtins.cpp
-#include <cstddef>
-#include <limits>
-#include <vector>
-#include <string>
-#include <cassert>
-#include <iostream>
-#include <array>
-#include <functional>
-#include <tuple>
-#include <functional>
-#include <utility>
-
-using namespace std;
-#define u64  uint64_t
-#define u32  uint32_t
-using i64 = int64_t;
-
-const u64 tag_mask = 0xffffc00000000000;
-const u64 tag_position = 46;
-const u64 int_tag = 0;
-const u64 str_tag = 2;
-const u64 sign_flip_const = 0x0000200000000000;
-const u64 signed_num_mask = 0xFFFFE00000000000;
-
-inline bool is_number(u64 datum) {
-  // cout << "is_number(" << datum << "): " << (datum >> tag_position == int_tag) << "\n";
-  return datum >> tag_position == int_tag;
-}
-
-inline i64 datum_to_number(u64 datum) {
-  i64 signed_val = (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
-  if (signed_val >= sign_flip_const) {
-    signed_val = sign_flip_const - signed_val;
-  }
-  return signed_val;
-  // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
-}
-const auto d2n = datum_to_number;
-
-inline u64 number_to_datum(i64 number) {
-  i64 unsigned_value = number;
-  if (number < 0) {
-    unsigned_value = (- number) + sign_flip_const;
-  }
-  return (unsigned_value & ~tag_mask) | (int_tag << tag_position);
-  // return (number & ~tag_mask) | (int_tag << tag_position);
-}
-
-const auto n2d = number_to_datum;
-
-inline u64 string_to_datum(std::string str)
-{
-  u32 str_hash = string_hash(str);
-  return (str_hash & ~tag_mask) | (str_tag << tag_position);
-}
-const auto s2d = string_to_datum;
-
-
-vector<array<u64,2>> builtin_div_rem(const u64* const data){
-  if (is_number(data[0]) && is_number(data[1])){
-    auto div = number_to_datum(d2n(data[0]) / d2n(data[1]));
-    auto rem = number_to_datum(d2n(data[0]) % d2n(data[1]));
-    return {{div, rem}};
-  } else {
-    return {};
-  }
-}
-
-#define BUILTIN_BINARY_NUMBER_PRED(name, op) \
-template<typename TState> inline TState name(const u64* data, TState init_state, TState (*callback) (TState state)){ \
-  if (is_number(data[0]) && is_number(data[1]) &&\
-      datum_to_number(data[0]) op datum_to_number(data[1])){\
-    return callback(init_state);\
-  } else \
-    return init_state;\
-}
-
-BUILTIN_BINARY_NUMBER_PRED(builtin_less, <)
-BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >)
-BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=)
-BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=)
-
-#define BUILTIN_BINARY_NUMBER_FUNC(name, op) \
-template<typename TState> inline TState name(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){ \
-  if (is_number(data[0]) && is_number(data[1])){\
-    auto res = number_to_datum(datum_to_number(data[0]) op datum_to_number(data[1]));\
-    return callback(res, init_state);\
-} else \
-  return init_state;\
-}
-
-BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +)
-BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -)
-BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *)
-BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /)
-
-#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl) \
-template<typename TState> inline TState name(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){ \
-  if (is_number(data[0]) && is_number(data[1])){\
-    auto res = number_to_datum(impl(datum_to_number(data[0]), datum_to_number(data[1])));\
-    return callback(res, init_state);\
-} else \
-  return init_state;\
-}
-
-inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) {return arg2 - arg1;}
-BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1)
-
-
-#define BUILTIN_UNARY_NUMBER_FUNC(name, impl) \
-template<typename TState> inline TState name(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){ \
-  if (is_number(data[0])){\
-    auto res = number_to_datum(impl(datum_to_number(data[0])));\
-    return callback(res, init_state);\
-} else \
-  return init_state;\
-}
-
-inline u64 add1(u64 x) {return x + 1;}
-inline u64 sub1(u64 x) {return x - 1;}
-
-BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1)
-BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1)
-BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1)
-BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1)
-
-
-vector<array<u64,1>> builtin_range(const u64* const data){
-  vector<array<u64,1>> res;
-  if (is_number(data[0]) && is_number(data[1])){
-    auto lb = datum_to_number(data[0]);
-    auto ub = datum_to_number(data[1]);
-    res.reserve(ub - lb);
-    for (u64 x = lb; x < ub; x++)
-      res.push_back({number_to_datum(x)});
-  } 
-  return res;
-}
-
-template<typename TState>
-TState callback_builtin_range(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){
-  auto state = init_state;
-  if (is_number(data[0]) && is_number(data[1])){
-    auto lb = datum_to_number(data[0]);
-    auto ub = datum_to_number(data[1]);
-    for (u64 x = lb; x < ub; x++)
-      state = callback(number_to_datum(x), state);
-  }
-  return state;
-}
-
-
-#define BUILTIN_BINARY_PRED(name, op) \
-template<typename TState> TState name(const u64* data, TState init_state, TState (*callback) (TState state)){ \
-  if (data[0] op data[1])\
-    return callback(init_state);\
-  else\
-    return init_state;\
-}
-BUILTIN_BINARY_PRED(builtin_eq, ==)
-BUILTIN_BINARY_PRED(builtin_neq, !=)
-
-template<typename TState>
-TState builtin_eq_1(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){
-  return callback(data[0], init_state);
-}
-
-#define BUILTIN_UNARY_PRED(name, pred) \
-template<typename TState> TState name(const u64* data, TState init_state, TState (*callback) (TState state)){ \
-  if (pred(data[0]))\
-    return callback(init_state);\
-  else\
-    return init_state;\
-}
-
-bool is_not_number(u64 datum) {return !is_number(datum);}
-BUILTIN_UNARY_PRED(builtin_number_huh, is_number)
-BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number)
-
-// for generate-cpp-lambda-for-computational-join
-struct CL2CB_State{
-  void* original_callback; // There be dragons?
-  void* original_state;
-  const u64* original_data;
-  u64* cl1_output_args;
-};
-
-// for generate-cpp-lambda-for-computational-copy
-struct BCLCB_State{
-  void* original_callback;
-  void* original_state;
-  const u64* original_data;
-};
-
-//an experiment:
-template<bool f (u64, u64)>
-bool builtin_binary_number_pred(const u64* data){
-  if (is_number(data[0]) && is_number(data[1])){
-    return f(datum_to_number(data[0]), datum_to_number(data[1]));
-  } else {
-    return false;
-  }
-}
-bool _less(u64 x, u64 y) { return x < y;}
-auto builtin_less2 = builtin_binary_number_pred<_less>;
-
-
-template<typename TState> inline TState builtin_nop(const u64* data, TState init_state, TState (*callback) (TState state)){ 
-  return callback(init_state);
-}
-
-// //////////////////// AGGREGATORS Alternative design ////////////////////
-
-
-// TODO: add number type check
-//////////////////////////////  count /////////////////////////////////////
-
-local_agg_res_t agg_count_local(std::pair<shmap_relation::iterator, shmap_relation::iterator> joined_range)
-{
-  local_agg_res_t cnt = 0;
-  for(auto it = joined_range.first; it != joined_range.second ; ++it) {
-    cnt ++;
-  }
-  return cnt;
-}
-
-local_agg_res_t agg_count_reduce (local_agg_res_t x, local_agg_res_t y) {
-  return x + y;
-}
-
-//////////////////////////////  sum /////////////////////////////////////
-
-local_agg_res_t agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator> joined_range)
-{
-  local_agg_res_t sum_res = 0;
-  for(auto it = joined_range.first; it != joined_range.second ; ++it) {
-    auto tuple = (*it);
-    sum_res += tuple[tuple.size()-1];
-  }
-  return sum_res;
-}
-
-local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) {
-  return x + y;
-}
-
-//////////////////////////////  maximum  /////////////////////////////////////
-
-local_agg_res_t agg_maximum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator> joined_range)
-{
-  local_agg_res_t max_res = 0;
-  for(auto it = joined_range.first; it != joined_range.second ; ++it) {
-    auto tuple = (*it);
-    auto current_v = tuple[tuple.size()-1];
-    if (current_v > max_res) {
-      max_res = current_v;
-    }
-  }
-  return max_res;
-}
-
-local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) {
-  if (x > y){
-    return x;
-  } else{
-    return y;
-  }
-}
-
-//////////////////////////////  minimum  /////////////////////////////////////
-
-local_agg_res_t agg_minimum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator> joined_range)
-{
-  local_agg_res_t min_res = std::numeric_limits<u32>::max();
-  for(auto it = joined_range.first; it != joined_range.second ; ++it) {
-    auto tuple = (*it);
-    auto current_v = tuple[tuple.size()-1];
-    if (current_v < min_res) {
-      min_res = current_v;
-    }
-  }
-  return min_res;
-}
-
-local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) {
-  if (x < y){
-    return x;
-  } else{
-    return y;
-  }
-}
-
-// // end of builtins.cpp
-
-
-// global definitions:
-
-
-int max_rel = 255;
-std::map<std::string, int> rel_tag_map;
-std::map<std::string, std::unordered_set<std::string>> rel_index_map;
-
-// load all relation inside input database
-void load_input_relation(std::string db_dir)
-{
-  for (const auto & entry : std::filesystem::directory_iterator(db_dir))
-  {
-    // check if ends with table
-    std::string filename_ss = entry.path().filename().string();
-    std::cout << "input database has file " << filename_ss << std::endl;
-    std::string suffix = ".table";
-    int ft = filename_ss.size()-suffix.size();
-    if (ft < 0)
-      ft = 0;
-    if (filename_ss.rfind(suffix) != ft)
-    {
-      continue;
-    }
-    std::string filename_s = entry.path().stem().string();
-    int tag = std::stoi(filename_s.substr(0, filename_s.find(".")));
-    std::string name_arity = filename_s.substr(filename_s.find(".")+1, filename_s.size()-filename_s.find(".")-1);
-    std::string name = name_arity.substr(0, name_arity.rfind("."));
-    std::string arity_s = name_arity.substr(name_arity.rfind(".")+1, name_arity.size());
-    int arity = std::stoi(arity_s);
-    std::stringstream index_stream;
-    index_stream << name;
-    for (int i = 1; i <= arity; i++)
-    {
-      index_stream << "__" <<  i;
-    }
-    if (tag > max_rel)
-      max_rel = tag;
-    std::cout << "load " << tag << "." << index_stream.str() << "has arity " << arity << std::endl;
-    rel_tag_map[index_stream.str()] = tag;
-  }
-}
-
-int get_tag_for_rel(std::string relation_name, std::string index_str) {
-  std::string name_arity = relation_name + "__" + index_str;
-  if (rel_index_map.find(relation_name) != rel_index_map.end()) {
-    rel_index_map[relation_name].insert(index_str);
-  } else {
-    rel_index_map[relation_name] = {index_str};
-  }
-
-  if (rel_tag_map.find(name_arity) != rel_tag_map.end())
-  {
-    // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] << std::endl;
-    return rel_tag_map[name_arity];
-  }
-  max_rel++;
-  rel_tag_map[name_arity] = max_rel;
-  std::cout << "generate rel tag: " << name_arity << " " << max_rel << std::endl;
-  return max_rel;
-}
-
-int main(int argc, char **argv)
-{
-  // input dir from compiler
-  std::string slog_input_dir = "/home/stargazermiao/workspace/PL/slog/out/input-data";
-  // output dir from compiler
-  std::string slog_output_dir = "/home/stargazermiao/workspace/PL/slog/out/checkpoints";
-  if (argc == 3) {
-    slog_input_dir = argv[1];
-    slog_output_dir = argv[2];
-  }
-  load_input_relation(slog_input_dir);
-  mpi_comm mcomm;
-  mcomm.create(argc, argv);
-
-relation* rel__edge__3__1__2__3 = new relation(3, true, 3, get_tag_for_rel("edge","1__2__3"), std::to_string(get_tag_for_rel("edge","1__2__3")) + ".edge.3.table", slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge","1__2__3")) + ".edge.3.table", FULL);
-relation* rel__spath__3__1__2__3 = new relation(3, true, 3, get_tag_for_rel("spath","1__2__3"), std::to_string(get_tag_for_rel("spath","1__2__3")) + ".spath.3.table", slog_input_dir + "/" + std::to_string(get_tag_for_rel("spath","1__2__3")) + ".spath.3.table", FULL);
-
-RAM* scc0 = new RAM(false, 0);
-scc0->add_relation(rel__edge__3__1__2__3, false, false);
-scc0->add_relation(rel__spath__3__1__2__3, true, false);
-scc0->add_rule(new parallel_copy(rel__spath__3__1__2__3, rel__edge__3__1__2__3, FULL, {0, 1, 2}));
-
-LIE* lie = new LIE();
-lie->add_relation(rel__edge__3__1__2__3);
-lie->add_relation(rel__spath__3__1__2__3);
-lie->add_scc(scc0);
-
-
-
-  
-  // Enable IO
-  lie->enable_all_to_all_dump();
-  lie->enable_data_IO();
-  // lie->enable_share_io();
-  lie->enable_IO();
-  // lie->enable_share_io();
-  lie->set_output_dir(slog_output_dir); // Write to this directory
-  lie->set_comm(mcomm);
-  lie->set_batch_size(1);
-  lie->execute();
-  lie->print_all_relation_size(); // Continuously print relation sizes 
-  lie->stat_intermediate();
-
-  // print all variants(non-canonical index of each relation)
-  if (mcomm.get_rank() == 0)
-  {
-    std::cout << "rel_name" << ",\t" << "indices\n"; 
-    for (auto const& rel_p : rel_index_map) {
-      std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n";
-    }
-    std::cout << std::endl;
-  }
-
-  // lie->print_all_relation_size(); // Continuously print relation sizes
-
-  delete lie;
-
-  mcomm.destroy();
-
-  return 0;
-}
diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
index dc3e9c86..6d41428b 100644
--- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
@@ -1,5 +1,5 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 
 #include <optional>
 #include <iterator>
@@ -412,7 +412,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
   scc0->add_rule(new parallel_copy_generate(
       rel__spath__3__2, rel__edge__2__1__2, FULL,
       [](const u64 *const data, u64 *const output) -> int {
-        auto args_for_old_bi = std::array<u64, 3>{data[0], data[1], n2d(1)};
+        auto args_for_old_bi = std::array<u64, 3>{data[0], data[1], 1};
         using TState = std::tuple<const u64 *, u64 *>;
         TState state = std::make_tuple(args_for_old_bi.data(), output);
         auto callback = [](u64 res_0, TState state) -> TState {
@@ -442,7 +442,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
                         rel__spath__3__2, DELTA, {5, 2, 3} // useless
       );
   update_spath_j->set_generator_func(
-      [](std::vector<u64> &target_v, std::vector<u64> &input_v, u64 *res) {
+      [](const depend_val_t& target_vs, const std::vector<u64>& input_v, depend_val_t& res_set) -> bool {
         // std::cout << "Joining  >>> ";
         // for (auto c : input_v) {
         //   std::cout << c << " ";
@@ -452,13 +452,18 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
         //     std::cout << c << " ";
         // }
         // std::cout << std::endl;
+        auto target_v = target_vs[0];
+        std::vector<u64> res(3, 0);
         res[0] = input_v[1];
         res[1] = target_v[1];
         if (res[0] == res[1]) {
+          // std::cout << "Warning detect a loop for node " << res[0] << std::endl;
           res[2] = 0;
         } else {
           res[2] = target_v[2] + 1;
         }
+        res_set.push_back(res);
+        return true;
       });
   scc1->add_rule(update_spath_j);
 
@@ -484,7 +489,8 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
   // rel__spath__3__1__2__3->print();
 
   // rel__spath__2__1__2->print();
-//   rel__spath__3__2->print();
+  // rel__edge__2__1__2->print();
+  // rel__spath__3__2->print();
   // rel__edge__3__1->print();
   // rel__edge__3__1__2__3->print();
 
diff --git a/backend/tests/sssp/sssp.py b/backend/tests/sssp/sssp.py
new file mode 100644
index 00000000..bb7862a1
--- /dev/null
+++ b/backend/tests/sssp/sssp.py
@@ -0,0 +1,18 @@
+
+import networkx as nx
+
+data_f = open("/home/ubuntu/workspace/dataset/soc-LiveJournal1.txt")
+# data_f = open("/home/ubuntu/workspace/slog/backend/tests/sssp/test-input-graph/edge.csv")
+
+g = nx.DiGraph()
+for l in data_f:
+    g.add_edge(*map(int, l.strip().split("\t")))
+
+sssp_nodes = 0
+for i in range(1,10):
+    reached_map = nx.shortest_path(g, i)
+    sssp_nodes = sssp_nodes + len(reached_map.keys())
+    for k, v in reached_map.items():
+        print(f"{k} {i} {len(v)-1}") 
+
+print(sssp_nodes)
diff --git a/backend/tests/sssp/test-input-graph/edge.csv b/backend/tests/sssp/test-input-graph/edge.csv
index 1d997fb7..20eddd3d 100644
--- a/backend/tests/sssp/test-input-graph/edge.csv
+++ b/backend/tests/sssp/test-input-graph/edge.csv
@@ -5,5 +5,7 @@
 4	5
 5	6
 6	7
+7	8
 8	9
+9	1
 9	10
diff --git a/examples/datalog-example b/examples/datalog-example
index be103a21..87266643 160000
--- a/examples/datalog-example
+++ b/examples/datalog-example
@@ -1 +1 @@
-Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa
+Subproject commit 872666433df43282a408b4e37dad3fcbeafa1891

From 336e1a129bfcfc6d7fcf98a6c763e5f004d3c835 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-253.ec2.internal>
Date: Fri, 25 Nov 2022 02:51:13 +0000
Subject: [PATCH 15/36] add page rank test

---
 backend/CMakeLists.txt                        |   2 +-
 backend/src/RA/parallel_agg.cpp               |   4 +-
 backend/src/compat.h                          |   1 +
 backend/src/relation/balanced_hash_relation.h |   8 +
 .../pagerank/compiled_pre/CMakeLists.txt      |  28 +
 .../tests/pagerank/compiled_pre/pagerank.cpp  | 631 ++++++++++++++++++
 server_log                                    |  12 +-
 7 files changed, 677 insertions(+), 9 deletions(-)
 create mode 100644 backend/tests/pagerank/compiled_pre/CMakeLists.txt
 create mode 100644 backend/tests/pagerank/compiled_pre/pagerank.cpp

diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 36256bd9..a348cc28 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -12,7 +12,7 @@ find_package(MPI REQUIRED)
 #     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
 # endif()
 
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++20 -lstdc++fs -Werror=class-memaccess -fpermissive")
 set (source_dir "${PROJECT_SOURCE_DIR}/src")
 set (tests_dir "${PROJECT_SOURCE_DIR}/tests")
 set (data_dir "${PROJECT_SOURCE_DIR}/data")
diff --git a/backend/src/RA/parallel_agg.cpp b/backend/src/RA/parallel_agg.cpp
index 90c07c3d..9fc5a204 100644
--- a/backend/src/RA/parallel_agg.cpp
+++ b/backend/src/RA/parallel_agg.cpp
@@ -80,8 +80,8 @@ void parallel_join_aggregate::local_aggregate(
 
     u32* output_sub_bucket_count = output->get_sub_bucket_per_bucket_count();
     u32** output_sub_bucket_rank = output->get_sub_bucket_rank();
-    u32 real_join_count = output->get_join_column_count() - 1;
-    agg_buffer.width[ra_counter] = real_join_count + 1;
+    u32 real_join_count = output->get_join_column_count();
+    agg_buffer.width[ra_counter] = output->get_arity();
 
     shmap_relation* agg_target;
     if (*(target->get_sub_bucket_per_bucket_count()) == 1) {
diff --git a/backend/src/compat.h b/backend/src/compat.h
index 397a5249..dbc42cf4 100644
--- a/backend/src/compat.h
+++ b/backend/src/compat.h
@@ -25,6 +25,7 @@
 #include "btree/btree_set.h"
 #include <filesystem>
 #include <optional>
+#include <bit>
 
 
 #ifdef __GNUC__
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index cfd322ad..88d63404 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -89,6 +89,9 @@ class relation
     std::vector<int> dependent_column_indices;
     update_partial_compare_func_t update_compare_func;
 
+    // This is only used when this relation need to be reused in another computation loop
+    bool init_flag = true;
+
 public:
 
     /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL);
@@ -283,4 +286,9 @@ class relation
         }
         return !is_canonical;
     }
+
+    // skip initialization/loading facts
+    void disable_initialization() { init_flag = false; }
+    void enable_initialization() { init_flag = true; }
+
 };
diff --git a/backend/tests/pagerank/compiled_pre/CMakeLists.txt b/backend/tests/pagerank/compiled_pre/CMakeLists.txt
new file mode 100644
index 00000000..44733818
--- /dev/null
+++ b/backend/tests/pagerank/compiled_pre/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required (VERSION 3.9)
+
+project (pagerank)
+
+add_compile_options(--std=c++17 -lstdc++fs -Wno-strict-aliasing -Werror=class-memaccess -fpermissive)
+
+link_libraries(stdc++fs)
+
+find_package(MPI REQUIRED)
+# find_package(OpenMP)
+# if (OPENMP_FOUND)
+#     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+#     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+#     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+# endif()
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive")
+# set (base_dir "${PROJECT_SOURCE_DIR}/../backend")
+set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
+
+file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
+file (GLOB source_files_pagerank "${PROJECT_SOURCE_DIR}/pagerank.cpp")
+
+ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}")
+
+add_executable(pagerank ${source_files_pagerank})
+INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH})
+TARGET_LINK_LIBRARIES(pagerank parallel_RA ${MPI_LIBRARIES})
diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp
new file mode 100644
index 00000000..21a2aa94
--- /dev/null
+++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp
@@ -0,0 +1,631 @@
+// location of `parallel_RA_inc.h` here
+#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+
+#include <iostream>
+#include <bit>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+// builtins.cpp goes here!
+// builtins.cpp
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace std;
+#define u64 uint64_t
+#define u32 uint32_t
+using i64 = int64_t;
+
+const u64 tag_mask = 0xffffc00000000000;
+const u64 tag_position = 46;
+const u64 int_tag = 0;
+const u64 str_tag = 2;
+const u64 sign_flip_const = 0x0000200000000000;
+const u64 signed_num_mask = 0xFFFFE00000000000;
+
+inline bool is_number(u64 datum) {
+  // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
+  // int_tag) << "\n";
+  return datum >> tag_position == int_tag;
+}
+
+inline i64 datum_to_number(u64 datum) {
+  i64 signed_val =
+      (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
+  if (signed_val >= sign_flip_const) {
+    signed_val = sign_flip_const - signed_val;
+  }
+  return signed_val;
+  // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 -
+  // tag_position);
+}
+const auto d2n = datum_to_number;
+
+inline u64 number_to_datum(i64 number) {
+  i64 unsigned_value = number;
+  if (number < 0) {
+    unsigned_value = (-number) + sign_flip_const;
+  }
+  return (unsigned_value & ~tag_mask) | (int_tag << tag_position);
+  // return (number & ~tag_mask) | (int_tag << tag_position);
+}
+
+const auto n2d = number_to_datum;
+
+inline u64 string_to_datum(std::string str) {
+  u32 str_hash = string_hash(str);
+  return (str_hash & ~tag_mask) | (str_tag << tag_position);
+}
+const auto s2d = string_to_datum;
+
+vector<array<u64, 2>> builtin_div_rem(const u64 *const data) {
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto div = number_to_datum(d2n(data[0]) / d2n(data[1]));
+    auto rem = number_to_datum(d2n(data[0]) % d2n(data[1]));
+    return {{div, rem}};
+  } else {
+    return {};
+  }
+}
+
+#define BUILTIN_BINARY_NUMBER_PRED(name, op)                                   \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(TState state)) {                       \
+    if (is_number(data[0]) && is_number(data[1]) &&                            \
+        datum_to_number(data[0]) op datum_to_number(data[1])) {                \
+      return callback(init_state);                                             \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+BUILTIN_BINARY_NUMBER_PRED(builtin_less, <)
+BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >)
+BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=)
+BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=)
+
+#define BUILTIN_BINARY_NUMBER_FUNC(name, op)                                   \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0]) && is_number(data[1])) {                            \
+      auto res = number_to_datum(datum_to_number(data[0])                      \
+                                     op datum_to_number(data[1]));             \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /)
+
+#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl)                                \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0]) && is_number(data[1])) {                            \
+      auto res = number_to_datum(                                              \
+          impl(datum_to_number(data[0]), datum_to_number(data[1])));           \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) { return arg2 - arg1; }
+BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1)
+
+#define BUILTIN_UNARY_NUMBER_FUNC(name, impl)                                  \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0])) {                                                  \
+      auto res = number_to_datum(impl(datum_to_number(data[0])));              \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+inline u64 add1(u64 x) { return x + 1; }
+inline u64 sub1(u64 x) { return x - 1; }
+
+BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1)
+
+vector<array<u64, 1>> builtin_range(const u64 *const data) {
+  vector<array<u64, 1>> res;
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto lb = datum_to_number(data[0]);
+    auto ub = datum_to_number(data[1]);
+    res.reserve(ub - lb);
+    for (u64 x = lb; x < ub; x++)
+      res.push_back({number_to_datum(x)});
+  }
+  return res;
+}
+
+template <typename TState>
+TState callback_builtin_range(const u64 *data, TState init_state,
+                              TState (*callback)(u64 res, TState state)) {
+  auto state = init_state;
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto lb = datum_to_number(data[0]);
+    auto ub = datum_to_number(data[1]);
+    for (u64 x = lb; x < ub; x++)
+      state = callback(number_to_datum(x), state);
+  }
+  return state;
+}
+
+#define BUILTIN_BINARY_PRED(name, op)                                          \
+  template <typename TState>                                                   \
+  TState name(const u64 *data, TState init_state,                              \
+              TState (*callback)(TState state)) {                              \
+    if (data[0] op data[1])                                                    \
+      return callback(init_state);                                             \
+    else                                                                       \
+      return init_state;                                                       \
+  }
+BUILTIN_BINARY_PRED(builtin_eq, ==)
+BUILTIN_BINARY_PRED(builtin_neq, !=)
+
+template <typename TState>
+TState builtin_eq_1(const u64 *data, TState init_state,
+                    TState (*callback)(u64 res, TState state)) {
+  return callback(data[0], init_state);
+}
+
+#define BUILTIN_UNARY_PRED(name, pred)                                         \
+  template <typename TState>                                                   \
+  TState name(const u64 *data, TState init_state,                              \
+              TState (*callback)(TState state)) {                              \
+    if (pred(data[0]))                                                         \
+      return callback(init_state);                                             \
+    else                                                                       \
+      return init_state;                                                       \
+  }
+
+bool is_not_number(u64 datum) { return !is_number(datum); }
+BUILTIN_UNARY_PRED(builtin_number_huh, is_number)
+BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number)
+
+// for generate-cpp-lambda-for-computational-join
+struct CL2CB_State {
+  void *original_callback; // There be dragons?
+  void *original_state;
+  const u64 *original_data;
+  u64 *cl1_output_args;
+};
+
+// for generate-cpp-lambda-for-computational-copy
+struct BCLCB_State {
+  void *original_callback;
+  void *original_state;
+  const u64 *original_data;
+};
+
+// an experiment:
+template <bool f(u64, u64)> bool builtin_binary_number_pred(const u64 *data) {
+  if (is_number(data[0]) && is_number(data[1])) {
+    return f(datum_to_number(data[0]), datum_to_number(data[1]));
+  } else {
+    return false;
+  }
+}
+bool _less(u64 x, u64 y) { return x < y; }
+auto builtin_less2 = builtin_binary_number_pred<_less>;
+
+template <typename TState>
+inline TState builtin_nop(const u64 *data, TState init_state,
+                          TState (*callback)(TState state)) {
+  return callback(init_state);
+}
+
+// //////////////////// AGGREGATORS Alternative design ////////////////////
+
+// TODO: add number type check
+//////////////////////////////  count /////////////////////////////////////
+
+local_agg_res_t
+agg_count_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                    joined_range) {
+  local_agg_res_t cnt = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    cnt++;
+  }
+  return cnt;
+}
+
+local_agg_res_t agg_count_reduce(local_agg_res_t x, local_agg_res_t y) {
+  return x + y;
+}
+
+//////////////////////////////  sum /////////////////////////////////////
+
+local_agg_res_t
+agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                  joined_range) {
+  local_agg_res_t sum_res = 0;
+  for (shmap_relation::iterator it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    sum_res += tuple[tuple.size() - 2];
+  }
+  return sum_res;
+}
+
+local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  return x + y;
+}
+
+//////////////////////////////  maximum  /////////////////////////////////////
+
+local_agg_res_t
+agg_maximum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                      joined_range) {
+  local_agg_res_t max_res = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    auto current_v = tuple[tuple.size() - 1];
+    if (current_v > max_res) {
+      max_res = current_v;
+    }
+  }
+  return max_res;
+}
+
+local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  if (x > y) {
+    return x;
+  } else {
+    return y;
+  }
+}
+
+//////////////////////////////  minimum  /////////////////////////////////////
+
+local_agg_res_t
+agg_minimum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                      joined_range) {
+  local_agg_res_t min_res = std::numeric_limits<u32>::max();
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    auto current_v = tuple[tuple.size() - 1];
+    if (current_v < min_res) {
+      min_res = current_v;
+    }
+  }
+  return min_res;
+}
+
+local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  if (x < y) {
+    return x;
+  } else {
+    return y;
+  }
+}
+
+// // end of builtins.cpp
+
+// global definitions:
+
+int max_rel = 255;
+std::map<std::string, int> rel_tag_map;
+std::map<std::string, std::unordered_set<std::string>> rel_index_map;
+
+// load all relation inside input database
+void load_input_relation(std::string db_dir) {
+  for (const auto &entry : std::filesystem::directory_iterator(db_dir)) {
+    // check if ends with table
+    std::string filename_ss = entry.path().filename().string();
+    std::cout << "input database has file " << filename_ss << std::endl;
+    std::string suffix = ".table";
+    int ft = filename_ss.size() - suffix.size();
+    if (ft < 0)
+      ft = 0;
+    if (filename_ss.rfind(suffix) != ft) {
+      continue;
+    }
+    std::string filename_s = entry.path().stem().string();
+    int tag = std::stoi(filename_s.substr(0, filename_s.find(".")));
+    std::string name_arity = filename_s.substr(
+        filename_s.find(".") + 1, filename_s.size() - filename_s.find(".") - 1);
+    std::string name = name_arity.substr(0, name_arity.rfind("."));
+    std::string arity_s =
+        name_arity.substr(name_arity.rfind(".") + 1, name_arity.size());
+    int arity = std::stoi(arity_s);
+    std::stringstream index_stream;
+    index_stream << name;
+    for (int i = 1; i <= arity; i++) {
+      index_stream << "__" << i;
+    }
+    if (tag > max_rel)
+      max_rel = tag;
+    std::cout << "load " << tag << "." << index_stream.str() << "has arity "
+              << arity << std::endl;
+    rel_tag_map[index_stream.str()] = tag;
+  }
+}
+
+int get_tag_for_rel(std::string relation_name, std::string index_str) {
+  std::string name_arity = relation_name + "__" + index_str;
+  if (rel_index_map.find(relation_name) != rel_index_map.end()) {
+    rel_index_map[relation_name].insert(index_str);
+  } else {
+    rel_index_map[relation_name] = {index_str};
+  }
+
+  if (rel_tag_map.find(name_arity) != rel_tag_map.end()) {
+    // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] <<
+    // std::endl;
+    return rel_tag_map[name_arity];
+  }
+  max_rel++;
+  rel_tag_map[name_arity] = max_rel;
+  std::cout << "generate rel tag: " << name_arity << " " << max_rel
+            << std::endl;
+  return max_rel;
+}
+
+float ALPHA = 0.85;
+u64 total_node_size = 0;
+
+int main(int argc, char **argv) {
+  // input dir from compiler
+  std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data";
+  // output dir from compiler
+  std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints";
+  if (argc == 3) {
+    slog_input_dir = argv[1];
+    slog_output_dir = argv[2];
+  }
+  load_input_relation(slog_input_dir);
+  mpi_comm mcomm;
+  mcomm.create(argc, argv);
+
+  // (edge from to)
+  relation *rel__edge__2__1 = new relation(
+      1, true, 2, get_tag_for_rel("edge", "1__2"),
+      std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
+          ".edge.2.table",
+      FULL);
+
+  // >>>>>>>>>>>>>>> compute node size
+  // (node x)
+  relation *rel__node__1__1 = new relation(
+      1, true, 2, get_tag_for_rel("node", "1"),
+      std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("node", "1")) +
+          ".node.1.table",
+      FULL);
+
+  // (total_node_cnt n)
+  relation *rel__total_node_cnt__1__1 =
+      new relation(1, true, 2, get_tag_for_rel("total_node_cnt", "1"),
+                   std::to_string(get_tag_for_rel("total_node_cnt", "1")) +
+                       ".total_node_cnt.1.table",
+                   slog_input_dir + "/" +
+                       std::to_string(get_tag_for_rel("total_node_cnt", "1")) +
+                       ".total_node_cnt.1.table",
+                   FULL);
+
+  // helper relation for non-join aggregation
+  relation *rel___dollorunit__1__1 = new relation(
+      0, true, 1, get_tag_for_rel("$unit", "1"),
+      std::to_string(get_tag_for_rel("$unit", "1")) + ".$unit.1.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("$unit", "1")) +
+          ".$unit.1.table",
+      FULL);
+
+  RAM *scc_helper_fact = new RAM(false, 0);
+  scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false);
+  scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)}));
+
+  // [(node a) (node b) <-- (edge a b)]
+  RAM *scc_compute_node = new RAM(false, 1);
+  scc_compute_node->add_relation(rel__edge__2__1, false, false);
+  scc_compute_node->add_relation(rel__node__1__1, true, false);
+  scc_compute_node->add_rule(new parallel_copy_generate(
+      rel__node__1__1, rel__edge__2__1, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
+        output[0] = data[0];
+        return 1;
+      }));
+  scc_compute_node->add_rule(new parallel_copy_generate(
+      rel__node__1__1, rel__edge__2__1, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
+        output[0] = data[1];
+        return 1;
+      }));
+
+  // (total_node_cnt {count node _})
+  RAM *scc_count_nodes = new RAM(false, 2);
+  scc_count_nodes->add_relation(rel__node__1__1, false, false);
+  scc_count_nodes->add_relation(rel__total_node_cnt__1__1, true, false);
+  scc_count_nodes->add_relation(rel___dollorunit__1__1, false, false);
+  scc_count_nodes->add_rule(new parallel_join_aggregate(
+      rel__total_node_cnt__1__1, rel__node__1__1, rel___dollorunit__1__1, FULL,
+      agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
+      {2}));
+
+  LIE *cnt_lie = new LIE();
+  cnt_lie->add_relation(rel__edge__2__1);
+  cnt_lie->add_relation(rel__node__1__1);
+  cnt_lie->add_relation(rel___dollorunit__1__1);
+  cnt_lie->add_relation(rel__total_node_cnt__1__1);
+  cnt_lie->add_scc(scc_helper_fact);
+  cnt_lie->add_scc(scc_compute_node);
+  cnt_lie->add_scc(scc_count_nodes);
+  cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes);
+  cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes);
+
+  cnt_lie->enable_all_to_all_dump();
+  cnt_lie->set_output_dir(slog_output_dir); // Write to this directory
+  cnt_lie->set_comm(mcomm);
+  cnt_lie->set_batch_size(1);
+  cnt_lie->execute();
+  cnt_lie->print_all_relation_size(); // Continuously print relation sizes
+
+  // only 1 data in this rel so its safe
+  rel__total_node_cnt__1__1->print();
+
+  for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) {
+    total_node_size = t[0];
+    std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << std::endl;
+  }
+
+  // >>>>>>>>>>>>>>> compute page rank
+  std::cout << ">>>>>>>>>> Computing pagerank ... " << std::endl;
+
+  rel__edge__2__1->disable_initialization();
+  rel__node__1__1->disable_initialization();
+
+//   matrix edge + successor count
+  relation *rel__matrix__3__1 = new relation(
+      1, true, 3, get_tag_for_rel("matrix", "1"),
+      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table",
+      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL);
+
+  relation *rel__rank__3__1 = new relation(
+      1, true, 3, get_tag_for_rel("rank", "1"),
+      std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table",
+      std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL);
+
+  rel__rank__3__1->set_dependent_column_update(
+      {2, 3, 4},
+      [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
+         const vector<u64> &nt) -> std::optional<bool> {
+        if (new_v[0] != old_v[0]) {
+          return std::nullopt;
+        } else {
+          // monotonic
+          assert(new_v[1] > old_v[1]);
+          return new_v[1] > old_v[1];
+        }
+      });
+
+  relation *rel__result__2__1__2 = new relation(
+      2, true, 2, get_tag_for_rel("result", "1__2"),
+      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
+      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", FULL);
+
+  // 
+
+  RAM* scc_compute_matrix = new RAM(false, 0);
+  scc_compute_matrix->add_relation(rel__edge__2__1, false, false);
+  scc_compute_matrix->add_relation(rel__matrix__3__1, true, false);
+  scc_compute_matrix->add_rule(
+    new parallel_join_aggregate(
+      rel__matrix__3__1, rel__edge__2__1, rel__node__1__1, FULL,
+      agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
+      {1, 2, 3}));
+
+  RAM* scc_page_rank = new RAM(true, 1);
+  scc_page_rank->add_relation(rel__matrix__3__1, false, false);
+  scc_page_rank->add_relation(rel__rank__3__1, true, false);
+  scc_page_rank->add_rule(new parallel_copy_generate(
+    rel__rank__3__1, rel__matrix__3__1, FULL,
+    [](const u64 *const data, u64 *const output) -> int {
+        output[0] = data[0];
+        output[1] = data[0];
+        float init_pg_v = (1 - ALPHA) / total_node_size;
+        output[1] = *reinterpret_cast<u32*>(&init_pg_v);
+        return 1;
+    }
+  ));
+  parallel_join* rank_join = new parallel_join(
+    rel__rank__3__1,
+    rel__matrix__3__1, FULL,
+    rel__rank__3__1, DELTA,
+    {3,1,2}     // useless
+  );
+  rank_join->set_generator_func(
+    [](const depend_val_t& target_vs, const std::vector<u64>& input_v, depend_val_t& res_set) -> bool {
+        float pg_sum = 0.0;
+        for (auto& tv: target_vs) {
+            u32 raw_succ_pg_v = (u32)d2n(tv[3]);     // all columns are u64, cast to u32 first
+            auto succ_pg_v = *reinterpret_cast<float*>(raw_succ_pg_v);
+            pg_sum += ALPHA * succ_pg_v / d2n(input_v[3]);
+        }
+        // u64 encoded_sum = 
+        std::vector<u64> res_tuple(3, 0);
+        res_tuple[0] = input_v[1];
+        res_tuple[1] = input_v[0];
+        res_tuple[2] = *reinterpret_cast<u32*>(&pg_sum);
+        res_set.push_back(res_tuple);
+        return true;
+    }
+  );
+  scc_page_rank->add_rule(rank_join);
+
+  RAM *scc_result = new RAM(false, 2);
+  scc_result->add_relation(rel__rank__3__1, false, false);
+  scc_result->add_relation(rel__result__2__1__2, true, false);
+  scc_result->add_relation(rel__node__1__1, false, false);
+  scc_result->add_rule(new parallel_join_aggregate(
+    rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
+    agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr,
+    {0, 2}
+  ));
+
+  
+  LIE *pg_lie = new LIE();
+  pg_lie->add_relation(rel__edge__2__1);
+  pg_lie->add_relation(rel__matrix__3__1);
+  pg_lie->add_relation(rel__node__1__1);
+  pg_lie->add_relation(rel__rank__3__1);
+  pg_lie->add_relation(rel__result__2__1__2);
+  pg_lie->add_scc(scc_compute_matrix);
+  pg_lie->add_scc(scc_page_rank);
+  pg_lie->add_scc(scc_result);
+  pg_lie->add_scc_dependance(scc_compute_matrix, scc_page_rank);
+  pg_lie->add_scc_dependance(scc_page_rank, scc_result);
+
+  // Enable IO
+  pg_lie->enable_all_to_all_dump();
+  pg_lie->enable_data_IO();
+  pg_lie->enable_IO();
+  // lie->enable_share_io();
+  pg_lie->set_output_dir(slog_output_dir); // Write to this directory
+  pg_lie->set_comm(mcomm);
+  pg_lie->set_batch_size(1);
+  pg_lie->execute();
+  pg_lie->print_all_relation_size(); // Continuously print relation sizes
+  // lie->stat_intermediate();
+
+  // print all variants(non-canonical index of each relation)
+//   if (mcomm.get_rank() == 0) {
+//     std::cout << "rel_name"
+//               << ",\t"
+//               << "indices\n";
+//     for (auto const &rel_p : rel_index_map) {
+//       std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n";
+//     }
+//     std::cout << std::endl;
+//   }
+
+
+  delete pg_lie;
+
+  // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+  mcomm.destroy();
+
+  return 0;
+}
diff --git a/server_log b/server_log
index 9c9807f1..3024eca0 100644
--- a/server_log
+++ b/server_log
@@ -1,8 +1,8 @@
-[I 2022-08-22 16:33:53] concurrency model: async
-[I 2022-08-22 16:33:53] masquerade (NAT) address: None
-[I 2022-08-22 16:33:53] passive ports: None
-[I 2022-08-22 16:33:53] >>> starting FTP server on 0.0.0.0:2121, pid=160658 <<<
+[I 2022-11-24 17:09:37] concurrency model: async
+[I 2022-11-24 17:09:37] masquerade (NAT) address: None
+[I 2022-11-24 17:09:37] passive ports: None
+[I 2022-11-24 17:09:37] >>> starting FTP server on 0.0.0.0:2121, pid=1188430 <<<
 user break
   context...:
-   /home/stargazermiao/workspace/PL/slog/compiler/slog-process.rkt:45:0: loop
-   body of "/home/stargazermiao/workspace/PL/slog/compiler/slog-process.rkt"
+   /home/ubuntu/workspace/slog/compiler/slog-process.rkt:45:0: loop
+   body of "/home/ubuntu/workspace/slog/compiler/slog-process.rkt"

From a3efb3a8a41a59a10e57dccd40f4f9897f013547 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-253.ec2.internal>
Date: Sun, 27 Nov 2022 19:55:37 +0000
Subject: [PATCH 16/36] fix dependent index

---
 backend/src/RA/parallel_agg.cpp               |   13 +-
 backend/src/RAM/RA_tasks.cpp                  |    4 +-
 backend/src/lie/lie.cpp                       |   13 +-
 .../src/relation/balanced_hash_relation.cpp   |   64 +
 backend/src/relation/balanced_hash_relation.h |    4 +
 backend/src/relation/shmap_relation_exp.cpp   |   53 +-
 .../pagerank/compiled_pre/in/$strings.csv     |    0
 .../pagerank/compiled_pre/in/258.edge.2.table |  Bin 0 -> 37392 bytes
 .../tests/pagerank/compiled_pre/pagerank.cpp  |  242 ++-
 backend/tests/pagerank/ground_truth           |   60 +
 backend/tests/pagerank/pagerank.py            |   19 +
 backend/tests/pagerank/test-graph/edge.fasts  | 1558 +++++++++++++++++
 12 files changed, 1947 insertions(+), 83 deletions(-)
 create mode 100644 backend/tests/pagerank/compiled_pre/in/$strings.csv
 create mode 100644 backend/tests/pagerank/compiled_pre/in/258.edge.2.table
 create mode 100644 backend/tests/pagerank/ground_truth
 create mode 100644 backend/tests/pagerank/pagerank.py
 create mode 100644 backend/tests/pagerank/test-graph/edge.fasts

diff --git a/backend/src/RA/parallel_agg.cpp b/backend/src/RA/parallel_agg.cpp
index 9fc5a204..fc42c114 100644
--- a/backend/src/RA/parallel_agg.cpp
+++ b/backend/src/RA/parallel_agg.cpp
@@ -93,11 +93,10 @@ void parallel_join_aggregate::local_aggregate(
             agg_target->insert_tuple_from_array(input0_buffer+k1, target->get_arity()+1);
         }
     }
-
     btree::btree_map<std::vector<u64>, u64, shmap_relation::t_comparator> res_map;
     for (u32 bucket=0; bucket < buckets; bucket ++) {
         for (auto tuple: input->get_full()[bucket]) {
-            std::vector<u64> data_v(tuple.begin(), tuple.begin()+target->get_join_column_count());
+            std::vector<u64> data_v(tuple.begin(), tuple.begin()+input->get_join_column_count());
             // std::cout << "On rank " << mcomm.get_rank() << " bucket " << *(target->get_sub_bucket_per_bucket_count()) << std::endl;
             auto joined_range = agg_target->prefix_range(data_v);
             auto agg_data = local_func(joined_range);
@@ -110,20 +109,28 @@ void parallel_join_aggregate::local_aggregate(
             }
         }
     }
+            
+    // std::cout << ">>>>>>>>>>>>>>>>>>>>> " << input->get_full()[0].size() << std::endl;
 
     for (u32 bucket=0; bucket < buckets; bucket ++) {
         for (auto input_tuple: input->get_full()[bucket]) {
             std::vector<u64> joined_input_tuple(input_tuple.begin(), input_tuple.begin()+input->get_join_column_count());
             auto agg_res = res_map[joined_input_tuple];
-            std::vector<u64> tuple(reorder_mapping.size(), 0);
+            std::vector<u64> tuple(output->get_arity(), 0);
             int reorder_agg_index = input->get_arity() + 1;
             for (long unsigned int j = 0; j < reorder_mapping.size(); j++) {
+            //   std::cout << reorder_mapping[j] << " " << reorder_agg_index << std::endl;
               if (reorder_mapping[j] == reorder_agg_index) {
                 tuple[j] = agg_res;
               } else {
                 tuple[j] = input_tuple[reorder_mapping[j]];
               }
             }
+            // std::cout << "aggregated tuple <<<"  << reorder_mapping.size() << " >>> ";
+            // for (auto c: tuple) {
+            //     std::cout << c << " ";
+            // }
+            // std::cout << std::endl;
 
             uint64_t bucket_id = tuple_hash(tuple.data(), output->get_join_column_count()) % buckets;
             uint64_t sub_bucket_id = 0;
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 2712e343..7bbd4a71 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -1129,7 +1129,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
         intra_bucket_comm_execute();
         auto intra_end = MPI_Wtime(); 
 
-        std::cout << std::setiosflags(std::ios::fixed);
+        // std::cout << std::setiosflags(std::ios::fixed);
         bool local_join_status = false;
         while (local_join_status == false)
         {
@@ -1262,7 +1262,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
         //    std::cout << "--------------FIXED POINT ITERATION " << loop_count_tracker << "--------------" << std::endl;
 #endif
 
-        std::cout << std::setiosflags(std::ios::fixed);
+        // std::cout << std::setiosflags(std::ios::fixed);
         auto intra_start = MPI_Wtime(); 
         intra_bucket_comm_execute();
         auto intra_end = MPI_Wtime(); 
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 59a0ec57..674a2d92 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -305,12 +305,13 @@ bool LIE::execute ()
     /// Initialize all relations
     for (u32 i = 0 ; i < lie_relations.size(); i++)
     {
-        lie_relations[i]->set_restart_flag(restart_flag);
-        lie_relations[i]->set_share_io(share_io);
-        lie_relations[i]->set_separate_io(separate_io);
-        lie_relations[i]->set_offset_io(offset_io);
-        lie_relations[i]->initialize_relation(mcomm, intern_map);
-
+        if (lie_relations[i]->need_init_huh()) {
+            lie_relations[i]->set_restart_flag(restart_flag);
+            lie_relations[i]->set_share_io(share_io);
+            lie_relations[i]->set_separate_io(separate_io);
+            lie_relations[i]->set_offset_io(offset_io);
+            lie_relations[i]->initialize_relation(mcomm, intern_map);
+        }
 #if DEBUG_OUTPUT
         //lie_relations[i]->print();
 #endif
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index 7182fc24..9fcfe747 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -488,6 +488,70 @@ void relation::print()
 //    }
 }
 
+void relation::print(tuple_formator_t ft)
+{
+    u32 buckets = get_bucket_count();
+//    if (mcomm.get_rank() == 0)
+//    {
+        vector_buffer *vb_full = new vector_buffer[buckets];
+        for (u32 i=0; i < buckets; i++)
+        {
+            vb_full[i].vector_buffer_create_empty();
+            std::vector<u64> prefix = {};
+            full[i].as_vector_buffer_recursive(&(vb_full[i]), prefix);
+
+            if (vb_full[i].size != 0)
+            	std::cout << get_debug_id() << " " << mcomm.get_rank() << " FULL Rows " << vb_full[i].size/(sizeof(u64) * (arity + 1)) << " columns " << arity + 1 << std::endl;
+            for (u32 j=0; j < vb_full[i].size/sizeof(u64); j = j + arity+1)
+            {
+                if (j % (arity+1) == 0) {
+                    std::cout << "F [" << j/(arity+1) << "] ";
+
+                }
+                std::vector<u64> cur_tuple;
+                for (u32 k = 0; k < arity+1; k++)
+                {
+                    u64 temp;
+                    memcpy(&temp, (vb_full[i].buffer) + (j + k)*sizeof(u64), sizeof(u64));
+                    // std::cout << temp << " ";
+                    cur_tuple.push_back(temp);
+                }
+                ft(cur_tuple);
+            }
+
+            vb_full[i].vector_buffer_free();
+        }
+        delete[] vb_full;
+
+
+        // vector_buffer *vb_delta = new vector_buffer[buckets];
+        // for (u32 i=0; i < buckets; i++)
+        // {
+        //     vb_delta[i].vector_buffer_create_empty();
+        //     std::vector<u64> prefix = {};
+        //     delta[i].as_vector_buffer_recursive(&(vb_delta[i]), prefix);
+
+        //     if (vb_delta[i].size != 0)
+        //         std::cout << get_debug_id() << " " << mcomm.get_rank() << " DELTA Rows " << vb_delta[i].size/(sizeof(u64) * (arity + 1)) << " columns " << arity + 1 << std::endl;
+
+        //     for (u32 j=0; j < vb_delta[i].size/sizeof(u64); j = j + arity+1)
+        //     {
+        //         if (j % (arity+1) == 0)
+        //             std::cout << "D ";
+
+        //         for (u32 k = 0; k < arity+1; k++)
+        //         {
+        //             u64 temp;
+        //             memcpy(&temp, (vb_delta[i].buffer) + (j + k)*sizeof(u64), sizeof(u64));
+        //             std::cout << temp << " ";
+        //         }
+        //         std::cout << std::endl;
+        //     }
+
+        //     vb_delta[i].vector_buffer_free();
+        // }
+        // delete[] vb_delta;
+}
 
 #if 0
 void relation::flush_full()
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index 88d63404..0757011d 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -20,6 +20,8 @@ enum {DELTA=0, FULL, FULL_AND_DELTA};
 enum {COPY=0, COPY_FILTER, COPY_GENERATE, ACOPY, JOIN, FACT, NEGATION, AGGREGATION, UPDATE};
 enum {STATIC=0, DYNAMIC};
 
+using tuple_formator_t = std::function<void(const std::vector<u64>&)>;
+
 // this is update function for column has functional dependence
 // the size of vector arguments must have exactly same size as dependent_column_indices
 
@@ -223,6 +225,7 @@ class relation
 
     /// print all tuples of newt, delta and full
     void print();
+    void print(tuple_formator_t ft);
 
 
     void serial_IO(std::string filename_template);
@@ -290,5 +293,6 @@ class relation
     // skip initialization/loading facts
     void disable_initialization() { init_flag = false; }
     void enable_initialization() { init_flag = true; }
+    bool need_init_huh() { return init_flag; }
 
 };
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index 9ceff570..6d2ea852 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -65,29 +65,50 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width)
             // update
             // iterator need_delete = ind.end();
             std::vector<iterator> need_deletes;
+            bool joined = false;
             for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) {
                 auto cur_tuple = *it;
-                // std::cout << "comparing  <<<<<< ";
-                // for (auto c: cur_tuple) {
-                //     std::cout << c << " ";
+                // if (tp[0] == 59 && tp[1] == 58) {
+                //     std::cout << "tppppp  <<<<<< ";
+                //     for (auto c: cur_tuple) {
+                //         std::cout << c << " ";
+                //     }
+                //     std::cout << std::endl;
                 // }
-                // std::cout << std::endl;
+                
                 std::vector<u64> old_t;
                 for (auto i: dependent_column_indices) {
                     old_t.push_back(cur_tuple[i]);
                 }
                 auto compare_res = update_compare_func(old_t, dependent_columns, tp);
-                if (compare_res.has_value() && compare_res.value()) {
-                    need_deletes.push_back(it);
-                    // std::cout << "update with <<<<<< ";
-                    // for (auto c: tp) {
-                    //     std::cout << c << " ";
+                if (!compare_res.has_value()) {
+                    continue;
+                }
+                if (compare_res.value()) {
+                    need_deletes.push_back(it);  
+                    // if (tp[0] == 59 && tp[1] == 58) {
+                    //     for (auto c: cur_tuple) {
+                    //         std::cout << c << " ";
+                    //     }
+                    //     std::cout << "update with " << compare_res.value() <<" <<<<<< ";
+                    //     for (auto c: tp) {
+                    //         std::cout << c << " ";
+                    //     }
+                    //     std::cout << std::endl;
                     // }
-                    // std::cout << std::endl;
                 }
+                joined = true;
+            }
+            if (!joined) {
+                return insert(tp);
             }
             if (!need_deletes.empty()) {
                 for (auto d: need_deletes) {
+                    // std::cout << "delete >>>>  ";
+                    // for (auto c: *d) {
+                    //     std::cout << c << " ";
+                    // }
+                    // std::cout << std::endl;
                     ind.erase(*d);
                 }
                 return insert(tp);
@@ -563,12 +584,22 @@ void shmap_relation::as_all_to_allv_left_join_buffer(
 
     if (generator_mode) {
         std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
+        // std::cout << "Input >>>>>> ";
+        // for (auto c: input_t) {
+        //     std::cout << c << " ";
+        // }
+        // std::cout << std::endl;
         std::vector<std::vector<u64>> eq_tuple_set;
         std::vector<std::vector<u64>> generated_tuple_set;
         std::vector<u64> prev_non_dependent_columns;
         for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it){
             auto cur_path = *it;
-            std::vector<u64> cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+arity+1-dependent_column_indices.size());
+            std::vector<u64> cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+cur_path.size()-dependent_column_indices.size());
+            // std::cout << " cur prefix >>>>>>> ";
+            // for (auto c: cur_path) {
+            //     std::cout << c << " ";
+            // }
+            // std::cout << std::endl;
             if (cur_non_dependent_columns == prev_non_dependent_columns) {
                 eq_tuple_set.push_back(cur_path);
                 continue;
diff --git a/backend/tests/pagerank/compiled_pre/in/$strings.csv b/backend/tests/pagerank/compiled_pre/in/$strings.csv
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/pagerank/compiled_pre/in/258.edge.2.table b/backend/tests/pagerank/compiled_pre/in/258.edge.2.table
new file mode 100644
index 0000000000000000000000000000000000000000..dab8cf6b7808db7f2375a2e0273c5839937d6f20
GIT binary patch
literal 37392
zcmZAAcf61F!^iQjz4zV+$EFY^Dk>@sp-4tXs7Pg`BCDc|wgwFuC9<Mrl#!L)pk%a^
z(N1K$&vp5{evj*Zoqx{py!w^ipX+*^>v(iO?o{gBN2c&O{xq`L|9+J@d=+0$c$VPt
z^@L{)9$!y*w&3yggl7*PUr%_B;PLf@=L{ZSPk652@%4n~4jx}mc%I<#^@QgQ9$!y*
zzTology#<)Ur%^};PLf@7YrU>Pk5o=@%4lk4jx}mc#+`o^@JA<9$!y*vEcFbgclDU
zUr%_6;PLf@mkb_XPk5=|@%4n44jx}mc$whw^@NuV9$!y*x#02jgqIH<Ur%_2;PLf@
zrv;C%C%j_t_<F)C1&^;MymIjPdcxC#$JY~HC3t*2;Z=jj*AreXcziwK)q}^^6J8^D
zd_CbcgU8nsUMqNfJ>j*3$JY~HCwP24;pYX9uP6Nc;PLf@*9{(DPk6oH@%4n)4<27n
z_yxh^>j`fVJiearhQZ_O3BNFSd_Cb81&^;MyixG@dcrRb9$!y*<KXf2gf|HuU(fLW
z*WZk5@%4mrj+aFLyM%L&m&!TE%jBG6Q#t3@OwKu8F6SJtkaLb#$~nj8a?Y`ZoO8TN
z&N*H!=Nzw*bB-<LoY%E-&gHt`nWN8%kL#naGxo<hmmB1qODj3&a-*DcX)Whm+Q>PV
zwsOwpCOPMFvz&8jC+A$+%Q=@@<eW<fIp=b#oO9_Y=lSX+=Q-*u=e)WEkKa3@ufxY}
z(dUfw!a1+5a?Y!pob&1~=e&Bz+5dJq`}dTye=j-v_m;E&9dh=+Q_lW($=SJ&oadsi
zoSp9uo;CWM_~;jXov|~{&iBaK`Cd6Y-zR71{&Jp+0dk&;fpX4ekeog5m$TboIlDa|
zXSWCC+_w(}&lY`7d<==c&e#oSx1n-&dsxnH!{qEXT+VJI<m@(5&TgaR+|Q55xt|}E
zb3Z>OXaCW1`Y=Y${*TMq|B2w)qtA(tvC-EVJLBv;PR`Eb<?K8`&dyKD+4(6sJ3lRF
z=V#>X{H&avpOdrm^Ky2cD5o1Q$l3WtIURT@c#i0E;^XD$>x`XocAg|>=gD$*enrmC
zugcl^H932}E@#gva`t>f&Yn}{>^V)&p3~*@WQLrb-;{Gdza^(9GlS=hJ|{ljj=s*=
zA7}qra`vAsXa9HP>_11&{&VHr=kw&8%X~TKvOvzcyesEi7Rou7MRI!bo}B&Pm$UN+
z!E;5Q6CWQ&UuW!$v-3xCc3v!J=OuD>UMgqjWpZ}@SkBI$$k}<hoSj$5*?Fa$`+Sw0
z`}|Wm`>&R>{~9^>|5`ccvMzY;=yT#@ee`w4x!|13XL8PEgPi?0%GrODoc%Y;*?)_i
z{kO{5f18~9`Exn@e<5f8FXim~m7JZo%h`E{oc@0uJWupF@$pUcb;kZU`|p&q|F?3U
zi(PWgWw)ID_sH3Qublh&J2^XlFK5pm<m~pNoZWs3o;UiO_}CYHov{zjK0nLZ=NCEq
z?3Z(Y9+0!oK{@x~AvyQquX6VJP0oGzyPVzrkkf%b<=mft$=UPo;Q6A@iI0DxuQT?<
z+4HcRJ^z)n=Mg#g+fg|?|0ieXV{&#rE@$Txa_-NQa`rzZXXn#$?$0xFp5L><^GBbP
zADJ?Kk+CPvo|)wAnOV-BS>)`QRnDH-<m{PU&U2kZ&dxdI?3_!^&bj66oJY=mo>$KP
z`Q+T^`Q_Z_1>~GpK{@w-Avxz*ICz2N8D@N4M9w)Dm2-~8<eX!1Ip<hH&N-HpbB?9t
zoMUM@=U7J0IhK`kj^*T>V|h6{SCF%Fnw*_01}~_cE6LfpvYegM<?L2P&TduZ>{d<A
zZq?=NQ$x-^HRasrwdCB-wdL$qCwL+4bDo@i&X==KT{-t*JvsMbeK~txAZO18a_*~!
za(25=&OR5(d9EADx!*1hURe7yma|V2Is059XP-;u>~oo%eVWSIr<t7l?Q%K0T_I<;
zE9Km8&E@RbLQYq&l5-zkEobLz<lLVv<viEd1}~y>xlYdh*UQ=e208n;lC%Gfa&~Sl
z=ecepXaBZxcD_l@&Ns{1vz?s#xxJh{Z;`WShu}rE=dE(?^Nw=%>?CKm&T{VCE^_v~
zP0pTO<=nU3<m}d6&i&a#&OW!x*{5glV%n#doPB!BxexD<v)i3=?!&v}?Ab@oo_*!)
zdAFQB`^mWv?~$|Ty>j-vPtN_>U(WMBK+gUH<?KHwcyaB2znuLC%h~?{Ir~2-Xa9%f
z>^wxy&O_zw{IHx343o3-a5+1VkhAkhIXjP%v-2atOK8tW<?Q*GoIOX&*>jAXJs+2I
zKR+R7=dp5j9w%q#@p5*aAg3Eo%DMlal5;Lk%Q=^4<eba1a{Bz7ocsLw;3ah~6Xl%C
z3v$loMLFm4lALpSS<bmkl5;MT<($haa?a&dIs3mRXaCpb>_0`${%^?Hf2y4Qr^$IP
zrUx&j{b$J8|4li2z9nb3nR5Pp-<I?HoF(VEo-OD8c_(;j{r>03`Tft8bN|efbN|ej
z^Y6Pr&U5#!oO53&=NuQwImh?p?DM{y9X<$NMmu~cXNQmE?66qQ4ol?huvE@-v`o(P
z^0A!f<r6u3E|>FMtdO(YN;&t>DmlA-8oaD_TP<g|HF9=aD`&TLa&}uUXSdJfJVzVk
z?730So}1+CxmnKrxkb)%x>e5p+vMEOpUc_*3pxGyQqH-26}+6zWxJen*&*j#zLs+?
z-^kg2r=0!2m9zgYIs5OHv-2J~d+wF9=XY}M^Y7*K;RiYU{}{Zy_WViCp8MqN`Lmoo
zf01*a@0YXl0XaJ#l(Xj{IrsCga(4br&d$Hf+4&DSJO3$X=fC9K&wtCgpZ^J7LHi$;
zv;V(x_CF$L|D$sD|4+{T$K>pPT+V%dLe9CIlyfep<ebZCIh{EpXaBQu_K*L=#*9Dz
zXHRy{lsRMI3{TU}ndR)9Mb6Gy<?NhI&d%B8?3_c+&N=1moJ-E0x#jGcN6vkoSI&K&
zPtKnC<=oE&f>+d@1?B8nNY0*x<?LBR&Ynf(>{(3Cp2g+tSwhZzTT;%>rR3~fTF!l2
zM$Z0a<?LTh&i>`)bh|?EO4>h7&i)nU>|aUF{*~qIpDt(rDsuL(Drf&{a`vw-Xa5><
z_OB^t|5|ePuPtZ)I&wO5o}6<zU(Rz;H+W^8OFcQ~QeV!wTp;IM8pt`9hH}p3LOJJh
zk(~V-$=Uy6IXgF&vu6`IdtM@E&r9Xp=a<RZxv8AaHw&Jw{V$iZ{}pofzf#Wr&E@Rh
zLeBnI$=UyEInTv4a?YitocsS;Ip=bnocsTJIp=bNoO5X<=Ui@-b1toeSJD1$<m}&8
z&i*&a+5cua`?r&`e|tIm-y&!K4s!OtRnBwKQO>z^l5;Md<vcH4<ebZGa?YizoO9_W
z=eg(}ysFNnhn#b{UCz1mlyfe<<eW=yIp=bRoO8KT&bizr=Un>8IhVe2&gE`7=h9Ek
z^Ky@z^SW2gdEF=HT>8s7mjS`6>0AcNIhR3l&gFhN=Q3E%xjZ1}TppBjE)U5$mmzZY
zA1Y_hhvn=xEO>S8FkH?KBjoHbQqB&e<m~W>oE;vO^Y8nZocnOJoE^r<xep(gv%?c|
zx;j?Q{WC6j4ed5w&TbRr?DnLb-JX)O+tYG(dq&Rv^Q@dbpOdra^K$l_C}+<X<UH3e
z%DE3;lJi`@EazM%$?5Q9InVnma?bJ9;G74?_L@BU@izK+T^{{1`j{fma*k7P$g>8Y
zD$f>tnml{(>GB-GXUO^Q^xu^8-<7{5&mI0gGv#@Lzb(%je3m?4@Y(YG!QYV=2tG$%
zF!)?~q2Tl6g@ezR7Y)8ZUM%>#^5Vf4%1Z=aBrh5KJ$b3%@5@UE|3F?Q_=j@-ockj=
zf6lsC&Yu@9k@M#+OXX?dpSw)XpC5cIuN40JPvrbuwOr2o{t7wop)2LQ2d$FxefU#3
z-<MX)>FydiU0o}utLx-+b-kRfekP}@8{~9#qnxg8lGD}Aa=N-jPFJ_e>FPE)UHx26
zSHF<c)i33A^(#4D-7e?*<Bs5*N4Tw0U(4z3H}dG0(Z^0Xz5P~BZ+FS*?QS`}-6N;B
zd*$@@J2}1mUQTa+kki{A<@EL^IlbK{r?)@L>FqCadb?jvZx6`n?Lm2wbH|c8B&WB(
z%IWQIa(er_oZkK+r>lR;>FQr{y85@AuKpvZtB2)u^<Q~f*x`trt{#<F3V;27a=Lm<
zPFIi1>FNnNT|FtMtEc32^|YL>o{{t3bXHDR<NrhXk<C&(?{qa&^uQ*Zu4b0g)hu$l
znpI9$v&rddb~#<mA*ZW3<#aWdoUZ1U)73n3x|&x`SM$l~YJNFgEfD;FyNCO#pq#E2
zl1IlEeH4~Qzl=VL$mwcPIbAI#r>n)~bhU(>u9lS3)lzc$Sz1m%%gE_xSvmbIC#Rp~
z<#e)woKB|6>10JY@2i#M^s};@ex}RmXB9d9tSYCU)#UWEx}1L2kkikaa{5_IPCskQ
z)8e}=m8v7BpXbRdg}?rMdF9}B<@B?joPO4q)6Wa!^s|ASem0cT&kN=B^CCIDY$T_j
z7t85qV>$h7BB!61$m!>$a{76hoPIWy)6Zsd`gysWeqJG`pI6H1XLC9IY$2zgSIOz;
z)xp^}+&-ym<n*(poPJ&_kA4|_TqmcW*URbW4RZR~N=`p-l+(}Fa{Adu&U<NFIbFR;
zPCsv!)6aHt`q^GiKW~xK&kl0>d8?dGc9he}PI5ZgSxzUr$oaYcHaY$5DyN^_<n*(<
zoKE(T)5+WAY4Kf^O7)b}$zF0g*;`&Y{C)0_)5$yKbn-4ao$Mp0lYQlM@@_eu>?fy_
z_sHqwy>i}z?vvBW{&G4wKu#wI%IV}FIi0*;PA3P;>Er`)I{BcSPCg{3lSAZma;Th6
zJ}jq)!-BI{xJ^>S<@9iboF0yp)4fr0y7!2j?ma4}dymQK-e@`98zZNCkIQ*)d_qq5
z#>(m5I62)LFX!j(337V)q?{f;C8vi^%Xy!CMouT6mD9uL<n-`)IenWbr*ALFc|Uwn
zPWN7t)4iAFbZ?TJ?oF1{y;tOw!wvnaobJ6Qr+cr<>E0AM-Frh$_om9}-ZVMgn=Yq&
zGvsvdO*!v1Z^`N2OgY_qTTb_8$?4u~Io*3lPWR@>>E2v9-J2(;d-LUVZ-Jcdy({Pa
zaG{(YE(*?G^zc17J$zqI4?mF8!w=>3@FO`rTr8)1OXPHKshsXDlk>j!v78=$BBzJT
z<@9icoF1-})5BGAdibfF9<G+ty)|;Ww^q*k;W{}zTra1GpUHVY+#sii8|CzHlbjxI
zmea#6a(cK`P7k-q>EY+{%Hf9oLQW6Al+(kn<n(a6oF49w)4i|dbnhEE-P<Xrd*90G
z-Yz-sHM`|>Z;zbr?UmEL@8opvdpX_vK~DF6l+(SR<aBSJobLTBr+dH1>E3=h-8&GR
zUFhCHIo&%Xr+dH3qhCfJzsc$0?{a$hhnyb%Dd+w0FFBq3TTUnck<-b;ayt31obDZw
z)4ii|y7!-)z8#bEzIR+s_fE*^-bp##J0+)kr{#3-jGXttvvRtZiVk#SvlRCa-OD7W
zZ<*z3(M^)^kws45vdZaOHhKEFzfCH;oQ~y?)3Ka#I+jb$dtGiheaj=KZ+YePEuWme
z<(Jd90&@CRP|o{HAvt|3ET?Zp<n*nmoW2#4)3@Ss`c^_t-%85qTPZnxD=nvQW#sg&
zY;gYl!)=l(C#P@a<@BwBJbE0Wk2E<Qt0<>qmE?4+vYd{k%jsAZIUTDi=e@3)ocFrw
za{5+7PTy+E>02#19jh&;V|C<o>^wOgJ6}%6>dNU@Jvkk#FQ;P{$mv)EIqz=`<@D`B
zIUT!5PRAO_E1$cqQWwk9gEyAbw<dD>c8Q#hT`H$zm&xf^Q#l=LCZ}VU%jwt^ayoXU
zoQ^e@^L?g;oQ_>3r(;*k>DV=LI@VH7$F7yrvFqe??0PvJyFpIJTFL3yjluc%47Wq7
zwVaN%k<+oZayoXCoQ~Zrr(^BpbgaFcj@=@sV;$sl>{dA)>nNvVo#b?^vz(4~k@Iut
zZF1hzy2|NWH#xoPE~i&L<fX$w-Y%zCJ>~SOmz*y3meZv><Z0obd#9XU-6f}2edP42
zubeL3EvHNU<h*y?Bd1sQ%IVd8a(dNYPOk>Y>D53vy&5E^SNF^5)nGZjdO*(m$b)iv
z^^lxi4UyBUp>lflu$*2Elhdo=a(XpFPOnDF>D8#<{QJ<WN96SCQ8~SOOir&x%jwk^
zIlX#ZPOqMj)2p#^dNoeY&xzyZbZLT|E<GuyOHavpzj|6uubz?9t7qlBcReSkOV7)B
zkD4f_S1-uv(u;E5qh6BJtC!{UYLc8@O_tNESLF2SRe5^25nq$jtJmf9YKojLy&<Pd
zQ{{ANnw&08m(!&ga=P@UoG!g3r%N;Cbm?t5U7985y<@hVF1;hCOLOFOX|9|u&6Crm
z`Et6nKu(w5mD8n#!TEjDrA2bO^q!n9y)UOrAIPI$Mjs!_>D5PadbL<iua?N^)lxa{
zQOo4?>SH<YSD(n~*m5}?TOp@oE9G=-m7HFED(C%bwVa<5*U0JES~(qCCogsGSW@fd
zbnG)ZUD_b0KO5!rXOo=wrOop6@cY>!r%PMqbZMKM{(LT{KVQh{&zExg^Oc<bY?srY
z9di2fwVeKZBd0$*<@D!UIqw&{<n(8^oc`>Q)1SR^`tzNf{(LW|KR?Ln&yT_R{e@#s
z{UoP9`{eZJXF2`(MNUul%jwAhIqx|K<@Dr`oNoLorw_l$>A>%D-e>-h(}zFh^x-c#
z?>T?V>Bc{DeqK8)rzii)>B$i}Kd&8?)06+?ycZpl^ImjZo)&JD6LR`<QcizP$<xE%
z=d_%joRRZ$*;zTAiT@vwjck_Uex@^-qHk~sr!$%5bS8_O&SaI-nQU@8lU+_{a>(gS
zPC1>)C8smF<-AAak<*#HaypYwPG|DV=}ZARohc}%Glhcl`v}LLDlDfnMdWm*sGQCe
zlSj8l^if<+XG+NFOi4MNDJAE<rL>&>l#%n^QdUlv%E{?Wc{x3)Ag3p3a(YrxPERVy
z=}BcdKWC-O=}8qiJ*g_EC)MO>@oA+})#dc0hMb<%l=D7QOI{`X{cFqVOdUBrIZsYc
z&X?1Zx^j9_Pfkzj%jwAla(dE0PEQ)j>B)t1dUBDRo-~rvlZ)l_wXvL@G?CMjOXT$A
zQaL@jOioXl2Irp_jyKgzPERhE(~~RY(J!NqE9LxL(_BtBTFB|ZRdSy9tL40hTqCCg
zE#<s#Tq~yo*U9O?^>R9JgPabulGA}3<-BjSmeYYYayrmf&d)VB$$8#ymh-%~lc%3M
z##DQGmEgC?=|BfL?;E$u`SX^Ja=Ot;PB%Ks=|&eh-MCFoH@eE{MmIU#=q{%lJ>+!b
zb~)YXDW@B~<aDFAoNnA9=e^)gIeoZGP9OTn=|kV(bSE5p>TWq5=qKmB;vPBg75B=c
zUq&DI$@zJwznu4w0dhJsP)=tC$$3AyUrv7p%jwSpa(>=<P)=talGBYLa=I~8Uh3RD
zsfXqCWSE?8442c55pudQQl1w6xufK~S3Dx8GmpyY%wuvoGg@9H{By_1>BHl4`tXFD
zK8%&qhjDWHFkVg{Cdlc-lXCj-l$<_1EvFC9$mzqga{BO`oIX4+rw<e5^x*|LeRxq$
zA6^R1|2`a7>Sa0a50m8dVX~Y)ydsZ&8GXDervtCadEQ@_^Sn=y^Yh3Xa(?cZDyIX}
z<OR<CHdE8(^kIga_mel}yobCc=RIVmoIboQ=e=T<oS!>p%jw2Da-R1&a-QqCa-QpX
z@+x7U`EuSL7RY(t-<9*cFO>7VFOu`TzbEH;e_zh?{(+q5{X;p=`$uw~_r-Fa_a$<k
z_oZ^4_hoXP_mAZ~@1Mwd-(N20`}_(y9at%+1FM4b*M;LveJbZYVYQs+eT|&wdaa!A
z@9X3|@9X6}@1Mze-Z#kkzQ0k<^S(*W^S)Wm&m&vpJnviOyeDjv7Y&1cE~gJ)$mzqE
za^5SxlGB0ha^4em$kW2_=W99N_rH<Thn;fz@U5Jm3wFurz-~Dm*dwO{d*yWCJ2}t$
z_i~>1ALKmmKgxODf0FaO@00Vq|19Tu|3%L8zF*Gsen8Iieo)Tyen`&u^IzpW@4p4-
z*F5jP%lW?lhnx=lDd&CQFFEi3f6IBU|B>_k`mmhm_g^{B?-4oA=}|e)>3?#b(_?a;
z)8lg9|4+#IK7LZpd;Tdo-``Kmd9KgMd9KgOdH+vEgO6;M;(U32Gs&xjgU>9l8a#`f
z=Q^vL=Qo?2=Qq2Y=QoF(@8db;JlDD8JlDD9JlA>TJlA>UJlFZ;JlFZ<Jl6%}Jl6%~
aJlBQfJlBQgJl93!Jl93#eBUk>{Qm$w3|%<@

literal 0
HcmV?d00001

diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp
index 21a2aa94..34214be3 100644
--- a/backend/tests/pagerank/compiled_pre/pagerank.cpp
+++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp
@@ -1,8 +1,8 @@
 // location of `parallel_RA_inc.h` here
 #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 
-#include <iostream>
 #include <bit>
+#include <iostream>
 #include <iterator>
 #include <map>
 #include <optional>
@@ -261,7 +261,8 @@ local_agg_res_t
 agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
                   joined_range) {
   local_agg_res_t sum_res = 0;
-  for (shmap_relation::iterator it = joined_range.first; it != joined_range.second; ++it) {
+  for (shmap_relation::iterator it = joined_range.first;
+       it != joined_range.second; ++it) {
     auto tuple = (*it);
     sum_res += tuple[tuple.size() - 2];
   }
@@ -272,6 +273,33 @@ local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) {
   return x + y;
 }
 
+local_agg_res_t agg_sum_float_local(
+    std::pair<shmap_relation::iterator, shmap_relation::iterator>
+        joined_range) {
+  float sum_res = 0.0;
+  for (shmap_relation::iterator it = joined_range.first;
+       it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    u32 agg_column_raw = tuple[tuple.size() - 2];
+
+    sum_res += *reinterpret_cast<float *>(&agg_column_raw);
+  }
+  // std::cout << ">>>>>>>  " << sum_res << " " <<
+  // *reinterpret_cast<u32*>(&sum_res) << std::endl;
+  u32 sum_res_encoded = *reinterpret_cast<u32 *>(&sum_res);
+  return sum_res_encoded;
+}
+
+local_agg_res_t agg_sum_float_reduce(local_agg_res_t x_raw,
+                                     local_agg_res_t y_raw) {
+  float x = *reinterpret_cast<float *>(&x_raw);
+  float y = *reinterpret_cast<float *>(&y_raw);
+  float res = x + y;
+  // std::cout << res << std::endl;
+  u32 res_encoded = *reinterpret_cast<u32 *>(&res);
+  return res_encoded;
+}
+
 //////////////////////////////  maximum  /////////////////////////////////////
 
 local_agg_res_t
@@ -409,7 +437,7 @@ int main(int argc, char **argv) {
   // >>>>>>>>>>>>>>> compute node size
   // (node x)
   relation *rel__node__1__1 = new relation(
-      1, true, 2, get_tag_for_rel("node", "1"),
+      1, true, 1, get_tag_for_rel("node", "1"),
       std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table",
       slog_input_dir + "/" + std::to_string(get_tag_for_rel("node", "1")) +
           ".node.1.table",
@@ -496,7 +524,12 @@ int main(int argc, char **argv) {
   rel__edge__2__1->disable_initialization();
   rel__node__1__1->disable_initialization();
 
-//   matrix edge + successor count
+  relation *rel__edge__2__2 = new relation(
+      1, false, 2, get_tag_for_rel("edge", "2"),
+      std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table",
+      std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL);
+
+  //   matrix edge + successor count
   relation *rel__matrix__3__1 = new relation(
       1, true, 3, get_tag_for_rel("matrix", "1"),
       std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table",
@@ -508,93 +541,167 @@ int main(int argc, char **argv) {
       std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL);
 
   rel__rank__3__1->set_dependent_column_update(
-      {2, 3, 4},
+      {1, 2, 3},
       [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
          const vector<u64> &nt) -> std::optional<bool> {
+        // if (nt[0] == 59 && nt[1] == 58) {
+        //   std::cout << "dependent column size " << new_v.size() << std::endl;
+        //   std::cout << new_v[0] << " " << new_v[1] << " " << new_v[2] << "
+        //   comparing with " << old_v[0] << " " << old_v[1] << " " << old_v[2]
+        //   << std::endl;
+        // }
         if (new_v[0] != old_v[0]) {
+          // std::cout << " www >>>>>>>>>" << std::endl;
           return std::nullopt;
         } else {
           // monotonic
-          assert(new_v[1] > old_v[1]);
+          // assert(new_v[1] > old_v[1]);
+          // u32 new_sum_raw = new_v[1];
+          // u32 old_sum_raw = old_v[1];
+          // float new_sum = *reinterpret_cast<float*>(&new_sum_raw);
+          // float old_sum = *reinterpret_cast<float*>(&old_sum_raw);
+          // if (new_sum > old_sum) {
+          //   std::cout << "new >> " << new_sum << " old >> " << old_sum <<
+          //   std::endl;
+          // }
+          // return new_sum > old_sum;
           return new_v[1] > old_v[1];
+          // return true;
         }
       });
 
   relation *rel__result__2__1__2 = new relation(
       2, true, 2, get_tag_for_rel("result", "1__2"),
       std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
-      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", FULL);
+      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
+      FULL);
 
-  // 
+  //
 
-  RAM* scc_compute_matrix = new RAM(false, 0);
+  RAM *scc_copy_edge = new RAM(false, 0);
+  scc_copy_edge->add_relation(rel__edge__2__1, false, false);
+  scc_copy_edge->add_relation(rel__edge__2__2, true, false);
+  scc_copy_edge->add_rule(
+      new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2}));
+
+  RAM *scc_compute_matrix = new RAM(false, 1);
   scc_compute_matrix->add_relation(rel__edge__2__1, false, false);
+  scc_compute_matrix->add_relation(rel__edge__2__2, false, false);
   scc_compute_matrix->add_relation(rel__matrix__3__1, true, false);
-  scc_compute_matrix->add_rule(
-    new parallel_join_aggregate(
-      rel__matrix__3__1, rel__edge__2__1, rel__node__1__1, FULL,
+  scc_compute_matrix->add_rule(new parallel_join_aggregate(
+      rel__matrix__3__1, rel__edge__2__2, rel__edge__2__1, FULL,
       agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
-      {1, 2, 3}));
+      {0, 1, 3}));
 
-  RAM* scc_page_rank = new RAM(true, 1);
-  scc_page_rank->add_relation(rel__matrix__3__1, false, false);
-  scc_page_rank->add_relation(rel__rank__3__1, true, false);
-  scc_page_rank->add_rule(new parallel_copy_generate(
-    rel__rank__3__1, rel__matrix__3__1, FULL,
-    [](const u64 *const data, u64 *const output) -> int {
+  RAM *scc_init = new RAM(false, 2);
+  scc_init->add_relation(rel__matrix__3__1, false, false);
+  scc_init->add_relation(rel__rank__3__1, true, false);
+  scc_init->add_rule(new parallel_copy_generate(
+      rel__rank__3__1, rel__matrix__3__1, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
         output[0] = data[0];
         output[1] = data[0];
-        float init_pg_v = (1 - ALPHA) / total_node_size;
-        output[1] = *reinterpret_cast<u32*>(&init_pg_v);
+        // float init_pg_v = (1 - ALPHA) / total_node_size;
+        u64 init_pg_v = (u64)(((1 - ALPHA) / total_node_size) * 100000);
+        // std::cout << init_pg_v << std::endl;
+        // output[2] = *reinterpret_cast<u32*>(&init_pg_v);
+        output[2] = init_pg_v;
         return 1;
+      }));
+
+  RAM *scc_page_rank = new RAM(true, 3);
+  scc_page_rank->add_relation(rel__matrix__3__1, false, false);
+  scc_page_rank->add_relation(rel__rank__3__1, true, false);
+  parallel_join *rank_join =
+      new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL,
+                        rel__rank__3__1, DELTA, {3, 1, 2} // useless
+      );
+  rank_join->set_generator_func([](const depend_val_t &target_vs,
+                                   const std::vector<u64> &input_v,
+                                   depend_val_t &res_set) -> bool {
+    // float pg_sum = 0.0;
+    u64 pg_sum = 0;
+
+    int count = 0;
+    for (auto &tv : target_vs) {
+      // std::cout << "tagret v >>>>> ";
+      // for (auto c: tv) {
+      //   std::cout << c << " ";
+      // }
+      // std::cout << std::endl;
+      u32 raw_succ_pg_v = tv[2]; // all columns are u64, cast to u32 first
+      // std::cout << ">>>>>>>>>>>>>>> " <<
+      // *reinterpret_cast<float*>(&raw_succ_pg_v) << std::endl;
+      // auto succ_pg_v = *reinterpret_cast<float*>(&raw_succ_pg_v);
+      // if(succ_pg_v == 0) {
+      //   // std::cout << ">>>>>>>>>> " << succ_pg_v << std::endl;
+      // std::cout << "tagret v >>>>> ";
+      // for (auto c: tv) {
+      //   std::cout << c << " ";
+      // }
+      // std::cout << std::endl;
+      // }
+      if (input_v[2] != 0) {
+        // pg_sum += ((ALPHA * succ_pg_v) / input_v[2]);
+        pg_sum += (u64)(((u64)(ALPHA * raw_succ_pg_v)) / input_v[2]);
+        // if (input_v[1] == 51) {
+        //   std::cout << "Sum 51 " << input_v[0] << " with ";
+        //   for (auto c: tv) {
+        //     std::cout << c << " ";
+        //   }
+        //   std::cout << " result " << pg_sum << std::endl;
+        // }
+      }
+      count++;
     }
-  ));
-  parallel_join* rank_join = new parallel_join(
-    rel__rank__3__1,
-    rel__matrix__3__1, FULL,
-    rel__rank__3__1, DELTA,
-    {3,1,2}     // useless
-  );
-  rank_join->set_generator_func(
-    [](const depend_val_t& target_vs, const std::vector<u64>& input_v, depend_val_t& res_set) -> bool {
-        float pg_sum = 0.0;
-        for (auto& tv: target_vs) {
-            u32 raw_succ_pg_v = (u32)d2n(tv[3]);     // all columns are u64, cast to u32 first
-            auto succ_pg_v = *reinterpret_cast<float*>(raw_succ_pg_v);
-            pg_sum += ALPHA * succ_pg_v / d2n(input_v[3]);
-        }
-        // u64 encoded_sum = 
-        std::vector<u64> res_tuple(3, 0);
-        res_tuple[0] = input_v[1];
-        res_tuple[1] = input_v[0];
-        res_tuple[2] = *reinterpret_cast<u32*>(&pg_sum);
-        res_set.push_back(res_tuple);
-        return true;
+    if (pg_sum == 0) {
+      return false;
     }
-  );
+    if (count == 0) {
+      return false;
+    }
+    std::vector<u64> res_tuple(3, 0);
+    res_tuple[0] = input_v[1];
+    res_tuple[1] = input_v[0];
+    // res_tuple[2] = *reinterpret_cast<u32*>(&pg_sum);
+    res_tuple[2] = pg_sum;
+    // std::cout << "New tuple >>>>>>> " << pg_sum << std::endl;
+    // for (auto c: res_tuple) {
+    //   std::cout << c << " ";
+    // }
+    // std::cout << std::endl;
+    res_set.push_back(res_tuple);
+    return true;
+  });
   scc_page_rank->add_rule(rank_join);
 
-  RAM *scc_result = new RAM(false, 2);
+  RAM *scc_result = new RAM(false, 4);
   scc_result->add_relation(rel__rank__3__1, false, false);
   scc_result->add_relation(rel__result__2__1__2, true, false);
   scc_result->add_relation(rel__node__1__1, false, false);
+  // scc_result->add_rule(new parallel_join_aggregate(
+  //     rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
+  //     agg_sum_float_local, SpecialAggregator::sum, agg_sum_float_reduce,
+  //     nullptr, {0, 2}));
   scc_result->add_rule(new parallel_join_aggregate(
-    rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
-    agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr,
-    {0, 2}
-  ));
+      rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
+      agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, 2}));
 
-  
   LIE *pg_lie = new LIE();
   pg_lie->add_relation(rel__edge__2__1);
   pg_lie->add_relation(rel__matrix__3__1);
   pg_lie->add_relation(rel__node__1__1);
+  pg_lie->add_relation(rel__edge__2__2);
   pg_lie->add_relation(rel__rank__3__1);
   pg_lie->add_relation(rel__result__2__1__2);
+  pg_lie->add_scc(scc_copy_edge);
   pg_lie->add_scc(scc_compute_matrix);
+  pg_lie->add_scc(scc_init);
   pg_lie->add_scc(scc_page_rank);
   pg_lie->add_scc(scc_result);
-  pg_lie->add_scc_dependance(scc_compute_matrix, scc_page_rank);
+  pg_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix);
+  pg_lie->add_scc_dependance(scc_compute_matrix, scc_init);
+  pg_lie->add_scc_dependance(scc_init, scc_page_rank);
   pg_lie->add_scc_dependance(scc_page_rank, scc_result);
 
   // Enable IO
@@ -608,18 +715,31 @@ int main(int argc, char **argv) {
   pg_lie->execute();
   pg_lie->print_all_relation_size(); // Continuously print relation sizes
   // lie->stat_intermediate();
+  // rel__matrix__3__1->print();
+  // rel__rank__3__1->print(
+  //   [](const std::vector<u64>& tp){
+  //     u32 pg_v = tp[2];
+  //     // std::cout << tp[0] << " " << tp[1] << " " <<
+  //     *reinterpret_cast<float*>(&pg_v) << std::cout << tp[0] << " " << tp[1]
+  //     << " " << pg_v << std::endl;
+  //   }
+  // );
+  rel__result__2__1__2->print([](const std::vector<u64> &tp) {
+    u32 pg_v = tp[1];
+    // std::cout << tp[0] << " " << *reinterpret_cast<float*>(&pg_v) <<
+    std::cout << tp[0] << " " << pg_v * 1.0 / 100000 << std::endl;
+  });
 
   // print all variants(non-canonical index of each relation)
-//   if (mcomm.get_rank() == 0) {
-//     std::cout << "rel_name"
-//               << ",\t"
-//               << "indices\n";
-//     for (auto const &rel_p : rel_index_map) {
-//       std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n";
-//     }
-//     std::cout << std::endl;
-//   }
-
+  //   if (mcomm.get_rank() == 0) {
+  //     std::cout << "rel_name"
+  //               << ",\t"
+  //               << "indices\n";
+  //     for (auto const &rel_p : rel_index_map) {
+  //       std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n";
+  //     }
+  //     std::cout << std::endl;
+  //   }
 
   delete pg_lie;
 
diff --git a/backend/tests/pagerank/ground_truth b/backend/tests/pagerank/ground_truth
new file mode 100644
index 00000000..e896aa19
--- /dev/null
+++ b/backend/tests/pagerank/ground_truth
@@ -0,0 +1,60 @@
+0	0.0360
+1	0.0113
+2	0.0119
+3	0.0103
+4	0.0093
+5	0.0103
+6	0.0103
+7	0.0108
+8	0.0103
+9	0.0113
+10	0.0108
+11	0.0087
+12	0.0098
+13	0.0098
+14	0.0124
+15	0.0103
+16	0.0087
+17	0.0087
+18	0.0113
+19	0.0098
+20	0.0087
+21	0.0108
+22	0.0098
+23	0.0113
+24	0.0108
+25	0.0067
+26	0.0082
+27	0.0093
+28	0.0113
+29	0.0098
+30	0.0108
+31	0.0103
+32	0.0098
+33	0.0108
+34	0.0113
+35	0.0108
+36	0.0108
+37	0.0113
+38	0.0113
+39	0.0082
+40	0.0093
+41	0.0119
+42	0.0354
+43	0.0347
+44	0.0341
+45	0.0331
+46	0.0329
+47	0.0324
+48	0.0317
+49	0.0310
+50	0.0305
+51	0.0300
+52	0.0294
+53	0.0288
+54	0.0281
+55	0.0271
+56	0.0272
+57	0.0265
+58	0.0259
+59	0.0254
diff --git a/backend/tests/pagerank/pagerank.py b/backend/tests/pagerank/pagerank.py
new file mode 100644
index 00000000..753feb8e
--- /dev/null
+++ b/backend/tests/pagerank/pagerank.py
@@ -0,0 +1,19 @@
+"""
+generate a test graph and pagerank ground truth for testing
+
+"""
+
+import networkx as nx
+
+bag = nx.barabasi_albert_graph(60, 41).to_directed()
+pr = nx.pagerank(bag)
+
+with open("test-graph/edge.fasts", "w+") as edge_f:
+    for f, t in bag.edges:
+        edge_f.write(f"{f}\t{t}\n")
+
+with open("ground_truth", "w+") as truth_f:
+    for node, val in pr.items():
+        truth_f.write(f"{node}\t{val:.4f}\n")
+
+print("done!")
diff --git a/backend/tests/pagerank/test-graph/edge.fasts b/backend/tests/pagerank/test-graph/edge.fasts
new file mode 100644
index 00000000..dc33fde7
--- /dev/null
+++ b/backend/tests/pagerank/test-graph/edge.fasts
@@ -0,0 +1,1558 @@
+0	1
+0	2
+0	3
+0	4
+0	5
+0	6
+0	7
+0	8
+0	9
+0	10
+0	11
+0	12
+0	13
+0	14
+0	15
+0	16
+0	17
+0	18
+0	19
+0	20
+0	21
+0	22
+0	23
+0	24
+0	25
+0	26
+0	27
+0	28
+0	29
+0	30
+0	31
+0	32
+0	33
+0	34
+0	35
+0	36
+0	37
+0	38
+0	39
+0	40
+0	41
+0	42
+0	43
+0	44
+0	45
+0	46
+0	47
+0	48
+0	49
+0	50
+0	51
+0	52
+0	53
+0	54
+0	55
+0	56
+0	57
+0	58
+0	59
+1	0
+1	42
+1	43
+1	44
+1	45
+1	46
+1	47
+1	48
+1	49
+1	50
+1	51
+1	52
+1	53
+1	54
+1	55
+1	57
+1	59
+2	0
+2	42
+2	43
+2	44
+2	45
+2	46
+2	47
+2	48
+2	49
+2	50
+2	51
+2	52
+2	53
+2	54
+2	55
+2	56
+2	58
+2	59
+3	0
+3	42
+3	43
+3	44
+3	45
+3	48
+3	49
+3	50
+3	51
+3	52
+3	53
+3	54
+3	56
+3	57
+3	59
+4	0
+4	42
+4	43
+4	44
+4	45
+4	46
+4	47
+4	48
+4	52
+4	54
+4	56
+4	57
+4	58
+5	0
+5	42
+5	43
+5	44
+5	45
+5	46
+5	47
+5	48
+5	49
+5	50
+5	51
+5	52
+5	55
+5	56
+5	57
+6	0
+6	42
+6	43
+6	44
+6	45
+6	46
+6	47
+6	48
+6	49
+6	50
+6	51
+6	52
+6	53
+6	56
+6	59
+7	0
+7	42
+7	43
+7	44
+7	45
+7	46
+7	48
+7	49
+7	50
+7	51
+7	52
+7	53
+7	54
+7	56
+7	57
+7	58
+8	0
+8	42
+8	43
+8	44
+8	45
+8	46
+8	47
+8	48
+8	49
+8	50
+8	51
+8	52
+8	55
+8	56
+8	58
+9	0
+9	42
+9	43
+9	44
+9	45
+9	46
+9	47
+9	48
+9	49
+9	50
+9	51
+9	53
+9	54
+9	55
+9	56
+9	57
+9	58
+10	0
+10	42
+10	43
+10	45
+10	46
+10	47
+10	48
+10	49
+10	50
+10	52
+10	53
+10	54
+10	56
+10	57
+10	58
+10	59
+11	0
+11	42
+11	43
+11	44
+11	45
+11	47
+11	48
+11	51
+11	52
+11	54
+11	57
+11	58
+12	0
+12	42
+12	43
+12	44
+12	46
+12	47
+12	49
+12	50
+12	52
+12	53
+12	54
+12	55
+12	56
+12	58
+13	0
+13	42
+13	43
+13	44
+13	45
+13	46
+13	47
+13	49
+13	50
+13	51
+13	52
+13	54
+13	57
+13	59
+14	0
+14	42
+14	43
+14	44
+14	45
+14	46
+14	47
+14	48
+14	49
+14	50
+14	51
+14	52
+14	53
+14	54
+14	55
+14	56
+14	57
+14	58
+14	59
+15	0
+15	42
+15	43
+15	44
+15	45
+15	46
+15	47
+15	48
+15	49
+15	50
+15	51
+15	52
+15	57
+15	58
+15	59
+16	0
+16	42
+16	43
+16	46
+16	47
+16	48
+16	49
+16	51
+16	52
+16	53
+16	55
+16	58
+17	0
+17	42
+17	43
+17	44
+17	45
+17	46
+17	47
+17	50
+17	51
+17	53
+17	56
+17	57
+18	0
+18	42
+18	43
+18	44
+18	45
+18	46
+18	47
+18	48
+18	50
+18	51
+18	52
+18	53
+18	54
+18	55
+18	57
+18	58
+18	59
+19	0
+19	43
+19	44
+19	45
+19	46
+19	48
+19	49
+19	50
+19	52
+19	53
+19	55
+19	56
+19	58
+19	59
+20	0
+20	42
+20	44
+20	46
+20	47
+20	48
+20	49
+20	51
+20	53
+20	54
+20	58
+20	59
+21	0
+21	42
+21	43
+21	44
+21	45
+21	46
+21	47
+21	48
+21	49
+21	52
+21	53
+21	54
+21	55
+21	57
+21	58
+21	59
+22	0
+22	42
+22	43
+22	44
+22	45
+22	47
+22	48
+22	49
+22	50
+22	53
+22	54
+22	55
+22	57
+22	58
+23	0
+23	42
+23	43
+23	44
+23	45
+23	46
+23	47
+23	48
+23	49
+23	50
+23	51
+23	52
+23	53
+23	54
+23	55
+23	56
+23	59
+24	0
+24	42
+24	43
+24	44
+24	45
+24	46
+24	47
+24	48
+24	49
+24	51
+24	52
+24	53
+24	54
+24	55
+24	56
+24	58
+25	0
+25	42
+25	45
+25	47
+25	52
+25	56
+25	57
+25	59
+26	0
+26	42
+26	43
+26	44
+26	45
+26	50
+26	51
+26	54
+26	55
+26	56
+26	59
+27	0
+27	42
+27	43
+27	44
+27	45
+27	46
+27	48
+27	49
+27	50
+27	53
+27	55
+27	57
+27	59
+28	0
+28	42
+28	43
+28	44
+28	45
+28	46
+28	47
+28	48
+28	49
+28	50
+28	51
+28	52
+28	53
+28	54
+28	57
+28	58
+28	59
+29	0
+29	42
+29	43
+29	44
+29	46
+29	47
+29	48
+29	49
+29	51
+29	53
+29	54
+29	55
+29	56
+29	57
+30	0
+30	42
+30	43
+30	44
+30	45
+30	46
+30	47
+30	49
+30	50
+30	51
+30	53
+30	54
+30	55
+30	56
+30	58
+30	59
+31	0
+31	42
+31	43
+31	44
+31	45
+31	46
+31	47
+31	48
+31	49
+31	50
+31	51
+31	52
+31	55
+31	56
+31	59
+32	0
+32	42
+32	43
+32	44
+32	45
+32	46
+32	47
+32	48
+32	50
+32	51
+32	53
+32	54
+32	56
+32	58
+33	0
+33	42
+33	43
+33	44
+33	45
+33	46
+33	47
+33	48
+33	49
+33	50
+33	51
+33	52
+33	53
+33	54
+33	56
+33	57
+34	0
+34	42
+34	43
+34	44
+34	45
+34	46
+34	47
+34	48
+34	49
+34	50
+34	51
+34	52
+34	53
+34	54
+34	55
+34	56
+34	58
+35	0
+35	42
+35	43
+35	44
+35	45
+35	46
+35	47
+35	49
+35	50
+35	52
+35	54
+35	55
+35	56
+35	57
+35	58
+35	59
+36	0
+36	42
+36	43
+36	44
+36	45
+36	46
+36	47
+36	48
+36	49
+36	51
+36	52
+36	53
+36	55
+36	56
+36	57
+36	58
+37	0
+37	42
+37	43
+37	44
+37	45
+37	46
+37	47
+37	48
+37	49
+37	50
+37	51
+37	52
+37	54
+37	55
+37	56
+37	57
+37	59
+38	0
+38	42
+38	43
+38	44
+38	45
+38	46
+38	47
+38	48
+38	49
+38	50
+38	51
+38	52
+38	53
+38	54
+38	55
+38	57
+38	59
+39	0
+39	42
+39	43
+39	44
+39	45
+39	46
+39	47
+39	48
+39	50
+39	53
+39	55
+40	0
+40	42
+40	43
+40	44
+40	45
+40	46
+40	48
+40	49
+40	50
+40	51
+40	55
+40	58
+40	59
+41	0
+41	42
+41	43
+41	44
+41	45
+41	46
+41	47
+41	48
+41	49
+41	50
+41	51
+41	52
+41	53
+41	54
+41	55
+41	56
+41	57
+41	58
+42	0
+42	1
+42	2
+42	3
+42	4
+42	5
+42	6
+42	7
+42	8
+42	9
+42	10
+42	11
+42	12
+42	13
+42	14
+42	15
+42	16
+42	17
+42	18
+42	20
+42	21
+42	22
+42	23
+42	24
+42	25
+42	26
+42	27
+42	28
+42	29
+42	30
+42	31
+42	32
+42	33
+42	34
+42	35
+42	36
+42	37
+42	38
+42	39
+42	40
+42	41
+42	43
+42	44
+42	45
+42	46
+42	47
+42	48
+42	49
+42	50
+42	51
+42	52
+42	53
+42	54
+42	55
+42	56
+42	57
+42	58
+42	59
+43	0
+43	1
+43	2
+43	3
+43	4
+43	5
+43	6
+43	7
+43	8
+43	9
+43	10
+43	11
+43	12
+43	13
+43	14
+43	15
+43	16
+43	17
+43	18
+43	19
+43	21
+43	22
+43	23
+43	24
+43	26
+43	27
+43	28
+43	29
+43	30
+43	31
+43	32
+43	33
+43	34
+43	35
+43	36
+43	37
+43	38
+43	39
+43	40
+43	41
+43	42
+43	44
+43	45
+43	46
+43	47
+43	48
+43	49
+43	50
+43	51
+43	52
+43	53
+43	54
+43	55
+43	56
+43	57
+43	58
+43	59
+44	0
+44	1
+44	2
+44	3
+44	4
+44	5
+44	6
+44	7
+44	8
+44	9
+44	11
+44	12
+44	13
+44	14
+44	15
+44	17
+44	18
+44	19
+44	20
+44	21
+44	22
+44	23
+44	24
+44	26
+44	27
+44	28
+44	29
+44	30
+44	31
+44	32
+44	33
+44	34
+44	35
+44	36
+44	37
+44	38
+44	39
+44	40
+44	41
+44	42
+44	43
+44	45
+44	46
+44	47
+44	48
+44	49
+44	50
+44	51
+44	52
+44	53
+44	54
+44	55
+44	56
+44	57
+44	58
+44	59
+45	0
+45	1
+45	2
+45	3
+45	4
+45	5
+45	6
+45	7
+45	8
+45	9
+45	10
+45	11
+45	13
+45	14
+45	15
+45	17
+45	18
+45	19
+45	21
+45	22
+45	23
+45	24
+45	25
+45	26
+45	27
+45	28
+45	30
+45	31
+45	32
+45	33
+45	34
+45	35
+45	36
+45	37
+45	38
+45	39
+45	40
+45	41
+45	42
+45	43
+45	44
+45	46
+45	47
+45	48
+45	49
+45	50
+45	51
+45	52
+45	53
+45	54
+45	55
+45	56
+45	57
+45	59
+46	0
+46	1
+46	2
+46	4
+46	5
+46	6
+46	7
+46	8
+46	9
+46	10
+46	12
+46	13
+46	14
+46	15
+46	16
+46	17
+46	18
+46	19
+46	20
+46	21
+46	23
+46	24
+46	27
+46	28
+46	29
+46	30
+46	31
+46	32
+46	33
+46	34
+46	35
+46	36
+46	37
+46	38
+46	39
+46	40
+46	41
+46	42
+46	43
+46	44
+46	45
+46	47
+46	48
+46	49
+46	50
+46	51
+46	52
+46	53
+46	54
+46	55
+46	56
+46	57
+46	58
+46	59
+47	0
+47	1
+47	2
+47	4
+47	5
+47	6
+47	8
+47	9
+47	10
+47	11
+47	12
+47	13
+47	14
+47	15
+47	16
+47	17
+47	18
+47	20
+47	21
+47	22
+47	23
+47	24
+47	25
+47	28
+47	29
+47	30
+47	31
+47	32
+47	33
+47	34
+47	35
+47	36
+47	37
+47	38
+47	39
+47	41
+47	42
+47	43
+47	44
+47	45
+47	46
+47	48
+47	49
+47	50
+47	51
+47	52
+47	53
+47	54
+47	55
+47	56
+47	57
+47	58
+47	59
+48	0
+48	1
+48	2
+48	3
+48	4
+48	5
+48	6
+48	7
+48	8
+48	9
+48	10
+48	11
+48	14
+48	15
+48	16
+48	18
+48	19
+48	20
+48	21
+48	22
+48	23
+48	24
+48	27
+48	28
+48	29
+48	31
+48	32
+48	33
+48	34
+48	36
+48	37
+48	38
+48	39
+48	40
+48	41
+48	42
+48	43
+48	44
+48	45
+48	46
+48	47
+48	49
+48	50
+48	51
+48	52
+48	53
+48	54
+48	55
+48	56
+48	57
+48	58
+48	59
+49	0
+49	1
+49	2
+49	3
+49	5
+49	6
+49	7
+49	8
+49	9
+49	10
+49	12
+49	13
+49	14
+49	15
+49	16
+49	19
+49	20
+49	21
+49	22
+49	23
+49	24
+49	27
+49	28
+49	29
+49	30
+49	31
+49	33
+49	34
+49	35
+49	36
+49	37
+49	38
+49	40
+49	41
+49	42
+49	43
+49	44
+49	45
+49	46
+49	47
+49	48
+49	50
+49	51
+49	52
+49	53
+49	54
+49	55
+49	56
+49	57
+49	58
+49	59
+50	0
+50	1
+50	2
+50	3
+50	5
+50	6
+50	7
+50	8
+50	9
+50	10
+50	12
+50	13
+50	14
+50	15
+50	17
+50	18
+50	19
+50	22
+50	23
+50	26
+50	27
+50	28
+50	30
+50	31
+50	32
+50	33
+50	34
+50	35
+50	37
+50	38
+50	39
+50	40
+50	41
+50	42
+50	43
+50	44
+50	45
+50	46
+50	47
+50	48
+50	49
+50	51
+50	52
+50	53
+50	54
+50	55
+50	56
+50	57
+50	58
+50	59
+51	0
+51	1
+51	2
+51	3
+51	5
+51	6
+51	7
+51	8
+51	9
+51	11
+51	13
+51	14
+51	15
+51	16
+51	17
+51	18
+51	20
+51	23
+51	24
+51	26
+51	28
+51	29
+51	30
+51	31
+51	32
+51	33
+51	34
+51	36
+51	37
+51	38
+51	40
+51	41
+51	42
+51	43
+51	44
+51	45
+51	46
+51	47
+51	48
+51	49
+51	50
+51	52
+51	53
+51	54
+51	55
+51	56
+51	57
+51	58
+51	59
+52	0
+52	1
+52	2
+52	3
+52	4
+52	5
+52	6
+52	7
+52	8
+52	10
+52	11
+52	12
+52	13
+52	14
+52	15
+52	16
+52	18
+52	19
+52	21
+52	23
+52	24
+52	25
+52	28
+52	31
+52	33
+52	34
+52	35
+52	36
+52	37
+52	38
+52	41
+52	42
+52	43
+52	44
+52	45
+52	46
+52	47
+52	48
+52	49
+52	50
+52	51
+52	53
+52	54
+52	55
+52	56
+52	57
+52	58
+52	59
+53	0
+53	1
+53	2
+53	3
+53	6
+53	7
+53	9
+53	10
+53	12
+53	14
+53	16
+53	17
+53	18
+53	19
+53	20
+53	21
+53	22
+53	23
+53	24
+53	27
+53	28
+53	29
+53	30
+53	32
+53	33
+53	34
+53	36
+53	38
+53	39
+53	41
+53	42
+53	43
+53	44
+53	45
+53	46
+53	47
+53	48
+53	49
+53	50
+53	51
+53	52
+53	54
+53	55
+53	56
+53	57
+53	58
+53	59
+54	0
+54	1
+54	2
+54	3
+54	4
+54	7
+54	9
+54	10
+54	11
+54	12
+54	13
+54	14
+54	18
+54	20
+54	21
+54	22
+54	23
+54	24
+54	26
+54	28
+54	29
+54	30
+54	32
+54	33
+54	34
+54	35
+54	37
+54	38
+54	41
+54	42
+54	43
+54	44
+54	45
+54	46
+54	47
+54	48
+54	49
+54	50
+54	51
+54	52
+54	53
+54	55
+54	56
+54	57
+54	58
+54	59
+55	0
+55	1
+55	2
+55	5
+55	8
+55	9
+55	12
+55	14
+55	16
+55	18
+55	19
+55	21
+55	22
+55	23
+55	24
+55	26
+55	27
+55	29
+55	30
+55	31
+55	34
+55	35
+55	36
+55	37
+55	38
+55	39
+55	40
+55	41
+55	42
+55	43
+55	44
+55	45
+55	46
+55	47
+55	48
+55	49
+55	50
+55	51
+55	52
+55	53
+55	54
+55	57
+55	58
+55	59
+56	0
+56	2
+56	3
+56	4
+56	5
+56	6
+56	7
+56	8
+56	9
+56	10
+56	12
+56	14
+56	17
+56	19
+56	23
+56	24
+56	25
+56	26
+56	29
+56	30
+56	31
+56	32
+56	33
+56	34
+56	35
+56	36
+56	37
+56	41
+56	42
+56	43
+56	44
+56	45
+56	46
+56	47
+56	48
+56	49
+56	50
+56	51
+56	52
+56	53
+56	54
+56	57
+56	58
+56	59
+57	0
+57	1
+57	3
+57	4
+57	5
+57	7
+57	9
+57	10
+57	11
+57	13
+57	14
+57	15
+57	17
+57	18
+57	21
+57	22
+57	25
+57	27
+57	28
+57	29
+57	33
+57	35
+57	36
+57	37
+57	38
+57	41
+57	42
+57	43
+57	44
+57	45
+57	46
+57	47
+57	48
+57	49
+57	50
+57	51
+57	52
+57	53
+57	54
+57	55
+57	56
+57	58
+57	59
+58	0
+58	2
+58	4
+58	7
+58	8
+58	9
+58	10
+58	11
+58	12
+58	14
+58	15
+58	16
+58	18
+58	19
+58	20
+58	21
+58	22
+58	24
+58	28
+58	30
+58	32
+58	34
+58	35
+58	36
+58	40
+58	41
+58	42
+58	43
+58	44
+58	46
+58	47
+58	48
+58	49
+58	50
+58	51
+58	52
+58	53
+58	54
+58	55
+58	56
+58	57
+58	59
+59	0
+59	1
+59	2
+59	3
+59	6
+59	10
+59	13
+59	14
+59	15
+59	18
+59	19
+59	20
+59	21
+59	23
+59	25
+59	26
+59	27
+59	28
+59	30
+59	31
+59	35
+59	37
+59	38
+59	40
+59	42
+59	43
+59	44
+59	45
+59	46
+59	47
+59	48
+59	49
+59	50
+59	51
+59	52
+59	53
+59	54
+59	55
+59	56
+59	57
+59	58

From ec9250cc34010f73ae968beb200997b6da1a75f1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-253.ec2.internal>
Date: Tue, 29 Nov 2022 22:25:47 +0000
Subject: [PATCH 17/36] stage

---
 backend/src/RAM/RA_tasks.cpp                  |   4 +-
 backend/src/lie/lie.cpp                       |  14 +-
 .../src/relation/balanced_hash_relation.cpp   |   2 +
 .../tests/pagerank/compiled_pre/pagerank.cpp  | 397 ++++-----
 .../pagerank/compiled_pre/pagerank_full.cpp   | 753 ++++++++++++++++++
 5 files changed, 964 insertions(+), 206 deletions(-)
 create mode 100644 backend/tests/pagerank/compiled_pre/pagerank_full.cpp

diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 7bbd4a71..2712e343 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -1129,7 +1129,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
         intra_bucket_comm_execute();
         auto intra_end = MPI_Wtime(); 
 
-        // std::cout << std::setiosflags(std::ios::fixed);
+        std::cout << std::setiosflags(std::ios::fixed);
         bool local_join_status = false;
         while (local_join_status == false)
         {
@@ -1262,7 +1262,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
         //    std::cout << "--------------FIXED POINT ITERATION " << loop_count_tracker << "--------------" << std::endl;
 #endif
 
-        // std::cout << std::setiosflags(std::ios::fixed);
+        std::cout << std::setiosflags(std::ios::fixed);
         auto intra_start = MPI_Wtime(); 
         intra_bucket_comm_execute();
         auto intra_end = MPI_Wtime(); 
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 674a2d92..e517288f 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -588,7 +588,9 @@ bool LIE::execute ()
     if (mcomm.get_rank() == 0)
         std::cout << "Total interation count: " << full_iteration_count << std::endl;
 
-    write_final_checkpoint_dump();
+    if (enable_data_io) {
+        write_final_checkpoint_dump();
+    }
 
     // std::cout << "finish writting checkpoint!" << std::endl;
 
@@ -604,9 +606,9 @@ bool LIE::execute ()
 
 LIE::~LIE ()
 {
-    for (u32 i = 0 ; i < lie_relations.size(); i++)
-    {
-        lie_relations[i]->finalize_relation();
-        delete (lie_relations[i]);
-    }
+    // for (u32 i = 0 ; i < lie_relations.size(); i++)
+    // {
+    //     lie_relations[i]->finalize_relation();
+    //     delete (lie_relations[i]);
+    // }
 }
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index 9fcfe747..660f5a7d 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -23,6 +23,7 @@ u32 relation::get_global_delta_element_count()
 
 u32 relation::get_global_full_element_count()
 {
+    // TODO: change to use size of shamp_relation rather than counter
     u32 global_full_element_count;
     MPI_Allreduce(&full_element_count, &global_full_element_count, 1, MPI_INT, MPI_SUM, mcomm.get_local_comm());
     return global_full_element_count;
@@ -856,6 +857,7 @@ void relation::populate_full(int buffer_size, u64* buffer)
 
         if (full[bucket_id].insert_tuple_from_array(t, (arity+1)) == true)
         {
+            // TODO: check if its update, if it is keep full count same
             full_element_count++;
             full_bucket_element_count[bucket_id]++;
             counter++;
diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp
index 34214be3..ec7133ba 100644
--- a/backend/tests/pagerank/compiled_pre/pagerank.cpp
+++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp
@@ -1,5 +1,6 @@
 // location of `parallel_RA_inc.h` here
 #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "mpi.h"
 
 #include <bit>
 #include <iostream>
@@ -35,6 +36,14 @@ const u64 str_tag = 2;
 const u64 sign_flip_const = 0x0000200000000000;
 const u64 signed_num_mask = 0xFFFFE00000000000;
 
+#define FLOAT_SCALE_CONST 100000
+float ALPHA = 0.85;
+u64 total_node_size = 0;
+u64 dangling_value = 0;
+u64 current_iter = 0;
+int MAX_PG_ITERATION = 2;
+u64 dangling_node_cnt;
+
 inline bool is_number(u64 datum) {
   // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
   // int_tag) << "\n";
@@ -264,7 +273,9 @@ agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
   for (shmap_relation::iterator it = joined_range.first;
        it != joined_range.second; ++it) {
     auto tuple = (*it);
-    sum_res += tuple[tuple.size() - 2];
+    // if (tuple[1] == MAX_PG_ITERATION) {
+      sum_res += tuple[tuple.size() - 2];
+    // }
   }
   return sum_res;
 }
@@ -410,9 +421,6 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
   return max_rel;
 }
 
-float ALPHA = 0.85;
-u64 total_node_size = 0;
-
 int main(int argc, char **argv) {
   // input dir from compiler
   std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data";
@@ -426,6 +434,8 @@ int main(int argc, char **argv) {
   mpi_comm mcomm;
   mcomm.create(argc, argv);
 
+  MAX_PG_ITERATION = atoi(argv[3]);
+
   // (edge from to)
   relation *rel__edge__2__1 = new relation(
       1, true, 2, get_tag_for_rel("edge", "1__2"),
@@ -461,6 +471,39 @@ int main(int argc, char **argv) {
           ".$unit.1.table",
       FULL);
 
+  // relation *rel__edge__2__2 = new relation(
+  //   1, false, 2, get_tag_for_rel("edge", "2"),
+  //   std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table",
+  //   std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL);
+
+    //   matrix edge + successor count
+  relation *rel__matrix__3__1 = new relation(
+    1, true, 3, get_tag_for_rel("matrix", "1"),
+    std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table",
+    std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL);
+
+  
+  relation *rel__dangling_node = new relation(
+    1, true, 1, get_tag_for_rel("dangling_node", "1"),
+    std::to_string(get_tag_for_rel("dangling_node", "1")) + ".dangling_node.table",
+    std::to_string(get_tag_for_rel("dangling_node", "1")) + ".dangling_node.table",
+    FULL);
+
+  // RAM *scc_copy_edge = new RAM(false, 0);
+  // scc_copy_edge->add_relation(rel__edge__2__1, false, false);
+  // scc_copy_edge->add_relation(rel__edge__2__2, true, false);
+  // scc_copy_edge->add_rule(
+  //     new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2}));
+
+  RAM *scc_compute_matrix = new RAM(false, 1);
+  scc_compute_matrix->add_relation(rel__edge__2__1, false, false);
+  // scc_compute_matrix->add_relation(rel__edge__2__2, false, false);
+  scc_compute_matrix->add_relation(rel__matrix__3__1, true, false);
+  scc_compute_matrix->add_rule(new parallel_join_aggregate(
+      rel__matrix__3__1, rel__edge__2__1, rel__edge__2__1, FULL,
+      agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
+      {0, 1, 3}));
+
   RAM *scc_helper_fact = new RAM(false, 0);
   scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false);
   scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)}));
@@ -492,14 +535,30 @@ int main(int argc, char **argv) {
       agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
       {2}));
 
+  RAM *scc_populate_dangling = new RAM(false, 3);
+  scc_populate_dangling->add_relation(rel__edge__2__1, false);
+  scc_populate_dangling->add_relation(rel__dangling_node, true);
+  scc_populate_dangling->add_relation(rel__node__1__1, false);
+  scc_populate_dangling->add_rule(new parallel_join_negate(
+    rel__dangling_node, rel__node__1__1, FULL, rel__edge__2__1,
+    {0}
+  ));
+
   LIE *cnt_lie = new LIE();
   cnt_lie->add_relation(rel__edge__2__1);
+  // cnt_lie->add_relation(rel__edge__2__2);
   cnt_lie->add_relation(rel__node__1__1);
   cnt_lie->add_relation(rel___dollorunit__1__1);
   cnt_lie->add_relation(rel__total_node_cnt__1__1);
+  cnt_lie->add_relation(rel__matrix__3__1);
+  cnt_lie->add_relation(rel__dangling_node);
   cnt_lie->add_scc(scc_helper_fact);
   cnt_lie->add_scc(scc_compute_node);
   cnt_lie->add_scc(scc_count_nodes);
+  // cnt_lie->add_scc(scc_copy_edge);
+  cnt_lie->add_scc(scc_compute_matrix);
+  cnt_lie->add_scc(scc_populate_dangling);
+  // cnt_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix);
   cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes);
   cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes);
 
@@ -513,28 +572,32 @@ int main(int argc, char **argv) {
   // only 1 data in this rel so its safe
   rel__total_node_cnt__1__1->print();
 
+  u64 local_node_size = 0;
   for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) {
-    total_node_size = t[0];
-    std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << std::endl;
+    if (t[0] != 0) {
+      local_node_size = t[0];
+    }
   }
+  rel__matrix__3__1->print();
+  MPI_Barrier(mcomm.get_comm());
 
-  // >>>>>>>>>>>>>>> compute page rank
-  std::cout << ">>>>>>>>>> Computing pagerank ... " << std::endl;
+  MPI_Allreduce(&local_node_size, &total_node_size, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, mcomm.get_comm());
+
+  dangling_node_cnt = rel__dangling_node->get_global_full_element_count();
+  dangling_value = FLOAT_SCALE_CONST / total_node_size;
+  std::cout << ">>>>>>>>> Number of nodes: " << total_node_size
+            << " >>>>>>>>> Dangling node count: " << dangling_node_cnt
+            << " >>>>>>>>> Dangling value: " << dangling_value * 1.0 / FLOAT_SCALE_CONST
+            << std::endl;
 
   rel__edge__2__1->disable_initialization();
   rel__node__1__1->disable_initialization();
+  rel__matrix__3__1->disable_initialization();
+  rel__dangling_node->disable_initialization();
+  
+  // rel__matrix__3__1->print();
 
-  relation *rel__edge__2__2 = new relation(
-      1, false, 2, get_tag_for_rel("edge", "2"),
-      std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table",
-      std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL);
-
-  //   matrix edge + successor count
-  relation *rel__matrix__3__1 = new relation(
-      1, true, 3, get_tag_for_rel("matrix", "1"),
-      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table",
-      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL);
-
+  //////////////////  compute  Page rank
   relation *rel__rank__3__1 = new relation(
       1, true, 3, get_tag_for_rel("rank", "1"),
       std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table",
@@ -544,205 +607,143 @@ int main(int argc, char **argv) {
       {1, 2, 3},
       [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
          const vector<u64> &nt) -> std::optional<bool> {
-        // if (nt[0] == 59 && nt[1] == 58) {
-        //   std::cout << "dependent column size " << new_v.size() << std::endl;
-        //   std::cout << new_v[0] << " " << new_v[1] << " " << new_v[2] << "
-        //   comparing with " << old_v[0] << " " << old_v[1] << " " << old_v[2]
-        //   << std::endl;
-        // }
         if (new_v[0] != old_v[0]) {
-          // std::cout << " www >>>>>>>>>" << std::endl;
           return std::nullopt;
-        } else {
-          // monotonic
-          // assert(new_v[1] > old_v[1]);
-          // u32 new_sum_raw = new_v[1];
-          // u32 old_sum_raw = old_v[1];
-          // float new_sum = *reinterpret_cast<float*>(&new_sum_raw);
-          // float old_sum = *reinterpret_cast<float*>(&old_sum_raw);
-          // if (new_sum > old_sum) {
-          //   std::cout << "new >> " << new_sum << " old >> " << old_sum <<
-          //   std::endl;
-          // }
-          // return new_sum > old_sum;
-          return new_v[1] > old_v[1];
-          // return true;
         }
+        // if (std::abs((int)new_v[1] - (int)old_v[1]) < 10) {
+        //   return false;
+        // }
+        return true;
       });
 
-  relation *rel__result__2__1__2 = new relation(
-      2, true, 2, get_tag_for_rel("result", "1__2"),
-      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
-      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
-      FULL);
-
-  //
-
-  RAM *scc_copy_edge = new RAM(false, 0);
-  scc_copy_edge->add_relation(rel__edge__2__1, false, false);
-  scc_copy_edge->add_relation(rel__edge__2__2, true, false);
-  scc_copy_edge->add_rule(
-      new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2}));
-
-  RAM *scc_compute_matrix = new RAM(false, 1);
-  scc_compute_matrix->add_relation(rel__edge__2__1, false, false);
-  scc_compute_matrix->add_relation(rel__edge__2__2, false, false);
-  scc_compute_matrix->add_relation(rel__matrix__3__1, true, false);
-  scc_compute_matrix->add_rule(new parallel_join_aggregate(
-      rel__matrix__3__1, rel__edge__2__2, rel__edge__2__1, FULL,
-      agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
-      {0, 1, 3}));
-
-  RAM *scc_init = new RAM(false, 2);
-  scc_init->add_relation(rel__matrix__3__1, false, false);
-  scc_init->add_relation(rel__rank__3__1, true, false);
-  scc_init->add_rule(new parallel_copy_generate(
-      rel__rank__3__1, rel__matrix__3__1, FULL,
-      [](const u64 *const data, u64 *const output) -> int {
-        output[0] = data[0];
-        output[1] = data[0];
-        // float init_pg_v = (1 - ALPHA) / total_node_size;
-        u64 init_pg_v = (u64)(((1 - ALPHA) / total_node_size) * 100000);
-        // std::cout << init_pg_v << std::endl;
-        // output[2] = *reinterpret_cast<u32*>(&init_pg_v);
-        output[2] = init_pg_v;
-        return 1;
-      }));
-
-  RAM *scc_page_rank = new RAM(true, 3);
-  scc_page_rank->add_relation(rel__matrix__3__1, false, false);
-  scc_page_rank->add_relation(rel__rank__3__1, true, false);
-  parallel_join *rank_join =
-      new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL,
-                        rel__rank__3__1, DELTA, {3, 1, 2} // useless
-      );
-  rank_join->set_generator_func([](const depend_val_t &target_vs,
-                                   const std::vector<u64> &input_v,
-                                   depend_val_t &res_set) -> bool {
-    // float pg_sum = 0.0;
-    u64 pg_sum = 0;
-
-    int count = 0;
-    for (auto &tv : target_vs) {
-      // std::cout << "tagret v >>>>> ";
-      // for (auto c: tv) {
-      //   std::cout << c << " ";
-      // }
-      // std::cout << std::endl;
-      u32 raw_succ_pg_v = tv[2]; // all columns are u64, cast to u32 first
-      // std::cout << ">>>>>>>>>>>>>>> " <<
-      // *reinterpret_cast<float*>(&raw_succ_pg_v) << std::endl;
-      // auto succ_pg_v = *reinterpret_cast<float*>(&raw_succ_pg_v);
-      // if(succ_pg_v == 0) {
-      //   // std::cout << ">>>>>>>>>> " << succ_pg_v << std::endl;
-      // std::cout << "tagret v >>>>> ";
-      // for (auto c: tv) {
-      //   std::cout << c << " ";
+  std::vector<LIE*> pg_lie_list;
+
+  for (int i = 0; i < MAX_PG_ITERATION; i++) {
+    std::cout << ">>>>>>>>>>>>>>>>>>>>> Compute pagerank iter " << current_iter
+              << std::endl;
+    LIE *pg_lie = new LIE();
+
+    // RAM *scc_
+    RAM *scc_init = new RAM(false, 0);
+    scc_init->add_relation(rel__matrix__3__1, false, false);
+    scc_init->add_relation(rel__rank__3__1, true, false);
+    scc_init->add_rule(new parallel_copy_generate(
+        rel__rank__3__1, rel__matrix__3__1, FULL,
+        [](const u64 *const data, u64 *const output) -> int {
+          output[0] = data[0];
+          output[1] = data[0];
+          output[2] = dangling_value;
+          return 1;
+        }));
+    RAM *scc_page_rank = new RAM(false, 1);
+    scc_page_rank->add_relation(rel__matrix__3__1, false, false);
+    scc_page_rank->add_relation(rel__rank__3__1, true, false);
+    parallel_join *rank_join =
+        new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL,
+                          rel__rank__3__1, DELTA, {3, 1, 2} // useless
+        );
+    rank_join->set_generator_func([](const depend_val_t &target_vs,
+                                     const std::vector<u64> &input_v,
+                                     depend_val_t &res_set) -> bool {
+      // if (current_iter > MAX_PG_ITERATION) {
+      //   return false;
       // }
-      // std::cout << std::endl;
-      // }
-      if (input_v[2] != 0) {
-        // pg_sum += ((ALPHA * succ_pg_v) / input_v[2]);
-        pg_sum += (u64)(((u64)(ALPHA * raw_succ_pg_v)) / input_v[2]);
-        // if (input_v[1] == 51) {
-        //   std::cout << "Sum 51 " << input_v[0] << " with ";
-        //   for (auto c: tv) {
-        //     std::cout << c << " ";
-        //   }
-        //   std::cout << " result " << pg_sum << std::endl;
-        // }
+      u64 pg_sum = dangling_node_cnt * dangling_value;
+      int count = 0;
+      for (auto &tv : target_vs) {
+        if ((tv[0] == tv[1]) && (current_iter != 0)) {
+          continue;
+        }
+        u32 raw_succ_pg_v_sub = tv[2]; // all columns are u64, cast to u32 first
+        if (current_iter == 0) {
+          raw_succ_pg_v_sub = raw_succ_pg_v_sub / input_v[2];
+        }
+        pg_sum += (u64)(raw_succ_pg_v_sub * ALPHA);
+        count++;
       }
-      count++;
+      pg_sum += (1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size;
+      std::vector<u64> res_tuple(3, 0);
+      res_tuple[0] = input_v[1];
+      res_tuple[1] = input_v[0];
+      res_tuple[2] = pg_sum / input_v[2];
+      res_set.push_back(res_tuple);
+      return true;
+    });
+    scc_page_rank->add_rule(rank_join);
+
+    pg_lie_list.push_back(pg_lie);
+    pg_lie->add_relation(rel__matrix__3__1);
+    pg_lie->add_relation(rel__node__1__1);
+    pg_lie->add_relation(rel__rank__3__1);
+    pg_lie->add_scc(scc_page_rank);
+    if (current_iter == 0) {
+      pg_lie->add_scc(scc_init);
+      pg_lie->add_scc_dependance(scc_init, scc_page_rank);
     }
-    if (pg_sum == 0) {
-      return false;
+    // Enable IO
+    if (i == MAX_PG_ITERATION - 1) {
+      pg_lie->enable_all_to_all_dump();
+      pg_lie->enable_data_IO();
+      pg_lie->enable_IO();
     }
-    if (count == 0) {
-      return false;
-    }
-    std::vector<u64> res_tuple(3, 0);
-    res_tuple[0] = input_v[1];
-    res_tuple[1] = input_v[0];
-    // res_tuple[2] = *reinterpret_cast<u32*>(&pg_sum);
-    res_tuple[2] = pg_sum;
-    // std::cout << "New tuple >>>>>>> " << pg_sum << std::endl;
-    // for (auto c: res_tuple) {
-    //   std::cout << c << " ";
-    // }
-    // std::cout << std::endl;
-    res_set.push_back(res_tuple);
-    return true;
-  });
-  scc_page_rank->add_rule(rank_join);
+    // lie->enable_share_io();
+    pg_lie->set_output_dir(slog_output_dir); // Write to this directory
+    pg_lie->set_comm(mcomm);
+    pg_lie->set_batch_size(1);
+    pg_lie->execute();
+    current_iter++;
+    rel__rank__3__1->disable_initialization();
+    pg_lie->print_all_relation_size(); // Continuously print relation sizes
+  }
+  // rel__rank__4__1->print(
+  //   [](const std::vector<u64>& tp){
+  //     u32 pg_v = tp[3];
+  //     std::cout << tp[0] << " " << tp[1] << " "
+  //     // *reinterpret_cast<float*>(&pg_v) << std::cout << tp[0] << " " << tp[1]
+  //     << tp[2] << " " << pg_v << std::endl;
+  //   }
+  // );
+  // delete pg_pre_lie;
+  // delete pg_lie;
+
+  std::cout << "Aggregating sum ..." << std::endl;
+  relation *rel__result__2__1__2 = new relation(
+      2, true, 2, get_tag_for_rel("result", "1__2"),
+      std::to_string(get_tag_for_rel("result", "1__2")) +
+      ".result.2.table", std::to_string(get_tag_for_rel("result", "1__2"))
+      + ".result.2.table", FULL);
 
   RAM *scc_result = new RAM(false, 4);
   scc_result->add_relation(rel__rank__3__1, false, false);
   scc_result->add_relation(rel__result__2__1__2, true, false);
   scc_result->add_relation(rel__node__1__1, false, false);
-  // scc_result->add_rule(new parallel_join_aggregate(
-  //     rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
-  //     agg_sum_float_local, SpecialAggregator::sum, agg_sum_float_reduce,
-  //     nullptr, {0, 2}));
   scc_result->add_rule(new parallel_join_aggregate(
       rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
-      agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, 2}));
-
-  LIE *pg_lie = new LIE();
-  pg_lie->add_relation(rel__edge__2__1);
-  pg_lie->add_relation(rel__matrix__3__1);
-  pg_lie->add_relation(rel__node__1__1);
-  pg_lie->add_relation(rel__edge__2__2);
-  pg_lie->add_relation(rel__rank__3__1);
-  pg_lie->add_relation(rel__result__2__1__2);
-  pg_lie->add_scc(scc_copy_edge);
-  pg_lie->add_scc(scc_compute_matrix);
-  pg_lie->add_scc(scc_init);
-  pg_lie->add_scc(scc_page_rank);
-  pg_lie->add_scc(scc_result);
-  pg_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix);
-  pg_lie->add_scc_dependance(scc_compute_matrix, scc_init);
-  pg_lie->add_scc_dependance(scc_init, scc_page_rank);
-  pg_lie->add_scc_dependance(scc_page_rank, scc_result);
-
-  // Enable IO
-  pg_lie->enable_all_to_all_dump();
-  pg_lie->enable_data_IO();
-  pg_lie->enable_IO();
-  // lie->enable_share_io();
-  pg_lie->set_output_dir(slog_output_dir); // Write to this directory
-  pg_lie->set_comm(mcomm);
-  pg_lie->set_batch_size(1);
-  pg_lie->execute();
-  pg_lie->print_all_relation_size(); // Continuously print relation sizes
-  // lie->stat_intermediate();
-  // rel__matrix__3__1->print();
-  // rel__rank__3__1->print(
-  //   [](const std::vector<u64>& tp){
-  //     u32 pg_v = tp[2];
-  //     // std::cout << tp[0] << " " << tp[1] << " " <<
-  //     *reinterpret_cast<float*>(&pg_v) << std::cout << tp[0] << " " << tp[1]
-  //     << " " << pg_v << std::endl;
-  //   }
-  // );
+      agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0,
+      2}));
+
+  LIE* final_lie = new LIE();
+  final_lie->add_relation(rel__result__2__1__2);
+  final_lie->add_relation(rel__node__1__1);
+  final_lie->add_relation(rel__rank__3__1);
+  final_lie->add_scc(scc_result);
+  final_lie->enable_all_to_all_dump();
+  final_lie->enable_data_IO();
+  final_lie->enable_IO();
+
+  final_lie->set_output_dir(slog_output_dir); // Write to this directory
+  final_lie->set_comm(mcomm);
+  final_lie->set_batch_size(1);
+  final_lie->execute();
+  final_lie->print_all_relation_size(); // Continuously print relation sizes
+
+  // rel__rank__3__1->print();
+
   rel__result__2__1__2->print([](const std::vector<u64> &tp) {
     u32 pg_v = tp[1];
     // std::cout << tp[0] << " " << *reinterpret_cast<float*>(&pg_v) <<
-    std::cout << tp[0] << " " << pg_v * 1.0 / 100000 << std::endl;
+    std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl;
   });
-
-  // print all variants(non-canonical index of each relation)
-  //   if (mcomm.get_rank() == 0) {
-  //     std::cout << "rel_name"
-  //               << ",\t"
-  //               << "indices\n";
-  //     for (auto const &rel_p : rel_index_map) {
-  //       std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n";
-  //     }
-  //     std::cout << std::endl;
-  //   }
-
-  delete pg_lie;
-
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
   mcomm.destroy();
diff --git a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
new file mode 100644
index 00000000..c0636f61
--- /dev/null
+++ b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
@@ -0,0 +1,753 @@
+// location of `parallel_RA_inc.h` here
+#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+
+#include <bit>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+// builtins.cpp goes here!
+// builtins.cpp
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace std;
+#define u64 uint64_t
+#define u32 uint32_t
+using i64 = int64_t;
+
+const u64 tag_mask = 0xffffc00000000000;
+const u64 tag_position = 46;
+const u64 int_tag = 0;
+const u64 str_tag = 2;
+const u64 sign_flip_const = 0x0000200000000000;
+const u64 signed_num_mask = 0xFFFFE00000000000;
+
+inline bool is_number(u64 datum) {
+  // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
+  // int_tag) << "\n";
+  return datum >> tag_position == int_tag;
+}
+
+inline i64 datum_to_number(u64 datum) {
+  i64 signed_val =
+      (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
+  if (signed_val >= sign_flip_const) {
+    signed_val = sign_flip_const - signed_val;
+  }
+  return signed_val;
+  // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 -
+  // tag_position);
+}
+const auto d2n = datum_to_number;
+
+inline u64 number_to_datum(i64 number) {
+  i64 unsigned_value = number;
+  if (number < 0) {
+    unsigned_value = (-number) + sign_flip_const;
+  }
+  return (unsigned_value & ~tag_mask) | (int_tag << tag_position);
+  // return (number & ~tag_mask) | (int_tag << tag_position);
+}
+
+const auto n2d = number_to_datum;
+
+inline u64 string_to_datum(std::string str) {
+  u32 str_hash = string_hash(str);
+  return (str_hash & ~tag_mask) | (str_tag << tag_position);
+}
+const auto s2d = string_to_datum;
+
+vector<array<u64, 2>> builtin_div_rem(const u64 *const data) {
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto div = number_to_datum(d2n(data[0]) / d2n(data[1]));
+    auto rem = number_to_datum(d2n(data[0]) % d2n(data[1]));
+    return {{div, rem}};
+  } else {
+    return {};
+  }
+}
+
+#define BUILTIN_BINARY_NUMBER_PRED(name, op)                                   \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(TState state)) {                       \
+    if (is_number(data[0]) && is_number(data[1]) &&                            \
+        datum_to_number(data[0]) op datum_to_number(data[1])) {                \
+      return callback(init_state);                                             \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+BUILTIN_BINARY_NUMBER_PRED(builtin_less, <)
+BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >)
+BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=)
+BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=)
+
+#define BUILTIN_BINARY_NUMBER_FUNC(name, op)                                   \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0]) && is_number(data[1])) {                            \
+      auto res = number_to_datum(datum_to_number(data[0])                      \
+                                     op datum_to_number(data[1]));             \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /)
+
+#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl)                                \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0]) && is_number(data[1])) {                            \
+      auto res = number_to_datum(                                              \
+          impl(datum_to_number(data[0]), datum_to_number(data[1])));           \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) { return arg2 - arg1; }
+BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1)
+
+#define BUILTIN_UNARY_NUMBER_FUNC(name, impl)                                  \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0])) {                                                  \
+      auto res = number_to_datum(impl(datum_to_number(data[0])));              \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+inline u64 add1(u64 x) { return x + 1; }
+inline u64 sub1(u64 x) { return x - 1; }
+
+BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1)
+
+vector<array<u64, 1>> builtin_range(const u64 *const data) {
+  vector<array<u64, 1>> res;
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto lb = datum_to_number(data[0]);
+    auto ub = datum_to_number(data[1]);
+    res.reserve(ub - lb);
+    for (u64 x = lb; x < ub; x++)
+      res.push_back({number_to_datum(x)});
+  }
+  return res;
+}
+
+template <typename TState>
+TState callback_builtin_range(const u64 *data, TState init_state,
+                              TState (*callback)(u64 res, TState state)) {
+  auto state = init_state;
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto lb = datum_to_number(data[0]);
+    auto ub = datum_to_number(data[1]);
+    for (u64 x = lb; x < ub; x++)
+      state = callback(number_to_datum(x), state);
+  }
+  return state;
+}
+
+#define BUILTIN_BINARY_PRED(name, op)                                          \
+  template <typename TState>                                                   \
+  TState name(const u64 *data, TState init_state,                              \
+              TState (*callback)(TState state)) {                              \
+    if (data[0] op data[1])                                                    \
+      return callback(init_state);                                             \
+    else                                                                       \
+      return init_state;                                                       \
+  }
+BUILTIN_BINARY_PRED(builtin_eq, ==)
+BUILTIN_BINARY_PRED(builtin_neq, !=)
+
+template <typename TState>
+TState builtin_eq_1(const u64 *data, TState init_state,
+                    TState (*callback)(u64 res, TState state)) {
+  return callback(data[0], init_state);
+}
+
+#define BUILTIN_UNARY_PRED(name, pred)                                         \
+  template <typename TState>                                                   \
+  TState name(const u64 *data, TState init_state,                              \
+              TState (*callback)(TState state)) {                              \
+    if (pred(data[0]))                                                         \
+      return callback(init_state);                                             \
+    else                                                                       \
+      return init_state;                                                       \
+  }
+
+bool is_not_number(u64 datum) { return !is_number(datum); }
+BUILTIN_UNARY_PRED(builtin_number_huh, is_number)
+BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number)
+
+// for generate-cpp-lambda-for-computational-join
+struct CL2CB_State {
+  void *original_callback; // There be dragons?
+  void *original_state;
+  const u64 *original_data;
+  u64 *cl1_output_args;
+};
+
+// for generate-cpp-lambda-for-computational-copy
+struct BCLCB_State {
+  void *original_callback;
+  void *original_state;
+  const u64 *original_data;
+};
+
+// an experiment:
+template <bool f(u64, u64)> bool builtin_binary_number_pred(const u64 *data) {
+  if (is_number(data[0]) && is_number(data[1])) {
+    return f(datum_to_number(data[0]), datum_to_number(data[1]));
+  } else {
+    return false;
+  }
+}
+bool _less(u64 x, u64 y) { return x < y; }
+auto builtin_less2 = builtin_binary_number_pred<_less>;
+
+template <typename TState>
+inline TState builtin_nop(const u64 *data, TState init_state,
+                          TState (*callback)(TState state)) {
+  return callback(init_state);
+}
+
+// //////////////////// AGGREGATORS Alternative design ////////////////////
+
+// TODO: add number type check
+//////////////////////////////  count /////////////////////////////////////
+
+local_agg_res_t
+agg_count_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                    joined_range) {
+  local_agg_res_t cnt = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    cnt++;
+  }
+  return cnt;
+}
+
+local_agg_res_t agg_count_reduce(local_agg_res_t x, local_agg_res_t y) {
+  return x + y;
+}
+
+//////////////////////////////  sum /////////////////////////////////////
+
+local_agg_res_t
+agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                  joined_range) {
+  local_agg_res_t sum_res = 0;
+  for (shmap_relation::iterator it = joined_range.first;
+       it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    sum_res += tuple[tuple.size() - 2];
+  }
+  return sum_res;
+}
+
+local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  return x + y;
+}
+
+local_agg_res_t agg_sum_float_local(
+    std::pair<shmap_relation::iterator, shmap_relation::iterator>
+        joined_range) {
+  float sum_res = 0.0;
+  for (shmap_relation::iterator it = joined_range.first;
+       it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    u32 agg_column_raw = tuple[tuple.size() - 2];
+
+    sum_res += *reinterpret_cast<float *>(&agg_column_raw);
+  }
+  // std::cout << ">>>>>>>  " << sum_res << " " <<
+  // *reinterpret_cast<u32*>(&sum_res) << std::endl;
+  u32 sum_res_encoded = *reinterpret_cast<u32 *>(&sum_res);
+  return sum_res_encoded;
+}
+
+local_agg_res_t agg_sum_float_reduce(local_agg_res_t x_raw,
+                                     local_agg_res_t y_raw) {
+  float x = *reinterpret_cast<float *>(&x_raw);
+  float y = *reinterpret_cast<float *>(&y_raw);
+  float res = x + y;
+  // std::cout << res << std::endl;
+  u32 res_encoded = *reinterpret_cast<u32 *>(&res);
+  return res_encoded;
+}
+
+//////////////////////////////  maximum  /////////////////////////////////////
+
+local_agg_res_t
+agg_maximum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                      joined_range) {
+  local_agg_res_t max_res = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    auto current_v = tuple[tuple.size() - 1];
+    if (current_v > max_res) {
+      max_res = current_v;
+    }
+  }
+  return max_res;
+}
+
+local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  if (x > y) {
+    return x;
+  } else {
+    return y;
+  }
+}
+
+//////////////////////////////  minimum  /////////////////////////////////////
+
+local_agg_res_t
+agg_minimum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                      joined_range) {
+  local_agg_res_t min_res = std::numeric_limits<u32>::max();
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    auto current_v = tuple[tuple.size() - 1];
+    if (current_v < min_res) {
+      min_res = current_v;
+    }
+  }
+  return min_res;
+}
+
+local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  if (x < y) {
+    return x;
+  } else {
+    return y;
+  }
+}
+
+// // end of builtins.cpp
+
+// global definitions:
+
+int max_rel = 255;
+std::map<std::string, int> rel_tag_map;
+std::map<std::string, std::unordered_set<std::string>> rel_index_map;
+
+// load all relation inside input database
+void load_input_relation(std::string db_dir) {
+  for (const auto &entry : std::filesystem::directory_iterator(db_dir)) {
+    // check if ends with table
+    std::string filename_ss = entry.path().filename().string();
+    std::cout << "input database has file " << filename_ss << std::endl;
+    std::string suffix = ".table";
+    int ft = filename_ss.size() - suffix.size();
+    if (ft < 0)
+      ft = 0;
+    if (filename_ss.rfind(suffix) != ft) {
+      continue;
+    }
+    std::string filename_s = entry.path().stem().string();
+    int tag = std::stoi(filename_s.substr(0, filename_s.find(".")));
+    std::string name_arity = filename_s.substr(
+        filename_s.find(".") + 1, filename_s.size() - filename_s.find(".") - 1);
+    std::string name = name_arity.substr(0, name_arity.rfind("."));
+    std::string arity_s =
+        name_arity.substr(name_arity.rfind(".") + 1, name_arity.size());
+    int arity = std::stoi(arity_s);
+    std::stringstream index_stream;
+    index_stream << name;
+    for (int i = 1; i <= arity; i++) {
+      index_stream << "__" << i;
+    }
+    if (tag > max_rel)
+      max_rel = tag;
+    std::cout << "load " << tag << "." << index_stream.str() << "has arity "
+              << arity << std::endl;
+    rel_tag_map[index_stream.str()] = tag;
+  }
+}
+
+int get_tag_for_rel(std::string relation_name, std::string index_str) {
+  std::string name_arity = relation_name + "__" + index_str;
+  if (rel_index_map.find(relation_name) != rel_index_map.end()) {
+    rel_index_map[relation_name].insert(index_str);
+  } else {
+    rel_index_map[relation_name] = {index_str};
+  }
+
+  if (rel_tag_map.find(name_arity) != rel_tag_map.end()) {
+    // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] <<
+    // std::endl;
+    return rel_tag_map[name_arity];
+  }
+  max_rel++;
+  rel_tag_map[name_arity] = max_rel;
+  std::cout << "generate rel tag: " << name_arity << " " << max_rel
+            << std::endl;
+  return max_rel;
+}
+
+float ALPHA = 0.85;
+u64 total_node_size = 0;
+u64 dangling_value = 0;
+
+int main(int argc, char **argv) {
+  // input dir from compiler
+  std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data";
+  // output dir from compiler
+  std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints";
+  if (argc == 3) {
+    slog_input_dir = argv[1];
+    slog_output_dir = argv[2];
+  }
+  load_input_relation(slog_input_dir);
+  mpi_comm mcomm;
+  mcomm.create(argc, argv);
+
+  // (edge from to)
+  relation *rel__edge__2__1 = new relation(
+      1, true, 2, get_tag_for_rel("edge", "1__2"),
+      std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
+          ".edge.2.table",
+      FULL);
+
+  // >>>>>>>>>>>>>>> compute node size
+  // (node x)
+  relation *rel__node__1__1 = new relation(
+      1, true, 1, get_tag_for_rel("node", "1"),
+      std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("node", "1")) +
+          ".node.1.table",
+      FULL);
+
+  // (total_node_cnt n)
+  relation *rel__total_node_cnt__1__1 =
+      new relation(1, true, 2, get_tag_for_rel("total_node_cnt", "1"),
+                   std::to_string(get_tag_for_rel("total_node_cnt", "1")) +
+                       ".total_node_cnt.1.table",
+                   slog_input_dir + "/" +
+                       std::to_string(get_tag_for_rel("total_node_cnt", "1")) +
+                       ".total_node_cnt.1.table",
+                   FULL);
+
+  // helper relation for non-join aggregation
+  relation *rel___dollorunit__1__1 = new relation(
+      0, true, 1, get_tag_for_rel("$unit", "1"),
+      std::to_string(get_tag_for_rel("$unit", "1")) + ".$unit.1.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("$unit", "1")) +
+          ".$unit.1.table",
+      FULL);
+
+  RAM *scc_helper_fact = new RAM(false, 0);
+  scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false);
+  scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)}));
+
+  // [(node a) (node b) <-- (edge a b)]
+  RAM *scc_compute_node = new RAM(false, 1);
+  scc_compute_node->add_relation(rel__edge__2__1, false, false);
+  scc_compute_node->add_relation(rel__node__1__1, true, false);
+  scc_compute_node->add_rule(new parallel_copy_generate(
+      rel__node__1__1, rel__edge__2__1, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
+        output[0] = data[0];
+        return 1;
+      }));
+  scc_compute_node->add_rule(new parallel_copy_generate(
+      rel__node__1__1, rel__edge__2__1, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
+        output[0] = data[1];
+        return 1;
+      }));
+
+  // (total_node_cnt {count node _})
+  RAM *scc_count_nodes = new RAM(false, 2);
+  scc_count_nodes->add_relation(rel__node__1__1, false, false);
+  scc_count_nodes->add_relation(rel__total_node_cnt__1__1, true, false);
+  scc_count_nodes->add_relation(rel___dollorunit__1__1, false, false);
+  scc_count_nodes->add_rule(new parallel_join_aggregate(
+      rel__total_node_cnt__1__1, rel__node__1__1, rel___dollorunit__1__1, FULL,
+      agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
+      {2}));
+
+  LIE *cnt_lie = new LIE();
+  cnt_lie->add_relation(rel__edge__2__1);
+  cnt_lie->add_relation(rel__node__1__1);
+  cnt_lie->add_relation(rel___dollorunit__1__1);
+  cnt_lie->add_relation(rel__total_node_cnt__1__1);
+  cnt_lie->add_scc(scc_helper_fact);
+  cnt_lie->add_scc(scc_compute_node);
+  cnt_lie->add_scc(scc_count_nodes);
+  cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes);
+  cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes);
+
+  cnt_lie->enable_all_to_all_dump();
+  cnt_lie->set_output_dir(slog_output_dir); // Write to this directory
+  cnt_lie->set_comm(mcomm);
+  cnt_lie->set_batch_size(1);
+  cnt_lie->execute();
+  cnt_lie->print_all_relation_size(); // Continuously print relation sizes
+
+  // only 1 data in this rel so its safe
+  rel__total_node_cnt__1__1->print();
+
+  for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) {
+    total_node_size = t[0];
+    dangling_value = (u64)(((1 - ALPHA) / total_node_size) * 100000);
+    std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << std::endl;
+  }
+
+  // >>>>>>>>>>>>>>> compute page rank
+  std::cout << ">>>>>>>>>> Computing pagerank ... " << std::endl;
+
+  rel__edge__2__1->disable_initialization();
+  rel__node__1__1->disable_initialization();
+
+  relation *rel__edge__2__2 = new relation(
+      1, false, 2, get_tag_for_rel("edge", "2"),
+      std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table",
+      std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL);
+
+  //   matrix edge + successor count
+  relation *rel__matrix__3__1 = new relation(
+      1, true, 3, get_tag_for_rel("matrix", "1"),
+      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table",
+      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL);
+
+  relation *rel__rank__3__1 = new relation(
+      1, true, 3, get_tag_for_rel("rank", "1"),
+      std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table",
+      std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL);
+
+  rel__rank__3__1->set_dependent_column_update(
+      {1, 2, 3},
+      [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
+         const vector<u64> &nt) -> std::optional<bool> {
+        // if (nt[0] == 59 && nt[1] == 58) {
+        //   std::cout << "dependent column size " << new_v.size() << std::endl;
+        //   std::cout << new_v[0] << " " << new_v[1] << " " << new_v[2] << "
+        //   comparing with " << old_v[0] << " " << old_v[1] << " " << old_v[2]
+        //   << std::endl;
+        // }
+        if (new_v[0] != old_v[0]) {
+          // std::cout << " www >>>>>>>>>" << std::endl;
+          return std::nullopt;
+        } else {
+          // monotonic
+          // assert(new_v[1] > old_v[1]);
+          // u32 new_sum_raw = new_v[1];
+          // u32 old_sum_raw = old_v[1];
+          // float new_sum = *reinterpret_cast<float*>(&new_sum_raw);
+          // float old_sum = *reinterpret_cast<float*>(&old_sum_raw);
+          // if (new_sum > old_sum) {
+          //   std::cout << "new >> " << new_sum << " old >> " << old_sum <<
+          //   std::endl;
+          // }
+          // return new_sum > old_sum;
+          return new_v[1] > old_v[1];
+          // return true;
+        }
+      });
+
+  relation *rel__result__2__1__2 = new relation(
+      2, true, 2, get_tag_for_rel("result", "1__2"),
+      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
+      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
+      FULL);
+
+  //
+
+  RAM *scc_copy_edge = new RAM(false, 0);
+  scc_copy_edge->add_relation(rel__edge__2__1, false, false);
+  scc_copy_edge->add_relation(rel__edge__2__2, true, false);
+  scc_copy_edge->add_rule(
+      new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2}));
+
+  RAM *scc_compute_matrix = new RAM(false, 1);
+  scc_compute_matrix->add_relation(rel__edge__2__1, false, false);
+  scc_compute_matrix->add_relation(rel__edge__2__2, false, false);
+  scc_compute_matrix->add_relation(rel__matrix__3__1, true, false);
+  scc_compute_matrix->add_rule(new parallel_join_aggregate(
+      rel__matrix__3__1, rel__edge__2__2, rel__edge__2__1, FULL,
+      agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
+      {0, 1, 3}));
+
+  RAM *scc_init = new RAM(false, 2);
+  scc_init->add_relation(rel__matrix__3__1, false, false);
+  scc_init->add_relation(rel__rank__3__1, true, false);
+  scc_init->add_rule(new parallel_copy_generate(
+      rel__rank__3__1, rel__matrix__3__1, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
+        output[0] = data[0];
+        output[1] = data[0];
+        // float init_pg_v = (1 - ALPHA) / total_node_size;
+        u64 init_pg_v = dangling_value;
+        // std::cout << init_pg_v << std::endl;
+        // output[2] = *reinterpret_cast<u32*>(&init_pg_v);
+        output[2] = init_pg_v;
+        return 1;
+      }));
+
+  RAM *scc_page_rank = new RAM(true, 3);
+  scc_page_rank->add_relation(rel__matrix__3__1, false, false);
+  scc_page_rank->add_relation(rel__rank__3__1, true, false);
+  parallel_join *rank_join =
+      new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL,
+                        rel__rank__3__1, DELTA, {3, 1, 2} // useless
+      );
+  rank_join->set_generator_func([](const depend_val_t &target_vs,
+                                   const std::vector<u64> &input_v,
+                                   depend_val_t &res_set) -> bool {
+    // float pg_sum = 0.0;
+    u64 pg_sum = dangling_value;
+
+    int count = 0;
+    for (auto &tv : target_vs) {
+      // std::cout << "tagret v >>>>> ";
+      // for (auto c: tv) {
+      //   std::cout << c << " ";
+      // }
+      // std::cout << std::endl;
+      u32 raw_succ_pg_v = tv[2]; // all columns are u64, cast to u32 first
+      // std::cout << ">>>>>>>>>>>>>>> " <<
+      // *reinterpret_cast<float*>(&raw_succ_pg_v) << std::endl;
+      // auto succ_pg_v = *reinterpret_cast<float*>(&raw_succ_pg_v);
+      // if(succ_pg_v == 0) {
+      //   // std::cout << ">>>>>>>>>> " << succ_pg_v << std::endl;
+      // std::cout << "tagret v >>>>> ";
+      // for (auto c: tv) {
+      //   std::cout << c << " ";
+      // }
+      // std::cout << std::endl;
+      // }
+      if (input_v[2] != 0) {
+        // pg_sum += ((ALPHA * succ_pg_v) / input_v[2]);
+        pg_sum += (u64)(((u64)(ALPHA * raw_succ_pg_v)) / input_v[2]);
+        // if (input_v[1] == 51) {
+        //   std::cout << "Sum 51 " << input_v[0] << " with ";
+        //   for (auto c: tv) {
+        //     std::cout << c << " ";
+        //   }
+        //   std::cout << " result " << pg_sum << std::endl;
+        // }
+      }
+      count++;
+    }
+    if (pg_sum == 0) {
+      return false;
+    }
+    if (count == 0) {
+      return false;
+    }
+    std::vector<u64> res_tuple(3, 0);
+    res_tuple[0] = input_v[1];
+    res_tuple[1] = input_v[0];
+    // res_tuple[2] = *reinterpret_cast<u32*>(&pg_sum);
+    res_tuple[2] = pg_sum;
+    // std::cout << "New tuple >>>>>>> " << pg_sum << std::endl;
+    // for (auto c: res_tuple) {
+    //   std::cout << c << " ";
+    // }
+    // std::cout << std::endl;
+    res_set.push_back(res_tuple);
+    return true;
+  });
+  scc_page_rank->add_rule(rank_join);
+
+  RAM *scc_result = new RAM(false, 4);
+  scc_result->add_relation(rel__rank__3__1, false, false);
+  scc_result->add_relation(rel__result__2__1__2, true, false);
+  scc_result->add_relation(rel__node__1__1, false, false);
+  // scc_result->add_rule(new parallel_join_aggregate(
+  //     rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
+  //     agg_sum_float_local, SpecialAggregator::sum, agg_sum_float_reduce,
+  //     nullptr, {0, 2}));
+  scc_result->add_rule(new parallel_join_aggregate(
+      rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
+      agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, 2}));
+
+  LIE *pg_lie = new LIE();
+  pg_lie->add_relation(rel__edge__2__1);
+  pg_lie->add_relation(rel__matrix__3__1);
+  pg_lie->add_relation(rel__node__1__1);
+  pg_lie->add_relation(rel__edge__2__2);
+  pg_lie->add_relation(rel__rank__3__1);
+  pg_lie->add_relation(rel__result__2__1__2);
+  pg_lie->add_scc(scc_copy_edge);
+  pg_lie->add_scc(scc_compute_matrix);
+  pg_lie->add_scc(scc_init);
+  pg_lie->add_scc(scc_page_rank);
+  pg_lie->add_scc(scc_result);
+  pg_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix);
+  pg_lie->add_scc_dependance(scc_compute_matrix, scc_init);
+  pg_lie->add_scc_dependance(scc_init, scc_page_rank);
+  pg_lie->add_scc_dependance(scc_page_rank, scc_result);
+
+  // Enable IO
+  pg_lie->enable_all_to_all_dump();
+  pg_lie->enable_data_IO();
+  pg_lie->enable_IO();
+  // lie->enable_share_io();
+  pg_lie->set_output_dir(slog_output_dir); // Write to this directory
+  pg_lie->set_comm(mcomm);
+  pg_lie->set_batch_size(1);
+  pg_lie->execute();
+  pg_lie->print_all_relation_size(); // Continuously print relation sizes
+  // lie->stat_intermediate();
+  // rel__matrix__3__1->print();
+  // rel__rank__3__1->print(
+  //   [](const std::vector<u64>& tp){
+  //     u32 pg_v = tp[2];
+  //     // std::cout << tp[0] << " " << tp[1] << " " <<
+  //     *reinterpret_cast<float*>(&pg_v) << std::cout << tp[0] << " " << tp[1]
+  //     << " " << pg_v << std::endl;
+  //   }
+  // );
+  rel__result__2__1__2->print([](const std::vector<u64> &tp) {
+    u32 pg_v = tp[1];
+    // std::cout << tp[0] << " " << *reinterpret_cast<float*>(&pg_v) <<
+    std::cout << tp[0] << " " << pg_v * 1.0 / 100000 << std::endl;
+  });
+
+  // print all variants(non-canonical index of each relation)
+  //   if (mcomm.get_rank() == 0) {
+  //     std::cout << "rel_name"
+  //               << ",\t"
+  //               << "indices\n";
+  //     for (auto const &rel_p : rel_index_map) {
+  //       std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n";
+  //     }
+  //     std::cout << std::endl;
+  //   }
+
+  delete pg_lie;
+
+  // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+  mcomm.destroy();
+
+  return 0;
+}

From ab530e68e605f0f99bc93ccc91c3dddebd784d0e Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-253.ec2.internal>
Date: Fri, 2 Dec 2022 07:06:38 +0000
Subject: [PATCH 18/36] finish impl page rank

---
 backend/src/RA/parallel_agg.cpp               |   1 +
 .../src/relation/balanced_hash_relation.cpp   |  21 +-
 backend/src/relation/balanced_hash_relation.h |   3 +-
 backend/src/relation/shmap_relation.h         |   2 +-
 backend/src/relation/shmap_relation_exp.cpp   |  35 +-
 .../pagerank/compiled_pre/CMakeLists.txt      |   2 +-
 .../tests/pagerank/compiled_pre/pagerank.cpp  | 426 +++++++++++-----
 .../pagerank/compiled_pre/pagerank_full.cpp   | 462 +++++++++---------
 8 files changed, 568 insertions(+), 384 deletions(-)

diff --git a/backend/src/RA/parallel_agg.cpp b/backend/src/RA/parallel_agg.cpp
index fc42c114..0b13ed7c 100644
--- a/backend/src/RA/parallel_agg.cpp
+++ b/backend/src/RA/parallel_agg.cpp
@@ -117,6 +117,7 @@ void parallel_join_aggregate::local_aggregate(
             std::vector<u64> joined_input_tuple(input_tuple.begin(), input_tuple.begin()+input->get_join_column_count());
             auto agg_res = res_map[joined_input_tuple];
             std::vector<u64> tuple(output->get_arity(), 0);
+            // std::cout << "wwwwwwwwwwwwwwwwwwwwwwww  " << output->get_arity() << std::endl;
             int reorder_agg_index = input->get_arity() + 1;
             for (long unsigned int j = 0; j < reorder_mapping.size(); j++) {
             //   std::cout << reorder_mapping[j] << " " << reorder_agg_index << std::endl;
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index 660f5a7d..b3b7a1e7 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -14,6 +14,7 @@
 
 u32 relation::get_global_delta_element_count()
 {
+    delta_element_count = delta[mcomm.get_rank()].count();
     int dec = (int)delta_element_count;
     int global_delta_element_count;
     MPI_Allreduce(&dec, &global_delta_element_count, 1, MPI_INT, MPI_SUM, mcomm.get_local_comm());
@@ -25,6 +26,7 @@ u32 relation::get_global_full_element_count()
 {
     // TODO: change to use size of shamp_relation rather than counter
     u32 global_full_element_count;
+    full_element_count = full[mcomm.get_rank()].count();
     MPI_Allreduce(&full_element_count, &global_full_element_count, 1, MPI_INT, MPI_SUM, mcomm.get_local_comm());
     return global_full_element_count;
 }
@@ -854,8 +856,8 @@ void relation::populate_full(int buffer_size, u64* buffer)
 
         for (u32 a = i; a < i + arity + 1; a++)
             t[a-i] = buffer[a];
-
-        if (full[bucket_id].insert_tuple_from_array(t, (arity+1)) == true)
+        int insert_res = full[bucket_id].insert_tuple_from_array(t, (arity+1));
+        if (insert_res == INSERT_SUCCESS)
         {
             // TODO: check if its update, if it is keep full count same
             full_element_count++;
@@ -880,7 +882,8 @@ void relation::populate_delta (int buffer_size, u64* buffer)
         for (u32 a = i; a < i + arity + 1; a++)
             t[a-i] = buffer[a];
 
-        if (delta[bucket_id].insert_tuple_from_array(t, arity+1) == true)
+        int insert_res = delta[bucket_id].insert_tuple_from_array(t, arity+1);
+        if (insert_res == INSERT_SUCCESS)
         {
             delta_element_count++;
             delta_bucket_element_count[bucket_id]++;
@@ -1202,13 +1205,17 @@ bool relation::insert_in_delta(u64* t)
 
     // std::cout << "inserting delta for " << intern_tag << std::endl;
     //assert((int)bucket_id == mcomm.get_local_rank());
-    if (delta[bucket_id].insert_tuple_from_array(t, arity+1) == true)
+    int insert_res = delta[bucket_id].insert_tuple_from_array(t, arity+1);
+    if (insert_res == INSERT_SUCCESS)
     {
         delta_element_count++;
         delta_bucket_element_count[bucket_id]++;
         delta_sub_bucket_element_count[bucket_id][sub_bucket_id]++;
         bucket_map[bucket_id] = 1;
 
+        return true;
+    } else if (insert_res == INSERT_UPDATED) {
+        bucket_map[bucket_id] = 1;
         return true;
     }
     return false;
@@ -1225,13 +1232,17 @@ bool relation::insert_in_newt(u64* t)
 
     // std::cout << "inserting newt for " << intern_tag << std::endl;
     //assert((int)bucket_id == mcomm.get_local_rank());
-    if (newt[bucket_id].insert_tuple_from_array(t, arity+1) == true)
+    int insert_res = newt[bucket_id].insert_tuple_from_array(t, arity+1);
+    if (insert_res == INSERT_SUCCESS)
     {
         newt_element_count++;
         newt_bucket_element_count[bucket_id]++;
         newt_sub_bucket_element_count[bucket_id][sub_bucket_id]++;
         bucket_map[bucket_id] = 1;
 
+        return true;
+    } else if (insert_res == INSERT_UPDATED) {
+        bucket_map[bucket_id] = 1;
         return true;
     }
     return false;
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index 0757011d..e34d6e76 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -19,6 +19,7 @@ enum {LEFT=0, RIGHT};
 enum {DELTA=0, FULL, FULL_AND_DELTA};
 enum {COPY=0, COPY_FILTER, COPY_GENERATE, ACOPY, JOIN, FACT, NEGATION, AGGREGATION, UPDATE};
 enum {STATIC=0, DYNAMIC};
+enum {INSERT_SUCCESS=0, INSERT_FAIL, INSERT_UPDATED};
 
 using tuple_formator_t = std::function<void(const std::vector<u64>&)>;
 
@@ -188,7 +189,7 @@ class relation
 
 
     void set_full_element_count(int val)   {full_element_count = val;}
-    int get_full_element_count()    {return full_element_count;}
+    int get_full_element_count()    {return full[mcomm.get_rank()].count();}
     u32** get_full_sub_bucket_element_count()   {return full_sub_bucket_element_count;}
     u32 get_global_full_element_count();
 
diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h
index 19287e53..3d1fe9a5 100644
--- a/backend/src/relation/shmap_relation.h
+++ b/backend/src/relation/shmap_relation.h
@@ -121,7 +121,7 @@ struct shmap_relation {
     };
 
     int count();
-    bool insert_tuple_from_array(u64* t, int arity);
+    int insert_tuple_from_array(u64* t, int arity);
     void remove_tuple();
     bool find_tuple_from_array(u64* t, int arity);
     bool check_dependent_insertion(const std::vector<u64> &v);
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index 6d2ea852..d0fbc176 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -11,6 +11,7 @@
  */
 
 #include "../parallel_RA_inc.h"
+#include "balanced_hash_relation.h"
 #include "shmap_relation.h"
 #include <cassert>
 #include <cstddef>
@@ -25,7 +26,7 @@ shmap_relation::shmap_relation(int arity, bool id_flag)
     // ind = new t_ind(t_comparator(id_flag));
 }
 
-bool shmap_relation::insert_tuple_from_array(u64 *t, int width)
+int shmap_relation::insert_tuple_from_array(u64 *t, int width)
 {
     t_tuple tp(t, t+width);
     // check if relation has functional dependance
@@ -60,7 +61,11 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width)
             //     }
             //     std::cout << std::endl;
             // }
-            return insert(tp);
+            if (insert(tp)) {
+                return INSERT_SUCCESS;
+            } else {
+                return INSERT_FAIL;
+            }
         } else {
             // update
             // iterator need_delete = ind.end();
@@ -100,7 +105,11 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width)
                 joined = true;
             }
             if (!joined) {
-                return insert(tp);
+                if (insert(tp)) {
+                    return INSERT_SUCCESS;
+                } else {
+                    return INSERT_FAIL;
+                }
             }
             if (!need_deletes.empty()) {
                 for (auto d: need_deletes) {
@@ -111,9 +120,13 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width)
                     // std::cout << std::endl;
                     ind.erase(*d);
                 }
-                return insert(tp);
+                if (insert(tp)) {
+                    return INSERT_SUCCESS;
+                } else {
+                    return INSERT_UPDATED;
+                }
             } else {
-                return false;
+                return INSERT_FAIL;
             }
         }
     } else {
@@ -122,7 +135,11 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width)
         //     std::cout << c << " ";
         // }
         // std::cout << std::endl;
-        return insert(tp);
+        if (insert(tp)) {
+            return INSERT_SUCCESS;
+        } else {
+            return INSERT_FAIL;
+        }
     }
 }
 
@@ -452,7 +469,7 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
     // for (auto c: upper_bound) {
     //     std::cout << c << " ";
     // }
-    std::cout << std::endl;
+    // std::cout << std::endl;
     auto joined_range = lowerUpperRange(lower_bound, upper_bound);
 
     if (generator_mode) {
@@ -516,7 +533,7 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
             //     std::cout << c << " ";
             // }
             // std::cout << std::endl;
-            if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
+            if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) != INSERT_FAIL)
             {
                 uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets;
                 uint64_t sub_bucket_id=0;
@@ -650,7 +667,7 @@ void shmap_relation::as_all_to_allv_left_join_buffer(
                 projected_path[i] = reordered_cur_path[reorder_map[i]];
             
             //std::cout << "NT " << projected_path[0] << " " << projected_path[1] << std::endl;
-            if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true)
+            if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) != INSERT_FAIL)
             {
                 uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets;
                 uint64_t sub_bucket_id=0;
diff --git a/backend/tests/pagerank/compiled_pre/CMakeLists.txt b/backend/tests/pagerank/compiled_pre/CMakeLists.txt
index 44733818..38953a06 100644
--- a/backend/tests/pagerank/compiled_pre/CMakeLists.txt
+++ b/backend/tests/pagerank/compiled_pre/CMakeLists.txt
@@ -19,7 +19,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla
 set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
 
 file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
-file (GLOB source_files_pagerank "${PROJECT_SOURCE_DIR}/pagerank.cpp")
+file (GLOB source_files_pagerank "${PROJECT_SOURCE_DIR}/pagerank_full.cpp")
 
 ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}")
 
diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp
index ec7133ba..a24bcf25 100644
--- a/backend/tests/pagerank/compiled_pre/pagerank.cpp
+++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp
@@ -273,6 +273,7 @@ agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
   for (shmap_relation::iterator it = joined_range.first;
        it != joined_range.second; ++it) {
     auto tuple = (*it);
+    // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl;
     // if (tuple[1] == MAX_PG_ITERATION) {
       sum_res += tuple[tuple.size() - 2];
     // }
@@ -426,10 +427,10 @@ int main(int argc, char **argv) {
   std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data";
   // output dir from compiler
   std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints";
-  if (argc == 3) {
+  // if (argc  3) {
     slog_input_dir = argv[1];
     slog_output_dir = argv[2];
-  }
+  // }
   load_input_relation(slog_input_dir);
   mpi_comm mcomm;
   mcomm.create(argc, argv);
@@ -455,7 +456,7 @@ int main(int argc, char **argv) {
 
   // (total_node_cnt n)
   relation *rel__total_node_cnt__1__1 =
-      new relation(1, true, 2, get_tag_for_rel("total_node_cnt", "1"),
+      new relation(1, true, 1, get_tag_for_rel("total_node_cnt", "1"),
                    std::to_string(get_tag_for_rel("total_node_cnt", "1")) +
                        ".total_node_cnt.1.table",
                    slog_input_dir + "/" +
@@ -476,7 +477,7 @@ int main(int argc, char **argv) {
   //   std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table",
   //   std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL);
 
-    //   matrix edge + successor count
+  // from, to, outage degree of `from`
   relation *rel__matrix__3__1 = new relation(
     1, true, 3, get_tag_for_rel("matrix", "1"),
     std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table",
@@ -489,13 +490,13 @@ int main(int argc, char **argv) {
     std::to_string(get_tag_for_rel("dangling_node", "1")) + ".dangling_node.table",
     FULL);
 
-  // RAM *scc_copy_edge = new RAM(false, 0);
-  // scc_copy_edge->add_relation(rel__edge__2__1, false, false);
-  // scc_copy_edge->add_relation(rel__edge__2__2, true, false);
-  // scc_copy_edge->add_rule(
-  //     new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2}));
+  relation *rel__node_outage_degree = new relation(
+    1, true, 2, get_tag_for_rel("node_outage_degree", "2"),
+    std::to_string(get_tag_for_rel("node_outage_degree", "2")) + ".node_outage_degree.table",
+    std::to_string(get_tag_for_rel("node_outage_degree", "2")) + ".node_outage_degree.table",
+    FULL);
 
-  RAM *scc_compute_matrix = new RAM(false, 1);
+  RAM *scc_compute_matrix = new RAM(false, 0);
   scc_compute_matrix->add_relation(rel__edge__2__1, false, false);
   // scc_compute_matrix->add_relation(rel__edge__2__2, false, false);
   scc_compute_matrix->add_relation(rel__matrix__3__1, true, false);
@@ -504,12 +505,12 @@ int main(int argc, char **argv) {
       agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
       {0, 1, 3}));
 
-  RAM *scc_helper_fact = new RAM(false, 0);
+  RAM *scc_helper_fact = new RAM(false, 1);
   scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false);
   scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)}));
 
   // [(node a) (node b) <-- (edge a b)]
-  RAM *scc_compute_node = new RAM(false, 1);
+  RAM *scc_compute_node = new RAM(false, 2);
   scc_compute_node->add_relation(rel__edge__2__1, false, false);
   scc_compute_node->add_relation(rel__node__1__1, true, false);
   scc_compute_node->add_rule(new parallel_copy_generate(
@@ -526,7 +527,7 @@ int main(int argc, char **argv) {
       }));
 
   // (total_node_cnt {count node _})
-  RAM *scc_count_nodes = new RAM(false, 2);
+  RAM *scc_count_nodes = new RAM(false, 3);
   scc_count_nodes->add_relation(rel__node__1__1, false, false);
   scc_count_nodes->add_relation(rel__total_node_cnt__1__1, true, false);
   scc_count_nodes->add_relation(rel___dollorunit__1__1, false, false);
@@ -535,7 +536,7 @@ int main(int argc, char **argv) {
       agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
       {2}));
 
-  RAM *scc_populate_dangling = new RAM(false, 3);
+  RAM *scc_populate_dangling = new RAM(false, 4);
   scc_populate_dangling->add_relation(rel__edge__2__1, false);
   scc_populate_dangling->add_relation(rel__dangling_node, true);
   scc_populate_dangling->add_relation(rel__node__1__1, false);
@@ -544,43 +545,54 @@ int main(int argc, char **argv) {
     {0}
   ));
 
-  LIE *cnt_lie = new LIE();
-  cnt_lie->add_relation(rel__edge__2__1);
+  RAM *scc_degree = new RAM(false, 5);
+  scc_degree->add_relation(rel__node_outage_degree, true);
+  scc_degree->add_relation(rel__matrix__3__1, false);
+  scc_degree->add_rule(new parallel_copy(
+    rel__node_outage_degree, rel__matrix__3__1, FULL, {0, 2}
+  ));
+
+  LIE *init_lie = new LIE();
+  init_lie->add_relation(rel__edge__2__1);
   // cnt_lie->add_relation(rel__edge__2__2);
-  cnt_lie->add_relation(rel__node__1__1);
-  cnt_lie->add_relation(rel___dollorunit__1__1);
-  cnt_lie->add_relation(rel__total_node_cnt__1__1);
-  cnt_lie->add_relation(rel__matrix__3__1);
-  cnt_lie->add_relation(rel__dangling_node);
-  cnt_lie->add_scc(scc_helper_fact);
-  cnt_lie->add_scc(scc_compute_node);
-  cnt_lie->add_scc(scc_count_nodes);
+  init_lie->add_relation(rel__node__1__1);
+  init_lie->add_relation(rel___dollorunit__1__1);
+  init_lie->add_relation(rel__total_node_cnt__1__1);
+  init_lie->add_relation(rel__matrix__3__1);
+  init_lie->add_relation(rel__dangling_node);
+  init_lie->add_relation(rel__node_outage_degree);
+  // init_lie->add_relation(rel__page_rank__2__1);
+  init_lie->add_scc(scc_helper_fact);
+  init_lie->add_scc(scc_compute_node);
+  init_lie->add_scc(scc_count_nodes);
   // cnt_lie->add_scc(scc_copy_edge);
-  cnt_lie->add_scc(scc_compute_matrix);
-  cnt_lie->add_scc(scc_populate_dangling);
+  init_lie->add_scc(scc_compute_matrix);
+  init_lie->add_scc(scc_populate_dangling);
+  init_lie->add_scc(scc_degree);
   // cnt_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix);
-  cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes);
-  cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes);
-
-  cnt_lie->enable_all_to_all_dump();
-  cnt_lie->set_output_dir(slog_output_dir); // Write to this directory
-  cnt_lie->set_comm(mcomm);
-  cnt_lie->set_batch_size(1);
-  cnt_lie->execute();
-  cnt_lie->print_all_relation_size(); // Continuously print relation sizes
+  init_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes);
+  init_lie->add_scc_dependance(scc_compute_node, scc_count_nodes);
+  init_lie->add_scc_dependance(scc_compute_matrix, scc_degree);
+
+  init_lie->enable_all_to_all_dump();
+  init_lie->set_output_dir(slog_output_dir); // Write to this directory
+  init_lie->set_comm(mcomm);
+  init_lie->set_batch_size(1);
+  init_lie->execute();
+  init_lie->print_all_relation_size(); // Continuously print relation sizes
+  MPI_Barrier(mcomm.get_comm());
 
   // only 1 data in this rel so its safe
   rel__total_node_cnt__1__1->print();
+  // rel__node_outage_degree->print();
 
-  u64 local_node_size = 0;
-  for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) {
-    if (t[0] != 0) {
-      local_node_size = t[0];
-    }
-  }
-  rel__matrix__3__1->print();
-  MPI_Barrier(mcomm.get_comm());
-
+  u64 local_node_size = rel__node__1__1->get_full_element_count();
+  // for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) {
+  //   if (t[0] != 0) {
+  //     local_node_size = t[0];
+  //   }
+  // }
+  // rel__matrix__3__1->print();
   MPI_Allreduce(&local_node_size, &total_node_size, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, mcomm.get_comm());
 
   dangling_node_cnt = rel__dangling_node->get_global_full_element_count();
@@ -594,93 +606,188 @@ int main(int argc, char **argv) {
   rel__node__1__1->disable_initialization();
   rel__matrix__3__1->disable_initialization();
   rel__dangling_node->disable_initialization();
+  rel__node_outage_degree->disable_initialization();
   
   // rel__matrix__3__1->print();
 
   //////////////////  compute  Page rank
-  relation *rel__rank__3__1 = new relation(
-      1, true, 3, get_tag_for_rel("rank", "1"),
-      std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table",
-      std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL);
+  relation *rel__page_rank__2__1 = new relation(
+    1, true, 2, get_tag_for_rel("page_rank", "1"),
+    std::to_string(get_tag_for_rel("page_rank", "1")) + ".page_rank.2.table",
+    std::to_string(get_tag_for_rel("page_rank", "1")) + ".page_rank.2.table", FULL);
+  rel__page_rank__2__1->set_dependent_column_update(
+    {1,2},
+    [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
+         const vector<u64> &nt) -> std::optional<bool> {
+      // if (std::abs((int)new_v[1] - (int)old_v[1]) < 5) {
+      //     return false;
+      // } else {
+      //   return true;
+      // }
+      return true;
+    });
 
-  rel__rank__3__1->set_dependent_column_update(
+  relation *rel__sub_rank__3__1 = new relation(
+      1, true, 3, get_tag_for_rel("sub_rank", "1"),
+      std::to_string(get_tag_for_rel("sub_rank", "1")) + ".sub_rank.3.table",
+      std::to_string(get_tag_for_rel("sub_rank", "1")) + ".sub_rank.3.table", FULL);
+
+  // page rank (node N, 
+  //            <sub page rank value comes form node P>,
+  //            <sub page rank value>)
+  rel__sub_rank__3__1->set_dependent_column_update(
       {1, 2, 3},
       [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
          const vector<u64> &nt) -> std::optional<bool> {
         if (new_v[0] != old_v[0]) {
+          // std::cout << "New " <<   new_v[0] << " " << new_v[1] <<  " Old " << old_v[0] << " " << old_v[1] << std::endl;
           return std::nullopt;
         }
-        // if (std::abs((int)new_v[1] - (int)old_v[1]) < 10) {
-        //   return false;
-        // }
         return true;
       });
 
+  LIE *pg_defaukt_lie = new LIE();
+  RAM *scc_defaultv = new RAM(false, 0);
+  scc_defaultv->add_relation(rel__node_outage_degree, false, false);
+  scc_defaultv->add_relation(rel__page_rank__2__1, true, false);
+  scc_defaultv->add_rule(new parallel_copy_generate(
+      rel__page_rank__2__1, rel__node_outage_degree, FULL,
+      [](const u64 *const data, u64 *const output) -> int {
+        output[0] = data[0];
+        output[1] = dangling_value;
+        return 1;
+      }));
+
+  pg_defaukt_lie->add_relation(rel__node_outage_degree);
+  pg_defaukt_lie->add_relation(rel__page_rank__2__1);
+  pg_defaukt_lie->add_scc(scc_defaultv);
+  pg_defaukt_lie->set_output_dir(slog_output_dir); // Write to this directory
+  pg_defaukt_lie->set_comm(mcomm);
+  pg_defaukt_lie->set_batch_size(1);
+  pg_defaukt_lie->execute();
+
+  rel__page_rank__2__1->disable_initialization();
+
   std::vector<LIE*> pg_lie_list;
 
   for (int i = 0; i < MAX_PG_ITERATION; i++) {
     std::cout << ">>>>>>>>>>>>>>>>>>>>> Compute pagerank iter " << current_iter
               << std::endl;
     LIE *pg_lie = new LIE();
-
-    // RAM *scc_
-    RAM *scc_init = new RAM(false, 0);
-    scc_init->add_relation(rel__matrix__3__1, false, false);
-    scc_init->add_relation(rel__rank__3__1, true, false);
-    scc_init->add_rule(new parallel_copy_generate(
-        rel__rank__3__1, rel__matrix__3__1, FULL,
-        [](const u64 *const data, u64 *const output) -> int {
-          output[0] = data[0];
-          output[1] = data[0];
-          output[2] = dangling_value;
-          return 1;
-        }));
     RAM *scc_page_rank = new RAM(false, 1);
     scc_page_rank->add_relation(rel__matrix__3__1, false, false);
-    scc_page_rank->add_relation(rel__rank__3__1, true, false);
-    parallel_join *rank_join =
-        new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL,
-                          rel__rank__3__1, DELTA, {3, 1, 2} // useless
+    scc_page_rank->add_relation(rel__page_rank__2__1, false, false);
+    scc_page_rank->add_relation(rel__sub_rank__3__1, false, false);
+    // scc_page_rank->add_relation(rel__node_outage_degree, false, false);
+    parallel_join *sub_rank_join =
+        new parallel_join(rel__sub_rank__3__1, 
+                          rel__page_rank__2__1, FULL,
+                          rel__matrix__3__1, FULL,
+                          {3, 1, 2} // useless
         );
-    rank_join->set_generator_func([](const depend_val_t &target_vs,
+    sub_rank_join->set_generator_func([](const depend_val_t &target_vs,
                                      const std::vector<u64> &input_v,
                                      depend_val_t &res_set) -> bool {
-      // if (current_iter > MAX_PG_ITERATION) {
-      //   return false;
-      // }
-      u64 pg_sum = dangling_node_cnt * dangling_value;
+      u64 pg_v = dangling_node_cnt * dangling_value;
+      // std::cout << input_v[0] << " " << input_v[1] << " " << input_v[2]  << std::endl;
       int count = 0;
       for (auto &tv : target_vs) {
-        if ((tv[0] == tv[1]) && (current_iter != 0)) {
-          continue;
-        }
-        u32 raw_succ_pg_v_sub = tv[2]; // all columns are u64, cast to u32 first
-        if (current_iter == 0) {
-          raw_succ_pg_v_sub = raw_succ_pg_v_sub / input_v[2];
-        }
-        pg_sum += (u64)(raw_succ_pg_v_sub * ALPHA);
+        u32 raw_succ_pg_v_sub = tv[1]; // all columns are u64, cast to u32 first
+        pg_v += raw_succ_pg_v_sub;
         count++;
       }
-      pg_sum += (1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size;
       std::vector<u64> res_tuple(3, 0);
       res_tuple[0] = input_v[1];
       res_tuple[1] = input_v[0];
-      res_tuple[2] = pg_sum / input_v[2];
+      res_tuple[2] = (u64)((pg_v * ALPHA) / input_v[2]);
       res_set.push_back(res_tuple);
       return true;
     });
-    scc_page_rank->add_rule(rank_join);
+    scc_page_rank->add_rule(sub_rank_join);
+
+    RAM *scc_sum = new RAM(false, 1);
+    scc_sum->add_relation(rel__page_rank__2__1, false, false);
+    scc_sum->add_relation(rel__sub_rank__3__1, false, false);
+    scc_sum->add_relation(rel__node__1__1, false, false);
+    scc_sum->add_rule(
+      new parallel_join_aggregate(
+        rel__page_rank__2__1,
+        rel__sub_rank__3__1,
+        rel__node__1__1, FULL,
+        [](std::pair<shmap_relation::iterator, shmap_relation::iterator> joined_range) -> local_agg_res_t {
+          local_agg_res_t sum_res = (u64)((1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size);
+          // std::cout << sum_res << std::endl;
+          for (shmap_relation::iterator it = joined_range.first;
+               it != joined_range.second; ++it) {
+            auto tp = *it;
+            sum_res += tp[2];
+          }
+          return sum_res;
+        }, SpecialAggregator::sum, agg_sum_reduce, nullptr,
+        {0, 2}
+      ));
+
+    // RAM *scc_init = new RAM(false, 0);
+    // scc_init->add_relation(rel__matrix__3__1, false, false);
+    // scc_init->add_relation(rel__sub_rank__3__1, true, false);
+    // scc_init->add_rule(new parallel_copy_generate(
+    //     rel__sub_rank__3__1, rel__matrix__3__1, FULL,
+    //     [](const u64 *const data, u64 *const output) -> int {
+    //       output[0] = data[0];
+    //       output[1] = data[0];
+    //       output[2] = (u64)((ALPHA * dangling_value) / data[2]);
+    //       return 1;
+    //     }));
+    // RAM *scc_page_rank = new RAM(false, 1);
+    // scc_page_rank->add_relation(rel__matrix__3__1, false, false);
+    // scc_page_rank->add_relation(rel__sub_rank__3__1, true, false);
+    // parallel_join *rank_join =
+    //     new parallel_join(rel__sub_rank__3__1, rel__matrix__3__1, FULL,
+    //                       rel__sub_rank__3__1, FULL, {3, 1, 2} // useless
+    //     );
+    // rank_join->set_generator_func([](const depend_val_t &target_vs,
+    //                                  const std::vector<u64> &input_v,
+    //                                  depend_val_t &res_set) -> bool {
+    //   // if (current_iter > MAX_PG_ITERATION) {
+    //   //   return false;
+    //   // }
+    //   u64 pg_sum = dangling_node_cnt * dangling_value;
+    //   int count = 0;
+    //   for (auto &tv : target_vs) {
+    //     if ((tv[0] == tv[1]) && (current_iter != 0)) {
+    //       continue;
+    //     }
+    //     u32 raw_succ_pg_v_sub = tv[2]; // all columns are u64, cast to u32 first
+    //     pg_sum += raw_succ_pg_v_sub;
+    //     count++;
+    //   }
+    //   pg_sum += (1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size;
+    //   std::vector<u64> res_tuple(3, 0);
+    //   res_tuple[0] = input_v[1];
+    //   res_tuple[1] = input_v[0];
+    //   res_tuple[2] = (u64)(pg_sum * ALPHA / input_v[2]);
+    //   res_set.push_back(res_tuple);
+    //   return true;
+    // });
+    // scc_page_rank->add_rule(rank_join);
+    // pg_lie->add_relation(rel__matrix__3__1);
+    // pg_lie->add_relation(rel__node__1__1);
+    // pg_lie->add_relation(rel__sub_rank__3__1);
+    // pg_lie->add_scc(scc_page_rank);
+    // if (current_iter == 0) {
+    //   pg_lie->add_scc(scc_init);
+    //   pg_lie->add_scc_dependance(scc_init, scc_page_rank);
+    // }
 
-    pg_lie_list.push_back(pg_lie);
-    pg_lie->add_relation(rel__matrix__3__1);
+    pg_lie->add_relation(rel__page_rank__2__1);
+    pg_lie->add_relation(rel__sub_rank__3__1);
     pg_lie->add_relation(rel__node__1__1);
-    pg_lie->add_relation(rel__rank__3__1);
+    pg_lie->add_relation(rel__matrix__3__1);
     pg_lie->add_scc(scc_page_rank);
-    if (current_iter == 0) {
-      pg_lie->add_scc(scc_init);
-      pg_lie->add_scc_dependance(scc_init, scc_page_rank);
-    }
-    // Enable IO
+    pg_lie->add_scc(scc_sum);
+    pg_lie->add_scc_dependance(scc_page_rank, scc_sum);
+    pg_lie_list.push_back(pg_lie);
+
     if (i == MAX_PG_ITERATION - 1) {
       pg_lie->enable_all_to_all_dump();
       pg_lie->enable_data_IO();
@@ -691,10 +798,19 @@ int main(int argc, char **argv) {
     pg_lie->set_comm(mcomm);
     pg_lie->set_batch_size(1);
     pg_lie->execute();
-    current_iter++;
-    rel__rank__3__1->disable_initialization();
+    rel__sub_rank__3__1->disable_initialization();
     pg_lie->print_all_relation_size(); // Continuously print relation sizes
+    current_iter++;
+  //     rel__page_rank__2__1->print([](const std::vector<u64> &tp) {
+  //   u32 pg_v = tp[1];
+  //   // std::cout << tp[0] << " " << *reinterpret_cast<float*>(&pg_v) <<
+  //   std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl;
+  // });
+    // // need this?
+    // MPI_Barrier(mcomm.get_comm());
   }
+  rel__page_rank__2__1->print();
+
   // rel__rank__4__1->print(
   //   [](const std::vector<u64>& tp){
   //     u32 pg_v = tp[3];
@@ -703,47 +819,91 @@ int main(int argc, char **argv) {
   //     << tp[2] << " " << pg_v << std::endl;
   //   }
   // );
-  // delete pg_pre_lie;
-  // delete pg_lie;
-
-  std::cout << "Aggregating sum ..." << std::endl;
-  relation *rel__result__2__1__2 = new relation(
-      2, true, 2, get_tag_for_rel("result", "1__2"),
-      std::to_string(get_tag_for_rel("result", "1__2")) +
-      ".result.2.table", std::to_string(get_tag_for_rel("result", "1__2"))
-      + ".result.2.table", FULL);
-
-  RAM *scc_result = new RAM(false, 4);
-  scc_result->add_relation(rel__rank__3__1, false, false);
-  scc_result->add_relation(rel__result__2__1__2, true, false);
-  scc_result->add_relation(rel__node__1__1, false, false);
-  scc_result->add_rule(new parallel_join_aggregate(
-      rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
-      agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0,
-      2}));
-
-  LIE* final_lie = new LIE();
-  final_lie->add_relation(rel__result__2__1__2);
-  final_lie->add_relation(rel__node__1__1);
-  final_lie->add_relation(rel__rank__3__1);
-  final_lie->add_scc(scc_result);
-  final_lie->enable_all_to_all_dump();
-  final_lie->enable_data_IO();
-  final_lie->enable_IO();
-
-  final_lie->set_output_dir(slog_output_dir); // Write to this directory
-  final_lie->set_comm(mcomm);
-  final_lie->set_batch_size(1);
-  final_lie->execute();
-  final_lie->print_all_relation_size(); // Continuously print relation sizes
+
+  // std::cout << "Aggregating sum ..." << std::endl;
+  // relation *rel__result__2__1__2 = new relation(
+  //     2, true, 2, get_tag_for_rel("result", "1__2"),
+  //     std::to_string(get_tag_for_rel("result", "1__2")) +
+  //     ".result.2.table", std::to_string(get_tag_for_rel("result", "1__2"))
+  //     + ".result.2.table", FULL);
+  // relation *rel__sum_pg__1__1__1 = new relation(
+  //   1, true, 1, get_tag_for_rel("sum_pg", "1"),
+  //   std::to_string(get_tag_for_rel("sum_pg", "1")) + "sum_pg.2.table",
+  //   std::to_string(get_tag_for_rel("sum_pg", "1")) + "sum_pg.2.table",
+  //   FULL);
+
+  // RAM *scc_result = new RAM(false, 4);
+  // scc_result->add_relation(rel__sub_rank__3__1, false, false);
+  // scc_result->add_relation(rel__result__2__1__2, true, false);
+  // scc_result->add_relation(rel__node__1__1, false, false);
+  // scc_result->add_relation(rel__sum_pg__1__1__1, true, false);
+  // scc_result->add_rule(new parallel_join_aggregate(
+  //     rel__result__2__1__2, rel__sub_rank__3__1, rel__node__1__1, FULL,
+  //     [](std::pair<shmap_relation::iterator, shmap_relation::iterator> joined_range) {
+  //       local_agg_res_t sum_res = 0;
+  //       for (shmap_relation::iterator it = joined_range.first;
+  //           it != joined_range.second; ++it) {
+  //         auto tuple = (*it);
+  //         // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl;
+  //         if (tuple[0] != tuple[1]) {
+  //           sum_res += tuple[tuple.size() - 2];
+  //         }
+  //       }
+  //       sum_res += (u64)((1 - ALPHA) / total_node_size);
+  //       return sum_res;
+  //     },
+  //     SpecialAggregator::sum,
+  //     agg_sum_reduce,
+  //     nullptr, {0, 2}));
+  // scc_result->add_rule(new parallel_join_aggregate(
+  //   rel__sum_pg__1__1__1, rel__sub_rank__3__1, rel___dollorunit__1__1, FULL,
+  //   [](std::pair<shmap_relation::iterator, shmap_relation::iterator> joined_range) {
+  //       local_agg_res_t sum_res = 0;
+  //       for (shmap_relation::iterator it = joined_range.first;
+  //           it != joined_range.second; ++it) {
+  //         auto tuple = (*it);
+  //         // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl;
+  //         if (tuple[0] != tuple[1]) {
+  //           sum_res += tuple[tuple.size() - 2];
+  //         }
+  //       }
+  //       // sum_res += (u64)((1 - ALPHA) * FLOAT_SCALE_CONST);
+  //       return sum_res;
+  //     },
+  //   SpecialAggregator::sum, agg_sum_reduce, nullptr,
+  //   {2}));
+
+  // LIE* final_lie = new LIE();
+  // final_lie->add_relation(rel__result__2__1__2);
+  // final_lie->add_relation(rel__node__1__1);
+  // final_lie->add_relation(rel__sub_rank__3__1);
+  // final_lie->add_relation(rel__sum_pg__1__1__1);
+  // final_lie->add_scc(scc_result);
+  // final_lie->enable_all_to_all_dump();
+  // final_lie->enable_data_IO();
+  // final_lie->enable_IO();
+
+  // final_lie->set_output_dir(slog_output_dir); // Write to this directory
+  // final_lie->set_comm(mcomm);
+  // final_lie->set_batch_size(1);
+  // final_lie->execute();
+  // final_lie->print_all_relation_size(); // Continuously print relation sizes
 
   // rel__rank__3__1->print();
+  // rel__node__1__1->print();
+
+  // rel__result__2__1__2->print([](const std::vector<u64> &tp) {
+  //   u32 pg_v = tp[1];
+  //   // std::cout << tp[0] << " " << *reinterpret_cast<float*>(&pg_v) <<
+  //   std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl;
+  // });
+  // rel__sum_pg__1__1__1->print([](const std::vector<u64> &tp) {
+  //   u32 pg_v = tp[0];
+  //   // std::cout << tp[0] << " " << *reinterpret_cast<float*>(&pg_v) <<
+  //   std::cout << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl;
+  // });
+
 
-  rel__result__2__1__2->print([](const std::vector<u64> &tp) {
-    u32 pg_v = tp[1];
-    // std::cout << tp[0] << " " << *reinterpret_cast<float*>(&pg_v) <<
-    std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl;
-  });
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
   mcomm.destroy();
diff --git a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
index c0636f61..11055725 100644
--- a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
+++ b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
@@ -1,5 +1,6 @@
 // location of `parallel_RA_inc.h` here
 #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "mpi.h"
 
 #include <bit>
 #include <iostream>
@@ -35,6 +36,14 @@ const u64 str_tag = 2;
 const u64 sign_flip_const = 0x0000200000000000;
 const u64 signed_num_mask = 0xFFFFE00000000000;
 
+#define FLOAT_SCALE_CONST 100000
+float ALPHA = 0.85;
+u64 total_node_size = 0;
+u64 dangling_value = 0;
+u64 current_iter = 0;
+int MAX_PG_ITERATION = 2;
+u64 dangling_node_cnt;
+
 inline bool is_number(u64 datum) {
   // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
   // int_tag) << "\n";
@@ -264,7 +273,10 @@ agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
   for (shmap_relation::iterator it = joined_range.first;
        it != joined_range.second; ++it) {
     auto tuple = (*it);
+    // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl;
+    // if (tuple[1] == MAX_PG_ITERATION) {
     sum_res += tuple[tuple.size() - 2];
+    // }
   }
   return sum_res;
 }
@@ -410,23 +422,21 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
   return max_rel;
 }
 
-float ALPHA = 0.85;
-u64 total_node_size = 0;
-u64 dangling_value = 0;
-
 int main(int argc, char **argv) {
   // input dir from compiler
   std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data";
   // output dir from compiler
   std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints";
-  if (argc == 3) {
-    slog_input_dir = argv[1];
-    slog_output_dir = argv[2];
-  }
+  // if (argc  3) {
+  slog_input_dir = argv[1];
+  slog_output_dir = argv[2];
+  // }
   load_input_relation(slog_input_dir);
   mpi_comm mcomm;
   mcomm.create(argc, argv);
 
+  MAX_PG_ITERATION = atoi(argv[3]);
+
   // (edge from to)
   relation *rel__edge__2__1 = new relation(
       1, true, 2, get_tag_for_rel("edge", "1__2"),
@@ -462,12 +472,43 @@ int main(int argc, char **argv) {
           ".$unit.1.table",
       FULL);
 
-  RAM *scc_helper_fact = new RAM(false, 0);
+  // from, to, outage degree of `from`
+  relation *rel__matrix__3__1 = new relation(
+      1, true, 3, get_tag_for_rel("matrix", "1"),
+      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table",
+      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL);
+
+  relation *rel__dangling_node =
+      new relation(1, true, 1, get_tag_for_rel("dangling_node", "1"),
+                   std::to_string(get_tag_for_rel("dangling_node", "1")) +
+                       ".dangling_node.table",
+                   std::to_string(get_tag_for_rel("dangling_node", "1")) +
+                       ".dangling_node.table",
+                   FULL);
+
+  relation *rel__node_outage_degree =
+      new relation(1, true, 2, get_tag_for_rel("node_outage_degree", "2"),
+                   std::to_string(get_tag_for_rel("node_outage_degree", "2")) +
+                       ".node_outage_degree.table",
+                   std::to_string(get_tag_for_rel("node_outage_degree", "2")) +
+                       ".node_outage_degree.table",
+                   FULL);
+
+  RAM *scc_compute_matrix = new RAM(false, 0);
+  scc_compute_matrix->add_relation(rel__edge__2__1, false, false);
+  // scc_compute_matrix->add_relation(rel__edge__2__2, false, false);
+  scc_compute_matrix->add_relation(rel__matrix__3__1, true, false);
+  scc_compute_matrix->add_rule(new parallel_join_aggregate(
+      rel__matrix__3__1, rel__edge__2__1, rel__edge__2__1, FULL,
+      agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
+      {0, 1, 3}));
+
+  RAM *scc_helper_fact = new RAM(false, 1);
   scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false);
   scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)}));
 
   // [(node a) (node b) <-- (edge a b)]
-  RAM *scc_compute_node = new RAM(false, 1);
+  RAM *scc_compute_node = new RAM(false, 2);
   scc_compute_node->add_relation(rel__edge__2__1, false, false);
   scc_compute_node->add_relation(rel__node__1__1, true, false);
   scc_compute_node->add_rule(new parallel_copy_generate(
@@ -484,7 +525,7 @@ int main(int argc, char **argv) {
       }));
 
   // (total_node_cnt {count node _})
-  RAM *scc_count_nodes = new RAM(false, 2);
+  RAM *scc_count_nodes = new RAM(false, 3);
   scc_count_nodes->add_relation(rel__node__1__1, false, false);
   scc_count_nodes->add_relation(rel__total_node_cnt__1__1, true, false);
   scc_count_nodes->add_relation(rel___dollorunit__1__1, false, false);
@@ -493,258 +534,211 @@ int main(int argc, char **argv) {
       agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
       {2}));
 
-  LIE *cnt_lie = new LIE();
-  cnt_lie->add_relation(rel__edge__2__1);
-  cnt_lie->add_relation(rel__node__1__1);
-  cnt_lie->add_relation(rel___dollorunit__1__1);
-  cnt_lie->add_relation(rel__total_node_cnt__1__1);
-  cnt_lie->add_scc(scc_helper_fact);
-  cnt_lie->add_scc(scc_compute_node);
-  cnt_lie->add_scc(scc_count_nodes);
-  cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes);
-  cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes);
-
-  cnt_lie->enable_all_to_all_dump();
-  cnt_lie->set_output_dir(slog_output_dir); // Write to this directory
-  cnt_lie->set_comm(mcomm);
-  cnt_lie->set_batch_size(1);
-  cnt_lie->execute();
-  cnt_lie->print_all_relation_size(); // Continuously print relation sizes
+  RAM *scc_populate_dangling = new RAM(false, 4);
+  scc_populate_dangling->add_relation(rel__edge__2__1, false);
+  scc_populate_dangling->add_relation(rel__dangling_node, true);
+  scc_populate_dangling->add_relation(rel__node__1__1, false);
+  scc_populate_dangling->add_rule(new parallel_join_negate(
+      rel__dangling_node, rel__node__1__1, FULL, rel__edge__2__1, {0}));
+
+  RAM *scc_degree = new RAM(false, 5);
+  scc_degree->add_relation(rel__node_outage_degree, true);
+  scc_degree->add_relation(rel__matrix__3__1, false);
+  scc_degree->add_rule(new parallel_copy(rel__node_outage_degree,
+                                         rel__matrix__3__1, FULL, {0, 2}));
+
+  LIE *init_lie = new LIE();
+  init_lie->add_relation(rel__edge__2__1);
+  init_lie->add_relation(rel__node__1__1);
+  init_lie->add_relation(rel___dollorunit__1__1);
+  init_lie->add_relation(rel__total_node_cnt__1__1);
+  init_lie->add_relation(rel__matrix__3__1);
+  init_lie->add_relation(rel__dangling_node);
+  init_lie->add_relation(rel__node_outage_degree);
+  init_lie->add_scc(scc_helper_fact);
+  init_lie->add_scc(scc_compute_node);
+  init_lie->add_scc(scc_count_nodes);
+  init_lie->add_scc(scc_compute_matrix);
+  init_lie->add_scc(scc_populate_dangling);
+  init_lie->add_scc(scc_degree);
+  init_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes);
+  init_lie->add_scc_dependance(scc_compute_node, scc_count_nodes);
+  init_lie->add_scc_dependance(scc_compute_matrix, scc_degree);
+
+  init_lie->enable_all_to_all_dump();
+  init_lie->set_output_dir(slog_output_dir); // Write to this directory
+  init_lie->set_comm(mcomm);
+  init_lie->set_batch_size(1);
+  init_lie->execute();
+  init_lie->print_all_relation_size(); // Continuously print relation sizes
 
   // only 1 data in this rel so its safe
-  rel__total_node_cnt__1__1->print();
 
-  for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) {
-    total_node_size = t[0];
-    dangling_value = (u64)(((1 - ALPHA) / total_node_size) * 100000);
-    std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << std::endl;
-  }
+  u64 local_node_size = rel__node__1__1->get_full_element_count();
+  MPI_Barrier(mcomm.get_comm());
 
-  // >>>>>>>>>>>>>>> compute page rank
-  std::cout << ">>>>>>>>>> Computing pagerank ... " << std::endl;
+  MPI_Allreduce(&local_node_size, &total_node_size, 1, MPI_UNSIGNED_LONG_LONG,
+                MPI_SUM, mcomm.get_comm());
+
+  dangling_node_cnt = rel__dangling_node->get_global_full_element_count();
+  dangling_value = FLOAT_SCALE_CONST / total_node_size;
+  std::cout << ">>>>>>>>> Number of nodes: " << total_node_size
+            << " >>>>>>>>> Dangling node count: " << dangling_node_cnt
+            << " >>>>>>>>> Dangling value: "
+            << dangling_value * 1.0 / FLOAT_SCALE_CONST << std::endl;
 
   rel__edge__2__1->disable_initialization();
   rel__node__1__1->disable_initialization();
+  rel__matrix__3__1->disable_initialization();
+  rel__dangling_node->disable_initialization();
+  rel__node_outage_degree->disable_initialization();
 
-  relation *rel__edge__2__2 = new relation(
-      1, false, 2, get_tag_for_rel("edge", "2"),
-      std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table",
-      std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL);
-
-  //   matrix edge + successor count
-  relation *rel__matrix__3__1 = new relation(
-      1, true, 3, get_tag_for_rel("matrix", "1"),
-      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table",
-      std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL);
-
-  relation *rel__rank__3__1 = new relation(
-      1, true, 3, get_tag_for_rel("rank", "1"),
-      std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table",
-      std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL);
+  // rel__matrix__3__1->print();
+  relation *rel__sub_rank__3__1 = new relation(
+      1, true, 3, get_tag_for_rel("sub_rank", "1"),
+      std::to_string(get_tag_for_rel("sub_rank", "1")) + ".sub_rank.3.table",
+      std::to_string(get_tag_for_rel("sub_rank", "1")) + ".sub_rank.3.table",
+      FULL);
 
-  rel__rank__3__1->set_dependent_column_update(
+  // page rank (node N,
+  //            <sub page rank value comes form node P>,
+  //            <sub page rank value>)
+  rel__sub_rank__3__1->set_dependent_column_update(
       {1, 2, 3},
       [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
          const vector<u64> &nt) -> std::optional<bool> {
-        // if (nt[0] == 59 && nt[1] == 58) {
-        //   std::cout << "dependent column size " << new_v.size() << std::endl;
-        //   std::cout << new_v[0] << " " << new_v[1] << " " << new_v[2] << "
-        //   comparing with " << old_v[0] << " " << old_v[1] << " " << old_v[2]
-        //   << std::endl;
-        // }
         if (new_v[0] != old_v[0]) {
-          // std::cout << " www >>>>>>>>>" << std::endl;
           return std::nullopt;
-        } else {
-          // monotonic
-          // assert(new_v[1] > old_v[1]);
-          // u32 new_sum_raw = new_v[1];
-          // u32 old_sum_raw = old_v[1];
-          // float new_sum = *reinterpret_cast<float*>(&new_sum_raw);
-          // float old_sum = *reinterpret_cast<float*>(&old_sum_raw);
-          // if (new_sum > old_sum) {
-          //   std::cout << "new >> " << new_sum << " old >> " << old_sum <<
-          //   std::endl;
-          // }
-          // return new_sum > old_sum;
-          return new_v[1] > old_v[1];
-          // return true;
         }
+        return true;
       });
 
-  relation *rel__result__2__1__2 = new relation(
-      2, true, 2, get_tag_for_rel("result", "1__2"),
-      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
-      std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table",
-      FULL);
-
-  //
-
-  RAM *scc_copy_edge = new RAM(false, 0);
-  scc_copy_edge->add_relation(rel__edge__2__1, false, false);
-  scc_copy_edge->add_relation(rel__edge__2__2, true, false);
-  scc_copy_edge->add_rule(
-      new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2}));
-
-  RAM *scc_compute_matrix = new RAM(false, 1);
-  scc_compute_matrix->add_relation(rel__edge__2__1, false, false);
-  scc_compute_matrix->add_relation(rel__edge__2__2, false, false);
-  scc_compute_matrix->add_relation(rel__matrix__3__1, true, false);
-  scc_compute_matrix->add_rule(new parallel_join_aggregate(
-      rel__matrix__3__1, rel__edge__2__2, rel__edge__2__1, FULL,
-      agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr,
-      {0, 1, 3}));
-
-  RAM *scc_init = new RAM(false, 2);
-  scc_init->add_relation(rel__matrix__3__1, false, false);
-  scc_init->add_relation(rel__rank__3__1, true, false);
-  scc_init->add_rule(new parallel_copy_generate(
-      rel__rank__3__1, rel__matrix__3__1, FULL,
-      [](const u64 *const data, u64 *const output) -> int {
-        output[0] = data[0];
-        output[1] = data[0];
-        // float init_pg_v = (1 - ALPHA) / total_node_size;
-        u64 init_pg_v = dangling_value;
-        // std::cout << init_pg_v << std::endl;
-        // output[2] = *reinterpret_cast<u32*>(&init_pg_v);
-        output[2] = init_pg_v;
-        return 1;
-      }));
-
-  RAM *scc_page_rank = new RAM(true, 3);
-  scc_page_rank->add_relation(rel__matrix__3__1, false, false);
-  scc_page_rank->add_relation(rel__rank__3__1, true, false);
-  parallel_join *rank_join =
-      new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL,
-                        rel__rank__3__1, DELTA, {3, 1, 2} // useless
-      );
-  rank_join->set_generator_func([](const depend_val_t &target_vs,
-                                   const std::vector<u64> &input_v,
-                                   depend_val_t &res_set) -> bool {
-    // float pg_sum = 0.0;
-    u64 pg_sum = dangling_value;
-
-    int count = 0;
-    for (auto &tv : target_vs) {
-      // std::cout << "tagret v >>>>> ";
-      // for (auto c: tv) {
-      //   std::cout << c << " ";
-      // }
-      // std::cout << std::endl;
-      u32 raw_succ_pg_v = tv[2]; // all columns are u64, cast to u32 first
-      // std::cout << ">>>>>>>>>>>>>>> " <<
-      // *reinterpret_cast<float*>(&raw_succ_pg_v) << std::endl;
-      // auto succ_pg_v = *reinterpret_cast<float*>(&raw_succ_pg_v);
-      // if(succ_pg_v == 0) {
-      //   // std::cout << ">>>>>>>>>> " << succ_pg_v << std::endl;
-      // std::cout << "tagret v >>>>> ";
-      // for (auto c: tv) {
-      //   std::cout << c << " ";
-      // }
-      // std::cout << std::endl;
-      // }
-      if (input_v[2] != 0) {
-        // pg_sum += ((ALPHA * succ_pg_v) / input_v[2]);
-        pg_sum += (u64)(((u64)(ALPHA * raw_succ_pg_v)) / input_v[2]);
-        // if (input_v[1] == 51) {
-        //   std::cout << "Sum 51 " << input_v[0] << " with ";
-        //   for (auto c: tv) {
-        //     std::cout << c << " ";
-        //   }
-        //   std::cout << " result " << pg_sum << std::endl;
-        // }
+  std::vector<LIE *> pg_lie_list;
+
+  for (int i = 0; i < MAX_PG_ITERATION; i++) {
+    std::cout << ">>>>>>>>>>>>>>>>>>>>> Compute pagerank iter " << current_iter
+              << std::endl;
+    LIE *pg_lie = new LIE();
+
+    RAM *scc_init = new RAM(false, 0);
+    scc_init->add_relation(rel__matrix__3__1, false, false);
+    scc_init->add_relation(rel__sub_rank__3__1, true, false);
+    scc_init->add_rule(new parallel_copy_generate(
+        rel__sub_rank__3__1, rel__matrix__3__1, FULL,
+        [](const u64 *const data, u64 *const output) -> int {
+          output[0] = data[1];
+          output[1] = data[0];
+          output[2] = (u64)((ALPHA * dangling_value) / data[2]);
+          return 1;
+        }));
+    RAM *scc_page_rank = new RAM(false, 1);
+    scc_page_rank->add_relation(rel__matrix__3__1, false, false);
+    scc_page_rank->add_relation(rel__sub_rank__3__1, true, false);
+    parallel_join *rank_join =
+        new parallel_join(rel__sub_rank__3__1,
+                          rel__matrix__3__1, FULL,
+                          rel__sub_rank__3__1, DELTA,
+                          {3, 1, 2} // useless
+        );
+    rank_join->set_generator_func([](const depend_val_t &target_vs,
+                                     const std::vector<u64> &input_v,
+                                     depend_val_t &res_set) -> bool {
+      u64 pg_sum = dangling_node_cnt * dangling_value;
+      int count = 0;
+      for (auto &tv : target_vs) {
+        u64 raw_succ_pg_v_sub = tv[2];
+        pg_sum += raw_succ_pg_v_sub;
+        count++;
       }
-      count++;
-    }
-    if (pg_sum == 0) {
-      return false;
+      pg_sum += (u64)((1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size);
+      std::vector<u64> res_tuple(3, 0);
+      res_tuple[0] = input_v[1];
+      res_tuple[1] = input_v[0];
+      res_tuple[2] = (u64)(pg_sum * ALPHA / input_v[2]);
+      res_set.push_back(res_tuple);
+      return true;
+    });
+    scc_page_rank->add_rule(rank_join);
+    pg_lie->add_relation(rel__matrix__3__1);
+    pg_lie->add_relation(rel__node__1__1);
+    pg_lie->add_relation(rel__sub_rank__3__1);
+    pg_lie->add_scc(scc_page_rank);
+    if (current_iter == 0) {
+      pg_lie->add_scc(scc_init);
+      pg_lie->add_scc_dependance(scc_init, scc_page_rank);
     }
-    if (count == 0) {
-      return false;
+
+    pg_lie_list.push_back(pg_lie);
+
+    if (i == MAX_PG_ITERATION - 1) {
+      pg_lie->enable_all_to_all_dump();
+      pg_lie->enable_data_IO();
+      pg_lie->enable_IO();
     }
-    std::vector<u64> res_tuple(3, 0);
-    res_tuple[0] = input_v[1];
-    res_tuple[1] = input_v[0];
-    // res_tuple[2] = *reinterpret_cast<u32*>(&pg_sum);
-    res_tuple[2] = pg_sum;
-    // std::cout << "New tuple >>>>>>> " << pg_sum << std::endl;
-    // for (auto c: res_tuple) {
-    //   std::cout << c << " ";
-    // }
-    // std::cout << std::endl;
-    res_set.push_back(res_tuple);
-    return true;
-  });
-  scc_page_rank->add_rule(rank_join);
+    // lie->enable_share_io();
+    pg_lie->set_output_dir(slog_output_dir); // Write to this directory
+    pg_lie->set_comm(mcomm);
+    pg_lie->set_batch_size(1);
+    pg_lie->execute();
+    rel__sub_rank__3__1->disable_initialization();
+    pg_lie->print_all_relation_size(); // Continuously print relation sizes
+    current_iter++;
+    // // need this?
+    // MPI_Barrier(mcomm.get_comm());
+  }
+
+  std::cout << "Aggregating Page Rank Result ..." << std::endl;
+  relation *rel__result__2__1__2 = new relation(
+      2, true, 2, get_tag_for_rel("result", "1__2"),
+      std::to_string(get_tag_for_rel("result", "1__2")) +
+      ".result.2.table", std::to_string(get_tag_for_rel("result", "1__2"))
+      + ".result.2.table", FULL);
 
   RAM *scc_result = new RAM(false, 4);
-  scc_result->add_relation(rel__rank__3__1, false, false);
+  scc_result->add_relation(rel__sub_rank__3__1, false, false);
   scc_result->add_relation(rel__result__2__1__2, true, false);
   scc_result->add_relation(rel__node__1__1, false, false);
-  // scc_result->add_rule(new parallel_join_aggregate(
-  //     rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
-  //     agg_sum_float_local, SpecialAggregator::sum, agg_sum_float_reduce,
-  //     nullptr, {0, 2}));
+  // scc_result->add_relation(rel__sum_pg__1__1__1, true, false);
   scc_result->add_rule(new parallel_join_aggregate(
-      rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL,
-      agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, 2}));
-
-  LIE *pg_lie = new LIE();
-  pg_lie->add_relation(rel__edge__2__1);
-  pg_lie->add_relation(rel__matrix__3__1);
-  pg_lie->add_relation(rel__node__1__1);
-  pg_lie->add_relation(rel__edge__2__2);
-  pg_lie->add_relation(rel__rank__3__1);
-  pg_lie->add_relation(rel__result__2__1__2);
-  pg_lie->add_scc(scc_copy_edge);
-  pg_lie->add_scc(scc_compute_matrix);
-  pg_lie->add_scc(scc_init);
-  pg_lie->add_scc(scc_page_rank);
-  pg_lie->add_scc(scc_result);
-  pg_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix);
-  pg_lie->add_scc_dependance(scc_compute_matrix, scc_init);
-  pg_lie->add_scc_dependance(scc_init, scc_page_rank);
-  pg_lie->add_scc_dependance(scc_page_rank, scc_result);
-
-  // Enable IO
-  pg_lie->enable_all_to_all_dump();
-  pg_lie->enable_data_IO();
-  pg_lie->enable_IO();
-  // lie->enable_share_io();
-  pg_lie->set_output_dir(slog_output_dir); // Write to this directory
-  pg_lie->set_comm(mcomm);
-  pg_lie->set_batch_size(1);
-  pg_lie->execute();
-  pg_lie->print_all_relation_size(); // Continuously print relation sizes
-  // lie->stat_intermediate();
-  // rel__matrix__3__1->print();
-  // rel__rank__3__1->print(
-  //   [](const std::vector<u64>& tp){
-  //     u32 pg_v = tp[2];
-  //     // std::cout << tp[0] << " " << tp[1] << " " <<
-  //     *reinterpret_cast<float*>(&pg_v) << std::cout << tp[0] << " " << tp[1]
-  //     << " " << pg_v << std::endl;
-  //   }
-  // );
+      rel__result__2__1__2, rel__sub_rank__3__1, rel__node__1__1, FULL,
+      [](std::pair<shmap_relation::iterator, shmap_relation::iterator>
+      joined_range) {
+        local_agg_res_t sum_res = 0;
+        for (shmap_relation::iterator it = joined_range.first;
+            it != joined_range.second; ++it) {
+          auto tuple = (*it);
+          if (tuple[0] != tuple[1]) {
+            sum_res += tuple[tuple.size() - 2];
+          }
+        }
+        sum_res += (u64)((1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size);
+        return sum_res;
+      },
+      SpecialAggregator::sum,
+      agg_sum_reduce,
+      nullptr, {0, 2}));
+
+  LIE* final_lie = new LIE();
+  final_lie->add_relation(rel__result__2__1__2);
+  final_lie->add_relation(rel__node__1__1);
+  final_lie->add_relation(rel__sub_rank__3__1);
+  // final_lie->add_relation(rel__sum_pg__1__1__1);
+  final_lie->add_scc(scc_result);
+  final_lie->enable_all_to_all_dump();
+  final_lie->enable_data_IO();
+  final_lie->enable_IO();
+
+  final_lie->set_output_dir(slog_output_dir); // Write to this directory
+  final_lie->set_comm(mcomm);
+  final_lie->set_batch_size(1);
+  final_lie->execute();
+  final_lie->print_all_relation_size(); // Continuously print relation sizes
+
   rel__result__2__1__2->print([](const std::vector<u64> &tp) {
     u32 pg_v = tp[1];
-    // std::cout << tp[0] << " " << *reinterpret_cast<float*>(&pg_v) <<
-    std::cout << tp[0] << " " << pg_v * 1.0 / 100000 << std::endl;
+    std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl;
   });
 
-  // print all variants(non-canonical index of each relation)
-  //   if (mcomm.get_rank() == 0) {
-  //     std::cout << "rel_name"
-  //               << ",\t"
-  //               << "indices\n";
-  //     for (auto const &rel_p : rel_index_map) {
-  //       std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n";
-  //     }
-  //     std::cout << std::endl;
-  //   }
-
-  delete pg_lie;
-
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
   mcomm.destroy();

From b88f7794ec44efcd9dd74b083a925a5d4dbcad21 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Sun, 4 Dec 2022 19:02:21 -0500
Subject: [PATCH 19/36] add CC

---
 backend/src/RA/parallel_copy.cpp             |   1 +
 backend/tests/cc/README.md                   |   0
 backend/tests/cc/compiled_pre/CMakeLists.txt |  28 +
 backend/tests/cc/compiled_pre/cc.cpp         | 575 +++++++++++++++++++
 backend/tests/cc/ground_truth                |   8 +
 backend/tests/cc/input-data/edge.facts       |  15 +
 examples/datalog-example                     |   2 +-
 7 files changed, 628 insertions(+), 1 deletion(-)
 create mode 100644 backend/tests/cc/README.md
 create mode 100644 backend/tests/cc/compiled_pre/CMakeLists.txt
 create mode 100644 backend/tests/cc/compiled_pre/cc.cpp
 create mode 100644 backend/tests/cc/ground_truth
 create mode 100644 backend/tests/cc/input-data/edge.facts

diff --git a/backend/src/RA/parallel_copy.cpp b/backend/src/RA/parallel_copy.cpp
index e5f3ade1..b6b4ca4c 100644
--- a/backend/src/RA/parallel_copy.cpp
+++ b/backend/src/RA/parallel_copy.cpp
@@ -6,6 +6,7 @@
 
 
 #include "../parallel_RA_inc.h"
+#include <iostream>
 
 #ifdef GOOGLE_MAP
 void parallel_copy::local_copy(u32 buckets, google_relation* input, u32* input_bucket_map, relation* output, std::vector<int> reorder_map, u32 arity, u32 join_column_count, all_to_allv_buffer& copy_buffer, int ra_counter)
diff --git a/backend/tests/cc/README.md b/backend/tests/cc/README.md
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/cc/compiled_pre/CMakeLists.txt b/backend/tests/cc/compiled_pre/CMakeLists.txt
new file mode 100644
index 00000000..36be513b
--- /dev/null
+++ b/backend/tests/cc/compiled_pre/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required (VERSION 3.9)
+
+project (cc)
+
+add_compile_options(--std=c++17 -lstdc++fs -Wno-strict-aliasing -Werror=class-memaccess -fpermissive)
+
+link_libraries(stdc++fs)
+
+find_package(MPI REQUIRED)
+# find_package(OpenMP)
+# if (OPENMP_FOUND)
+#     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+#     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+#     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+# endif()
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive")
+# set (base_dir "${PROJECT_SOURCE_DIR}/../backend")
+set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
+
+file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
+file (GLOB source_files_cc "${PROJECT_SOURCE_DIR}/cc.cpp")
+
+ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}")
+
+add_executable(cc ${source_files_cc})
+INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH})
+TARGET_LINK_LIBRARIES(cc parallel_RA ${MPI_LIBRARIES})
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
new file mode 100644
index 00000000..521597a3
--- /dev/null
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -0,0 +1,575 @@
+// location of `parallel_RA_inc.h` here
+#include "/home/stargazermiao/workspace/PL/slog/backend/src/parallel_RA_inc.h"
+#include "mpi.h"
+
+#include <bit>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+// builtins.cpp goes here!
+// builtins.cpp
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace std;
+#define u64 uint64_t
+#define u32 uint32_t
+using i64 = int64_t;
+
+const u64 tag_mask = 0xffffc00000000000;
+const u64 tag_position = 46;
+const u64 int_tag = 0;
+const u64 str_tag = 2;
+const u64 sign_flip_const = 0x0000200000000000;
+const u64 signed_num_mask = 0xFFFFE00000000000;
+
+inline bool is_number(u64 datum) {
+  // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
+  // int_tag) << "\n";
+  return datum >> tag_position == int_tag;
+}
+
+inline i64 datum_to_number(u64 datum) {
+  i64 signed_val =
+      (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position);
+  if (signed_val >= sign_flip_const) {
+    signed_val = sign_flip_const - signed_val;
+  }
+  return signed_val;
+  // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 -
+  // tag_position);
+}
+const auto d2n = datum_to_number;
+
+inline u64 number_to_datum(i64 number) {
+  i64 unsigned_value = number;
+  if (number < 0) {
+    unsigned_value = (-number) + sign_flip_const;
+  }
+  return (unsigned_value & ~tag_mask) | (int_tag << tag_position);
+  // return (number & ~tag_mask) | (int_tag << tag_position);
+}
+
+const auto n2d = number_to_datum;
+
+inline u64 string_to_datum(std::string str) {
+  u32 str_hash = string_hash(str);
+  return (str_hash & ~tag_mask) | (str_tag << tag_position);
+}
+const auto s2d = string_to_datum;
+
+vector<array<u64, 2>> builtin_div_rem(const u64 *const data) {
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto div = number_to_datum(d2n(data[0]) / d2n(data[1]));
+    auto rem = number_to_datum(d2n(data[0]) % d2n(data[1]));
+    return {{div, rem}};
+  } else {
+    return {};
+  }
+}
+
+#define BUILTIN_BINARY_NUMBER_PRED(name, op)                                   \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(TState state)) {                       \
+    if (is_number(data[0]) && is_number(data[1]) &&                            \
+        datum_to_number(data[0]) op datum_to_number(data[1])) {                \
+      return callback(init_state);                                             \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+BUILTIN_BINARY_NUMBER_PRED(builtin_less, <)
+BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >)
+BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=)
+BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=)
+
+#define BUILTIN_BINARY_NUMBER_FUNC(name, op)                                   \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0]) && is_number(data[1])) {                            \
+      auto res = number_to_datum(datum_to_number(data[0])                      \
+                                     op datum_to_number(data[1]));             \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *)
+BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /)
+
+#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl)                                \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0]) && is_number(data[1])) {                            \
+      auto res = number_to_datum(                                              \
+          impl(datum_to_number(data[0]), datum_to_number(data[1])));           \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) { return arg2 - arg1; }
+BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1)
+
+#define BUILTIN_UNARY_NUMBER_FUNC(name, impl)                                  \
+  template <typename TState>                                                   \
+  inline TState name(const u64 *data, TState init_state,                       \
+                     TState (*callback)(u64 res, TState state)) {              \
+    if (is_number(data[0])) {                                                  \
+      auto res = number_to_datum(impl(datum_to_number(data[0])));              \
+      return callback(res, init_state);                                        \
+    } else                                                                     \
+      return init_state;                                                       \
+  }
+
+inline u64 add1(u64 x) { return x + 1; }
+inline u64 sub1(u64 x) { return x - 1; }
+
+BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1)
+BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1)
+
+vector<array<u64, 1>> builtin_range(const u64 *const data) {
+  vector<array<u64, 1>> res;
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto lb = datum_to_number(data[0]);
+    auto ub = datum_to_number(data[1]);
+    res.reserve(ub - lb);
+    for (u64 x = lb; x < ub; x++)
+      res.push_back({number_to_datum(x)});
+  }
+  return res;
+}
+
+template <typename TState>
+TState callback_builtin_range(const u64 *data, TState init_state,
+                              TState (*callback)(u64 res, TState state)) {
+  auto state = init_state;
+  if (is_number(data[0]) && is_number(data[1])) {
+    auto lb = datum_to_number(data[0]);
+    auto ub = datum_to_number(data[1]);
+    for (u64 x = lb; x < ub; x++)
+      state = callback(number_to_datum(x), state);
+  }
+  return state;
+}
+
+#define BUILTIN_BINARY_PRED(name, op)                                          \
+  template <typename TState>                                                   \
+  TState name(const u64 *data, TState init_state,                              \
+              TState (*callback)(TState state)) {                              \
+    if (data[0] op data[1])                                                    \
+      return callback(init_state);                                             \
+    else                                                                       \
+      return init_state;                                                       \
+  }
+BUILTIN_BINARY_PRED(builtin_eq, ==)
+BUILTIN_BINARY_PRED(builtin_neq, !=)
+
+template <typename TState>
+TState builtin_eq_1(const u64 *data, TState init_state,
+                    TState (*callback)(u64 res, TState state)) {
+  return callback(data[0], init_state);
+}
+
+#define BUILTIN_UNARY_PRED(name, pred)                                         \
+  template <typename TState>                                                   \
+  TState name(const u64 *data, TState init_state,                              \
+              TState (*callback)(TState state)) {                              \
+    if (pred(data[0]))                                                         \
+      return callback(init_state);                                             \
+    else                                                                       \
+      return init_state;                                                       \
+  }
+
+bool is_not_number(u64 datum) { return !is_number(datum); }
+BUILTIN_UNARY_PRED(builtin_number_huh, is_number)
+BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number)
+
+// for generate-cpp-lambda-for-computational-join
+struct CL2CB_State {
+  void *original_callback; // There be dragons?
+  void *original_state;
+  const u64 *original_data;
+  u64 *cl1_output_args;
+};
+
+// for generate-cpp-lambda-for-computational-copy
+struct BCLCB_State {
+  void *original_callback;
+  void *original_state;
+  const u64 *original_data;
+};
+
+// an experiment:
+template <bool f(u64, u64)> bool builtin_binary_number_pred(const u64 *data) {
+  if (is_number(data[0]) && is_number(data[1])) {
+    return f(datum_to_number(data[0]), datum_to_number(data[1]));
+  } else {
+    return false;
+  }
+}
+bool _less(u64 x, u64 y) { return x < y; }
+auto builtin_less2 = builtin_binary_number_pred<_less>;
+
+template <typename TState>
+inline TState builtin_nop(const u64 *data, TState init_state,
+                          TState (*callback)(TState state)) {
+  return callback(init_state);
+}
+
+// //////////////////// AGGREGATORS Alternative design ////////////////////
+
+// TODO: add number type check
+//////////////////////////////  count /////////////////////////////////////
+
+local_agg_res_t
+agg_count_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                    joined_range) {
+  local_agg_res_t cnt = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    cnt++;
+  }
+  return cnt;
+}
+
+local_agg_res_t agg_count_reduce(local_agg_res_t x, local_agg_res_t y) {
+  return x + y;
+}
+
+//////////////////////////////  sum /////////////////////////////////////
+
+local_agg_res_t
+agg_sum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                  joined_range) {
+  local_agg_res_t sum_res = 0;
+  for (shmap_relation::iterator it = joined_range.first;
+       it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl;
+    // if (tuple[1] == MAX_PG_ITERATION) {
+    sum_res += tuple[tuple.size() - 2];
+    // }
+  }
+  return sum_res;
+}
+
+local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  return x + y;
+}
+
+local_agg_res_t agg_sum_float_local(
+    std::pair<shmap_relation::iterator, shmap_relation::iterator>
+        joined_range) {
+  float sum_res = 0.0;
+  for (shmap_relation::iterator it = joined_range.first;
+       it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    u32 agg_column_raw = tuple[tuple.size() - 2];
+
+    sum_res += *reinterpret_cast<float *>(&agg_column_raw);
+  }
+  // std::cout << ">>>>>>>  " << sum_res << " " <<
+  // *reinterpret_cast<u32*>(&sum_res) << std::endl;
+  u32 sum_res_encoded = *reinterpret_cast<u32 *>(&sum_res);
+  return sum_res_encoded;
+}
+
+local_agg_res_t agg_sum_float_reduce(local_agg_res_t x_raw,
+                                     local_agg_res_t y_raw) {
+  float x = *reinterpret_cast<float *>(&x_raw);
+  float y = *reinterpret_cast<float *>(&y_raw);
+  float res = x + y;
+  // std::cout << res << std::endl;
+  u32 res_encoded = *reinterpret_cast<u32 *>(&res);
+  return res_encoded;
+}
+
+//////////////////////////////  maximum  /////////////////////////////////////
+
+local_agg_res_t
+agg_maximum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                      joined_range) {
+  local_agg_res_t max_res = 0;
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    auto current_v = tuple[tuple.size() - 1];
+    if (current_v > max_res) {
+      max_res = current_v;
+    }
+  }
+  return max_res;
+}
+
+local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  if (x > y) {
+    return x;
+  } else {
+    return y;
+  }
+}
+
+//////////////////////////////  minimum  /////////////////////////////////////
+
+local_agg_res_t
+agg_minimum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
+                      joined_range) {
+  local_agg_res_t min_res = std::numeric_limits<u32>::max();
+  for (auto it = joined_range.first; it != joined_range.second; ++it) {
+    auto tuple = (*it);
+    auto current_v = tuple[tuple.size() - 1];
+    if (current_v < min_res) {
+      min_res = current_v;
+    }
+  }
+  return min_res;
+}
+
+local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) {
+  if (x < y) {
+    return x;
+  } else {
+    return y;
+  }
+}
+
+// // end of builtins.cpp
+
+// global definitions:
+
+int max_rel = 255;
+std::map<std::string, int> rel_tag_map;
+std::map<std::string, std::unordered_set<std::string>> rel_index_map;
+
+// load all relation inside input database
+void load_input_relation(std::string db_dir) {
+  for (const auto &entry : std::filesystem::directory_iterator(db_dir)) {
+    // check if ends with table
+    std::string filename_ss = entry.path().filename().string();
+    std::cout << "input database has file " << filename_ss << std::endl;
+    std::string suffix = ".table";
+    int ft = filename_ss.size() - suffix.size();
+    if (ft < 0)
+      ft = 0;
+    if (filename_ss.rfind(suffix) != ft) {
+      continue;
+    }
+    std::string filename_s = entry.path().stem().string();
+    int tag = std::stoi(filename_s.substr(0, filename_s.find(".")));
+    std::string name_arity = filename_s.substr(
+        filename_s.find(".") + 1, filename_s.size() - filename_s.find(".") - 1);
+    std::string name = name_arity.substr(0, name_arity.rfind("."));
+    std::string arity_s =
+        name_arity.substr(name_arity.rfind(".") + 1, name_arity.size());
+    int arity = std::stoi(arity_s);
+    std::stringstream index_stream;
+    index_stream << name;
+    for (int i = 1; i <= arity; i++) {
+      index_stream << "__" << i;
+    }
+    if (tag > max_rel)
+      max_rel = tag;
+    std::cout << "load " << tag << "." << index_stream.str() << "has arity "
+              << arity << std::endl;
+    rel_tag_map[index_stream.str()] = tag;
+  }
+}
+
+int get_tag_for_rel(std::string relation_name, std::string index_str) {
+  std::string name_arity = relation_name + "__" + index_str;
+  if (rel_index_map.find(relation_name) != rel_index_map.end()) {
+    rel_index_map[relation_name].insert(index_str);
+  } else {
+    rel_index_map[relation_name] = {index_str};
+  }
+
+  if (rel_tag_map.find(name_arity) != rel_tag_map.end()) {
+    // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] <<
+    // std::endl;
+    return rel_tag_map[name_arity];
+  }
+  max_rel++;
+  rel_tag_map[name_arity] = max_rel;
+  std::cout << "generate rel tag: " << name_arity << " " << max_rel
+            << std::endl;
+  return max_rel;
+}
+
+int main(int argc, char **argv) {
+  // input dir from compiler
+  std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data";
+  // output dir from compiler
+  std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints";
+  // if (argc  3) {
+  slog_input_dir = argv[1];
+  slog_output_dir = argv[2];
+  // }
+  load_input_relation(slog_input_dir);
+  mpi_comm mcomm;
+  mcomm.create(argc, argv);
+
+  // (edge from to)
+  relation *rel__edge__2__1 = new relation(
+      1, true, 2, get_tag_for_rel("edge", "1__2"),
+      std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
+          ".edge.2.table",
+      FULL);
+
+  relation *rel__cc__2__1 = new  relation(
+    1, true, 2, get_tag_for_rel("cc", "1"),
+    std::to_string(get_tag_for_rel("cc", "1")) + ".cc.2.table",
+    FULL);
+  rel__cc__2__1->set_dependent_column_update(
+    {1,2},
+    [](const std::vector<u64> &old_v, const std::vector<u64> &new_v,
+         const vector<u64> &nt) -> std::optional<bool> {
+      if (new_v[0] < old_v[0]) {
+        return true;
+      } else {
+        return false;
+      }
+    });
+  
+  relation *rel__node__1__1 = new relation(
+    1, true, 1, get_tag_for_rel("node", "1"),
+    std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table",
+    FULL);
+
+  relation *rel__cc_final__2__1 = new relation(
+    1, true, 2, get_tag_for_rel("cc_final", "2"),
+    std::to_string(get_tag_for_rel("cc_final", "1")) + ".cc_final.2.table",
+    slog_input_dir + "/" + std::to_string(get_tag_for_rel("cc_final", "1")) +
+          ".cc_final.2.table",
+    FULL);
+
+  relation *rel__cc_represent__1__1 = new relation(
+    1, true, 1, get_tag_for_rel("cc_represent", "1"),
+    std::to_string(get_tag_for_rel("cc_represent", "1")) + ".cc_represent.2.table",
+    FULL);
+
+  RAM *to_undirected_scc = new RAM(false, 0);
+  to_undirected_scc->add_relation(rel__edge__2__1, false);
+  to_undirected_scc->add_rule(new parallel_copy_generate(
+    rel__edge__2__1, rel__edge__2__1, FULL,
+    [](const u64 *const data, u64 *const output) -> int {
+      output[0] = data[1];
+      output[1] = data[0];
+      return 1;
+    }
+  ));
+
+  RAM *cc_init_scc = new RAM(false, 1);
+  cc_init_scc->add_relation(rel__edge__2__1, false);
+  cc_init_scc->add_relation(rel__cc__2__1, true);
+  cc_init_scc->add_relation(rel__node__1__1, true);
+  cc_init_scc->add_rule(new parallel_copy_generate(
+    rel__cc__2__1, rel__edge__2__1, FULL,
+    [](const u64 *const data, u64 *const output) -> int {
+      output[0] = data[0];
+      output[1] = data[0];
+      return 1;
+    }
+  ));
+  cc_init_scc->add_rule(new parallel_copy_generate(
+    rel__node__1__1, rel__edge__2__1, FULL,
+    [](const u64 *const data, u64 *const output) -> int {
+      output[0] = data[0];
+      return 1;
+    }
+  ));
+
+  RAM* cc_compute_scc = new RAM(true, 2);
+  cc_compute_scc->add_relation(rel__edge__2__1, false);
+  cc_compute_scc->add_relation(rel__cc__2__1, true);
+  parallel_join *cc_pg = new parallel_join(
+    rel__cc__2__1, rel__edge__2__1,
+    FULL, rel__cc__2__1, DELTA,
+    {1, 0}  // useless
+  );
+  cc_pg->set_generator_func(
+    [](const depend_val_t& target_vs, const std::vector<u64>& input_v, depend_val_t& res_set) -> bool {
+      // std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl;
+      auto target_v = target_vs[0];
+      std::vector<u64> res(2, 0);
+      res[0] = input_v[1];
+      res[1] = target_v[1];
+      res_set.push_back(res);
+      return true;
+    }
+  );
+  cc_compute_scc->add_rule(cc_pg);
+
+  RAM* cc_agg_scc = new RAM(false, 3);
+  cc_agg_scc->add_relation(rel__cc__2__1, false);
+  cc_agg_scc->add_relation(rel__node__1__1, false);
+  cc_agg_scc->add_relation(rel__cc_final__2__1, true);
+  cc_agg_scc->add_rule(new parallel_join_aggregate(
+    rel__cc_final__2__1, rel__cc__2__1,
+    rel__node__1__1, FULL,
+    agg_minimum_local, SpecialAggregator::minimum, agg_minimum_reduce,
+    nullptr, {0,2}));
+  
+  RAM* cc_rep_scc = new RAM(false, 3);
+  cc_rep_scc->add_relation(rel__cc_final__2__1, false);
+  cc_rep_scc->add_relation(rel__cc_represent__1__1, true);
+  cc_rep_scc->add_rule(new parallel_copy(
+    rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1}
+  ));
+
+
+  LIE *cc_lie = new LIE();
+  cc_lie->add_relation(rel__edge__2__1);
+  cc_lie->add_relation(rel__node__1__1);
+  cc_lie->add_relation(rel__cc__2__1);
+  cc_lie->add_relation(rel__cc_final__2__1);
+  cc_lie->add_relation(rel__cc_represent__1__1);
+
+  cc_lie->add_scc(to_undirected_scc);
+  cc_lie->add_scc(cc_init_scc);
+  cc_lie->add_scc(cc_compute_scc);
+  cc_lie->add_scc(cc_agg_scc);
+  cc_lie->add_scc(cc_rep_scc);
+
+  cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc);
+  cc_lie->add_scc_dependance(cc_init_scc, cc_compute_scc);
+  cc_lie->add_scc_dependance(cc_compute_scc, cc_agg_scc);
+  cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc);
+
+  cc_lie->enable_all_to_all_dump();
+  cc_lie->set_output_dir(slog_output_dir); // Write to this directory
+  cc_lie->set_comm(mcomm);
+  cc_lie->set_batch_size(1);
+  cc_lie->execute();
+  cc_lie->print_all_relation_size(); // Continuously print relation sizes
+
+  // rel__node__1__1->print();
+  // rel__edge__2__1->print();
+  rel__cc__2__1->print();
+  // rel__cc_final__2__1->print();
+  // rel__cc_represent__1__1->print();
+  // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+  mcomm.destroy();
+
+  return 0;
+}
diff --git a/backend/tests/cc/ground_truth b/backend/tests/cc/ground_truth
new file mode 100644
index 00000000..cc2f793f
--- /dev/null
+++ b/backend/tests/cc/ground_truth
@@ -0,0 +1,8 @@
+[{'0', '19', '25', '3'},
+ {'13', '2'},
+ {'14', '4'},
+ {'11', '18', '27', '28', '5'},
+ {'26', '6'},
+ {'21', '22', '23', '8'},
+ {'12', '24'},
+ {'17', '20'}]
diff --git a/backend/tests/cc/input-data/edge.facts b/backend/tests/cc/input-data/edge.facts
new file mode 100644
index 00000000..1530cad3
--- /dev/null
+++ b/backend/tests/cc/input-data/edge.facts
@@ -0,0 +1,15 @@
+0	19
+0	25
+2	13
+3	19
+4	14
+5	27
+5	28
+6	26
+8	23
+11	27
+12	24
+17	20
+18	28
+21	23
+21	22
\ No newline at end of file
diff --git a/examples/datalog-example b/examples/datalog-example
index 87266643..be103a21 160000
--- a/examples/datalog-example
+++ b/examples/datalog-example
@@ -1 +1 @@
-Subproject commit 872666433df43282a408b4e37dad3fcbeafa1891
+Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa

From 5ea49614413c7ec87ccfed26fa16d3f48efbeee2 Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Thu, 8 Dec 2022 19:43:03 -0500
Subject: [PATCH 20/36] add debug info

---
 backend/src/RAM/RA_tasks.cpp                  | 44 +++++++++++----
 backend/src/RAM/RA_tasks.h                    |  2 +-
 backend/src/btree/btree_container.h           |  8 +--
 backend/src/compat.h                          |  2 +-
 backend/src/lie/lie.cpp                       | 13 ++++-
 .../src/relation/balanced_hash_relation.cpp   | 21 +++----
 backend/src/relation/balanced_hash_relation.h |  2 +-
 backend/src/relation/shmap_relation.h         |  2 +-
 backend/src/relation/shmap_relation_exp.cpp   | 42 +-------------
 backend/tests/cc/compiled_pre/cc.cpp          | 55 +++++++++++--------
 .../tests/pagerank/compiled_pre/pagerank.cpp  |  2 +-
 .../pagerank/compiled_pre/pagerank_full.cpp   |  2 +-
 backend/tests/sssp/compiled_pre/sssp_opt.cpp  | 10 +++-
 13 files changed, 105 insertions(+), 100 deletions(-)

diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 2712e343..3c8963f8 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -785,6 +785,8 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
     int nprocs = mcomm.get_local_nprocs();
     int RA_count = RA_list.size();
     u64 relation_id=0, bucket_id=0, intern_key=0, intern_value=0;
+    double check_time = 0;
+    double insert_time = 0;
 
     for (int k = 0; k < RA_count * nprocs; k++)
     {
@@ -847,19 +849,22 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
             }
 #endif
             u32 elements_to_read = tuples_to_read * width;
+
             for (int tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++)
             {
                 u32 x = starting + tuple_ind * width;
                 bool insert_flag = true;
                 if (output->get_dependent_column().size() > 1) {
-                    std::vector<u64> tt;
-                    for (int i = 0; i < width; i++) {
-                        tt.push_back(cumulative_all_to_allv_buffer[x+i]);
-                    }
+                    std::vector<u64> tt(cumulative_all_to_allv_buffer+x, cumulative_all_to_allv_buffer+x+width);
+                    // for (int i = 0; i < width; i++) {
+                    //     tt.push_back(cumulative_all_to_allv_buffer[x+i]);
+                    // }
                     // temporary index column just to match size of column
                     tt.push_back(0);
+                    auto _before_i = MPI_Wtime();
                     insert_flag = output->check_dependent_value_insert_avalible(tt);
-
+                    auto _after_i = MPI_Wtime();
+                    check_time += _after_i - _before_i;
                 } else {
                     insert_flag = output->find_in_full(cumulative_all_to_allv_buffer + x, width) == false &&
                         output->find_in_delta(cumulative_all_to_allv_buffer + x, width) == false &&
@@ -885,9 +890,11 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
                     intern_map[intern_key] = intern_value;
                     tuple[width] = intern_key | intern_value;    /// Intern here
 
-
+                    auto _before_ins = MPI_Wtime();
                     if (output->insert_in_newt(tuple) == true)
                         successful_insert++;
+                    auto _after_ins = MPI_Wtime();
+                    insert_time += _after_ins - _before_ins;
                 } 
             }
             starting = starting + elements_to_read;
@@ -923,7 +930,8 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
 
         // std::cout << output->get_debug_id() << " successful insert: " << successful_insert << " ; failed insert : " << failed_insert <<  std::endl;
     }
-
+    if (mcomm.get_rank() == 0)
+        std::cout << "CHECK TIME: " << check_time << "   INSERT_TIME: " << insert_time << " NEW TUPLES: " << successful_insert << std::endl;
     delete[] cumulative_all_to_allv_recv_process_count_array;
     delete[] cumulative_all_to_allv_buffer;
 }
@@ -1246,7 +1254,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
 
 
 
-void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num)
+void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector<double>& runtime_vector)
 {
     int inner_loop = 0;
     u32 RA_count = RA_list.size();
@@ -1254,6 +1262,11 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
     int *offset = new int[RA_count];
     for (u32 i =0; i < RA_count; i++)
         offset[i] = 0;
+    
+    double all_local_compute = 0;
+    double all_insert_newt = 0;
+    double all_comm = 0;
+    double all_time = 0;
 
     while (batch_size != 0)
     {
@@ -1265,7 +1278,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
         std::cout << std::setiosflags(std::ios::fixed);
         auto intra_start = MPI_Wtime(); 
         intra_bucket_comm_execute();
-        auto intra_end = MPI_Wtime(); 
+        auto intra_end = MPI_Wtime();
 
         bool local_join_status = false;
         while (local_join_status == false)
@@ -1277,10 +1290,12 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
             auto compute_start = MPI_Wtime();
             local_join_status = local_compute(offset);
             auto compute_end = MPI_Wtime();
+            all_local_compute += compute_end - compute_start;
 
             auto all_to_all_start = MPI_Wtime();
             comm_compaction_all_to_all(compute_buffer, &cumulative_all_to_allv_recv_process_count_array, &cumulative_all_to_allv_buffer, mcomm.get_local_comm(), *loop_counter, task_id, output_dir, all_to_all_record, sloav_mode, rotate_index_array, send_indexes, sendb_num);
             auto all_to_all_end = MPI_Wtime();
+            all_comm += all_to_all_end - all_to_all_start;
 
             auto free_buffers_start = MPI_Wtime();
             free_compute_buffers();
@@ -1289,6 +1304,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
             auto insert_in_newt_start = MPI_Wtime();
             local_insert_in_newt_comm_compaction(intern_map);
             auto insert_in_newt_end = MPI_Wtime();
+            all_insert_newt += insert_in_newt_end - insert_in_newt_start;
 
 
 #if 1
@@ -1327,7 +1343,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
 
         auto insert_in_full_start = MPI_Wtime(); 
         local_insert_in_full();
-        auto insert_in_full_end = MPI_Wtime(); 
+        auto insert_in_full_end = MPI_Wtime();
 
 #if 1
         if (mcomm.get_rank() == 0)
@@ -1349,6 +1365,8 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
             std::cout << (intra_end - intra_start) << std::setw(12)
                       << (insert_in_full_end - insert_in_full_start)  << std::setw(12)
                       << (insert_in_full_end - intra_start) << std::endl;
+        
+            all_time += insert_in_full_end - intra_start;
 
         }
 #endif
@@ -1360,6 +1378,12 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
         if (iteration_count == 1)
             break;
     }
+    if (mcomm.get_rank() == 0) {
+        runtime_vector[0] = runtime_vector[0] + all_comm;
+        runtime_vector[1] = runtime_vector[1] + all_local_compute;
+        runtime_vector[2] = runtime_vector[2] + all_insert_newt;
+        runtime_vector[3] = runtime_vector[3] + all_time;
+    }
 
     delete[] offset;
 
diff --git a/backend/src/RAM/RA_tasks.h b/backend/src/RAM/RA_tasks.h
index 8b5d8e0d..ab9ac4a3 100644
--- a/backend/src/RAM/RA_tasks.h
+++ b/backend/src/RAM/RA_tasks.h
@@ -165,7 +165,7 @@ class RAM
     /// Start running this SCC (task) for "batck_size" iterations
     void execute_in_batches(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int *loop_counter,int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num);
 
-    void execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num);
+    void execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector<double>& runtime_vector);
 };
 
 #endif
diff --git a/backend/src/btree/btree_container.h b/backend/src/btree/btree_container.h
index fb617abe..9b918ba7 100644
--- a/backend/src/btree/btree_container.h
+++ b/backend/src/btree/btree_container.h
@@ -58,9 +58,9 @@ class btree_container {
 
   // Iterator routines.
   iterator begin() { return tree_.begin(); }
-  const_iterator begin() const { return tree_.begin(); }
+  const_iterator cbegin() const { return tree_.begin(); }
   iterator end() { return tree_.end(); }
-  const_iterator end() const { return tree_.end(); }
+  const_iterator cend() const { return tree_.end(); }
   reverse_iterator rbegin() { return tree_.rbegin(); }
   const_reverse_iterator rbegin() const { return tree_.rbegin(); }
   reverse_iterator rend() { return tree_.rend(); }
@@ -70,13 +70,13 @@ class btree_container {
   iterator lower_bound(const key_type &key) {
     return tree_.lower_bound(key);
   }
-  const_iterator lower_bound(const key_type &key) const {
+  const_iterator clower_bound(const key_type &key) const {
     return tree_.lower_bound(key);
   }
   iterator upper_bound(const key_type &key) {
     return tree_.upper_bound(key);
   }
-  const_iterator upper_bound(const key_type &key) const {
+  const_iterator cupper_bound(const key_type &key) const {
     return tree_.upper_bound(key);
   }
   std::pair<iterator,iterator> equal_range(const key_type &key) {
diff --git a/backend/src/compat.h b/backend/src/compat.h
index dbc42cf4..e40be509 100644
--- a/backend/src/compat.h
+++ b/backend/src/compat.h
@@ -25,7 +25,7 @@
 #include "btree/btree_set.h"
 #include <filesystem>
 #include <optional>
-#include <bit>
+// #include <bit>
 
 
 #ifdef __GNUC__
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index e517288f..ad478a20 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -385,6 +385,7 @@ bool LIE::execute ()
             }
         }
     }
+    std::vector<double> run_time_vector(4,0);
 
     //int c = 0;
     /// Running one task at a time
@@ -475,7 +476,7 @@ bool LIE::execute ()
             if (comm_compaction == 0)
                 executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num);
             else
-                executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num);
+                executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector);
 
             // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
             // for (u32 i = 0 ; i < scc_relation_count; i++)
@@ -523,11 +524,14 @@ bool LIE::execute ()
                 if (comm_compaction == 0)
                     executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num);
                 else
-                    executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num);
+                    executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector);
 
                 //executable_task->print_all_relation();
 
                 delta_in_scc = history[history.size()-2];
+                if(mcomm.get_rank() == 0) {
+                    std::cout << "DELTA " << delta_in_scc << std::endl;
+                }
                 //if (delta_in_scc == 0)
                 //    executed_scc_id.push_back(executable_task->get_id());
 #if 0
@@ -569,6 +573,11 @@ bool LIE::execute ()
         if (mcomm.get_rank() == 0)
         {
             std::cout << "<<<<<<<<<<< SCC " << executable_task->get_id() << " finish, " << loop_counter << " iteration in total." << std::endl;
+            std::cout << "TOTAL STAT >>>>>>>> " << executable_task->get_id() << " >>>>>>>> "
+            << "COMM TIME: " << run_time_vector[0] << "  LCOMPUTE TIME: " << run_time_vector[1] << "  INSERT TIME: " << run_time_vector[2]
+            << "  OTHER TIME: " << run_time_vector[3] - run_time_vector[0] - run_time_vector[1] - run_time_vector[2]
+            << "  ALL TIME: " << run_time_vector[3]
+            << std::endl;
             // print_all_relation_size();
         }
         full_iteration_count += loop_counter;
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index b3b7a1e7..28b0adf8 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -11,6 +11,7 @@
 #include <cstddef>
 #include <filesystem>
 #include <iostream>
+#include <vector>
 
 u32 relation::get_global_delta_element_count()
 {
@@ -1273,6 +1274,8 @@ bool relation::insert_in_full(u64* t)
 
     // TODO: use normal insert here!
     if (full[bucket_id].insert_tuple_from_array(t, arity+1) == true)
+    // std::vector<u64> tp(t, t+arity+1);
+    // if (full[bucket_id].insert(tp))
     {
         // TODO: change how to deal with element counts
         full_element_count++;
@@ -1410,20 +1413,10 @@ void relation::local_insert_in_delta()
 }
 
 bool relation::check_dependent_value_insert_avalible(const std::vector<u64>& tuple) {
-    uint64_t bucket_id = tuple_hash(tuple.data(), join_column_count) % get_bucket_count();
-    // return newt[bucket_id].check_dependent_insertion(tuple);
-    // if (!(full[bucket_id].check_dependent_insertion(tuple) && delta[bucket_id].check_dependent_insertion(tuple))) {
-    //     for (auto c: tuple) {
-    //         std::cout << c << " ";
-    //     }
-    //     std::cout << std::endl;
-    //     std::cout << "current tree >>" << std::endl;
-    //     for (auto t: delta[bucket_id]) {
-    //         for (auto c: t) {
-    //             std::cout << c << " ";
-    //         }
-    //         std::cout << std::endl;
-    //     }
+    // uint64_t bucket_id = tuple_hash(tuple.data(), join_column_count) % get_bucket_count();
+    // if (bucket_id != mcomm.get_rank()) {
+    //     std::cout << "wwwwwwwwwwwwwwwwwwwwwwwwwwwwww " << std::endl; 
     // }
+    int bucket_id = mcomm.get_rank();
     return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ;
 }
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index e34d6e76..d80d3b08 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -215,7 +215,7 @@ class relation
 #endif
 
     void set_delta_element_count(int val)   {delta_element_count = val;}
-    int get_delta_element_count()   {return delta_element_count;}
+    int get_delta_element_count()   {return delta[mcomm.get_rank()].count();}
     u32** get_delta_sub_bucket_element_count()  {return delta_sub_bucket_element_count;}
     u32 get_global_delta_element_count();
 
diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h
index 3d1fe9a5..4ed96934 100644
--- a/backend/src/relation/shmap_relation.h
+++ b/backend/src/relation/shmap_relation.h
@@ -45,7 +45,7 @@ struct shmap_relation {
     // souffle use multi set for some relation
     using t_ind = btree::btree_set<t_tuple, t_comparator>;
     t_ind ind;
-    using iterator = t_ind::const_iterator;
+    using iterator = t_ind::iterator;
 
     bool insert(const t_tuple &t) {
         return ind.insert(t).second;
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index d0fbc176..91934d9e 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -73,13 +73,6 @@ int shmap_relation::insert_tuple_from_array(u64 *t, int width)
             bool joined = false;
             for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) {
                 auto cur_tuple = *it;
-                // if (tp[0] == 59 && tp[1] == 58) {
-                //     std::cout << "tppppp  <<<<<< ";
-                //     for (auto c: cur_tuple) {
-                //         std::cout << c << " ";
-                //     }
-                //     std::cout << std::endl;
-                // }
                 
                 std::vector<u64> old_t;
                 for (auto i: dependent_column_indices) {
@@ -91,16 +84,6 @@ int shmap_relation::insert_tuple_from_array(u64 *t, int width)
                 }
                 if (compare_res.value()) {
                     need_deletes.push_back(it);  
-                    // if (tp[0] == 59 && tp[1] == 58) {
-                    //     for (auto c: cur_tuple) {
-                    //         std::cout << c << " ";
-                    //     }
-                    //     std::cout << "update with " << compare_res.value() <<" <<<<<< ";
-                    //     for (auto c: tp) {
-                    //         std::cout << c << " ";
-                    //     }
-                    //     std::cout << std::endl;
-                    // }
                 }
                 joined = true;
             }
@@ -113,13 +96,9 @@ int shmap_relation::insert_tuple_from_array(u64 *t, int width)
             }
             if (!need_deletes.empty()) {
                 for (auto d: need_deletes) {
-                    // std::cout << "delete >>>>  ";
-                    // for (auto c: *d) {
-                    //     std::cout << c << " ";
-                    // }
-                    // std::cout << std::endl;
-                    ind.erase(*d);
+                    ind.erase(d);
                 }
+
                 if (insert(tp)) {
                     return INSERT_SUCCESS;
                 } else {
@@ -179,23 +158,6 @@ shmap_relation::check_dependent_insertion(const std::vector<u64> &tp) {
                     joined = true;
                 }
             }
-            // std::cout << " not adding to lattice with <<<<<< ";
-            // for (auto c: tp) {
-            //     std::cout << c << " ";
-            // }
-            // std::cout << " while lower bound ... ";
-            // for (auto c: lower_bound) {
-            //     std::cout << c << " ";
-            // }
-            // std::cout << std::endl;
-            // std::cout << "The current btree: " << std::endl;
-            // for (auto& t: ind) {
-            //     std::cout << "Tuple : ";
-            //     for (auto c: t) {
-            //         std::cout << c << " ";
-            //     }
-            //     std::cout << std::endl;
-            // }
             if (!joined) {
                 return true;
             } else {
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 521597a3..232244f7 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -1,8 +1,8 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/stargazermiao/workspace/PL/slog/backend/src/parallel_RA_inc.h"
+#include "/home/ysun67/workspace/slog/backend/src/parallel_RA_inc.h"
 #include "mpi.h"
 
-#include <bit>
+// #include <bit>
 #include <iostream>
 #include <iterator>
 #include <map>
@@ -415,6 +415,7 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
 }
 
 int main(int argc, char **argv) {
+  double start_time = MPI_Wtime();
   // input dir from compiler
   std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data";
   // output dir from compiler
@@ -467,16 +468,16 @@ int main(int argc, char **argv) {
     std::to_string(get_tag_for_rel("cc_represent", "1")) + ".cc_represent.2.table",
     FULL);
 
-  RAM *to_undirected_scc = new RAM(false, 0);
-  to_undirected_scc->add_relation(rel__edge__2__1, false);
-  to_undirected_scc->add_rule(new parallel_copy_generate(
-    rel__edge__2__1, rel__edge__2__1, FULL,
-    [](const u64 *const data, u64 *const output) -> int {
-      output[0] = data[1];
-      output[1] = data[0];
-      return 1;
-    }
-  ));
+  // RAM *to_undirected_scc = new RAM(false, 0);
+  // to_undirected_scc->add_relation(rel__edge__2__1, false);
+  // to_undirected_scc->add_rule(new parallel_copy_generate(
+  //   rel__edge__2__1, rel__edge__2__1, FULL,
+  //   [](const u64 *const data, u64 *const output) -> int {
+  //     output[0] = data[1];
+  //     output[1] = data[0];
+  //     return 1;
+  //   }
+  // ));
 
   RAM *cc_init_scc = new RAM(false, 1);
   cc_init_scc->add_relation(rel__edge__2__1, false);
@@ -529,12 +530,12 @@ int main(int argc, char **argv) {
     agg_minimum_local, SpecialAggregator::minimum, agg_minimum_reduce,
     nullptr, {0,2}));
   
-  RAM* cc_rep_scc = new RAM(false, 3);
-  cc_rep_scc->add_relation(rel__cc_final__2__1, false);
-  cc_rep_scc->add_relation(rel__cc_represent__1__1, true);
-  cc_rep_scc->add_rule(new parallel_copy(
-    rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1}
-  ));
+  // RAM* cc_rep_scc = new RAM(false, 3);
+  // cc_rep_scc->add_relation(rel__cc_final__2__1, false);
+  // cc_rep_scc->add_relation(rel__cc_represent__1__1, true);
+  // cc_rep_scc->add_rule(new parallel_copy(
+  //   rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1}
+  // ));
 
 
   LIE *cc_lie = new LIE();
@@ -544,27 +545,35 @@ int main(int argc, char **argv) {
   cc_lie->add_relation(rel__cc_final__2__1);
   cc_lie->add_relation(rel__cc_represent__1__1);
 
-  cc_lie->add_scc(to_undirected_scc);
+  // cc_lie->add_scc(to_undirected_scc);
   cc_lie->add_scc(cc_init_scc);
   cc_lie->add_scc(cc_compute_scc);
   cc_lie->add_scc(cc_agg_scc);
-  cc_lie->add_scc(cc_rep_scc);
+  // cc_lie->add_scc(cc_rep_scc);
 
-  cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc);
+  // cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc);
   cc_lie->add_scc_dependance(cc_init_scc, cc_compute_scc);
   cc_lie->add_scc_dependance(cc_compute_scc, cc_agg_scc);
-  cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc);
+  // cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc);
 
   cc_lie->enable_all_to_all_dump();
   cc_lie->set_output_dir(slog_output_dir); // Write to this directory
   cc_lie->set_comm(mcomm);
   cc_lie->set_batch_size(1);
   cc_lie->execute();
+
+  double end_time = MPI_Wtime();
+  double rank_running_time = end_time - start_time;
+  double final_time;
+  MPI_Reduce(&rank_running_time, &final_time, 1, MPI_DOUBLE_PRECISION, MPI_MAX, 0, mcomm.get_comm());
+  if (mcomm.get_rank() == 0) {
+    std::cout << "RUNNING TIME: >>>>>>>>>>>>>>>>>>>>>> " << final_time << std::endl;
+  }
   cc_lie->print_all_relation_size(); // Continuously print relation sizes
 
   // rel__node__1__1->print();
   // rel__edge__2__1->print();
-  rel__cc__2__1->print();
+  // rel__cc__2__1->print();
   // rel__cc_final__2__1->print();
   // rel__cc_represent__1__1->print();
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp
index a24bcf25..267a4abc 100644
--- a/backend/tests/pagerank/compiled_pre/pagerank.cpp
+++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp
@@ -2,7 +2,7 @@
 #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 #include "mpi.h"
 
-#include <bit>
+// #include <bit>
 #include <iostream>
 #include <iterator>
 #include <map>
diff --git a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
index 11055725..7513aa3c 100644
--- a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
+++ b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
@@ -2,7 +2,7 @@
 #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 #include "mpi.h"
 
-#include <bit>
+// #include <bit>
 #include <iostream>
 #include <iterator>
 #include <map>
diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
index 6d41428b..b2a788c3 100644
--- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
@@ -1,5 +1,5 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 
 #include <optional>
 #include <iterator>
@@ -385,6 +385,7 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
 
 void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
                        std::string output_dir, int argc, char **argv) {
+  double start_time = 0;
   start_node = sp;
   load_input_relation(input_dir);
 
@@ -483,6 +484,13 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
   lie->set_comm(mcomm);
   lie->set_batch_size(1);
   lie->execute();
+  double end_time = MPI_Wtime();
+  double rank_running_time = end_time - start_time;
+  double final_time;
+  MPI_Reduce(&rank_running_time, &final_time, 1, MPI_DOUBLE_PRECISION, MPI_MAX, 0, mcomm.get_comm());
+  if (mcomm.get_rank() == 0) {
+    std::cout << "RUNNING TIME: >>>>>>>>>>>>>>>>>>>>>> " << final_time << std::endl;
+  }
   lie->print_all_relation_size(); // Continuously print relation sizes
                                   //   lie->stat_intermediate();
 

From fa9941b64633f88523d5093b871a042875a2e7cb Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Mon, 12 Dec 2022 01:24:02 -0500
Subject: [PATCH 21/36] freez

---
 backend/tests/cc/compiled_pre/cc.cpp         |  4 +-
 backend/tests/sssp/compiled_pre/sssp_opt.cpp | 12 +----
 backend/tests/sssp/sssp.py                   | 15 +++---
 cluster.yaml                                 | 50 ++++++++++++++++++++
 sbatch.sh                                    |  6 +++
 5 files changed, 67 insertions(+), 20 deletions(-)
 create mode 100644 cluster.yaml
 create mode 100644 sbatch.sh

diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 232244f7..a038fc59 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -573,8 +573,8 @@ int main(int argc, char **argv) {
 
   // rel__node__1__1->print();
   // rel__edge__2__1->print();
-  // rel__cc__2__1->print();
-  // rel__cc_final__2__1->print();
+ // rel__cc__2__1->print();
+ // rel__cc_final__2__1->print();
   // rel__cc_represent__1__1->print();
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
index b2a788c3..3b9d630a 100644
--- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
@@ -420,7 +420,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
           auto [data, output] = state;
           auto head_tuple = output;
 
-          bool compatible = true && res_0 < n2d(start_node);
+          bool compatible = true && res_0 == n2d(start_node);
           if (!compatible)
             return state;
 
@@ -444,21 +444,11 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
       );
   update_spath_j->set_generator_func(
       [](const depend_val_t& target_vs, const std::vector<u64>& input_v, depend_val_t& res_set) -> bool {
-        // std::cout << "Joining  >>> ";
-        // for (auto c : input_v) {
-        //   std::cout << c << " ";
-        // }
-        // std::cout << " and >>>>>>>";
-        // for (auto c : target_v) {
-        //     std::cout << c << " ";
-        // }
-        // std::cout << std::endl;
         auto target_v = target_vs[0];
         std::vector<u64> res(3, 0);
         res[0] = input_v[1];
         res[1] = target_v[1];
         if (res[0] == res[1]) {
-          // std::cout << "Warning detect a loop for node " << res[0] << std::endl;
           res[2] = 0;
         } else {
           res[2] = target_v[2] + 1;
diff --git a/backend/tests/sssp/sssp.py b/backend/tests/sssp/sssp.py
index bb7862a1..0259a7d7 100644
--- a/backend/tests/sssp/sssp.py
+++ b/backend/tests/sssp/sssp.py
@@ -1,7 +1,7 @@
 
 import networkx as nx
 
-data_f = open("/home/ubuntu/workspace/dataset/soc-LiveJournal1.txt")
+data_f = open("/home/ysun67/workspace/dataset/soc-LiveJournal1.txt")
 # data_f = open("/home/ubuntu/workspace/slog/backend/tests/sssp/test-input-graph/edge.csv")
 
 g = nx.DiGraph()
@@ -9,10 +9,11 @@
     g.add_edge(*map(int, l.strip().split("\t")))
 
 sssp_nodes = 0
-for i in range(1,10):
-    reached_map = nx.shortest_path(g, i)
-    sssp_nodes = sssp_nodes + len(reached_map.keys())
-    for k, v in reached_map.items():
-        print(f"{k} {i} {len(v)-1}") 
+#for i in range(1,10):
+#    reached_map = nx.shortest_path(g, i)
+#    sssp_nodes = sssp_nodes + len(reached_map.keys())
+#    for k, v in reached_map.items():
+#        print(f"{k} {i} {len(v)-1}") 
 
-print(sssp_nodes)
+reached_map = nx.shortest_path(g, 1)
+print(len(reached_map.items()))
diff --git a/cluster.yaml b/cluster.yaml
new file mode 100644
index 00000000..97fd09f0
--- /dev/null
+++ b/cluster.yaml
@@ -0,0 +1,50 @@
+Region: us-east-2
+Image:
+  Os: ubuntu2004
+HeadNode:
+  InstanceType: c6a.xlarge
+  Networking:
+    SubnetId: subnet-0b2659c4d572b0d41
+  Ssh:
+    KeyName: us-east-2
+  LocalStorage:
+    RootVolume:
+      Size: 256
+Scheduling:
+  Scheduler: slurm
+  SlurmQueues:
+  - Name: queue1
+    ComputeResources:
+    - Name: m5nmetal
+      Instances:
+      - InstanceType: m5n.metal
+      MinCount: 0
+      MaxCount: 4
+      Efa:
+        Enabled: true
+    - Name: c6a32x
+      Instances:
+      - InstanceType: c6a.32xlarge
+      MinCount: 0
+      MaxCount: 4
+      Efa:
+        Enabled: true
+    - Name: c6imetal
+      Instances:
+      - InstanceType: c6i.metal
+      MinCount: 0
+      MaxCount: 4
+      Efa:
+        Enabled: true
+    - Name: m6imetal
+      Instances:
+      - InstanceType: m6i.metal
+      MinCount: 0
+      MaxCount: 4
+      Efa:
+        Enabled: truev
+    Networking:
+      PlacementGroup:
+        Enabled: true
+      SubnetIds:
+      - subnet-03f9e3c05f7ec22c3
diff --git a/sbatch.sh b/sbatch.sh
new file mode 100644
index 00000000..06bafd3e
--- /dev/null
+++ b/sbatch.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+#SBATCH --nodes=2
+#SBATCH --ntasks=256
+#SBATCH --ntasks-per-node=128
+#SBATCH --cpus-per-task=1 
+srun /home/ubuntu/slog/backend/tests/cc/compiled_pre/build/cc /home/ubuntu/dataset/twitter /home/ubuntu/srun-out

From ba89394c94c81e8662a2cabec497b5d15082ea15 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Thu, 5 Jan 2023 12:49:48 -0500
Subject: [PATCH 22/36] local

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 37d3dfa0..16082615 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ temp-out/
 test-input
 souffle-out
 local/
+evaluation

From f043516321ac6113c86fcb1cb42860596e574c6d Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Wed, 28 Dec 2022 14:17:19 -0500
Subject: [PATCH 23/36] theta gcc + mpich

---
 backend/src/RA/parallel_join.cpp             |  72 ++++++++++-
 backend/src/RA/parallel_join.h               |   1 +
 backend/src/RAM/RA_tasks.cpp                 | 126 +++++++++++++------
 backend/src/RAM/RA_tasks.h                   |   2 +-
 backend/src/lie/lie.cpp                      |   4 +-
 backend/src/relation/shmap_relation.h        |   4 +-
 backend/src/relation/shmap_relation_exp.cpp  |  62 ++++-----
 backend/tests/cc/compiled_pre/cc.cpp         |  16 ++-
 backend/tests/sssp/compiled_pre/sssp_opt.cpp |  17 +--
 9 files changed, 209 insertions(+), 95 deletions(-)

diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index 59b2fd48..23b2e7b6 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -8,11 +8,13 @@
 #include "../parallel_RA_inc.h"
 #include <cstddef>
 #include <iostream>
+#include <vector>
 
 
 bool parallel_join::local_join(int threshold, int* offset,
                                int join_order,
                                u32 buckets,
+                               shmap_relation *input0,
                                int input0_buffer_size, int input0_buffer_width, u64 *input0_buffer,
                                shmap_relation *input1, u32 i1_size, int input1_buffer_width,
                                std::vector<int> reorder_map_array,
@@ -78,6 +80,68 @@ bool parallel_join::local_join(int threshold, int* offset,
 
     else if (join_order == RIGHT)
     {
+        if (input0->dependent_column_indices.size() > 0 && generator_mode) {
+            // right lattice join
+            std::vector<std::vector<u64>> input_ts;
+            std::vector<u64> prev_non_dependent_columns;
+            for (int k1 = *offset; k1 < input0_buffer_size; k1 = k1 + input0_buffer_width) {
+                std::vector<u64> cur_non_dependent_columns(
+                    input0_buffer+k1,
+                    input0_buffer+k1+input0_buffer_width-input0->dependent_column_indices.size()
+                );
+                // std::vector<u64> prefix;
+                // for (int jc=0; jc < join_column_count; jc++)
+                //     prefix.push_back(input0_buffer[k1 + jc]);
+                
+                std::vector<u64> input_t(input0_buffer+k1, input0_buffer+k1+input0_buffer_width);
+                // std::cout << "LT >>> ";
+                // for (auto c: input_t) {
+                //     std::cout << c << " ";
+                // }
+                // std::cout << std::endl;
+                if (cur_non_dependent_columns == prev_non_dependent_columns) {
+                    input_ts.push_back(input_t);
+                } else {
+                    if (input_ts.size() != 0) {
+                        u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
+                        input1[bucket_id].as_all_to_allv_right_join_buffer(
+                            std::vector<u64>(prev_non_dependent_columns.begin(),
+                                             prev_non_dependent_columns.begin()+join_column_count),
+                            join_buffer,
+                            input_ts,
+                            input1_buffer_width, counter,
+                            buckets, output_sub_bucket_count,
+                            output_sub_bucket_rank, reorder_map_array,
+                            join_column_count, deduplicate,
+                            &local_join_count, global_join_duplicates,
+                            global_join_inserts,
+                            output->get_join_column_count(),output->get_is_canonical(),
+                            generator_mode, generator_func);
+                        input_ts.clear();
+                    }
+                    prev_non_dependent_columns = cur_non_dependent_columns;
+                    input_ts.push_back(input_t);
+                }
+            }
+            if (input_ts.size() != 0) {
+                u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
+                input1[bucket_id].as_all_to_allv_right_join_buffer(
+                    std::vector<u64>(prev_non_dependent_columns.begin(),
+                                    prev_non_dependent_columns.begin()+join_column_count),
+                    join_buffer,
+                    input_ts,
+                    input1_buffer_width, counter,
+                    buckets, output_sub_bucket_count,
+                    output_sub_bucket_rank, reorder_map_array,
+                    join_column_count, deduplicate,
+                    &local_join_count, global_join_duplicates,
+                    global_join_inserts,
+                    output->get_join_column_count(),output->get_is_canonical(),
+                    generator_mode, generator_func);
+                input_ts.clear();
+            }
+        } else {
+        // original code    
         for (int k1 = *offset; k1 < input0_buffer_size; k1 = k1 + input0_buffer_width)
         {
             std::vector<u64> prefix;
@@ -85,10 +149,12 @@ bool parallel_join::local_join(int threshold, int* offset,
                 prefix.push_back(input0_buffer[k1 + jc]);
 
             u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
-
+            std::vector<std::vector<u64>> input_ts;
+            input_ts.push_back(std::vector<u64>(input0_buffer+k1, input0_buffer+k1+input0_buffer_width));
             input1[bucket_id].as_all_to_allv_right_join_buffer(
                 prefix, join_buffer,
-                input0_buffer + k1, input0_buffer_width,
+                // input0_buffer + k1, input0_buffer_width,
+                input_ts,
                 input1_buffer_width, counter,
                 buckets, output_sub_bucket_count,
                 output_sub_bucket_rank, reorder_map_array,
@@ -107,6 +173,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                 return false;
             }
         }
+
+        }
     }
 
     deduplicate.remove_tuple();
diff --git a/backend/src/RA/parallel_join.h b/backend/src/RA/parallel_join.h
index 30e15000..900b3b4d 100644
--- a/backend/src/RA/parallel_join.h
+++ b/backend/src/RA/parallel_join.h
@@ -90,6 +90,7 @@ class parallel_join: public parallel_RA {
     bool local_join(int threshold, int* offset,
                     int join_order,
                     u32 buckets,
+                    shmap_relation *input0,
                     int input0_buffer_size, int input0_buffer_width, u64 *input0_buffer,
                     shmap_relation *input1, u32 i1_size, int input1_buffer_width,
                     std::vector<int> reorder_map_array,
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 3c8963f8..88b7fa2e 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -259,21 +259,21 @@ u64 RAM::intra_bucket_comm_execute()
             else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
             {
                 // std::cout << "here>>>>>>>>>>>>>"  << std::endl;
-                if (input1->get_dependent_column().size() > 0) {
-                    intra_bucket_comm(get_bucket_count(),
-                                  input0->get_full(),
-                                  input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
-                                  input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
-                                  &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
-                                  mcomm.get_local_comm());
-                } else {
+                // if (input1->get_dependent_column().size() > 0) {
+                //     intra_bucket_comm(get_bucket_count(),
+                //                   input0->get_full(),
+                //                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
+                //                   input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
+                //                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
+                //                   mcomm.get_local_comm());
+                // } else {
                     intra_bucket_comm(get_bucket_count(),
                                     input1->get_delta(),
                                     input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
                                     input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
                                     &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
                                     mcomm.get_local_comm());
-                }
+                // }
                 total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
             }
 
@@ -384,7 +384,9 @@ bool RAM::local_compute(int* offset)
     u32 total_join_tuples = 0;
     u32 counter = 0;
     int threshold = 20000000;
-
+    auto before_compute_time = MPI_Wtime();
+    auto ibf_size = 0;
+    u64 jtarget_size = 0;
     for (std::vector<parallel_RA*>::iterator it = RA_list.begin() ; it != RA_list.end(); ++it)
     {
         // std::cout << "RA type : " << (*it)->get_RA_type() << std::endl;
@@ -592,7 +594,7 @@ bool RAM::local_compute(int* offset)
 
         else if ((*it)->get_RA_type() == JOIN)
         {
-            // auto before_time = MPI_Wtime();
+            auto before_join_time = MPI_Wtime();
             parallel_join* current_ra = (parallel_join*) *it;
             relation* output_relation = current_ra->get_join_output();
 
@@ -607,6 +609,7 @@ bool RAM::local_compute(int* offset)
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          LEFT,
                                                                          get_bucket_count(),
+                                                                         input0->get_delta(),
                                                                          intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
                                                                          input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1,
                                                                          reorder_map_array,
@@ -617,7 +620,7 @@ bool RAM::local_compute(int* offset)
                                                                          &join_tuples_duplicates,
                                                                          &join_tuples);
                 total_join_tuples = total_join_tuples + join_tuples;
-
+                jtarget_size += input1->get_delta_element_count();
             }
             else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL)
             {
@@ -625,6 +628,7 @@ bool RAM::local_compute(int* offset)
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          LEFT,
                                                                          get_bucket_count(),
+                                                                         input0->get_delta(),
                                                                          intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
                                                                          input1->get_full(), input1->get_full_element_count(), input1->get_arity()+1,
                                                                          reorder_map_array,
@@ -635,26 +639,30 @@ bool RAM::local_compute(int* offset)
                                                                          &join_tuples_duplicates,
                                                                          &join_tuples);
                 total_join_tuples = total_join_tuples + join_tuples;
+                jtarget_size += input1->get_full_element_count();
             }
             else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
             {
-                if (input1->get_dependent_column().size() > 0) {
-                    join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
-                                                                            LEFT,
-                                                                            get_bucket_count(),
-                                                                            intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
-                                                                            input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1,
-                                                                            reorder_map_array,
-                                                                            output_relation,
-                                                                            compute_buffer,
-                                                                            counter,
-                                                                            join_column_count,
-                                                                            &join_tuples_duplicates,
-                                                                            &join_tuples); 
-                } else {
+                // if (input1->get_dependent_column().size() > 0) {
+                //     join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
+                //                                                             LEFT,
+                //                                                             get_bucket_count(),
+                //                                                             input0->get_delta(),
+                //                                                             intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
+                //                                                             input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1,
+                //                                                             reorder_map_array,
+                //                                                             output_relation,
+                //                                                             compute_buffer,
+                //                                                             counter,
+                //                                                             join_column_count,
+                //                                                             &join_tuples_duplicates,
+                //                                                             &join_tuples); 
+                //     jtarget_size += input1->get_delta_element_count();
+                // } else {
                     join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                             RIGHT,
                                                                             get_bucket_count(),
+                                                                            input1->get_delta(),
                                                                             intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
                                                                             input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1,
                                                                             reorder_map_array,
@@ -664,14 +672,17 @@ bool RAM::local_compute(int* offset)
                                                                             join_column_count,
                                                                             &join_tuples_duplicates,
                                                                             &join_tuples);
-                }
+                    jtarget_size += input0->get_full_element_count();
+                // }
                 total_join_tuples = total_join_tuples + join_tuples;
+                
             }
             else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL)
             {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          RIGHT,
                                                                          get_bucket_count(),
+                                                                         input1->get_full(),
                                                                          intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
                                                                          input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1,
                                                                          reorder_map_array,
@@ -682,14 +693,14 @@ bool RAM::local_compute(int* offset)
                                                                          &join_tuples_duplicates,
                                                                          &join_tuples);
                 total_join_tuples = total_join_tuples + join_tuples;
+                jtarget_size += input0->get_full_element_count();
             }
-            // auto after_time = MPI_Wtime();
-            // if (mcomm.get_local_rank() == 0) {
-            //     std::cout << "local join on rank " << mcomm.get_local_rank() << " takes " << after_time - before_time << std::endl;
-            // }
+            
+            ibf_size += intra_bucket_buf_output_size[counter];
         }
         counter++;      
     }
+    auto after_compute_time = MPI_Wtime();
 
 #if 0
     int global_total_join_tuples = 0;
@@ -700,12 +711,27 @@ bool RAM::local_compute(int* offset)
         std::cout << "Joins: " << global_total_join_tuples << " Duplicates " << global_join_tuples_duplicates << " " << std::endl;
 #endif
 
+    auto before_sync_time = MPI_Wtime();
     int global_synchronizer = 0;
     int synchronizer = 0;
     if (join_completed == true)
         synchronizer = 1;
 
     MPI_Allreduce(&synchronizer, &global_synchronizer, 1, MPI_INT, MPI_BAND, mcomm.get_comm());
+    auto after_sync_time = MPI_Wtime();
+    auto lc_all_time = after_compute_time - before_compute_time;
+    double slowest_rank_time = lc_all_time;
+    MPI_Allreduce(&lc_all_time, &slowest_rank_time, 1, MPI_DOUBLE, MPI_MAX, mcomm.get_comm());
+    if (lc_all_time == slowest_rank_time) {
+        std::cout << "Slowest Rank >>> " << mcomm.get_rank()
+                  << "   Comp Time >>> " << after_compute_time - before_compute_time
+                  << "   Sync Time >>> " << after_sync_time - before_sync_time
+                  << "  Input Size >>> " << ibf_size
+                  << "  Target Count >>> " << jtarget_size
+                  << std::endl;
+    }
+
+    bool res = false;
     if (global_synchronizer == 1)
     {
         counter = 0;
@@ -734,10 +760,19 @@ bool RAM::local_compute(int* offset)
 
         delete[] intra_bucket_buf_output_size;
         delete[] intra_bucket_buf_output;
-        return true;
+        res = true;
     }
-    else
-        return false;
+    
+    
+    if (mcomm.get_rank() == 0) {
+        std::cout << "Rank 0 compute time >>> " << after_compute_time - before_compute_time
+                  << "    Sync time >>> " << after_sync_time - before_sync_time
+                  << "  Input Size >>> " << ibf_size
+                  << "  Target Count >>> " << jtarget_size
+                  << std::endl;
+    }
+
+    return res;
 }
 
 
@@ -850,7 +885,7 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
 #endif
             u32 elements_to_read = tuples_to_read * width;
 
-            for (int tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++)
+            for (u32 tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++)
             {
                 u32 x = starting + tuple_ind * width;
                 bool insert_flag = true;
@@ -906,7 +941,7 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
             u64 tuple[width];
             successful_insert = 0;
             u32 elements_to_read = tuples_to_read * width;
-            for (int tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++)
+            for (u32 tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++)
             {
                 u32 x = starting + tuple_ind * width;
                 if (output->find_in_full(cumulative_all_to_allv_buffer + x, width) == false && output->find_in_delta(cumulative_all_to_allv_buffer + x, width) == false)
@@ -1117,7 +1152,7 @@ void RAM::io_all_relation(int status)
 }
 
 
-void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num)
+void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector<double>& runtime_vector)
 {
     int inner_loop = 0;
     u32 RA_count = RA_list.size();
@@ -1126,6 +1161,11 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
     for (u32 i =0; i < RA_count; i++)
         offset[i] = 0;
 
+    double all_local_compute = 0;
+    double all_insert_newt = 0;
+    double all_comm = 0;
+    double all_time = 0;
+
     while (batch_size != 0)
     {
 #if DEBUG_OUTPUT
@@ -1148,10 +1188,12 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
             auto compute_start = MPI_Wtime();
             local_join_status = local_compute(offset);
             auto compute_end = MPI_Wtime();
+            all_local_compute += compute_end - compute_start;
 
             auto all_to_all_start = MPI_Wtime();
             local_comm();
             auto all_to_all_end = MPI_Wtime();
+            all_comm += all_to_all_end - all_to_all_start;
 
             auto free_buffers_start = MPI_Wtime();
             free_compute_buffers();
@@ -1160,6 +1202,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
             auto insert_in_newt_start = MPI_Wtime();
             local_insert_in_newt(intern_map);
             auto insert_in_newt_end = MPI_Wtime();
+            all_insert_newt += insert_in_newt_end - insert_in_newt_start;
 
 #if 1
             if (mcomm.get_rank() == 0)
@@ -1230,6 +1273,13 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
             break;
     }
 
+    if (mcomm.get_rank() == 0) {
+        runtime_vector[0] = runtime_vector[0] + all_comm;
+        runtime_vector[1] = runtime_vector[1] + all_local_compute;
+        runtime_vector[2] = runtime_vector[2] + all_insert_newt;
+        runtime_vector[3] = runtime_vector[3] + all_time;
+    }
+
     delete[] offset;
 
 
@@ -1408,7 +1458,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
 
 bool RAM::contains_relation(int tag) {
     for (auto rel : ram_relations) {
-        if (rel->get_intern_tag() == tag) {
+        if (rel->get_intern_tag() == (u32)tag) {
             return true;
         }
     }
diff --git a/backend/src/RAM/RA_tasks.h b/backend/src/RAM/RA_tasks.h
index ab9ac4a3..be90384a 100644
--- a/backend/src/RAM/RA_tasks.h
+++ b/backend/src/RAM/RA_tasks.h
@@ -163,7 +163,7 @@ class RAM
     bool contains_relation(int tag);
 
     /// Start running this SCC (task) for "batck_size" iterations
-    void execute_in_batches(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int *loop_counter,int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num);
+    void execute_in_batches(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int *loop_counter,int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector<double>& runtime_vector);
 
     void execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector<u32>& history, std::map<u64, u64>& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector<double>& runtime_vector);
 };
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index ad478a20..320e9592 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -474,7 +474,7 @@ bool LIE::execute ()
                 create_checkpoint_dump(loop_counter, executable_task->get_id());
 
             if (comm_compaction == 0)
-                executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num);
+                executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector);
             else
                 executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector);
 
@@ -522,7 +522,7 @@ bool LIE::execute ()
                     create_checkpoint_dump(loop_counter, executable_task->get_id());
 
                 if (comm_compaction == 0)
-                    executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num);
+                    executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector);
                 else
                     executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector);
 
diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h
index 4ed96934..09ba11ac 100644
--- a/backend/src/relation/shmap_relation.h
+++ b/backend/src/relation/shmap_relation.h
@@ -11,6 +11,7 @@
 #include "../btree/btree_set.h"
 #include <cstdint>
 #include <utility>
+#include <vector>
 
 struct shmap_relation {
 
@@ -136,7 +137,8 @@ struct shmap_relation {
     void as_all_to_allv_acopy_buffer(all_to_allv_buffer& buffer, std::vector<u64> prefix, std::vector<int> reorder_map, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, u32 arity, u32 join_column_count, int head_rel_hash_col_count, bool canonical);
     void as_all_to_allv_right_join_buffer(
         std::vector<u64> prefix, all_to_allv_buffer& join_buffer,
-        u64 *input0_buffer, int input0_buffer_width,
+        // u64 *input0_buffer, int input0_buffer_width,
+        std::vector<std::vector<u64>> &input_ts,
         int input1_buffer_width, int ra_id,
         u32 buckets, u32* output_sub_bucket_count,
         u32** output_sub_bucket_rank, std::vector<int> reorder_map,
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index 91934d9e..19decd31 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -393,8 +393,9 @@ void shmap_relation::as_all_to_allv_copy_generate_buffer(
 void shmap_relation::as_all_to_allv_right_join_buffer(
     std::vector<u64> prefix,
     all_to_allv_buffer &join_buffer,
-    u64 *input0_buffer,
-    int input0_buffer_width,
+    // u64 *input0_buffer,
+    // int input0_buffer_width,
+    std::vector<std::vector<u64>> &input_ts,
     int input1_buffer_width,
     int ra_id, u32 buckets,
     u32 *output_sub_bucket_count,
@@ -435,48 +436,33 @@ void shmap_relation::as_all_to_allv_right_join_buffer(
     auto joined_range = lowerUpperRange(lower_bound, upper_bound);
 
     if (generator_mode) {
-        std::vector<u64> input_t(input0_buffer, input0_buffer+input0_buffer_width);
-        std::vector<std::vector<u64>> eq_tuple_set;
-        std::vector<std::vector<u64>> generated_tuple_set;
-        std::vector<u64> prev_non_dependent_columns;
-        for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it){
+        for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) {
             auto cur_path = *it;
-            std::vector<u64> cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+arity+1-dependent_column_indices.size());
-            if (cur_non_dependent_columns == prev_non_dependent_columns) {
-                eq_tuple_set.push_back(cur_path);
-                continue;
-            } else {
-                if (eq_tuple_set.size() != 0) {
-                    gen_func(eq_tuple_set, input_t, generated_tuple_set);
-                    eq_tuple_set.clear();
-                }
-                prev_non_dependent_columns = cur_non_dependent_columns;
-                eq_tuple_set.push_back(cur_path);
-            }
-        }
-        if (eq_tuple_set.size() != 0) {
-            gen_func(eq_tuple_set, input_t, generated_tuple_set);
-        }
-        for (auto& tp: generated_tuple_set) {
-            uint64_t bucket_id = tuple_hash(tp.data(), head_rel_hash_col_count) % buckets;
-            uint64_t sub_bucket_id=0;
-            if (canonical == false)
-                sub_bucket_id = tuple_hash(tp.data() + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id];
+            std::vector<std::vector<u64>> generated_tuple_set;
+            gen_func(input_ts, cur_path, generated_tuple_set);
+            for (auto& tp: generated_tuple_set) {
+                uint64_t bucket_id = tuple_hash(tp.data(), head_rel_hash_col_count) % buckets;
+                uint64_t sub_bucket_id=0;
+                if (canonical == false)
+                    sub_bucket_id = tuple_hash(tp.data() + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id];
 
-            int index = output_sub_bucket_rank[bucket_id][sub_bucket_id];
+                int index = output_sub_bucket_rank[bucket_id][sub_bucket_id];
 
-            join_buffer.local_compute_output_size_rel[ra_id] = join_buffer.local_compute_output_size_rel[ra_id] + join_buffer.width[ra_id];
-            join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id];
-            join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id];
-            join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++;
+                join_buffer.local_compute_output_size_rel[ra_id] = join_buffer.local_compute_output_size_rel[ra_id] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++;
 
-            join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id];
-            join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id];
-            join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)tp.data(), sizeof(u64)*join_buffer.width[ra_id]);
-            (*local_join_inserts)++;
-            (*local_join_count)++;
+                join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id];
+                join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id];
+                join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)tp.data(), sizeof(u64)*join_buffer.width[ra_id]);
+                (*local_join_inserts)++;
+                (*local_join_count)++;
+            }
         }
     } else {
+        u64* input0_buffer = input_ts[0].data();
+        int input0_buffer_width = input_ts[0].size();
         for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it)
         {
             auto cur_path = *it;
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index a038fc59..8964510d 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -365,7 +365,7 @@ void load_input_relation(std::string db_dir) {
   for (const auto &entry : std::filesystem::directory_iterator(db_dir)) {
     // check if ends with table
     std::string filename_ss = entry.path().filename().string();
-    std::cout << "input database has file " << filename_ss << std::endl;
+    // std::cout << "input database has file " << filename_ss << std::endl;
     std::string suffix = ".table";
     int ft = filename_ss.size() - suffix.size();
     if (ft < 0)
@@ -388,8 +388,8 @@ void load_input_relation(std::string db_dir) {
     }
     if (tag > max_rel)
       max_rel = tag;
-    std::cout << "load " << tag << "." << index_stream.str() << "has arity "
-              << arity << std::endl;
+    // std::cout << "load " << tag << "." << index_stream.str() << "has arity "
+    //           << arity << std::endl;
     rel_tag_map[index_stream.str()] = tag;
   }
 }
@@ -409,8 +409,8 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
   }
   max_rel++;
   rel_tag_map[name_arity] = max_rel;
-  std::cout << "generate rel tag: " << name_arity << " " << max_rel
-            << std::endl;
+  // std::cout << "generate rel tag: " << name_arity << " " << max_rel
+  //           << std::endl;
   return max_rel;
 }
 
@@ -509,11 +509,15 @@ int main(int argc, char **argv) {
   );
   cc_pg->set_generator_func(
     [](const depend_val_t& target_vs, const std::vector<u64>& input_v, depend_val_t& res_set) -> bool {
-      // std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl;
       auto target_v = target_vs[0];
       std::vector<u64> res(2, 0);
       res[0] = input_v[1];
       res[1] = target_v[1];
+      // if (target_v[0] == 21) {
+      // std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl;
+      // std::cout << "cc " << target_v[0] << " " << target_v[1] << std::endl;
+      // std::cout << "res " << res[0] << " " << res[1] << std::endl;
+      // }
       res_set.push_back(res);
       return true;
     }
diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
index 3b9d630a..b39254bb 100644
--- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp
+++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp
@@ -1,5 +1,6 @@
 // location of `parallel_RA_inc.h` here
 #include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "mpi.h"
 
 #include <optional>
 #include <iterator>
@@ -35,6 +36,7 @@ const u64 str_tag = 2;
 const u64 sign_flip_const = 0x0000200000000000;
 const u64 signed_num_mask = 0xFFFFE00000000000;
 int start_node = 1;
+int end_node = 2;
 
 inline bool is_number(u64 datum) {
   // cout << "is_number(" << datum << "): " << (datum >> tag_position ==
@@ -383,10 +385,10 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
   return max_rel;
 }
 
-void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
+void compute_sssp_from(mpi_comm &mcomm, int sp, int ep, std::string input_dir,
                        std::string output_dir, int argc, char **argv) {
-  double start_time = 0;
   start_node = sp;
+  end_node = ep;
   load_input_relation(input_dir);
 
   relation *rel__edge__2__1__2 = new relation(
@@ -420,7 +422,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
           auto [data, output] = state;
           auto head_tuple = output;
 
-          bool compatible = true && res_0 == n2d(start_node);
+          bool compatible = true && (res_0 < n2d(end_node)) && (res_0 >= n2d(start_node));
           if (!compatible)
             return state;
 
@@ -458,6 +460,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
       });
   scc1->add_rule(update_spath_j);
 
+  double start_time = MPI_Wtime();
   LIE *lie = new LIE();
   lie->add_relation(rel__edge__2__1__2);
   lie->add_relation(rel__spath__3__2);
@@ -467,9 +470,9 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
 
   // Enable IO
   lie->enable_all_to_all_dump();
-  lie->enable_data_IO();
+  //lie->enable_data_IO();
   //   lie->enable_share_io();
-  lie->enable_IO();
+  //lie->enable_IO();
   lie->set_output_dir(output_dir); // Write to this directory
   lie->set_comm(mcomm);
   lie->set_batch_size(1);
@@ -488,7 +491,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir,
 
   // rel__spath__2__1__2->print();
   // rel__edge__2__1__2->print();
-  // rel__spath__3__2->print();
+ // rel__spath__3__2->print();
   // rel__edge__3__1->print();
   // rel__edge__3__1__2__3->print();
 
@@ -522,7 +525,7 @@ int main(int argc, char **argv) {
   mpi_comm mcomm;
   mcomm.create(argc, argv);
 
-  compute_sssp_from(mcomm, atoi(argv[3]), slog_input_dir, slog_output_dir, argc,
+  compute_sssp_from(mcomm, atoi(argv[3]), atoi(argv[4]), slog_input_dir, slog_output_dir, argc,
                     argv);
 
   mcomm.destroy();

From 13a0a419a35765046995379915f8defaa88f55bd Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Wed, 28 Dec 2022 18:05:23 -0500
Subject: [PATCH 24/36] add more log

---
 backend/src/RAM/RA_tasks.cpp | 239 ++++++++++++++++-------------------
 1 file changed, 107 insertions(+), 132 deletions(-)

diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 88b7fa2e..54149941 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -227,68 +227,42 @@ u64 RAM::intra_bucket_comm_execute()
             parallel_join* current_ra = (parallel_join*) *it;
             relation* input0 = current_ra->get_join_input0();
             relation* input1 = current_ra->get_join_input1();
-
-            /// Join between delta and delta
-            if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == DELTA)
-            {
-
-                intra_bucket_comm(get_bucket_count(),
-                                  input0->get_delta(),
-                                  input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
-                                  input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
-                                  &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
-                                  mcomm.get_local_comm());
-
-                total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
+            shmap_relation* input0_trees = input0->get_full();
+            u64 input0_size = input0->get_full_element_count();
+            shmap_relation* input1_trees = input1->get_full();
+            u64 input1_size = input1->get_full_element_count();
+            if (current_ra->get_join_input0_graph_type() == DELTA) {
+                input0_trees = input0->get_delta();
+                input0_size = input0->get_delta_element_count();
+            }
+            if (current_ra->get_join_input1_graph_type() == DELTA) {
+                input1_trees = input1->get_delta();
+                input1_size = input1->get_delta_element_count();
+            }
+            int join_direction = LEFT;
+            int local_join_direction_count = input0_size < input1_size ? 0 : 1;   // true if size of input0 > input1
+            int global_join_direction_count = local_join_direction_count;
+            MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm());
+            if (global_join_direction_count > mcomm.get_nprocs() / 2) {
+                join_direction = RIGHT;
             }
 
-            /// Join between delta and full
-            else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL)
-            {
-
+            if (join_direction == LEFT) {
                 intra_bucket_comm(get_bucket_count(),
-                                  input0->get_delta(),
+                                  input0_trees,
                                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
                                   input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
                                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
                                   mcomm.get_local_comm());
-                total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
-            }
-
-            /// Join between full and delta
-            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
-            {
-                // std::cout << "here>>>>>>>>>>>>>"  << std::endl;
-                // if (input1->get_dependent_column().size() > 0) {
-                //     intra_bucket_comm(get_bucket_count(),
-                //                   input0->get_full(),
-                //                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
-                //                   input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
-                //                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
-                //                   mcomm.get_local_comm());
-                // } else {
-                    intra_bucket_comm(get_bucket_count(),
-                                    input1->get_delta(),
-                                    input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
-                                    input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
-                                    &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
-                                    mcomm.get_local_comm());
-                // }
-                total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
-            }
-
-            /// Join between full and full
-            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL)
-            {
-
+            } else {
                 intra_bucket_comm(get_bucket_count(),
-                                  input1->get_full(),
+                                  input1_trees,
                                   input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
                                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
                                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
                                   mcomm.get_local_comm());
-                total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
             }
+            total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
         }
         counter++;
     }
@@ -604,33 +578,33 @@ bool RAM::local_compute(int* offset)
             relation* input1 = current_ra->get_join_input1();
             int join_column_count = input0->get_join_column_count();
 
-            if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == DELTA)
-            {
-                join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
-                                                                         LEFT,
-                                                                         get_bucket_count(),
-                                                                         input0->get_delta(),
-                                                                         intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
-                                                                         input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1,
-                                                                         reorder_map_array,
-                                                                         output_relation,
-                                                                         compute_buffer,
-                                                                         counter,
-                                                                         join_column_count,
-                                                                         &join_tuples_duplicates,
-                                                                         &join_tuples);
-                total_join_tuples = total_join_tuples + join_tuples;
-                jtarget_size += input1->get_delta_element_count();
+            shmap_relation* input0_trees = input0->get_full();
+            u64 input0_size = input0->get_full_element_count();
+            shmap_relation* input1_trees = input1->get_full();
+            u64 input1_size = input1->get_full_element_count();
+            if (current_ra->get_join_input0_graph_type() == DELTA) {
+                input0_trees = input0->get_delta();
+                input0_size = input0->get_delta_element_count();
+            }
+            if (current_ra->get_join_input1_graph_type() == DELTA) {
+                input1_trees = input1->get_delta();
+                input1_size = input1->get_delta_element_count();
+            }
+            int join_direction = LEFT;
+            int local_join_direction_count = input0_size < input1_size ? 0 : 1;   // true if size of input0 > input1
+            int global_join_direction_count = local_join_direction_count;
+            MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm());
+            if (global_join_direction_count > mcomm.get_nprocs() / 2) {
+                join_direction = RIGHT;
             }
-            else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL)
-            {
 
+            if (join_direction == LEFT) {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          LEFT,
                                                                          get_bucket_count(),
-                                                                         input0->get_delta(),
+                                                                         input0_trees,
                                                                          intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
-                                                                         input1->get_full(), input1->get_full_element_count(), input1->get_arity()+1,
+                                                                         input1_trees, input1_size, input1->get_arity()+1,
                                                                          reorder_map_array,
                                                                          output_relation,
                                                                          compute_buffer,
@@ -638,53 +612,14 @@ bool RAM::local_compute(int* offset)
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
                                                                          &join_tuples);
-                total_join_tuples = total_join_tuples + join_tuples;
-                jtarget_size += input1->get_full_element_count();
-            }
-            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
-            {
-                // if (input1->get_dependent_column().size() > 0) {
-                //     join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
-                //                                                             LEFT,
-                //                                                             get_bucket_count(),
-                //                                                             input0->get_delta(),
-                //                                                             intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
-                //                                                             input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1,
-                //                                                             reorder_map_array,
-                //                                                             output_relation,
-                //                                                             compute_buffer,
-                //                                                             counter,
-                //                                                             join_column_count,
-                //                                                             &join_tuples_duplicates,
-                //                                                             &join_tuples); 
-                //     jtarget_size += input1->get_delta_element_count();
-                // } else {
-                    join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
-                                                                            RIGHT,
-                                                                            get_bucket_count(),
-                                                                            input1->get_delta(),
-                                                                            intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
-                                                                            input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1,
-                                                                            reorder_map_array,
-                                                                            output_relation,
-                                                                            compute_buffer,
-                                                                            counter,
-                                                                            join_column_count,
-                                                                            &join_tuples_duplicates,
-                                                                            &join_tuples);
-                    jtarget_size += input0->get_full_element_count();
-                // }
-                total_join_tuples = total_join_tuples + join_tuples;
-                
-            }
-            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL)
-            {
+                           
+            } else {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          RIGHT,
                                                                          get_bucket_count(),
-                                                                         input1->get_full(),
+                                                                         input1_trees,
                                                                          intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
-                                                                         input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1,
+                                                                         input0_trees, input0_size, input0->get_arity()+1,
                                                                          reorder_map_array,
                                                                          output_relation,
                                                                          compute_buffer,
@@ -692,9 +627,9 @@ bool RAM::local_compute(int* offset)
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
                                                                          &join_tuples);
-                total_join_tuples = total_join_tuples + join_tuples;
-                jtarget_size += input0->get_full_element_count();
             }
+            total_join_tuples = total_join_tuples + join_tuples;
+            jtarget_size += input1->get_delta_element_count();  
             
             ibf_size += intra_bucket_buf_output_size[counter];
         }
@@ -1165,7 +1100,12 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
     double all_insert_newt = 0;
     double all_comm = 0;
     double all_time = 0;
+    double all_insert_in_full = 0;
+    double all_allocate_buf = 0;
+    double all_intra = 0;
+    double all_free_buf =0;
 
+    // auto before_batch = MPI_Wtime();
     while (batch_size != 0)
     {
 #if DEBUG_OUTPUT
@@ -1175,7 +1115,8 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
 
         auto intra_start = MPI_Wtime(); 
         intra_bucket_comm_execute();
-        auto intra_end = MPI_Wtime(); 
+        auto intra_end = MPI_Wtime();
+        all_intra += intra_end - intra_start; 
 
         std::cout << std::setiosflags(std::ios::fixed);
         bool local_join_status = false;
@@ -1184,6 +1125,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
             auto allocate_buffers_start = MPI_Wtime();
             allocate_compute_buffers();
             auto allocate_buffers_end = MPI_Wtime();
+            all_allocate_buf += allocate_buffers_end - allocate_buffers_start;
 
             auto compute_start = MPI_Wtime();
             local_join_status = local_compute(offset);
@@ -1198,6 +1140,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
             auto free_buffers_start = MPI_Wtime();
             free_compute_buffers();
             auto free_buffers_end = MPI_Wtime();
+            all_free_buf += free_buffers_end - free_buffers_start;
 
             auto insert_in_newt_start = MPI_Wtime();
             local_insert_in_newt(intern_map);
@@ -1240,9 +1183,13 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
         auto insert_in_full_start = MPI_Wtime(); 
         local_insert_in_full();
         auto insert_in_full_end = MPI_Wtime(); 
-        
+        all_insert_in_full += insert_in_full_end - insert_in_full_start;
+
 #if 1
-        if (mcomm.get_rank() == 0)
+        double all_l_time = insert_in_full_end - intra_start;
+        double slowest_all_time = all_l_time;
+        MPI_Allreduce(&all_l_time, &slowest_all_time, 1, MPI_DOUBLE, MPI_MAX, mcomm.get_comm());
+        if (mcomm.get_rank() == 0 || slowest_all_time == all_l_time)
         {
 #if 0
             std::cout  << name << " " << mcomm.get_local_nprocs()<< " Current time OUTER LOOP [" << loop_count_tracker << " ] "
@@ -1258,13 +1205,21 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
                        << " full " << *running_insert_in_full
                        << " Total " << *running_intra_bucket_comm + *running_buffer_allocate + *running_local_compute + *running_all_to_all + *running_buffer_free + *running_insert_newt + *running_insert_in_full << std::endl;
 #endif
-            std::cout << (intra_end - intra_start) << std::setw(12)
-                      << (insert_in_full_end - insert_in_full_start)  << std::setw(12)
-                      << (insert_in_full_end - intra_start) << std::endl;
-
+            std::cout << "rank" << std::setw(12) << "loop" << std::setw(12) << "alloc_buf" << std::setw(12) << "compute" << std::setw(12)
+                        << "all2all" << std::setw(12) << "free_buf" << std::setw(12) << "insert_newt" << std::setw(12)
+                        << "intra" << std::setw(12) << "insert_full" << std::setw(12) << "total" << "\n" ;
+            std::cout << mcomm.get_rank() << std::setw(12) << loop_count_tracker << std::setprecision(4) << std::setw(12)
+                        << all_allocate_buf << std::setprecision(4) << std::setw(12)
+                        << all_local_compute << std::setprecision(4) << std::setw(12)
+                        << all_comm << std::setprecision(4) << std::setw(12)
+                        << all_free_buf << std::setprecision(4) << std::setw(12)
+                        << all_insert_newt << std::setprecision(4) << std::setw(12);
+            std::cout << all_intra << std::setw(12)
+                      << all_insert_in_full << std::setw(12)
+                      << all_l_time << std::endl;
         }
 #endif
-
+        all_time += all_l_time;
         batch_size--;
         loop_count_tracker++;
 
@@ -1317,6 +1272,12 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
     double all_insert_newt = 0;
     double all_comm = 0;
     double all_time = 0;
+    double all_insert_in_full = 0;
+    double all_allocate_buf = 0;
+    double all_intra = 0;
+    double all_free_buf =0;
+
+    // auto before_batch = MPI_Wtime();
 
     while (batch_size != 0)
     {
@@ -1329,6 +1290,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
         auto intra_start = MPI_Wtime(); 
         intra_bucket_comm_execute();
         auto intra_end = MPI_Wtime();
+        all_intra += intra_end - intra_start;
 
         bool local_join_status = false;
         while (local_join_status == false)
@@ -1336,6 +1298,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
             auto allocate_buffers_start = MPI_Wtime();
             allocate_compute_buffers();
             auto allocate_buffers_end = MPI_Wtime();
+            all_allocate_buf += allocate_buffers_end - allocate_buffers_start;
 
             auto compute_start = MPI_Wtime();
             local_join_status = local_compute(offset);
@@ -1350,6 +1313,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
             auto free_buffers_start = MPI_Wtime();
             free_compute_buffers();
             auto free_buffers_end = MPI_Wtime();
+            all_free_buf += free_buffers_end - free_buffers_start;
 
             auto insert_in_newt_start = MPI_Wtime();
             local_insert_in_newt_comm_compaction(intern_map);
@@ -1357,7 +1321,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
             all_insert_newt += insert_in_newt_end - insert_in_newt_start;
 
 
-#if 1
+#if 0
             if (mcomm.get_rank() == 0)
             {
 #if 0
@@ -1394,9 +1358,13 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
         auto insert_in_full_start = MPI_Wtime(); 
         local_insert_in_full();
         auto insert_in_full_end = MPI_Wtime();
+        all_insert_in_full += insert_in_full_end - insert_in_full_start;
 
 #if 1
-        if (mcomm.get_rank() == 0)
+        double all_l_time = insert_in_full_end - intra_start;
+        double slowest_all_time = all_l_time;
+        MPI_Allreduce(&all_l_time, &slowest_all_time, 1, MPI_DOUBLE, MPI_MAX, mcomm.get_comm());
+        if (mcomm.get_rank() == 0 || slowest_all_time == all_l_time)
         {
 #if 0
             std::cout  << name << " " << mcomm.get_local_nprocs()<< " Current time OUTER LOOP [" << loop_count_tracker << " ] "
@@ -1412,15 +1380,22 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s
                        << " full " << *running_insert_in_full
                        << " Total " << *running_intra_bucket_comm + *running_buffer_allocate + *running_local_compute + *running_all_to_all + *running_buffer_free + *running_insert_newt + *running_insert_in_full << std::endl;
 #endif
-            std::cout << (intra_end - intra_start) << std::setw(12)
-                      << (insert_in_full_end - insert_in_full_start)  << std::setw(12)
-                      << (insert_in_full_end - intra_start) << std::endl;
-        
-            all_time += insert_in_full_end - intra_start;
-
+            std::cout << "rank" << std::setw(12) << "loop" << std::setw(12) << "alloc_buf" << std::setw(12) << "compute" << std::setw(12)
+                        << "all2all" << std::setw(12) << "free_buf" << std::setw(12) << "insert_newt" << std::setw(12)
+                        << "intra" << std::setw(12) << "insert_full" << std::setw(12) << "total" << "\n" ;
+            std::cout << mcomm.get_rank()<< std::setw(12) << loop_count_tracker << std::setprecision(4) << std::setw(12)
+                        << all_allocate_buf << std::setprecision(4) << std::setw(12)
+                        << all_local_compute << std::setprecision(4) << std::setw(12)
+                        << all_comm << std::setprecision(4) << std::setw(12)
+                        << all_free_buf << std::setprecision(4) << std::setw(12)
+                        << all_insert_newt << std::setprecision(4) << std::setw(12);
+            std::cout << all_intra << std::setw(12)
+                      << all_insert_in_full  << std::setw(12)
+                      << all_l_time << std::endl;
         }
+        
 #endif
-
+        all_time += all_l_time;
         batch_size--;
         loop_count_tracker++;
 

From 3c6221e845affc470264928c9eb4c633397306b2 Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Thu, 29 Dec 2022 23:45:09 -0500
Subject: [PATCH 25/36] more stat

---
 backend/src/RA/parallel_join.cpp | 22 ++++++++++++++++++++--
 backend/src/RA/parallel_join.h   |  3 ++-
 backend/src/RAM/RA_tasks.cpp     | 25 ++++++++++++++++++-------
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index 23b2e7b6..95ba9660 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -6,6 +6,7 @@
 
 
 #include "../parallel_RA_inc.h"
+#include "mpi.h"
 #include <cstddef>
 #include <iostream>
 #include <vector>
@@ -23,7 +24,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                                int counter,
                                int join_column_count,
                                u32* global_join_duplicates,
-                               u32* global_join_inserts)
+                               u32* global_join_inserts,
+                               std::vector<double>& time_stat)
 {
     join_buffer.width[counter] = reorder_map_array.size();
 
@@ -39,9 +41,12 @@ bool parallel_join::local_join(int threshold, int* offset,
     u32** output_sub_bucket_rank = output->get_sub_bucket_rank();
     // std::cout << "wwwwwwwww " << input0_buffer_size << " " << input0_buffer_size << " " << i1_size << std::endl;
 
-    if (*offset > input0_buffer_size || input0_buffer_size == 0 || i1_size == 0)
+    if (*offset > input0_buffer_size || input0_buffer_size == 0 || i1_size == 0) {
+        time_stat.push_back(0);
         return true;
+    }
 
+    double join_time_total = 0;
     int local_join_count=0;
     if (join_order == LEFT)
     {
@@ -56,6 +61,7 @@ bool parallel_join::local_join(int threshold, int* offset,
 
             u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
 
+            auto before_actual_join = MPI_Wtime();
             input1[bucket_id].as_all_to_allv_left_join_buffer(
                 prefix, join_buffer,
                 input0_buffer + k1,input0_buffer_width,
@@ -67,6 +73,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                 global_join_inserts, output->get_join_column_count(),
                 output->get_is_canonical(),
                 generator_mode, generator_func);
+            auto after_actual_join = MPI_Wtime();
+            join_time_total += after_actual_join - before_actual_join;
 
             // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl;
             if (local_join_count > threshold)
@@ -103,6 +111,7 @@ bool parallel_join::local_join(int threshold, int* offset,
                     input_ts.push_back(input_t);
                 } else {
                     if (input_ts.size() != 0) {
+                        auto before_actual_join = MPI_Wtime();
                         u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
                         input1[bucket_id].as_all_to_allv_right_join_buffer(
                             std::vector<u64>(prev_non_dependent_columns.begin(),
@@ -117,6 +126,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                             global_join_inserts,
                             output->get_join_column_count(),output->get_is_canonical(),
                             generator_mode, generator_func);
+                        auto after_actual_join = MPI_Wtime();
+                        join_time_total += after_actual_join - before_actual_join;
                         input_ts.clear();
                     }
                     prev_non_dependent_columns = cur_non_dependent_columns;
@@ -125,6 +136,7 @@ bool parallel_join::local_join(int threshold, int* offset,
             }
             if (input_ts.size() != 0) {
                 u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
+                auto before_actual_join = MPI_Wtime();
                 input1[bucket_id].as_all_to_allv_right_join_buffer(
                     std::vector<u64>(prev_non_dependent_columns.begin(),
                                     prev_non_dependent_columns.begin()+join_column_count),
@@ -138,6 +150,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                     global_join_inserts,
                     output->get_join_column_count(),output->get_is_canonical(),
                     generator_mode, generator_func);
+                auto after_actual_join = MPI_Wtime();
+                join_time_total += after_actual_join - before_actual_join;
                 input_ts.clear();
             }
         } else {
@@ -151,6 +165,7 @@ bool parallel_join::local_join(int threshold, int* offset,
             u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
             std::vector<std::vector<u64>> input_ts;
             input_ts.push_back(std::vector<u64>(input0_buffer+k1, input0_buffer+k1+input0_buffer_width));
+            auto before_actual_join = MPI_Wtime();
             input1[bucket_id].as_all_to_allv_right_join_buffer(
                 prefix, join_buffer,
                 // input0_buffer + k1, input0_buffer_width,
@@ -163,6 +178,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                 global_join_inserts,
                 output->get_join_column_count(),output->get_is_canonical(),
                 generator_mode, generator_func);
+            auto after_actual_join = MPI_Wtime();
+            join_time_total += after_actual_join - before_actual_join;
 
             // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl;
             if (local_join_count > threshold)
@@ -177,6 +194,7 @@ bool parallel_join::local_join(int threshold, int* offset,
         }
     }
 
+    time_stat.push_back(join_time_total);
     deduplicate.remove_tuple();
     return true;
 }
diff --git a/backend/src/RA/parallel_join.h b/backend/src/RA/parallel_join.h
index 900b3b4d..c30120c5 100644
--- a/backend/src/RA/parallel_join.h
+++ b/backend/src/RA/parallel_join.h
@@ -99,7 +99,8 @@ class parallel_join: public parallel_RA {
                     int counter,
                     int join_column_count,
                     u32* local_join_duplicates,
-                    u32* local_join_inserts);
+                    u32* local_join_inserts,
+                    std::vector<double>& time_stat);
 
 #endif
 };
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 54149941..0684d2c6 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -361,6 +361,8 @@ bool RAM::local_compute(int* offset)
     auto before_compute_time = MPI_Wtime();
     auto ibf_size = 0;
     u64 jtarget_size = 0;
+    double size_sync_time = 0;
+    double real_join_time = 0;
     for (std::vector<parallel_RA*>::iterator it = RA_list.begin() ; it != RA_list.end(); ++it)
     {
         // std::cout << "RA type : " << (*it)->get_RA_type() << std::endl;
@@ -593,11 +595,15 @@ bool RAM::local_compute(int* offset)
             int join_direction = LEFT;
             int local_join_direction_count = input0_size < input1_size ? 0 : 1;   // true if size of input0 > input1
             int global_join_direction_count = local_join_direction_count;
+
+            auto before_size_sync = MPI_Wtime();
             MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm());
             if (global_join_direction_count > mcomm.get_nprocs() / 2) {
                 join_direction = RIGHT;
             }
-
+            auto after_size_sync = MPI_Wtime();
+            size_sync_time += after_size_sync - before_size_sync;
+            std::vector<double> real_j_time_stat;
             if (join_direction == LEFT) {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          LEFT,
@@ -611,8 +617,10 @@ bool RAM::local_compute(int* offset)
                                                                          counter,
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
-                                                                         &join_tuples);
-                           
+                                                                         &join_tuples,
+                                                                         real_j_time_stat);
+                jtarget_size += input1_size;             
+                ibf_size += input0_size;           
             } else {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          RIGHT,
@@ -626,12 +634,13 @@ bool RAM::local_compute(int* offset)
                                                                          counter,
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
-                                                                         &join_tuples);
+                                                                         &join_tuples,
+                                                                         real_j_time_stat);
+                jtarget_size += input0_size;             
+                ibf_size += input1_size;  
             }
             total_join_tuples = total_join_tuples + join_tuples;
-            jtarget_size += input1->get_delta_element_count();  
-            
-            ibf_size += intra_bucket_buf_output_size[counter];
+            real_join_time += real_j_time_stat[0];
         }
         counter++;      
     }
@@ -660,7 +669,9 @@ bool RAM::local_compute(int* offset)
     if (lc_all_time == slowest_rank_time) {
         std::cout << "Slowest Rank >>> " << mcomm.get_rank()
                   << "   Comp Time >>> " << after_compute_time - before_compute_time
+                  << "   Real Join >>> " << real_join_time
                   << "   Sync Time >>> " << after_sync_time - before_sync_time
+                  << "   Size Sync Time >>> " << size_sync_time
                   << "  Input Size >>> " << ibf_size
                   << "  Target Count >>> " << jtarget_size
                   << std::endl;

From 6a68658c20af50e04612e37363d5196ad833920f Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Wed, 4 Jan 2023 14:53:44 -0500
Subject: [PATCH 26/36] more hash function

stage change
---
 backend/CMakeLists.txt                        |    2 +-
 backend/src/RAM/RA_tasks.cpp                  |    4 +
 backend/src/hash/fasthash.cpp                 |   50 +
 backend/src/hash/fasthash.h                   |   52 +
 backend/src/hash/hash.cpp                     |   29 +-
 backend/src/hash/hash.h                       |  128 +-
 backend/src/hash/spooky-c.cpp                 |  598 ++
 backend/src/hash/spooky-c.h                   |   94 +
 backend/src/hash/xxhash.cpp                   |    5 +
 backend/src/hash/xxhash.h                     | 6290 +++++++++++++++++
 backend/src/lie/lie.cpp                       |   12 +
 .../src/relation/balanced_hash_relation.cpp   |   50 +-
 backend/src/relation/balanced_hash_relation.h |   19 +-
 .../src/relation/relation_load_balancer.cpp   |   10 +
 backend/tests/cc/compiled_pre/CMakeLists.txt  |    2 +-
 backend/tests/cc/compiled_pre/cc.cpp          |   15 +
 backend/utility/tsv_to_bin.cpp                |  127 +-
 cluster.yaml                                  |    2 +-
 18 files changed, 7452 insertions(+), 37 deletions(-)
 create mode 100644 backend/src/hash/fasthash.cpp
 create mode 100644 backend/src/hash/fasthash.h
 create mode 100644 backend/src/hash/spooky-c.cpp
 create mode 100644 backend/src/hash/spooky-c.h
 create mode 100644 backend/src/hash/xxhash.cpp
 create mode 100644 backend/src/hash/xxhash.h

diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index a348cc28..1d331260 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -18,7 +18,7 @@ set (tests_dir "${PROJECT_SOURCE_DIR}/tests")
 set (data_dir "${PROJECT_SOURCE_DIR}/data")
 set (utility_dir "${PROJECT_SOURCE_DIR}/utility")
 
-file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
+file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/hash/hash.cpp" "${source_dir}/hash/xxhash.cpp" "${source_dir}/hash/spooky-c.cpp" "${source_dir}/hash/fashhash.cpp" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
 file (GLOB source_files_ata "${tests_dir}/all_to_all_benchmark.cpp")
 file (GLOB source_files_tc "${tests_dir}/transitive_closure.cpp")
 #file (GLOB source_files_builtin "${tests_dir}/builtin.cpp")
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 0684d2c6..3b62572b 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -77,6 +77,9 @@ void RAM::load_balance()
     for (u32 i=0; i < ram_relation_count; i++)
     {
         relation* current_relation = ram_relations[i];
+        if (!current_relation->balance_flag) {
+            continue;
+        }
         if (current_relation->load_balance_merge_full_and_delta(refinement_factor) == false)
             current_relation->load_balance_split_full_and_delta(refinement_factor);
 
@@ -674,6 +677,7 @@ bool RAM::local_compute(int* offset)
                   << "   Size Sync Time >>> " << size_sync_time
                   << "  Input Size >>> " << ibf_size
                   << "  Target Count >>> " << jtarget_size
+                  << "  Join Count >>> " << total_join_tuples
                   << std::endl;
     }
 
diff --git a/backend/src/hash/fasthash.cpp b/backend/src/hash/fasthash.cpp
new file mode 100644
index 00000000..c60c9501
--- /dev/null
+++ b/backend/src/hash/fasthash.cpp
@@ -0,0 +1,50 @@
+#include "fasthash.h"
+
+// Compression function for Merkle-Damgard construction.
+// This function is generated using the framework provided.
+#define mix(h) ({					\
+			(h) ^= (h) >> 23;		\
+			(h) *= 0x2127599bf4325c37ULL;	\
+			(h) ^= (h) >> 47; })
+
+uint64_t fasthash64(const void *buf, size_t len, uint64_t seed)
+{
+	const uint64_t    m = 0x880355f21e6d1965ULL;
+	const uint64_t *pos = (const uint64_t *)buf;
+	const uint64_t *end = pos + (len / 8);
+	const unsigned char *pos2;
+	uint64_t h = seed ^ (len * m);
+	uint64_t v;
+
+	while (pos != end) {
+		v  = *pos++;
+		h ^= mix(v);
+		h *= m;
+	}
+
+	pos2 = (const unsigned char*)pos;
+	v = 0;
+
+	switch (len & 7) {
+	case 7: v ^= (uint64_t)pos2[6] << 48;
+	case 6: v ^= (uint64_t)pos2[5] << 40;
+	case 5: v ^= (uint64_t)pos2[4] << 32;
+	case 4: v ^= (uint64_t)pos2[3] << 24;
+	case 3: v ^= (uint64_t)pos2[2] << 16;
+	case 2: v ^= (uint64_t)pos2[1] << 8;
+	case 1: v ^= (uint64_t)pos2[0];
+		h ^= mix(v);
+		h *= m;
+	}
+
+	return mix(h);
+} 
+
+uint32_t fasthash32(const void *buf, size_t len, uint32_t seed)
+{
+	// the following trick converts the 64-bit hashcode to Fermat
+	// residue, which shall retain information from both the higher
+	// and lower parts of hashcode.
+        uint64_t h = fasthash64(buf, len, seed);
+	return h - (h >> 32);
+}
\ No newline at end of file
diff --git a/backend/src/hash/fasthash.h b/backend/src/hash/fasthash.h
new file mode 100644
index 00000000..042387a9
--- /dev/null
+++ b/backend/src/hash/fasthash.h
@@ -0,0 +1,52 @@
+/* The MIT License
+   Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
+   Permission is hereby granted, free of charge, to any person
+   obtaining a copy of this software and associated documentation
+   files (the "Software"), to deal in the Software without
+   restriction, including without limitation the rights to use, copy,
+   modify, merge, publish, distribute, sublicense, and/or sell copies
+   of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef _FASTHASH_H
+#define _FASTHASH_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * fasthash32 - 32-bit implementation of fasthash
+ * @buf:  data buffer
+ * @len:  data size
+ * @seed: the seed
+ */
+	uint32_t fasthash32(const void *buf, size_t len, uint32_t seed);
+
+/**
+ * fasthash64 - 64-bit implementation of fasthash
+ * @buf:  data buffer
+ * @len:  data size
+ * @seed: the seed
+ */
+	uint64_t fasthash64(const void *buf, size_t len, uint64_t seed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/backend/src/hash/hash.cpp b/backend/src/hash/hash.cpp
index 9b9dcf1d..364d0efd 100644
--- a/backend/src/hash/hash.cpp
+++ b/backend/src/hash/hash.cpp
@@ -1 +1,28 @@
-#include "parallel_RA_inc.h"
+#include "hash.h"
+
+#include "fasthash.h"
+#include "spooky-c.h"
+#include "xxhash.h"
+#include <vector>
+
+uint64_t tuple_hash(const uint64_t* start_ptr, uint64_t prefix_len)
+{
+    return fnv1a(start_ptr, prefix_len);
+    // return MurmurHash64A(start_ptr, prefix_len*8, MURMUR_SEED);
+    // return spooky_hash64(start_ptr, prefix_len*8, MURMUR_SEED);
+    // return fasthash64(start_ptr, prefix_len*8, 10);
+    // return XXH64(start_ptr, prefix_len*8, 10);
+}
+
+std::vector<uint64_t> tuple_hash_test_all(const uint64_t* start_ptr, uint64_t prefix_len) {
+    std::vector<uint64_t> all_hash_v;
+    all_hash_v.push_back(start_ptr[0]);
+    all_hash_v.push_back(fnv1a(start_ptr, prefix_len));
+    all_hash_v.push_back(hash64shift(start_ptr));
+    all_hash_v.push_back(spooky_hash64(start_ptr, prefix_len*8, 1));
+    all_hash_v.push_back(fasthash64(start_ptr, prefix_len*8, 1));
+    // all_hash_v.push_back(XXH64(start_ptr, prefix_len*8, 1));
+    all_hash_v.push_back(XXH32(start_ptr, prefix_len*4, 1));
+    return all_hash_v;
+}
+
diff --git a/backend/src/hash/hash.h b/backend/src/hash/hash.h
index ab7f1951..aa9a6b8a 100644
--- a/backend/src/hash/hash.h
+++ b/backend/src/hash/hash.h
@@ -8,33 +8,140 @@
 
 
 /// Based on the FNV-1a hash function
-inline u64 tuple_hash(const u64* start_ptr, u64 prefix_len)
+
+#include <cstdint>
+#include <string>
+#include <vector>
+// #include <endian.h>
+#define MURMUR_SEED 7917
+
+///FNV-1a
+inline uint64_t fnv1a(const uint64_t* start_ptr, uint64_t prefix_len)
 {
-    const u64 base = 14695981039346656037ULL;
-    const u64 prime = 1099511628211ULL;
+    const uint64_t base = 14695981039346656037ULL;
+    const uint64_t prime = 1099511628211ULL;
 
-    u64 hash = base;
-    for (u64 i = 0; i < prefix_len; ++i)
+    uint64_t hash = base;
+    for (uint64_t i = 0; i < prefix_len; ++i)
     {
-        u64 chunk = start_ptr[i];
+        uint64_t chunk = start_ptr[i];
         hash ^= chunk & 255ULL;
         hash *= prime;
         for (char j = 0; j < 7; ++j)
         {
             chunk = chunk >> 8;
             hash ^= chunk & 255ULL;
+            if ((chunk & 255ULL) == 0)
+              continue;
             hash *= prime;
         }
     }
     return hash;
 }
 
+inline uint64_t nonhash1(const uint64_t* start_ptr, uint64_t prefix_len)
+{
+    // range base split on first column,
+    return start_ptr[0]; 
+}
+
+
+// murmurhash
+#if defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+static inline uint64_t getblock ( const uint64_t * p )
+{
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+  return *p;
+#else
+  const uint8_t *c = (const uint8_t *)p;
+  return (uint64_t)c[0] |
+	 (uint64_t)c[1] <<  8 |
+	 (uint64_t)c[2] << 16 |
+	 (uint64_t)c[3] << 24 |
+	 (uint64_t)c[4] << 32 |
+	 (uint64_t)c[5] << 40 |
+	 (uint64_t)c[6] << 48 |
+	 (uint64_t)c[7] << 56;
+#endif
+}
+
+inline uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed )
+{
+  const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
+  const int r = 47;
+
+  uint64_t h = seed ^ (len * m);
+
+  const uint64_t * data = (const uint64_t *)key;
+  const uint64_t * end = data + (len/8);
+
+  while(data != end)
+  {
+    uint64_t k = getblock(data++);
+
+    k *= m; 
+    k ^= k >> r; 
+    k *= m; 
+    
+    h ^= k;
+    h *= m; 
+  }
+
+  const unsigned char * data2 = (const unsigned char*)data;
+
+  switch(len & 7)
+  {
+  case 7: h ^= uint64_t(data2[6]) << 48;
+  case 6: h ^= uint64_t(data2[5]) << 40;
+  case 5: h ^= uint64_t(data2[4]) << 32;
+  case 4: h ^= uint64_t(data2[3]) << 24;
+  case 3: h ^= uint64_t(data2[2]) << 16;
+  case 2: h ^= uint64_t(data2[1]) << 8;
+  case 1: h ^= uint64_t(data2[0]);
+          h *= m;
+  };
+ 
+  h ^= h >> r;
+  h *= m;
+  h ^= h >> r;
+
+  return h;
+} 
+
+inline uint64_t hash64shift(const uint64_t* keys)
+{
+  uint64_t key = keys[0];
+  key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+  key = key ^ (key >> 24);
+  key = (key + (key << 3)) + (key << 8); // key * 265
+  key = key ^ (key >> 14);
+  key = (key + (key << 2)) + (key << 4); // key * 21
+  key = key ^ (key >> 28);
+  key = key + (key << 31);
+  return key;
+}
+
+
+uint64_t tuple_hash(const uint64_t* start_ptr, uint64_t prefix_len);
+std::vector<uint64_t> tuple_hash_test_all(const uint64_t* start_ptr, uint64_t prefix_len);
+
 // change this to compile time?
-inline u32 string_hash(const std::string& str) {
-    const u32 base = 2166136261u;
-    const u32 prime = 16777619u;
+inline uint32_t string_hash(const std::string& str) {
+    const uint32_t base = 2166136261u;
+    const uint32_t prime = 16777619u;
 
-    u32 hash = base;
+    uint32_t hash = base;
     for (char c: str)
     {
         if ((int)c == 0) continue;
@@ -43,3 +150,4 @@ inline u32 string_hash(const std::string& str) {
     }
     return hash;
 }
+
diff --git a/backend/src/hash/spooky-c.cpp b/backend/src/hash/spooky-c.cpp
new file mode 100644
index 00000000..6e8f8c2c
--- /dev/null
+++ b/backend/src/hash/spooky-c.cpp
@@ -0,0 +1,598 @@
+
+// A C version of Bob Jenkins' spooky hash
+// Spooky Hash
+// A 128-bit noncryptographic hash, for checksums and table lookup
+// By Bob Jenkins. Bob's version was under Public Domain
+// The C version is under the BSD license
+// * Copyright (c) 2014, Spooky Contributors
+// * All rights reserved.
+// *
+// * Redistribution and use in source and binary forms, with or without
+// * modification, are permitted provided that the following conditions are met:
+// *
+// * 1. Redistributions of source code must retain the above copyright notice,
+// * this list of conditions and the following disclaimer.
+// *
+// * 2. Redistributions in binary form must reproduce the above copyright
+// * notice, this list of conditions and the following disclaimer in the
+// * documentation and/or other materials provided with the distribution.
+// *
+// * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+// * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+// * OF THE POSSIBILITY OF SUCH DAMAGE.
+//   Oct 31 2010: published framework, disclaimer ShortHash isn't right
+//   Nov 7 2010: disabled ShortHash
+//   Oct 11 2011: C version ported by Andi Kleen (andikleen@github)
+//   Oct 31 2011: replace End, ShortMix, ShortEnd, enable ShortHash again
+//   Apr 10 2012: buffer overflow on platforms without unaligned reads
+//   Apr 27 2012: C version updated by Ziga Zupanec ziga.zupanec@gmail.com (agiz@github)
+//   Update to spooky V2: d = should be d += in short hash, and remove extra mix from long hash
+//   (note results have changed from this change)
+
+//   Assumes little endian ness. Caller has to check this case.
+//   According to Bob it should work on LE too, but just give different results.
+
+
+
+/*
+ * If this is an autoconf build, then use the unaligned access autoconf test to
+ * determine this. Otherwise, fall back on using the arch macros provided by
+ * the compiler.
+ */
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#  ifndef HAVE_ALIGNED_ACCESS_REQUIRED
+#     define ALLOW_UNALIGNED_READS 1
+#  else
+#     define ALLOW_UNALIGNED_READS 0
+#  endif
+#else
+#  if defined(__i386__) || defined(__x86_64__) // add more architectures here
+#     define ALLOW_UNALIGNED_READS 1
+#  else
+#     define ALLOW_UNALIGNED_READS 0
+#  endif
+#endif /* HAVE_CONFIG_H */
+
+#include <memory.h>
+
+#include "spooky-c.h"
+
+// SC_CONST: a constant which:
+//  * is not zero
+//  * is odd
+//  * is a not-very-regular mix of 1's and 0's
+//  * does not need any other special mathematical properties
+#define SC_CONST 0xdeadbeefdeadbeefLL
+
+static inline uint64_t rot64(uint64_t x, int k)
+{
+	return (x << k) | (x >> (64 - k));
+}
+
+//
+// This is used if the input is 96 bytes long or longer.
+//
+// The internal state is fully overwritten every 96 bytes.
+// Every input bit appears to cause at least 128 bits of entropy
+// before 96 other bytes are combined, when run forward or backward
+//   For every input bit,
+//   Two inputs differing in just that input bit
+//   Where "differ" means xor or subtraction
+//   And the base value is random
+//   When run forward or backwards one Mix
+// I tried 3 pairs of each; they all differed by at least 212 bits.
+//
+static inline void mix
+(
+	const uint64_t *data,
+	uint64_t *s0, uint64_t *s1, uint64_t *s2,  uint64_t *s3,
+	uint64_t *s4, uint64_t *s5, uint64_t *s6,  uint64_t *s7,
+	uint64_t *s8, uint64_t *s9, uint64_t *s10, uint64_t *s11
+)
+{
+	*s0 += data[0];		*s2 ^= *s10;	*s11 ^= *s0;	*s0 = rot64(*s0, 11);	*s11 += *s1;
+	*s1 += data[1];		*s3 ^= *s11;	*s0 ^= *s1;		*s1 = rot64(*s1, 32);	*s0 += *s2;
+	*s2 += data[2];		*s4 ^= *s0;		*s1 ^= *s2;		*s2 = rot64(*s2, 43);	*s1 += *s3;
+	*s3 += data[3];		*s5 ^= *s1;		*s2 ^= *s3;		*s3 = rot64(*s3, 31);	*s2 += *s4;
+	*s4 += data[4];		*s6 ^= *s2;		*s3 ^= *s4;		*s4 = rot64(*s4, 17);	*s3 += *s5;
+	*s5 += data[5];		*s7 ^= *s3;		*s4 ^= *s5;		*s5 = rot64(*s5, 28);	*s4 += *s6;
+	*s6 += data[6];		*s8 ^= *s4;		*s5 ^= *s6;		*s6 = rot64(*s6, 39);	*s5 += *s7;
+	*s7 += data[7];		*s9 ^= *s5;		*s6 ^= *s7;		*s7 = rot64(*s7, 57);	*s6 += *s8;
+	*s8 += data[8];		*s10 ^= *s6;	*s7 ^= *s8;		*s8 = rot64(*s8, 55);	*s7 += *s9;
+	*s9 += data[9];		*s11 ^= *s7;	*s8 ^= *s9;		*s9 = rot64(*s9, 54);	*s8 += *s10;
+	*s10 += data[10];	*s0 ^= *s8;		*s9 ^= *s10;	*s10 = rot64(*s10, 22);	*s9 += *s11;
+	*s11 += data[11];	*s1 ^= *s9;		*s10 ^= *s11;	*s11 = rot64(*s11, 46);	*s10 += *s0;
+}
+
+//
+// Mix all 12 inputs together so that h0, h1 are a hash of them all.
+//
+// For two inputs differing in just the input bits
+// Where "differ" means xor or subtraction
+// And the base value is random, or a counting value starting at that bit
+// The final result will have each bit of h0, h1 flip
+// For every input bit,
+// with probability 50 +- .3%
+// For every pair of input bits,
+// with probability 50 +- 3%
+//
+// This does not rely on the last Mix() call having already mixed some.
+// Two iterations was almost good enough for a 64-bit result, but a
+// 128-bit result is reported, so End() does three iterations.
+//
+static inline void endPartial
+(
+	uint64_t *h0, uint64_t *h1, uint64_t *h2,  uint64_t *h3,
+	uint64_t *h4, uint64_t *h5, uint64_t *h6,  uint64_t *h7,
+	uint64_t *h8, uint64_t *h9, uint64_t *h10, uint64_t *h11
+)
+{
+	*h11+= *h1;		*h2 ^= *h11;	*h1 = rot64(*h1, 44);
+	*h0 += *h2;		*h3 ^= *h0;		*h2 = rot64(*h2, 15);
+	*h1 += *h3;		*h4 ^= *h1;		*h3 = rot64(*h3, 34);
+	*h2 += *h4;		*h5 ^= *h2;		*h4 = rot64(*h4, 21);
+	*h3 += *h5;		*h6 ^= *h3;		*h5 = rot64(*h5, 38);
+	*h4 += *h6;		*h7 ^= *h4;		*h6 = rot64(*h6, 33);
+	*h5 += *h7;		*h8 ^= *h5;		*h7 = rot64(*h7, 10);
+	*h6 += *h8;		*h9 ^= *h6;		*h8 = rot64(*h8, 13);
+	*h7 += *h9;		*h10^= *h7;		*h9 = rot64(*h9, 38);
+	*h8 += *h10;	*h11^= *h8;		*h10= rot64(*h10, 53);
+	*h9 += *h11;	*h0 ^= *h9;		*h11= rot64(*h11, 42);
+	*h10+= *h0;		*h1 ^= *h10;	*h0 = rot64(*h0, 54);
+}
+
+static inline void end
+(
+	uint64_t *h0,	uint64_t *h1,	uint64_t *h2,	uint64_t *h3,
+	uint64_t *h4,	uint64_t *h5,	uint64_t *h6,	uint64_t *h7,
+	uint64_t *h8,	uint64_t *h9,	uint64_t *h10,	uint64_t *h11
+)
+{
+	endPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11);
+	endPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11);
+	endPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11);
+}
+
+//
+// The goal is for each bit of the input to expand into 128 bits of
+//   apparent entropy before it is fully overwritten.
+// n trials both set and cleared at least m bits of h0 h1 h2 h3
+//   n: 2   m: 29
+//   n: 3   m: 46
+//   n: 4   m: 57
+//   n: 5   m: 107
+//   n: 6   m: 146
+//   n: 7   m: 152
+// when run forwards or backwards
+// for all 1-bit and 2-bit diffs
+// with diffs defined by either xor or subtraction
+// with a base of all zeros plus a counter, or plus another bit, or random
+//
+static inline void short_mix
+(
+	uint64_t *h0,
+	uint64_t *h1,
+	uint64_t *h2,
+	uint64_t *h3
+)
+{
+	*h2 = rot64(*h2, 50);	*h2 += *h3;  *h0 ^= *h2;
+	*h3 = rot64(*h3, 52);	*h3 += *h0;  *h1 ^= *h3;
+	*h0 = rot64(*h0, 30);	*h0 += *h1;  *h2 ^= *h0;
+	*h1 = rot64(*h1, 41);	*h1 += *h2;  *h3 ^= *h1;
+	*h2 = rot64(*h2, 54);	*h2 += *h3;  *h0 ^= *h2;
+	*h3 = rot64(*h3, 48);	*h3 += *h0;  *h1 ^= *h3;
+	*h0 = rot64(*h0, 38);	*h0 += *h1;  *h2 ^= *h0;
+	*h1 = rot64(*h1, 37);	*h1 += *h2;  *h3 ^= *h1;
+	*h2 = rot64(*h2, 62);	*h2 += *h3;  *h0 ^= *h2;
+	*h3 = rot64(*h3, 34);	*h3 += *h0;  *h1 ^= *h3;
+	*h0 = rot64(*h0, 5);	*h0 += *h1;  *h2 ^= *h0;
+	*h1 = rot64(*h1, 36);	*h1 += *h2;  *h3 ^= *h1;
+}
+
+//
+// Mix all 4 inputs together so that h0, h1 are a hash of them all.
+//
+// For two inputs differing in just the input bits
+// Where "differ" means xor or subtraction
+// And the base value is random, or a counting value starting at that bit
+// The final result will have each bit of h0, h1 flip
+// For every input bit,
+// with probability 50 +- .3% (it is probably better than that)
+// For every pair of input bits,
+// with probability 50 +- .75% (the worst case is approximately that)
+//
+static inline void short_end
+(
+	uint64_t *h0,
+	uint64_t *h1,
+	uint64_t *h2,
+	uint64_t *h3
+)
+{
+	*h3 ^= *h2;  *h2 = rot64(*h2, 15);  *h3 += *h2;
+	*h0 ^= *h3;  *h3 = rot64(*h3, 52);  *h0 += *h3;
+	*h1 ^= *h0;  *h0 = rot64(*h0, 26);  *h1 += *h0;
+	*h2 ^= *h1;  *h1 = rot64(*h1, 51);  *h2 += *h1;
+	*h3 ^= *h2;  *h2 = rot64(*h2, 28);  *h3 += *h2;
+	*h0 ^= *h3;  *h3 = rot64(*h3, 9);   *h0 += *h3;
+	*h1 ^= *h0;  *h0 = rot64(*h0, 47);  *h1 += *h0;
+	*h2 ^= *h1;  *h1 = rot64(*h1, 54);  *h2 += *h1;
+	*h3 ^= *h2;  *h2 = rot64(*h2, 32);  *h3 += *h2;
+	*h0 ^= *h3;  *h3 = rot64(*h3, 25);  *h0 += *h3;
+	*h1 ^= *h0;  *h0 = rot64(*h0, 63);  *h1 += *h0;
+}
+
+void spooky_shorthash
+(
+	const void *message,
+	size_t length,
+	uint64_t *hash1,
+	uint64_t *hash2
+)
+{
+	uint64_t buf[2 * SC_NUMVARS];
+	union
+	{
+		const uint8_t *p8;
+		uint32_t *p32;
+		uint64_t *p64;
+		size_t i;
+	} u;
+	size_t remainder;
+	uint64_t a, b, c, d;
+	u.p8 = (const uint8_t *)message;
+
+	if (!ALLOW_UNALIGNED_READS && (u.i & 0x7))
+	{
+		memcpy(buf, message, length);
+		u.p64 = buf;
+	}
+
+	remainder = length % 32;
+	a = *hash1;
+	b = *hash2;
+	c = SC_CONST;
+	d = SC_CONST;
+
+	if (length > 15)
+	{
+		const uint64_t *endp = u.p64 + (length/32)*4;
+
+		// handle all complete sets of 32 bytes
+		for (; u.p64 < endp; u.p64 += 4)
+		{
+			c += u.p64[0];
+			d += u.p64[1];
+			short_mix(&a, &b, &c, &d);
+			a += u.p64[2];
+			b += u.p64[3];
+		}
+
+		// Handle the case of 16+ remaining bytes.
+		if (remainder >= 16)
+		{
+			c += u.p64[0];
+			d += u.p64[1];
+			short_mix(&a, &b, &c, &d);
+			u.p64 += 2;
+			remainder -= 16;
+		}
+	}
+
+	// Handle the last 0..15 bytes, and its length
+	d += ((uint64_t)length) << 56;
+	switch (remainder)
+	{
+		case 15:
+			d += ((uint64_t)u.p8[14]) << 48;
+		case 14:
+			d += ((uint64_t)u.p8[13]) << 40;
+		case 13:
+			d += ((uint64_t)u.p8[12]) << 32;
+		case 12:
+			d += u.p32[2];
+			c += u.p64[0];
+			break;
+		case 11:
+			d += ((uint64_t)u.p8[10]) << 16;
+		case 10:
+			d += ((uint64_t)u.p8[9]) << 8;
+		case 9:
+			d += (uint64_t)u.p8[8];
+		case 8:
+			c += u.p64[0];
+			break;
+		case 7:
+			c += ((uint64_t)u.p8[6]) << 48;
+		case 6:
+			c += ((uint64_t)u.p8[5]) << 40;
+		case 5:
+			c += ((uint64_t)u.p8[4]) << 32;
+		case 4:
+			c += u.p32[0];
+			break;
+		case 3:
+			c += ((uint64_t)u.p8[2]) << 16;
+		case 2:
+			c += ((uint64_t)u.p8[1]) << 8;
+		case 1:
+			c += (uint64_t)u.p8[0];
+			break;
+		case 0:
+			c += SC_CONST;
+			d += SC_CONST;
+	}
+	short_end(&a, &b, &c, &d);
+	*hash1 = a;
+	*hash2 = b;
+}
+
+void spooky_init
+(
+	struct spooky_state *state,
+	uint64_t seed1,
+	uint64_t seed2
+)
+{
+	state->m_length = 0;
+	state->m_remainder = 0;
+	state->m_state[0] = seed1;
+	state->m_state[1] = seed2;
+}
+
+void spooky_update
+(
+	struct spooky_state *state,
+	const void *message,
+	size_t length
+)
+{
+	uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+	size_t newLength = length + state->m_remainder;
+	uint8_t remainder;
+	union
+	{
+		const uint8_t *p8;
+		uint64_t *p64;
+		size_t i;
+	} u;
+	const uint64_t *endp;
+
+	// Is this message fragment too short?  If it is, stuff it away.
+	if (newLength < SC_BUFSIZE)
+	{
+		memcpy(&((uint8_t *)state->m_data)[state->m_remainder], message, length);
+		state->m_length = length + state->m_length;
+		state->m_remainder = (uint8_t)newLength;
+		return;
+	}
+
+	// init the variables
+	if (state->m_length < SC_BUFSIZE)
+	{
+		h0 = h3 = h6 = h9  = state->m_state[0];
+		h1 = h4 = h7 = h10 = state->m_state[1];
+		h2 = h5 = h8 = h11 = SC_CONST;
+	}
+	else
+	{
+		h0 = state->m_state[0];
+		h1 = state->m_state[1];
+		h2 = state->m_state[2];
+		h3 = state->m_state[3];
+		h4 = state->m_state[4];
+		h5 = state->m_state[5];
+		h6 = state->m_state[6];
+		h7 = state->m_state[7];
+		h8 = state->m_state[8];
+		h9 = state->m_state[9];
+		h10 = state->m_state[10];
+		h11 = state->m_state[11];
+	}
+	state->m_length = length + state->m_length;
+
+	// if we've got anything stuffed away, use it now
+	if (state->m_remainder)
+	{
+		uint8_t prefix = SC_BUFSIZE-state->m_remainder;
+		memcpy(&(((uint8_t *)state->m_data)[state->m_remainder]), message, prefix);
+		u.p64 = state->m_data;
+		mix(u.p64, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+		mix(&u.p64[SC_NUMVARS], &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+		u.p8 = ((const uint8_t *)message) + prefix;
+		length -= prefix;
+	}
+	else
+	{
+		u.p8 = (const uint8_t *)message;
+	}
+
+	// handle all whole blocks of SC_BLOCKSIZE bytes
+	endp = u.p64 + (length/SC_BLOCKSIZE)*SC_NUMVARS;
+	remainder = (uint8_t)(length-((const uint8_t *)endp - u.p8));
+	if (ALLOW_UNALIGNED_READS || (u.i & 0x7) == 0)
+	{
+		while (u.p64 < endp)
+		{
+			mix(u.p64, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+			u.p64 += SC_NUMVARS;
+		}
+	}
+	else
+	{
+		while (u.p64 < endp)
+		{
+			memcpy(state->m_data, u.p8, SC_BLOCKSIZE);
+			mix(state->m_data, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+			u.p64 += SC_NUMVARS;
+		}
+	}
+
+	// stuff away the last few bytes
+	state->m_remainder = remainder;
+	memcpy(state->m_data, endp, remainder);
+
+	// stuff away the variables
+	state->m_state[0] = h0;
+	state->m_state[1] = h1;
+	state->m_state[2] = h2;
+	state->m_state[3] = h3;
+	state->m_state[4] = h4;
+	state->m_state[5] = h5;
+	state->m_state[6] = h6;
+	state->m_state[7] = h7;
+	state->m_state[8] = h8;
+	state->m_state[9] = h9;
+	state->m_state[10] = h10;
+	state->m_state[11] = h11;
+}
+
+void spooky_final
+(
+	struct spooky_state *state,
+	uint64_t *hash1,
+	uint64_t *hash2
+)
+{
+	uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+	const uint64_t *data = (const uint64_t *)state->m_data;
+	uint8_t remainder = state->m_remainder;
+
+	// init the variables
+	if (state->m_length < SC_BUFSIZE)
+	{
+		spooky_shorthash(state->m_data, state->m_length, hash1, hash2);
+		return;
+	}
+
+	h0 = state->m_state[0];
+	h1 = state->m_state[1];
+	h2 = state->m_state[2];
+	h3 = state->m_state[3];
+	h4 = state->m_state[4];
+	h5 = state->m_state[5];
+	h6 = state->m_state[6];
+	h7 = state->m_state[7];
+	h8 = state->m_state[8];
+	h9 = state->m_state[9];
+	h10 = state->m_state[10];
+	h11 = state->m_state[11];
+
+	if (remainder >= SC_BLOCKSIZE)
+	{
+		// m_data can contain two blocks; handle any whole first block
+		mix(data, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+		data += SC_NUMVARS;
+		remainder -= SC_BLOCKSIZE;
+	}
+
+	// mix in the last partial block, and the length mod SC_BLOCKSIZE
+	memset(&((uint8_t *)data)[remainder], 0, (SC_BLOCKSIZE-remainder));
+
+	((uint8_t *)data)[SC_BLOCKSIZE-1] = remainder;
+	mix(data, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+
+	// do some final mixing
+	end(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+
+	*hash1 = h0;
+	*hash2 = h1;
+}
+
+void spooky_hash128
+(
+	const void *message,
+	size_t length,
+	uint64_t *hash1,
+	uint64_t *hash2
+)
+{
+	uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+	uint64_t buf[SC_NUMVARS];
+	uint64_t *endp;
+	union
+	{
+		const uint8_t *p8;
+		uint64_t *p64;
+		uintptr_t i;
+	} u;
+	size_t remainder;
+
+	if (length < SC_BUFSIZE)
+	{
+		spooky_shorthash(message, length, hash1, hash2);
+		return;
+	}
+
+	h0 = h3 = h6 = h9  = *hash1;
+	h1 = h4 = h7 = h10 = *hash2;
+	h2 = h5 = h8 = h11 = SC_CONST;
+
+	u.p8 = (const uint8_t *)message;
+	endp = u.p64 + (length/SC_BLOCKSIZE)*SC_NUMVARS;
+
+	// handle all whole blocks of SC_BLOCKSIZE bytes
+	if (ALLOW_UNALIGNED_READS || (u.i & 0x7) == 0)
+	{
+		while (u.p64 < endp)
+		{
+			mix(u.p64, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+			u.p64 += SC_NUMVARS;
+		}
+	}
+	else
+	{
+		while (u.p64 < endp)
+		{
+			memcpy(buf, u.p64, SC_BLOCKSIZE);
+			mix(buf, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+			u.p64 += SC_NUMVARS;
+		}
+	}
+
+	// handle the last partial block of SC_BLOCKSIZE bytes
+	remainder = (length - ((const uint8_t *)endp-(const uint8_t *)message));
+	memcpy(buf, endp, remainder);
+	memset(((uint8_t *)buf)+remainder, 0, SC_BLOCKSIZE-remainder);
+	((uint8_t *)buf)[SC_BLOCKSIZE-1] = remainder;
+
+	// do some final mixing
+	end(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11);
+	*hash1 = h0;
+	*hash2 = h1;
+}
+
+uint64_t spooky_hash64
+(
+	const void *message,
+	size_t length,
+	uint64_t seed
+)
+{
+	uint64_t hash1 = seed;
+	spooky_hash128(message, length, &hash1, &seed);
+	return hash1;
+}
+
+uint32_t spooky_hash32
+(
+	const void *message,
+	size_t length,
+	uint32_t seed
+)
+{
+	uint64_t hash1 = seed, hash2 = seed;
+	spooky_hash128(message, length, &hash1, &hash2);
+	return (uint32_t)hash1;
+}
\ No newline at end of file
diff --git a/backend/src/hash/spooky-c.h b/backend/src/hash/spooky-c.h
new file mode 100644
index 00000000..9cd60e05
--- /dev/null
+++ b/backend/src/hash/spooky-c.h
@@ -0,0 +1,94 @@
+// SpookyHash: a 128-bit noncryptographic hash function
+// By Bob Jenkins, public domain
+//   Oct 31 2010: alpha, framework + SpookyHash::Mix appears right
+//   Oct 11 2011: C version ported by Andi Kleen (andikleen@github)
+//   Oct 31 2011: alpha again, Mix only good to 2^^69 but rest appears right
+//   Dec 31 2011: beta, improved Mix, tested it for 2-bit deltas
+//   Feb  2 2012: production, same bits as beta
+//   Feb  5 2012: adjusted definitions of uint* to be more portable
+//   Mar 30 2012: 3 bytes/cycle, not 4.  Alpha was 4 but wasn't thorough enough.
+//   Apr 27 2012: C version updated by Ziga Zupanec ziga.zupanec@gmail.com (agiz@github)
+//
+// Up to 3 bytes/cycle for long messages.  Reasonably fast for short messages.
+// All 1 or 2 bit deltas achieve avalanche within 1% bias per output bit.
+//
+// This was developed for and tested on 64-bit x86-compatible processors.
+// It assumes the processor is little-endian.  There is a macro
+// controlling whether unaligned reads are allowed (by default they are).
+// This should be an equally good hash on big-endian machines, but it will
+// compute different results on them than on little-endian machines.
+//
+// Google's CityHash has similar specs to SpookyHash, and CityHash is faster
+// on some platforms.  MD4 and MD5 also have similar specs, but they are orders
+// of magnitude slower.  CRCs are two or more times slower, but unlike
+// SpookyHash, they have nice math for combining the CRCs of pieces to form
+// the CRCs of wholes.  There are also cryptographic hashes, but those are even
+// slower than MD5.
+//
+
+#include <stdint.h>
+#include <stddef.h>
+
+#define SC_NUMVARS		12
+#define SC_BLOCKSIZE	(8 * SC_NUMVARS)
+#define SC_BUFSIZE		(2 * SC_BLOCKSIZE)
+
+struct spooky_state
+{
+	uint64_t m_data[2 * SC_NUMVARS];
+	uint64_t m_state[SC_NUMVARS];
+	size_t m_length;
+	unsigned char m_remainder;
+};
+
+void spooky_shorthash
+(
+	const void *message,
+	size_t length,
+	uint64_t *hash1,
+	uint64_t *hash2
+);
+
+void spooky_init
+(
+	struct spooky_state *state,
+	uint64_t hash1,
+	uint64_t hash2
+);
+
+void spooky_update
+(
+	struct spooky_state *state,
+	const void *msg,
+	size_t len
+);
+
+void spooky_final
+(
+	struct spooky_state *state,
+	uint64_t *hash1,
+	uint64_t *hash2
+);
+
+//hash1/2 doubles as input parameter for seed1/2 and output for hash1/2
+void spooky_hash128
+(
+	const void *message,
+	size_t length,
+	uint64_t *hash1,
+	uint64_t *hash2
+);
+
+uint64_t spooky_hash64
+(
+	const void *message,
+	size_t len,
+	uint64_t seed
+);
+
+uint32_t spooky_hash32
+(
+	const void *message,
+	size_t len,
+	uint32_t seed
+);
\ No newline at end of file
diff --git a/backend/src/hash/xxhash.cpp b/backend/src/hash/xxhash.cpp
new file mode 100644
index 00000000..267cbe79
--- /dev/null
+++ b/backend/src/hash/xxhash.cpp
@@ -0,0 +1,5 @@
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "xxhash.h"
diff --git a/backend/src/hash/xxhash.h b/backend/src/hash/xxhash.h
new file mode 100644
index 00000000..2a70a8bc
--- /dev/null
+++ b/backend/src/hash/xxhash.h
@@ -0,0 +1,6290 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2021 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*!
+ * @mainpage xxHash
+ *
+ * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
+ * limits.
+ *
+ * It is proposed in four flavors, in three families:
+ * 1. @ref XXH32_family
+ *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
+ *     32-bit and 64-bit systems.
+ * 2. @ref XXH64_family
+ *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
+ *     64-bit systems (but _not_ 32-bit systems).
+ * 3. @ref XXH3_family
+ *   - Modern 64-bit and 128-bit hash function family which features improved
+ *     strength and performance across the board, especially on smaller data.
+ *     It benefits greatly from SIMD and 64-bit without requiring it.
+ *
+ * Benchmarks
+ * ---
+ * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
+ * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
+ *
+ * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
+ * | -------------------- | ------- | ----: | ---------------: | ------------------: |
+ * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
+ * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
+ * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
+ * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
+ * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
+ * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
+ * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
+ * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
+ * | City64               |         |    64 |        22.0 GB/s |                76.6 |
+ * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
+ * | City128              |         |   128 |        21.7 GB/s |                57.7 |
+ * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
+ * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
+ * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
+ * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
+ * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
+ * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
+ * | City32               |         |    32 |         9.1 GB/s |                66.0 |
+ * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
+ * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
+ * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
+ * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
+ * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
+ * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
+ * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
+ * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
+ * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
+ * @note
+ *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
+ *     even though it is mandatory on x64.
+ *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
+ *     by modern standards.
+ *   - Small data velocity is a rough average of algorithm's efficiency for small
+ *     data. For more accurate information, see the wiki.
+ *   - More benchmarks and strength tests are found on the wiki:
+ *         https://github.com/Cyan4973/xxHash/wiki
+ *
+ * Usage
+ * ------
+ * All xxHash variants use a similar API. Changing the algorithm is a trivial
+ * substitution.
+ *
+ * @pre
+ *    For functions which take an input and length parameter, the following
+ *    requirements are assumed:
+ *    - The range from [`input`, `input + length`) is valid, readable memory.
+ *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
+ *    - For C++, the objects must have the *TriviallyCopyable* property, as the
+ *      functions access bytes directly as if it was an array of `unsigned char`.
+ *
+ * @anchor single_shot_example
+ * **Single Shot**
+ *
+ * These functions are stateless functions which hash a contiguous block of memory,
+ * immediately returning the result. They are the easiest and usually the fastest
+ * option.
+ *
+ * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
+ *
+ * @code{.c}
+ *   #include <string.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which hashes a null terminated string with XXH32().
+ *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
+ *   {
+ *       // NULL pointers are only valid if the length is zero
+ *       size_t length = (string == NULL) ? 0 : strlen(string);
+ *       return XXH32(string, length, seed);
+ *   }
+ * @endcode
+ *
+ * @anchor streaming_example
+ * **Streaming**
+ *
+ * These groups of functions allow incremental hashing of unknown size, even
+ * more than what would fit in a size_t.
+ *
+ * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include <assert.h>
+ *   #include "xxhash.h"
+ *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
+ *   XXH64_hash_t hashFile(FILE* f)
+ *   {
+ *       // Allocate a state struct. Do not just use malloc() or new.
+ *       XXH3_state_t* state = XXH3_createState();
+ *       assert(state != NULL && "Out of memory!");
+ *       // Reset the state to start a new hashing session.
+ *       XXH3_64bits_reset(state);
+ *       char buffer[4096];
+ *       size_t count;
+ *       // Read the file in chunks
+ *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
+ *           // Run update() as many times as necessary to process the data
+ *           XXH3_64bits_update(state, buffer, count);
+ *       }
+ *       // Retrieve the finalized hash. This will not change the state.
+ *       XXH64_hash_t result = XXH3_64bits_digest(state);
+ *       // Free the state. Do not use free().
+ *       XXH3_freeState(state);
+ *       return result;
+ *   }
+ * @endcode
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Exposes the implementation and marks all functions as `inline`.
+ *
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * @endcode
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#  define XXH_INLINE_ALL
+#  undef XXH_INLINE_ALL
+/*!
+ * @brief Exposes the implementation without marking functions as inline.
+ */
+#  define XXH_PRIVATE_API
+#  undef XXH_PRIVATE_API
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+    * such as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+   /* Before that, we unconditionally #undef all symbols,
+    * in case they were already defined with XXH_NAMESPACE.
+    * They will then be redefined for XXH_INLINE_ALL
+    */
+#  undef XXH_versionNumber
+    /* XXH32 */
+#  undef XXH32
+#  undef XXH32_createState
+#  undef XXH32_freeState
+#  undef XXH32_reset
+#  undef XXH32_update
+#  undef XXH32_digest
+#  undef XXH32_copyState
+#  undef XXH32_canonicalFromHash
+#  undef XXH32_hashFromCanonical
+    /* XXH64 */
+#  undef XXH64
+#  undef XXH64_createState
+#  undef XXH64_freeState
+#  undef XXH64_reset
+#  undef XXH64_update
+#  undef XXH64_digest
+#  undef XXH64_copyState
+#  undef XXH64_canonicalFromHash
+#  undef XXH64_hashFromCanonical
+    /* XXH3_64bits */
+#  undef XXH3_64bits
+#  undef XXH3_64bits_withSecret
+#  undef XXH3_64bits_withSeed
+#  undef XXH3_64bits_withSecretandSeed
+#  undef XXH3_createState
+#  undef XXH3_freeState
+#  undef XXH3_copyState
+#  undef XXH3_64bits_reset
+#  undef XXH3_64bits_reset_withSeed
+#  undef XXH3_64bits_reset_withSecret
+#  undef XXH3_64bits_update
+#  undef XXH3_64bits_digest
+#  undef XXH3_generateSecret
+    /* XXH3_128bits */
+#  undef XXH128
+#  undef XXH3_128bits
+#  undef XXH3_128bits_withSeed
+#  undef XXH3_128bits_withSecret
+#  undef XXH3_128bits_reset
+#  undef XXH3_128bits_reset_withSeed
+#  undef XXH3_128bits_reset_withSecret
+#  undef XXH3_128bits_reset_withSecretandSeed
+#  undef XXH3_128bits_update
+#  undef XXH3_128bits_digest
+#  undef XXH128_isEqual
+#  undef XXH128_cmp
+#  undef XXH128_canonicalFromHash
+#  undef XXH128_hashFromCanonical
+    /* Finally, free the namespace itself */
+#  undef XXH_NAMESPACE
+
+    /* employ the namespace for XXH_INLINE_ALL */
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols,
+    * but they must nonetheless be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and has a more dispersed impact.
+    * Meanwhile, renaming can be achieved in a single place.
+    */
+#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/*! @brief Marks a global symbol. */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Compiler specifics
+***************************************/
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#if defined (__GNUC__)
+# define XXH_CONSTF  __attribute__((const))
+# define XXH_PUREF   __attribute__((pure))
+# define XXH_MALLOCF __attribute__((malloc))
+#else
+# define XXH_CONSTF  /* disable */
+# define XXH_PUREF
+# define XXH_MALLOCF
+#endif
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  1
+/*! @brief Version number, encoded as two digits each */
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is mostly useful when xxHash is compiled as a shared library,
+ * since the returned value comes from the library, as opposed to header file.
+ *
+ * @return @ref XXH_VERSION_NUMBER of the invoked library.
+ */
+XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Common basic types
+******************************/
+#include <stddef.h>   /* size_t */
+/*!
+ * @brief Exit code for the streaming API.
+ */
+typedef enum {
+    XXH_OK = 0, /*!< OK */
+    XXH_ERROR   /*!< Error */
+} XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   elif ULONG_MAX == 0xFFFFFFFFUL
+      typedef unsigned long XXH32_hash_t;
+#   else
+#     error "unsupported platform: need a 32-bit type"
+#   endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+ *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
+ *   and 64-bit systems, and offers true 64/128 bit hash results.
+ *
+ * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
+ * @see @ref XXH32_impl for implementation details
+ * @{
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * See @ref single_shot_example "Single Shot Example" for an example.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit hash value.
+ *
+ * @see
+ *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * Streaming functions generate the xxHash value from an incremental input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ * @see streaming_example at the top of @ref xxhash.h for an example.
+ */
+
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
+ */
+typedef struct XXH32_state_s XXH32_state_t;
+
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * Must be freed with XXH32_freeState().
+ * @return An allocated XXH32_state_t on success, `NULL` on failure.
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * Must be allocated with XXH32_createState().
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ * @return XXH_OK.
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated xxHash32 value from that state.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ */
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifdef __has_attribute
+# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+# define XXH_HAS_ATTRIBUTE(x) 0
+#endif
+
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
+# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define XXH_HAS_C_ATTRIBUTE(x) 0
+#endif
+
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define XXH_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+/*
+ * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+ * introduced in CPP17 and C23.
+ * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+ */
+#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
+#else
+# define XXH_FALLTHROUGH /* fallthrough */
+#endif
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#  include <stdint.h>
+   typedef uint64_t XXH64_hash_t;
+#else
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results.
+ *   It provides better speed for systems with vector processing capabilities.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit hash.
+ *
+ * @see
+ *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+#ifndef XXH_NO_XXH3
+
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup XXH3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
+ * at competitive speeds, even without vector support. Further details are
+ * explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled via the @ref XXH_VECTOR
+ * macro. For the x86 family, an automatic dispatcher is included separately
+ * in @ref xxh_x86dispatch.c.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generage exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
+/*!
+ * @brief 64-bit unseeded variant of XXH3.
+ *
+ * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however
+ * it may have slightly better performance due to constant propagation of the
+ * defaults.
+ *
+ * @see
+ *    XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms
+ * @see
+ *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
+ * @see
+ *    XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t length);
+
+/*!
+ * @brief 64-bit seeded variant of XXH3
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the `seed` value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * @param input The data to hash
+ * @param length The length
+ * @param seed The 64-bit seed to alter the state.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed);
+
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
+/*!
+ * @brief 64-bit variant of XXH3 with a custom "secret".
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing "XXH3_generateSecret()" instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with default parameters.
+ * digest will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*!
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+
+/*-**********************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+/*!
+ * @brief Unseeded 128-bit variant of XXH3
+ *
+ * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
+ * for shorter inputs.
+ *
+ * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however
+ * it may have slightly better performance due to constant propagation of the
+ * defaults.
+ *
+ * @see
+ *    XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms
+ * @see
+ *    XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
+ * @see
+ *    XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * @brief Compares two @ref XXH128_hash_t
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * @return: >0 if *h128_1  > *h128_2
+ *          =0 if *h128_1 == *h128_2
+ *          <0 if *h128_1  < *h128_2
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t v[4];         /*!< Accumulator lanes */
+   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t v[4];         /*!< Accumulator lanes */
+   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
+};   /* typedef'd to XXH64_state_t */
+
+#ifndef XXH_NO_XXH3
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+/* In C++ alignas() is a keyword */
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+   XXH32_hash_t bufferedSize;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t useSeed;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
+   XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
+   XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+   XXH64_hash_t reserved64;
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)   { (XXH3_state_ptr)->seed = 0; }
+
+
+/*!
+ * simple alias to pre-selected XXH3_128bits variant
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*!
+ * XXH3_generateSecret():
+ *
+ * Derive a high-entropy secret from any user-defined content, named customSeed.
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection
+ * than 64-bit seed, as it becomes much more difficult for an external actor to
+ * guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length @p secretSize into an
+ * already allocated buffer @p secretBuffer.
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
+ * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
+ * are part of this list. They all accept a `secret` parameter
+ * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
+ * be employed to ensure proper quality.
+ *
+ * @p customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even "poor entropy" sources such as a bunch
+ * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
+ *
+ * @pre
+ *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
+ *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+ *
+ * Example code:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <stdlib.h>
+ *    #include <string.h>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Hashes argv[2] using the entropy from argv[1].
+ *    int main(int argc, char* argv[])
+ *    {
+ *        char secret[XXH3_SECRET_SIZE_MIN];
+ *        if (argv != 3) { return 1; }
+ *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
+ *        XXH64_hash_t h = XXH3_64bits_withSecret(
+ *             argv[2], strlen(argv[2]),
+ *             secret, sizeof(secret)
+ *        );
+ *        printf("%016llx\n", (unsigned long long) h);
+ *    }
+ * @endcode
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
+
+/*!
+ * @brief Generate the same secret as the _withSeed() variants.
+ *
+ * The generated secret can be used in combination with
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
+ *
+ * Example C++ `std::string` hash class:
+ * @code{.cpp}
+ *    #include <string>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Slow, seeds each time
+ *    class HashSlow {
+ *        XXH64_hash_t seed;
+ *    public:
+ *        HashSlow(XXH64_hash_t s) : seed{s} {}
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
+ *        }
+ *    };
+ *    // Fast, caches the seeded secret for future uses.
+ *    class HashFast {
+ *        unsigned char secret[XXH3_SECRET_SIZE_MIN];
+ *    public:
+ *        HashFast(XXH64_hash_t s) {
+ *            XXH3_generateSecret_fromSeed(secret, seed);
+ *        }
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{
+ *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
+ *            };
+ *        }
+ *    };
+ * @endcode
+ * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
+ * @param seed The seed to seed the state.
+ */
+XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
+
+/*!
+ * These variants generate hash values using either
+ * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
+ * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX).
+ *
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
+ * which requires more instructions than _withSeed() variants.
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
+ *
+ * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
+ * hence offering only a pure speed benefit on "large" input,
+ * by skipping the need to regenerate the secret for every large input.
+ *
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
+ * for example with XXH3_64bits(), which then becomes the seed,
+ * and then employ both the seed and the secret in _withSecretandSeed().
+ * On top of speed, an added benefit is that each bit in the secret
+ * has a 50% chance to swap each bit in the output, via its impact to the seed.
+ *
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
+ * because only portions of the secret are employed for small data.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
+XXH3_64bits_withSecretandSeed(const void* data, size_t len,
+                              const void* secret, size_t secretSize,
+                              XXH64_hash_t seed);
+/*! @copydoc XXH3_64bits_withSecretandSeed() */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
+XXH3_128bits_withSecretandSeed(const void* input, size_t length,
+                               const void* secret, size_t secretSize,
+                               XXH64_hash_t seed64);
+#ifndef XXH_NO_STREAM
+/*! @copydoc XXH3_64bits_withSecretandSeed() */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
+                                    const void* secret, size_t secretSize,
+                                    XXH64_hash_t seed64);
+/*! @copydoc XXH3_64bits_withSecretandSeed() */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
+                                     const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
+#endif /* !XXH_NO_STREAM */
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *  .
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+
+/*!
+ * @def XXH_SIZE_OPT
+ * @brief Controls how much xxHash optimizes for size.
+ *
+ * xxHash, when compiled, tends to result in a rather large binary size. This
+ * is mostly due to heavy usage to forced inlining and constant folding of the
+ * @ref XXH3_family to increase performance.
+ *
+ * However, some developers prefer size over speed. This option can
+ * significantly reduce the size of the generated code. When using the `-Os`
+ * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
+ * otherwise it is defined to 0.
+ *
+ * Most of these size optimizations can be controlled manually.
+ *
+ * This is a number from 0-2.
+ *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
+ *    comes first.
+ *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
+ *    conservative and disables hacks that increase code size. It implies the
+ *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
+ *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
+ *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
+ *    Performance may cry. For example, the single shot functions just use the
+ *    streaming API.
+ */
+#  define XXH_SIZE_OPT 0
+
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
+ * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH32_ENDJMP
+ * @brief Whether to use a jump for `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+ * This is generally preferable for performance,
+ * but depending on exact architecture, a jmp may be preferable.
+ *
+ * This setting is only possibly making a difference for very small inputs.
+ */
+#  define XXH32_ENDJMP 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+
+/*!
+ * @def XXH_NO_STREAM
+ * @brief Disables the streaming API.
+ *
+ * When xxHash is not inlined and the streaming functions are not used, disabling
+ * the streaming functions can improve code size significantly, especially with
+ * the @ref XXH3_family which tends to make constant folded copies of itself.
+ */
+#  define XXH_NO_STREAM
+#  undef XXH_NO_STREAM /* don't actually */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for GCC
+    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
+    * which for some reason does unaligned loads. */
+#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_SIZE_OPT
+   /* default to 1 for -Os or -Oz */
+#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
+#    define XXH_SIZE_OPT 1
+#  else
+#    define XXH_SIZE_OPT 0
+#  endif
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
+#  if XXH_SIZE_OPT >= 1 || \
+      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH32_ENDJMP
+/* generally preferable for performance */
+#  define XXH32_ENDJMP 0
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+#if defined(XXH_NO_STREAM)
+/* nothing */
+#elif defined(XXH_NO_STDLIB)
+
+/* When requesting to disable any mention of stdlib,
+ * the library loses the ability to invoked malloc / free.
+ * In practice, it means that functions like `XXH*_createState()`
+ * will always fail, and return NULL.
+ * This flag is useful in situations where
+ * xxhash.h is integrated into some kernel, embedded or limited environment
+ * without access to dynamic allocation.
+ */
+
+static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
+static void XXH_free(void* p) { (void)p; }
+
+#else
+
+/*
+ * Modify the local functions below should you wish to use
+ * different memory routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
+static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
+static void XXH_free(void* p) { free(p); }
+
+#endif  /* XXH_NO_STDLIB */
+
+#include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__) || defined(__clang__)
+#    define XXH_FORCE_INLINE static __attribute__((unused))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(__GNUC__) || defined(__clang__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
+#  define XXH_NO_INLINE static __attribute__((noinline))
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#ifndef XXH_STATIC_ASSERT
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
+#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  else
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+#  endif
+#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+#endif
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD_W(var) __asm__ __volatile__("" : "+w" (var))
+#else
+#  define XXH_COMPILER_GUARD_W(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t xxh_u8;
+#else
+  typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;
+    return *((const xxh_unalign32*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup XXH32_impl XXH32 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH32 implementation.
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is the only thing that prevents GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+     * and it is pointless writing a NEON implementation that is basically the
+     * same speed as scalar for XXH32.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param hash The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 hash)
+{
+    hash ^= hash >> 15;
+    hash *= XXH_PRIME32_2;
+    hash ^= hash >> 13;
+    hash *= XXH_PRIME32_3;
+    hash ^= hash >> 16;
+    return hash;
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ * @see XXH64_finalize().
+ */
+static XXH_PUREF xxh_u32
+XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                             \
+    hash += (*ptr++) * XXH_PRIME32_5;                 \
+    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                             \
+    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
+    ptr += 4;                                         \
+    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(hash);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 8:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(hash);
+
+           case 13:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 9:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 14:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 10:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 15:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 11:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 7:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 3:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 2:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 1:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 0:       return XXH32_avalanche(hash);
+        }
+        XXH_ASSERT(0);
+        return hash;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input , len , seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    xxh_u32 h32;
+
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=16) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+        xxh_u32 v2 = seed + XXH_PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - XXH_PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    statePtr->v[1] = seed + XXH_PRIME32_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME32_1;
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+
+            do {
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v[0], 1)
+            + XXH_rotl32(state->v[1], 7)
+            + XXH_rotl32(state->v[2], 12)
+            + XXH_rotl32(state->v[3], 18);
+    } else {
+        h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @ingroup XXH32_family
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;
+    return *((const xxh_unalign64*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup XXH64_impl XXH64 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH64 implementation.
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+/*! @copydoc XXH32_round */
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+/*! @copydoc XXH32_avalanche */
+static xxh_u64 XXH64_avalanche(xxh_u64 hash)
+{
+    hash ^= hash >> 33;
+    hash *= XXH_PRIME64_2;
+    hash ^= hash >> 29;
+    hash *= XXH_PRIME64_3;
+    hash ^= hash >> 32;
+    return hash;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-31 bytes of @p ptr.
+ *
+ * There may be up to 31 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 32.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash
+ * @see XXH32_finalize().
+ */
+static XXH_PUREF xxh_u64
+XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        hash ^= k1;
+        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        hash ^= (*ptr++) * XXH_PRIME64_5;
+        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(hash);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH64().
+ *
+ * @param input , len , seed Directly passed from @ref XXH64().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    xxh_u64 h64;
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=32) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 31;
+        xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+        xxh_u64 v2 = seed + XXH_PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - XXH_PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    statePtr->v[1] = seed + XXH_PRIME64_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME64_1;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
+            state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
+            state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
+            state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
+            p += 32 - state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+
+            do {
+                state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
+                state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
+                state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
+                state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
+        h64 = XXH64_mergeRound(h64, state->v[0]);
+        h64 = XXH64_mergeRound(h64, state->v[1]);
+        h64 = XXH64_mergeRound(h64, state->v[2]);
+        h64 = XXH64_mergeRound(h64, state->v[3]);
+    } else {
+        h64  = state->v[2] /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/******* Canonical representation   *******/
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup XXH3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__ARM_FEATURE_SVE)
+#    include <arm_sve.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON) \
+   || defined(__aarch64__)  || defined(_M_ARM) \
+   || defined(_M_ARM64)     || defined(_M_ARM64EC)
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  elif defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * @ref XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+    XXH_SCALAR = 0,  /*!< Portable scalar version */
+    XXH_SSE2   = 1,  /*!<
+                      * SSE2 for Pentium 4, Opteron, all x86_64.
+                      *
+                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                      * Android x86.
+                      */
+    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+    XXH_NEON   = 4,  /*!< NEON for most ARMv7-A and all AArch64 */
+    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+    XXH_SVE    = 6,  /*!< SVE for some ARMv8-A and ARMv9-A */
+};
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment required for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#  define XXH_SCALAR 0
+#  define XXH_SSE2   1
+#  define XXH_AVX2   2
+#  define XXH_AVX512 3
+#  define XXH_NEON   4
+#  define XXH_VSX    5
+#  define XXH_SVE    6
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__ARM_FEATURE_SVE)
+#    define XXH_VECTOR XXH_SVE
+#  elif ( \
+        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
+   ) && ( \
+        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   )
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
+#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
+#  ifdef _MSC_VER
+#    pragma warning(once : 4606)
+#  else
+#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
+#  endif
+#  undef XXH_VECTOR
+#  define XXH_VECTOR XXH_SCALAR
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_SVE   /* sve */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_SVE
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*!
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && (defined(__GNUC__) || defined(__clang__)) \
+   && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+
+/*!
+ * @internal
+ * @brief `vld1q_u64` but faster and alignment-safe.
+ *
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
+ *
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
+ *
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
+ * unaligned load.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
+{
+    return *(uint64x2_t const*)ptr;
+}
+#else
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
+{
+    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
+}
+#endif
+/*!
+ * @ingroup tuning
+ * @brief Controls the NEON to scalar ratio for XXH3
+ *
+ * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
+ * 2 lanes on scalar by default (except on Apple platforms, as Apple CPUs benefit
+ * from only using NEON).
+ *
+ * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
+ * emulated 64-bit arithmetic is too slow.
+ *
+ * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
+ *
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
+ * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
+ * you are only using 2/3 of the CPU bandwidth.
+ *
+ * This is even more noticeable on the more advanced cores like the A76 which
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
+ *
+ * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
+ * remaining lanes will use scalar instructions. This improves the bandwidth
+ * and also gives the integer pipelines something to do besides twiddling loop
+ * counters and pointers.
+ *
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
+ * most other CPUs:
+ *
+ *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
+ *  |:----------------------|:--------------------|----------:|-----------:|------:|
+ *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
+ *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
+ *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
+ *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
+ *
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
+ *
+ * @see XXH3_accumulate_512_neon()
+ */
+# ifndef XXH3_NEON_LANES
+#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
+   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
+#   define XXH3_NEON_LANES 6
+#  else
+#   define XXH3_NEON_LANES XXH_ACC_NB
+#  endif
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
+ * and `pixel`. This is a problem for obvious reasons.
+ *
+ * These keywords are unnecessary; the spec literally says they are
+ * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
+ * after including the header.
+ *
+ * We use pragma push_macro/pop_macro to keep the namespace clean. */
+#  pragma push_macro("bool")
+#  pragma push_macro("vector")
+#  pragma push_macro("pixel")
+/* silence potential macro redefined warnings */
+#  undef bool
+#  undef vector
+#  undef pixel
+
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+/* Restore the original macro values, if applicable. */
+#  pragma pop_macro("pixel")
+#  pragma pop_macro("vector")
+#  pragma pop_macro("bool")
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+ /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+#if XXH_VECTOR == XXH_SVE
+#define ACCRND(acc, offset) \
+do { \
+    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
+    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
+    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
+    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
+    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
+    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
+    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
+    acc = svadd_u64_x(mask, acc, mul);                               \
+} while (0)
+#endif /* XXH_VECTOR == XXH_SVE */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if XXH_SIZE_OPT >= 1
+#    define XXH_PREFETCH(ptr) (void)(ptr)
+#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*! Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs , rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+    /*
+     * MSVC for ARM64's __umulh method.
+     *
+     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
+     */
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(__umulh)
+#endif
+    XXH128_hash_t r128;
+    r128.low64  = lhs * rhs;
+    r128.high64 = __umulh(lhs, rhs);
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs , rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= 0x9FB21C651E98DF25ULL;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= 0x9FB21C651E98DF25ULL;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1, acc_end;
+#if XXH_SIZE_OPT >= 1
+        /* Smaller and cleaner, but slightly slower. */
+        unsigned int i = (unsigned int)(len - 1) / 32;
+        do {
+            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
+            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
+        } while (i-- != 0);
+        acc_end = 0;
+#else
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc_end = XXH3_mix16B(input+len-16, secret+16, seed);
+        if (len > 32) {
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc_end += XXH3_mix16B(input+len-32, secret+48, seed);
+            if (len > 64) {
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc_end += XXH3_mix16B(input+len-48, secret+80, seed);
+
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc_end += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+            }
+        }
+#endif
+        return XXH3_avalanche(acc + acc_end);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        unsigned int const nbRounds = (unsigned int)len / 16;
+        unsigned int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * These macros are to generate an XXH3_accumulate() function.
+ * The two arguments select the name suffix and target attribute.
+ *
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
+ * XXH3_accumulate_512_<name>().
+ *
+ * It may be useful to hand implement this function if the compiler fails to
+ * optimize the inline function.
+ */
+#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
+void                                                        \
+XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
+                       const xxh_u8* XXH_RESTRICT input,    \
+                       const xxh_u8* XXH_RESTRICT secret,   \
+                       size_t nbStripes)                    \
+{                                                           \
+    size_t n;                                               \
+    for (n = 0; n < nbStripes; n++ ) {                      \
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
+        XXH3_accumulate_512_##name(                         \
+                 acc,                                       \
+                 in,                                        \
+                 secret + n*XXH_SECRET_CONSUME_RATE);       \
+    }                                                       \
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    XXH_memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
+        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
+
+        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+              __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 63) == 0);
+        for (i=0; i < nbRounds; ++i) {
+            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
+              __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 31) == 0);
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        const void* const src16 = XXH3_kSecret;
+        __m128i* dst16 = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dst16);
+#       endif
+        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dst16 & 15) == 0);
+
+        for (i=0; i < nbRounds; ++i) {
+            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+/* forward declarations for the scalar routines */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret, size_t lane);
+
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret, size_t lane);
+
+/*!
+ * @internal
+ * @brief The bulk processing loop for NEON.
+ *
+ * The NEON code path is actually partially scalar when running on AArch64. This
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
+ * CPU, and it also mitigates some GCC codegen issues.
+ *
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
+    {
+        uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        /* AArch64 uses both scalar and neon at the same time */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarRound(acc, input, secret, i);
+        }
+        i = 0;
+        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
+            uint64x2_t acc_vec1 = xacc[i];
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec1 = XXH_vld1q_u64(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec1  = XXH_vld1q_u64(xsecret + (i * 16));
+            /* acc_vec_2 = swap(data_vec) */
+            uint64x2_t acc_vec_21 = vextq_u64(data_vec1, data_vec1, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key1 = veorq_u64(data_vec1, key_vec1);
+
+            uint64x2_t acc_vec2 = xacc[i+1];
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
+            /* acc_vec_2 = swap(data_vec) */
+            uint64x2_t acc_vec_22 = vextq_u64(data_vec2, data_vec2, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key2 = veorq_u64(data_vec2, key_vec2);
+
+            /* data_key_lo = {(data_key1 & 0xFFFFFFFF), (data_key2 & 0xFFFFFFFF)};
+             * data_key_hi = {(data_key1 >> 32), (data_key2 >> 32)};
+             */
+            uint32x4x2_t zipped = vuzpq_u32(vreinterpretq_u32_u64(data_key1), vreinterpretq_u32_u64(data_key2));
+            uint32x4_t data_key_lo = zipped.val[0];
+            uint32x4_t data_key_hi = zipped.val[1];
+
+            /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            acc_vec_21 = vmlal_u32 (acc_vec_21, vget_low_u32(data_key_lo), vget_low_u32(data_key_hi));
+            XXH_COMPILER_GUARD_W(acc_vec_21);
+            /* xacc[i] += acc_vec_2; */
+            acc_vec1 = vaddq_u64 (acc_vec1, acc_vec_21);
+            xacc[i] = acc_vec1;
+            /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            acc_vec_22 = vmlal_u32 (acc_vec_22, vget_high_u32(data_key_lo), vget_high_u32(data_key_hi));
+            XXH_COMPILER_GUARD_W(acc_vec_22);
+            /* xacc[i] += acc_vec_2; */
+            acc_vec2 = vaddq_u64 (acc_vec2, acc_vec_22);
+            xacc[i+1] = acc_vec2;
+        }
+        for (; i < XXH3_NEON_LANES / 2; i++) {
+            uint64x2_t acc_vec = xacc[i];
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            /* acc_vec_2 = swap(data_vec) */
+            uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1);
+            /* data_key = data_vec ^ key_vec; */
+            data_key = veorq_u64(data_vec, key_vec);
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi);
+            XXH_COMPILER_GUARD_W(acc_vec_2);
+            /* xacc[i] += acc_vec_2; */
+            acc_vec = vaddq_u64 (acc_vec, acc_vec_2);
+            xacc[i] = acc_vec;
+        }
+
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (XXH_PRIME32_1);
+
+        size_t i;
+        /* AArch64 uses both scalar and neon at the same time */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarScrambleRound(acc, secret, i);
+        }
+        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64   (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64     (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint64x2_t key_vec  = XXH_vld1q_u64 (xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                prod_hi = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
+                xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
+            }
+        }
+    }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* presumed aligned */
+    unsigned int* const xacc = (unsigned int*) acc;
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        /* acc_vec = xacc[i]; */
+        xxh_u64x2 acc_vec        = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
+        acc_vec += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        acc_vec += vec_permi(data_vec, data_vec, 2);
+#else
+        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+        /* xacc[i] = acc_vec; */
+        vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_SVE)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
+                   const void* XXH_RESTRICT input,
+                   const void* XXH_RESTRICT secret)
+{
+    uint64_t *xacc = (uint64_t *)acc;
+    const uint64_t *xinput = (const uint64_t *)(const void *)input;
+    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+    uint64_t element_count = svcntd();
+    if (element_count >= 8) {
+        svbool_t mask = svptrue_pat_b64(SV_VL8);
+        svuint64_t vacc = svld1_u64(mask, xacc);
+        ACCRND(vacc, 0);
+        svst1_u64(mask, xacc, vacc);
+    } else if (element_count == 2) {   /* sve128 */
+        svbool_t mask = svptrue_pat_b64(SV_VL2);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 2);
+        ACCRND(acc2, 4);
+        ACCRND(acc3, 6);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 2, acc1);
+        svst1_u64(mask, xacc + 4, acc2);
+        svst1_u64(mask, xacc + 6, acc3);
+    } else {
+        svbool_t mask = svptrue_pat_b64(SV_VL4);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 4);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 4, acc1);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
+               const xxh_u8* XXH_RESTRICT input,
+               const xxh_u8* XXH_RESTRICT secret,
+               size_t nbStripes)
+{
+    if (nbStripes != 0) {
+        uint64_t *xacc = (uint64_t *)acc;
+        const uint64_t *xinput = (const uint64_t *)(const void *)input;
+        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+        uint64_t element_count = svcntd();
+        if (element_count >= 8) {
+            svbool_t mask = svptrue_pat_b64(SV_VL8);
+            svuint64_t vacc = svld1_u64(mask, xacc + 0);
+            do {
+                /* svprfd(svbool_t, void *, enum svfprop); */
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(vacc, 0);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, vacc);
+        } else if (element_count == 2) { /* sve128 */
+            svbool_t mask = svptrue_pat_b64(SV_VL2);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 2);
+                ACCRND(acc2, 4);
+                ACCRND(acc3, 6);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 2, acc1);
+           svst1_u64(mask, xacc + 4, acc2);
+           svst1_u64(mask, xacc + 6, acc3);
+        } else {
+            svbool_t mask = svptrue_pat_b64(SV_VL4);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 4);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 4, acc1);
+       }
+    }
+}
+
+#endif
+
+/* scalar variants - universal */
+
+/*!
+ * @internal
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc,
+                 void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret,
+                 size_t lane)
+{
+    xxh_u64* xacc = (xxh_u64*) acc;
+    xxh_u8 const* xinput  = (xxh_u8 const*) input;
+    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    {
+        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
+        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes a 64 byte block of data using the scalar path.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
+#if defined(__GNUC__) && !defined(__clang__) \
+  && (defined(__arm__) || defined(__thumb2__)) \
+  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
+  && XXH_SIZE_OPT <= 0
+#  pragma GCC unroll 8
+#endif
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarRound(acc, input, secret, i);
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
+
+/*!
+ * @internal
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret,
+                         size_t lane)
+{
+    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
+        xxh_u64 acc64 = xacc[lane];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[lane] = acc64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Scrambles the accumulators after a large chunk has been read
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarScrambleRound(acc, secret, i);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), it fights for bandwidth with
+     * the arithmetic instructions.
+     *
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     *
+     * See XXH3_NEON_LANES for details on the pipsline.
+     *
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == XXH3_kSecret);
+
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes Clang to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_accumulate     XXH3_accumulate_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_accumulate     XXH3_accumulate_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_accumulate     XXH3_accumulate_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_accumulate     XXH3_accumulate_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_accumulate     XXH3_accumulate_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_SVE)
+#define XXH3_accumulate_512 XXH3_accumulate_512_sve
+#define XXH3_accumulate     XXH3_accumulate_sve
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_accumulate     XXH3_accumulate_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
+#  undef XXH3_initCustomSecret
+#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+#endif
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate f_acc,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+}
+
+/*
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
+ */
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate f_acc,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+#if XXH_SIZE_OPT <= 0
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc, f_scramble);
+#endif
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t length)
+{
+    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t length, const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(const void* input, size_t length, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (length <= XXH3_MIDSIZE_MAX)
+        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
+}
+
+
+/* ===   XXH3 streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                    XXH64_hash_t seed,
+                    const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->useSeed = (seed != 0);
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+        XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr->useSeed = 1; /* always, even if seed64==0 */
+    return XXH_OK;
+}
+
+/* Note : when XXH3_consumeStripes() is invoked,
+ * there must be a guarantee that at least one more byte must be consumed from input
+ * so that the function can blindly consume all stripes using the "normal" secret segment */
+XXH_FORCE_INLINE void
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate f_acc,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ASSERT(nbStripes <= nbStripesPerBlock);  /* can handle max 1 scramble per invocation */
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
+        size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
+        f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock);
+        f_scramble(acc, secret + secretLimit);
+        f_acc(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock);
+        *nbStripesSoFarPtr = nbStripesAfterBlock;
+    } else {
+        f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes);
+        *nbStripesSoFarPtr += nbStripes;
+    }
+}
+
+#ifndef XXH3_STREAM_USE_STACK
+# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
+#   define XXH3_STREAM_USE_STACK 1
+# endif
+#endif
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+            const xxh_u8* XXH_RESTRICT input, size_t len,
+            XXH3_f_accumulate f_acc,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    XXH_ASSERT(state != NULL);
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* For some reason, gcc and MSVC seem to suffer greatly
+         * when operating accumulators directly into state.
+         * Operating into stack space seems to enable proper optimization.
+         * clang, on the other hand, doesn't seem to need this trick */
+        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
+#else
+        xxh_u64* XXH_RESTRICT const acc = state->acc;
+#endif
+        state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+        /* small input : just fill in tmp buffer */
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+
+        /* large input to consume : ingest per full block */
+        if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
+            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
+            /* join to current block's end */
+            {   size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
+                XXH_ASSERT(nbStripesToEnd <= nbStripes);
+                f_acc(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd);
+                f_scramble(acc, secret + state->secretLimit);
+                state->nbStripesSoFar = 0;
+                input += nbStripesToEnd * XXH_STRIPE_LEN;
+                nbStripes -= nbStripesToEnd;
+            }
+            /* consume per entire blocks */
+            while(nbStripes >= state->nbStripesPerBlock) {
+                f_acc(acc, input, secret, state->nbStripesPerBlock);
+                f_scramble(acc, secret + state->secretLimit);
+                input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
+                nbStripes -= state->nbStripesPerBlock;
+            }
+            /* consume last partial block */
+            f_acc(acc, input, secret, nbStripes);
+            input += nbStripes * XXH_STRIPE_LEN;
+            XXH_ASSERT(input < bEnd);  /* at least some bytes left */
+            state->nbStripesSoFar = nbStripes;
+            /* buffer predecessor of last partial stripe */
+            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+            XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
+        } else {
+            /* content to consume <= block size */
+            /* Consume input by a multiple of internal buffer size */
+            if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+                const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+                do {
+                    XXH3_consumeStripes(acc,
+                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                        input, XXH3_INTERNALBUFFER_STRIPES,
+                                        secret, state->secretLimit,
+                                        f_acc, f_scramble);
+                    input += XXH3_INTERNALBUFFER_SIZE;
+                } while (input<limit);
+                /* buffer predecessor of last partial stripe */
+                XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+            }
+        }
+
+        /* Some remaining input (always) : buffer it */
+        XXH_ASSERT(input < bEnd);
+        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+        XXH_ASSERT(state->bufferedSize == 0);
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* save stack accumulators into state */
+        memcpy(state->acc, acc, sizeof(acc));
+#endif
+    }
+
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    XXH_memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate, XXH3_scrambleAcc);
+        /* last stripe */
+        XXH3_accumulate_512(acc,
+                            state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
+                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        xxh_u8 lastStripe[XXH_STRIPE_LEN];
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        XXH3_accumulate_512(acc,
+                            lastStripe,
+                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+    }
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_mergeAccs(acc,
+                              secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * XXH_PRIME64_1);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->useSeed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+
+#if XXH_SIZE_OPT >= 1
+        {
+            /* Smaller, but slightly slower. */
+            unsigned int i = (unsigned int)(len - 1) / 32;
+            do {
+                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
+            } while (i-- != 0);
+        }
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+#endif
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        unsigned int const nbRounds = (unsigned int)len / 32;
+        unsigned int i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * XXH_PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * XXH_PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong() is not inlined.
+ */
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance to pass @p secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate f_acc,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * All initialization and update functions are identical to 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    return XXH3_64bits_reset(statePtr);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSeed(statePtr, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * XXH_PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         secret + state->secretLimit + XXH_STRIPE_LEN
+                                                - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * @return : >0 if *h128_1  > *h128_2
+ *           <0 if *h128_1  < *h128_2
+ *           =0 if *h128_1 == *h128_2  */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+/* ==========================================
+ * Secret generators
+ * ==========================================
+ */
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
+{
+    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
+{
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+#else
+    /* production mode, assert() are disabled */
+    if (secretBuffer == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+#endif
+
+    if (customSeedSize == 0) {
+        customSeed = XXH3_kSecret;
+        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+    }
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(customSeed != NULL);
+#else
+    if (customSeed == NULL) return XXH_ERROR;
+#endif
+
+    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+    {   size_t pos = 0;
+        while (pos < secretSize) {
+            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+            memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+            pos += toCopy;
+    }   }
+
+    {   size_t const nbSeg16 = secretSize / 16;
+        size_t n;
+        XXH128_canonical_t scrambler;
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+        for (n=0; n<nbSeg16; n++) {
+            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
+            XXH3_combine16((char*)secretBuffer + n*16, h128);
+        }
+        /* last segment */
+        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+    }
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
+{
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    XXH3_initCustomSecret(secret, seed);
+    XXH_ASSERT(secretBuffer != NULL);
+    memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+}
+
+
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXH_NO_XXH3 */
+
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 320e9592..88d85b4e 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -311,6 +311,9 @@ bool LIE::execute ()
             lie_relations[i]->set_separate_io(separate_io);
             lie_relations[i]->set_offset_io(offset_io);
             lie_relations[i]->initialize_relation(mcomm, intern_map);
+            // if (lie_relations[i]->get_intern_tag() == 258) {
+            //     std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << lie_relations[i]->get_full_element_count() << std::endl; 
+            // }
         }
 #if DEBUG_OUTPUT
         //lie_relations[i]->print();
@@ -320,6 +323,11 @@ bool LIE::execute ()
 
     print_all_relation_size();
 
+    // balance all relation before program run
+    // for (u32 i = 0 ; i < lie_relations.size(); i++) {
+    
+    // }
+
     //if (mcomm.get_local_rank() == 0)
     //    std::cout << "Done initializing " << lie_relation_count <<  std::endl;
 
@@ -423,6 +431,10 @@ bool LIE::execute ()
             print_relation_size(scc_relation[i]);
         std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< BEFORE COMPUTATION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
 #endif
+
+        // load balance before a SCC executed
+        executable_task->load_balance();
+
         if (restart_flag == false)
         {
             for (u32 i=0; i < scc_relation_count; i++)
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index 28b0adf8..da398ffa 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -7,8 +7,10 @@
 
 #include "../parallel_RA_inc.h"
 #include "balanced_hash_relation.h"
+#include "mpi.h"
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <filesystem>
 #include <iostream>
 #include <vector>
@@ -1417,6 +1419,50 @@ bool relation::check_dependent_value_insert_avalible(const std::vector<u64>& tup
     // if (bucket_id != mcomm.get_rank()) {
     //     std::cout << "wwwwwwwwwwwwwwwwwwwwwwwwwwwwww " << std::endl; 
     // }
-    int bucket_id = mcomm.get_rank();
-    return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ;
+    // int bucket_id = mcomm.get_rank();
+    bool res = true;
+    for (int i = 0 ; i < mcomm.get_nprocs(); i ++) {
+        res = (res && delta[i].check_dependent_insertion(tuple)) && full[i].check_dependent_insertion(tuple);
+    }
+    // return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ;
+    return res;
+}
+
+void relation::test_calc_hash_rank(u64 rank_n) {
+    int hash_types = 6;
+    std::vector<std::vector<u64>> tuple_cnts(hash_types, std::vector<u64>(rank_n, 0));
+    std::vector<std::string> hash_names{"nohash", "fnv1a", "murmur", "spooky", "fasthash", "xxhash"};
+
+    for (auto t: full[mcomm.get_rank()]) {
+        // std::vector<u64> compressed;
+        // for (auto c: t) {
+        //     compressed.push_back(c % rank_n);
+        // }
+        auto hashes = tuple_hash_test_all(t.data(), get_join_column_count());
+        for (int i = 0; i < hash_types; i++) {
+            // u64 hashv_main = hashes[i];
+            // u64 rk_main = hashes[i] % rank_n;
+            // u64 rk_sub = tuple_hash_test_all(&hashv_main, 1)[1] % rank_n;
+            // u64 rk_final = rk_main * 64 + rk_sub;
+            // tuple_cnts[i][rk_final]++;
+            tuple_cnts[i][hashes[i] % (rank_n-1)]++;
+            // u64 rkv = rank_n;
+            // u64 p =  UINT64_MAX / rank_n;
+            // tuple_cnts[i][hashes[i] / p]++;
+        }  
+    }
+    // std::cout << mcomm.get_rank() << std::endl;
+    for (int i = 0; i < hash_types; i++) {
+        // for (auto cnt: tuple_cnts[i]) {
+        //     std::cout << hash_names[i] << ", " << mcomm.get_rank() << ", " << cnt << std::endl;
+        // }
+        for (u64 rk = 0; rk < rank_n; rk++) {
+            u64 local_cnt = tuple_cnts[i][rk];
+            u64 global_cnt = local_cnt;
+            MPI_Reduce(&local_cnt, &global_cnt, 1, MPI_UINT64_T, MPI_SUM, 0, mcomm.get_comm());
+            if (mcomm.get_rank() == 0) {
+                std::cout << hash_names[i] << ", " << rk << ", " << global_cnt << std::endl;
+            }
+        }
+    }
 }
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index d80d3b08..1be454a2 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -97,6 +97,8 @@ class relation
 
 public:
 
+    bool balance_flag = false;
+
     /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL);
     /// 2: arity (Internally one extra id (intern id) column is added to every relation)
     /// true: arity == join column count
@@ -189,7 +191,13 @@ class relation
 
 
     void set_full_element_count(int val)   {full_element_count = val;}
-    int get_full_element_count()    {return full[mcomm.get_rank()].count();}
+    int get_full_element_count()    {
+        u64 res = 0;
+        for (int i = 0; i < get_bucket_count();  i++) {
+            res += full[i].size();
+        }
+        return res;
+    }
     u32** get_full_sub_bucket_element_count()   {return full_sub_bucket_element_count;}
     u32 get_global_full_element_count();
 
@@ -215,7 +223,13 @@ class relation
 #endif
 
     void set_delta_element_count(int val)   {delta_element_count = val;}
-    int get_delta_element_count()   {return delta[mcomm.get_rank()].count();}
+    int get_delta_element_count()   {
+        u64 res = 0;
+        for (int i = 0; i < get_bucket_count();  i++) {
+            res += delta[i].size();
+        }
+        return res;
+    }
     u32** get_delta_sub_bucket_element_count()  {return delta_sub_bucket_element_count;}
     u32 get_global_delta_element_count();
 
@@ -296,4 +310,5 @@ class relation
     void enable_initialization() { init_flag = true; }
     bool need_init_huh() { return init_flag; }
 
+    void test_calc_hash_rank(u64 rank_n);
 };
diff --git a/backend/src/relation/relation_load_balancer.cpp b/backend/src/relation/relation_load_balancer.cpp
index e278ec36..5918b658 100644
--- a/backend/src/relation/relation_load_balancer.cpp
+++ b/backend/src/relation/relation_load_balancer.cpp
@@ -6,6 +6,7 @@
 
 
 #include "../parallel_RA_inc.h"
+#include <iostream>
 
 
 
@@ -270,8 +271,17 @@ bool relation::load_balance_split_full_and_delta(float rf)
     MPI_Allreduce(&min_sub_bucket_size, &global_min, 1, MPI_INT, MPI_MIN, mcomm.get_local_comm());
     MPI_Allreduce(&total_sub_bucket_size, &global_total_sub_bucket_size, 1, MPI_INT, MPI_SUM, mcomm.get_local_comm());
     delete[] max_sub_bucket_size;
+    if (mcomm.get_rank() == 0) {
+    std::cout << "Max sub buckets ";
+    for (u32 i = 0; i < buckets; i++) {
+        std::cout << max_sub_bucket_size[i] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "Total Sub buckect size : " << total_sub_bucket_size  << std::endl; 
+    }
 
     average_sub_bucket_size = global_total_sub_bucket_size / total_sub_bucket_count;
+    // std::cout << "Total Sub buckect size : " << global_total_sub_bucket_size  << std::endl; 
 
     u32 global_new_sub_bucket[buckets];
     memcpy(global_new_sub_bucket, sub_bucket_per_bucket_count, buckets * sizeof(u32));
diff --git a/backend/tests/cc/compiled_pre/CMakeLists.txt b/backend/tests/cc/compiled_pre/CMakeLists.txt
index 36be513b..79276742 100644
--- a/backend/tests/cc/compiled_pre/CMakeLists.txt
+++ b/backend/tests/cc/compiled_pre/CMakeLists.txt
@@ -18,7 +18,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla
 # set (base_dir "${PROJECT_SOURCE_DIR}/../backend")
 set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
 
-file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
+file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/hash/xxhash.cpp" "${source_dir}/hash/hash.cpp" "${source_dir}/hash/spooky-c.cpp" "${source_dir}/hash/fasthash.cpp" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
 file (GLOB source_files_cc "${PROJECT_SOURCE_DIR}/cc.cpp")
 
 ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}")
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 8964510d..43c04d90 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -436,6 +436,15 @@ int main(int argc, char **argv) {
           ".edge.2.table",
       FULL);
 
+  rel__edge__2__1->balance_flag = true;
+
+  // relation *rel__edge__2__1__2 = new relation(
+  //     2, true, 2, get_tag_for_rel("edge", "1__2"),
+  //     std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
+  //     slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
+  //         ".edge.2.table",
+  //     FULL);
+
   relation *rel__cc__2__1 = new  relation(
     1, true, 2, get_tag_for_rel("cc", "1"),
     std::to_string(get_tag_for_rel("cc", "1")) + ".cc.2.table",
@@ -480,6 +489,7 @@ int main(int argc, char **argv) {
   // ));
 
   RAM *cc_init_scc = new RAM(false, 1);
+  // cc_init_scc->add_relation(rel__edge__2__1__2, false);
   cc_init_scc->add_relation(rel__edge__2__1, false);
   cc_init_scc->add_relation(rel__cc__2__1, true);
   cc_init_scc->add_relation(rel__node__1__1, true);
@@ -548,6 +558,7 @@ int main(int argc, char **argv) {
   cc_lie->add_relation(rel__cc__2__1);
   cc_lie->add_relation(rel__cc_final__2__1);
   cc_lie->add_relation(rel__cc_represent__1__1);
+  // cc_lie->add_relation(rel__edge__2__1__2);
 
   // cc_lie->add_scc(to_undirected_scc);
   cc_lie->add_scc(cc_init_scc);
@@ -575,6 +586,10 @@ int main(int argc, char **argv) {
   }
   cc_lie->print_all_relation_size(); // Continuously print relation sizes
 
+
+  // rel__edge__2__1__2->test_calc_hash_rank(4096);
+  // rel__edge__2__1->test_calc_hash_rank(4096);
+  // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; 
   // rel__node__1__1->print();
   // rel__edge__2__1->print();
  // rel__cc__2__1->print();
diff --git a/backend/utility/tsv_to_bin.cpp b/backend/utility/tsv_to_bin.cpp
index 25de07b6..9f88afb5 100644
--- a/backend/utility/tsv_to_bin.cpp
+++ b/backend/utility/tsv_to_bin.cpp
@@ -56,25 +56,114 @@ unsigned buckets;
 string string_intern_file_path;
 string mode = "slog";
 
-// hash a tuple n values long using our hashing algorithm
-u64 hash_tuple(u64 *fact, unsigned num)
+/// Based on the FNV-1a hash function
+#include <cstdint>
+// #include <endian.h>
+#define MURMUR_SEED 7917
+
+///FNV-1a
+uint64_t fnv1a(const uint64_t* start_ptr, uint64_t prefix_len)
 {
-	u64 prime = 1099511628211ull;
-	u64 hash = 14695981039346656037ull;
-	u64 chunk, h0;
-	for (unsigned i = 0; i < num; i++)
-	{
-		chunk = fact[i];
-		h0 = hash ^ (chunk & 255);
-		hash = h0 * prime;
-		for (unsigned j = 0; j < 7; j++)
-		{
-			chunk = chunk >> 8;
-			h0 = hash ^ (chunk & 255);
-			hash = h0 * prime;
-		}
-	}
-	return hash;
+    const uint64_t base = 14695981039346656037ULL;
+    const uint64_t prime = 1099511628211ULL;
+
+    uint64_t hash = base;
+    for (uint64_t i = 0; i < prefix_len; ++i)
+    {
+        uint64_t chunk = start_ptr[i];
+        hash ^= chunk & 255ULL;
+        hash *= prime;
+        for (char j = 0; j < 7; ++j)
+        {
+            chunk = chunk >> 8;
+            hash ^= chunk & 255ULL;
+            hash *= prime;
+        }
+    }
+    return hash;
+}
+
+
+
+// murmurhash
+#if defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+static inline uint64_t getblock ( const uint64_t * p )
+{
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+  return *p;
+#else
+  const uint8_t *c = (const uint8_t *)p;
+  return (uint64_t)c[0] |
+	 (uint64_t)c[1] <<  8 |
+	 (uint64_t)c[2] << 16 |
+	 (uint64_t)c[3] << 24 |
+	 (uint64_t)c[4] << 32 |
+	 (uint64_t)c[5] << 40 |
+	 (uint64_t)c[6] << 48 |
+	 (uint64_t)c[7] << 56;
+#endif
+}
+
+uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed )
+{
+  const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
+  const int r = 47;
+
+  uint64_t h = seed ^ (len * m);
+
+  const uint64_t * data = (const uint64_t *)key;
+  const uint64_t * end = data + (len/8);
+
+  while(data != end)
+  {
+    uint64_t k = getblock(data++);
+
+    k *= m; 
+    k ^= k >> r; 
+    k *= m; 
+    
+    h ^= k;
+    h *= m; 
+  }
+
+  const unsigned char * data2 = (const unsigned char*)data;
+
+  switch(len & 7)
+  {
+  case 7: h ^= uint64_t(data2[6]) << 48;
+  case 6: h ^= uint64_t(data2[5]) << 40;
+  case 5: h ^= uint64_t(data2[4]) << 32;
+  case 4: h ^= uint64_t(data2[3]) << 24;
+  case 3: h ^= uint64_t(data2[2]) << 16;
+  case 2: h ^= uint64_t(data2[1]) << 8;
+  case 1: h ^= uint64_t(data2[0]);
+          h *= m;
+  };
+ 
+  h ^= h >> r;
+  h *= m;
+  h ^= h >> r;
+
+  return h;
+} 
+
+
+
+uint64_t tuple_hash(const uint64_t* start_ptr, uint64_t prefix_len)
+{
+    // return fnv1a(start_ptr, prefix_len);
+    return MurmurHash64A(start_ptr, (int)prefix_len, MURMUR_SEED);
 }
 
 u32 string_hash(const std::string& str) {
@@ -284,7 +373,7 @@ void file_to_slog(char *input_file, char *output_file,
 			col_count++;
 		}
 
-		u64 t_hash = hash_tuple(tuple_buffer, arity);
+		u64 t_hash = tuple_hash(tuple_buffer, arity);
 		if (tuple_hash_set.find(t_hash) == tuple_hash_set.end()){
 			tuple_hash_set.insert(t_hash);
 			u64 tid = rel_tag;
diff --git a/cluster.yaml b/cluster.yaml
index 97fd09f0..7fbe091b 100644
--- a/cluster.yaml
+++ b/cluster.yaml
@@ -42,7 +42,7 @@ Scheduling:
       MinCount: 0
       MaxCount: 4
       Efa:
-        Enabled: truev
+        Enabled: true
     Networking:
       PlacementGroup:
         Enabled: true

From 61a4d7f6c8ad7e674b150af35388bb7133ea30f6 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Thu, 5 Jan 2023 21:08:03 -0500
Subject: [PATCH 27/36] add manual sub rank split

fix insert check/sub rank

split dyn

try opt
---
 backend/src/RA/parallel_join.cpp              | 112 ++++++++++--------
 backend/src/RA/parallel_join.h                |   2 +
 backend/src/RAM/RA_tasks.cpp                  |   4 +
 backend/src/RAM/RA_tasks.h                    |   2 +-
 backend/src/lie/lie.cpp                       |   2 +
 .../src/relation/balanced_hash_relation.cpp   |  22 ++--
 backend/src/relation/balanced_hash_relation.h |   3 +-
 backend/src/relation/shmap_relation.h         |   4 +-
 backend/tests/cc/compiled_pre/cc.cpp          |   4 +-
 9 files changed, 88 insertions(+), 67 deletions(-)

diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index 95ba9660..61630f4d 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -20,6 +20,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                                shmap_relation *input1, u32 i1_size, int input1_buffer_width,
                                std::vector<int> reorder_map_array,
                                relation* output,
+                               relation* input0_rel,
+                               relation* input1_rel,
                                all_to_allv_buffer& join_buffer,
                                int counter,
                                int join_column_count,
@@ -60,19 +62,21 @@ bool parallel_join::local_join(int threshold, int* offset,
             }
 
             u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
-
+            
             auto before_actual_join = MPI_Wtime();
-            input1[bucket_id].as_all_to_allv_left_join_buffer(
-                prefix, join_buffer,
-                input0_buffer + k1,input0_buffer_width,
-                input1_buffer_width, counter,
-                buckets, output_sub_bucket_count,
-                output_sub_bucket_rank, reorder_map_array,
-                join_column_count, deduplicate,
-                &local_join_count, global_join_duplicates,
-                global_join_inserts, output->get_join_column_count(),
-                output->get_is_canonical(),
-                generator_mode, generator_func);
+            for (u32 sb = 0; sb < input1_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
+                input1[input1_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_left_join_buffer(
+                    prefix, join_buffer,
+                    input0_buffer + k1,input0_buffer_width,
+                    input1_buffer_width, counter,
+                    buckets, output_sub_bucket_count,
+                    output_sub_bucket_rank, reorder_map_array,
+                    join_column_count, deduplicate,
+                    &local_join_count, global_join_duplicates,
+                    global_join_inserts, output->get_join_column_count(),
+                    output->get_is_canonical(),
+                    generator_mode, generator_func);
+            }
             auto after_actual_join = MPI_Wtime();
             join_time_total += after_actual_join - before_actual_join;
 
@@ -112,20 +116,22 @@ bool parallel_join::local_join(int threshold, int* offset,
                 } else {
                     if (input_ts.size() != 0) {
                         auto before_actual_join = MPI_Wtime();
-                        u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
-                        input1[bucket_id].as_all_to_allv_right_join_buffer(
-                            std::vector<u64>(prev_non_dependent_columns.begin(),
-                                             prev_non_dependent_columns.begin()+join_column_count),
-                            join_buffer,
-                            input_ts,
-                            input1_buffer_width, counter,
-                            buckets, output_sub_bucket_count,
-                            output_sub_bucket_rank, reorder_map_array,
-                            join_column_count, deduplicate,
-                            &local_join_count, global_join_duplicates,
-                            global_join_inserts,
-                            output->get_join_column_count(),output->get_is_canonical(),
-                            generator_mode, generator_func);
+                        // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
+                        for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) {
+                            input1[bucket_id].as_all_to_allv_right_join_buffer(
+                                std::vector<u64>(prev_non_dependent_columns.begin(),
+                                                prev_non_dependent_columns.begin()+join_column_count),
+                                join_buffer,
+                                input_ts,
+                                input1_buffer_width, counter,
+                                buckets, output_sub_bucket_count,
+                                output_sub_bucket_rank, reorder_map_array,
+                                join_column_count, deduplicate,
+                                &local_join_count, global_join_duplicates,
+                                global_join_inserts,
+                                output->get_join_column_count(),output->get_is_canonical(),
+                                generator_mode, generator_func);
+                        }
                         auto after_actual_join = MPI_Wtime();
                         join_time_total += after_actual_join - before_actual_join;
                         input_ts.clear();
@@ -137,19 +143,21 @@ bool parallel_join::local_join(int threshold, int* offset,
             if (input_ts.size() != 0) {
                 u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
                 auto before_actual_join = MPI_Wtime();
-                input1[bucket_id].as_all_to_allv_right_join_buffer(
-                    std::vector<u64>(prev_non_dependent_columns.begin(),
-                                    prev_non_dependent_columns.begin()+join_column_count),
-                    join_buffer,
-                    input_ts,
-                    input1_buffer_width, counter,
-                    buckets, output_sub_bucket_count,
-                    output_sub_bucket_rank, reorder_map_array,
-                    join_column_count, deduplicate,
-                    &local_join_count, global_join_duplicates,
-                    global_join_inserts,
-                    output->get_join_column_count(),output->get_is_canonical(),
-                    generator_mode, generator_func);
+                for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
+                    input1[input0_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_right_join_buffer(
+                        std::vector<u64>(prev_non_dependent_columns.begin(),
+                                        prev_non_dependent_columns.begin()+join_column_count),
+                        join_buffer,
+                        input_ts,
+                        input1_buffer_width, counter,
+                        buckets, output_sub_bucket_count,
+                        output_sub_bucket_rank, reorder_map_array,
+                        join_column_count, deduplicate,
+                        &local_join_count, global_join_duplicates,
+                        global_join_inserts,
+                        output->get_join_column_count(),output->get_is_canonical(),
+                        generator_mode, generator_func);
+                }
                 auto after_actual_join = MPI_Wtime();
                 join_time_total += after_actual_join - before_actual_join;
                 input_ts.clear();
@@ -166,18 +174,20 @@ bool parallel_join::local_join(int threshold, int* offset,
             std::vector<std::vector<u64>> input_ts;
             input_ts.push_back(std::vector<u64>(input0_buffer+k1, input0_buffer+k1+input0_buffer_width));
             auto before_actual_join = MPI_Wtime();
-            input1[bucket_id].as_all_to_allv_right_join_buffer(
-                prefix, join_buffer,
-                // input0_buffer + k1, input0_buffer_width,
-                input_ts,
-                input1_buffer_width, counter,
-                buckets, output_sub_bucket_count,
-                output_sub_bucket_rank, reorder_map_array,
-                join_column_count, deduplicate,
-                &local_join_count, global_join_duplicates,
-                global_join_inserts,
-                output->get_join_column_count(),output->get_is_canonical(),
-                generator_mode, generator_func);
+            for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
+                input1[input0_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_right_join_buffer(
+                    prefix, join_buffer,
+                    // input0_buffer + k1, input0_buffer_width,
+                    input_ts,
+                    input1_buffer_width, counter,
+                    buckets, output_sub_bucket_count,
+                    output_sub_bucket_rank, reorder_map_array,
+                    join_column_count, deduplicate,
+                    &local_join_count, global_join_duplicates,
+                    global_join_inserts,
+                    output->get_join_column_count(),output->get_is_canonical(),
+                    generator_mode, generator_func);
+            }
             auto after_actual_join = MPI_Wtime();
             join_time_total += after_actual_join - before_actual_join;
 
diff --git a/backend/src/RA/parallel_join.h b/backend/src/RA/parallel_join.h
index c30120c5..25aafeae 100644
--- a/backend/src/RA/parallel_join.h
+++ b/backend/src/RA/parallel_join.h
@@ -95,6 +95,8 @@ class parallel_join: public parallel_RA {
                     shmap_relation *input1, u32 i1_size, int input1_buffer_width,
                     std::vector<int> reorder_map_array,
                     relation* output,
+                    relation* input0_rel,
+                    relation* input1_rel,
                     all_to_allv_buffer& join_buffer,
                     int counter,
                     int join_column_count,
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 3b62572b..546efd21 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -616,6 +616,8 @@ bool RAM::local_compute(int* offset)
                                                                          input1_trees, input1_size, input1->get_arity()+1,
                                                                          reorder_map_array,
                                                                          output_relation,
+                                                                         input0,
+                                                                         input1,
                                                                          compute_buffer,
                                                                          counter,
                                                                          join_column_count,
@@ -633,6 +635,8 @@ bool RAM::local_compute(int* offset)
                                                                          input0_trees, input0_size, input0->get_arity()+1,
                                                                          reorder_map_array,
                                                                          output_relation,
+                                                                         input0,
+                                                                         input1,
                                                                          compute_buffer,
                                                                          counter,
                                                                          join_column_count,
diff --git a/backend/src/RAM/RA_tasks.h b/backend/src/RAM/RA_tasks.h
index be90384a..3f9718c9 100644
--- a/backend/src/RAM/RA_tasks.h
+++ b/backend/src/RAM/RA_tasks.h
@@ -55,7 +55,7 @@ class RAM
     u32 loop_count_tracker;
 
 public:
-
+    bool balance_flag = false;
     double all_to_all_time = 0;
 
     ~RAM();
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 88d85b4e..6e4de91a 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -433,7 +433,9 @@ bool LIE::execute ()
 #endif
 
         // load balance before a SCC executed
+        if (executable_task->balance_flag) {
         executable_task->load_balance();
+        }
 
         if (restart_flag == false)
         {
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index da398ffa..88af9322 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -416,11 +416,13 @@ void relation::print()
             full[i].as_vector_buffer_recursive(&(vb_full[i]), prefix);
 
             if (vb_full[i].size != 0)
-            	std::cout << get_debug_id() << " " << mcomm.get_rank() << " FULL Rows " << vb_full[i].size/(sizeof(u64) * (arity + 1)) << " columns " << arity + 1 << std::endl;
+            	std::cout << get_debug_id() << " " << mcomm.get_rank() << " " << i << " FULL Rows "
+                          << vb_full[i].size/(sizeof(u64) * (arity + 1)) << " columns " << arity + 1
+                          << std::endl;
             for (u32 j=0; j < vb_full[i].size/sizeof(u64); j = j + arity+1)
             {
                 if (j % (arity+1) == 0)
-                    std::cout << "F [" << j/(arity+1) << "] ";
+                    std::cout << "F [" << mcomm.get_rank() << " " << i << " " << j/(arity+1) << "] ";
                 for (u32 k = 0; k < arity+1; k++)
                 {
                     u64 temp;
@@ -709,7 +711,7 @@ void relation::initialize_relation(mpi_comm& mcomm, std::map<u64, u64>& intern_m
 
     u32 buckets = mcomm.get_local_nprocs();
 
-    default_sub_bucket_per_bucket_count = 1;
+    // default_sub_bucket_per_bucket_count = 1;
     int rank = mcomm.get_local_rank();
     int nprocs = mcomm.get_local_nprocs();
 
@@ -1415,17 +1417,17 @@ void relation::local_insert_in_delta()
 }
 
 bool relation::check_dependent_value_insert_avalible(const std::vector<u64>& tuple) {
+    // bool res = true;
+    // for (int i = 0 ; i < mcomm.get_nprocs(); i ++) {
+    //     res = (res && delta[i].check_dependent_insertion(tuple)) && full[i].check_dependent_insertion(tuple);
+    // }
+    // return res;
     // uint64_t bucket_id = tuple_hash(tuple.data(), join_column_count) % get_bucket_count();
     // if (bucket_id != mcomm.get_rank()) {
     //     std::cout << "wwwwwwwwwwwwwwwwwwwwwwwwwwwwww " << std::endl; 
     // }
-    // int bucket_id = mcomm.get_rank();
-    bool res = true;
-    for (int i = 0 ; i < mcomm.get_nprocs(); i ++) {
-        res = (res && delta[i].check_dependent_insertion(tuple)) && full[i].check_dependent_insertion(tuple);
-    }
-    // return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ;
-    return res;
+    int bucket_id = mcomm.get_rank();
+    return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ;
 }
 
 void relation::test_calc_hash_rank(u64 rank_n) {
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index 1be454a2..4b9f5215 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -71,7 +71,6 @@ class relation
     u32 **delta_sub_bucket_element_count;
     u32 *delta_bucket_element_count;
 
-    u32 default_sub_bucket_per_bucket_count;    /// 1
     u32 *sub_bucket_per_bucket_count;           /// sub_bucket_per_bucket_count[i] holds the total number of sub-buckets at bucket index i
     u32** sub_bucket_rank;                      /// target rank of a subbucket
 
@@ -96,7 +95,7 @@ class relation
     bool init_flag = true;
 
 public:
-
+    u32 default_sub_bucket_per_bucket_count = 1;    /// 1
     bool balance_flag = false;
 
     /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL);
diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h
index 09ba11ac..47ff2eb8 100644
--- a/backend/src/relation/shmap_relation.h
+++ b/backend/src/relation/shmap_relation.h
@@ -54,7 +54,7 @@ struct shmap_relation {
 
     std::size_t size() const { return ind.size(); }
 
-    bool contains(const t_tuple &t) const {
+    bool contains(const t_tuple &t) {
         auto res = ind.find(t);
         return res != ind.end();
     }
@@ -68,7 +68,7 @@ struct shmap_relation {
     // I keep this weird  name from souffle, actually join helper function
     // in souffle its index selection function, in slog we don't need select
     // so only one version of this function
-    std::pair<iterator, iterator> lowerUpperRange(const t_tuple &lower, const t_tuple &upper) const
+    std::pair<iterator, iterator> lowerUpperRange(const t_tuple &lower, const t_tuple &upper)
     {
         auto lower_it = ind.lower_bound(lower);
         auto upper_it = ind.upper_bound(upper);
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 43c04d90..94c13b26 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -437,6 +437,7 @@ int main(int argc, char **argv) {
       FULL);
 
   rel__edge__2__1->balance_flag = true;
+  // rel__edge__2__1->default_sub_bucket_per_bucket_count = 2;
 
   // relation *rel__edge__2__1__2 = new relation(
   //     2, true, 2, get_tag_for_rel("edge", "1__2"),
@@ -489,6 +490,7 @@ int main(int argc, char **argv) {
   // ));
 
   RAM *cc_init_scc = new RAM(false, 1);
+  cc_init_scc->balance_flag = true;
   // cc_init_scc->add_relation(rel__edge__2__1__2, false);
   cc_init_scc->add_relation(rel__edge__2__1, false);
   cc_init_scc->add_relation(rel__cc__2__1, true);
@@ -592,7 +594,7 @@ int main(int argc, char **argv) {
   // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; 
   // rel__node__1__1->print();
   // rel__edge__2__1->print();
- // rel__cc__2__1->print();
+//  rel__cc__2__1->print();
  // rel__cc_final__2__1->print();
   // rel__cc_represent__1__1->print();
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

From 0370c184e85837b2d5626f6b3c5a173fceecd7d7 Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Tue, 17 Jan 2023 11:55:46 -0500
Subject: [PATCH 28/36] fix bucket in join

---
 backend/src/RA/parallel_join.cpp     | 24 ++++++++++++------------
 backend/tests/cc/compiled_pre/cc.cpp |  6 +++---
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index 61630f4d..18c7acf7 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -64,8 +64,8 @@ bool parallel_join::local_join(int threshold, int* offset,
             u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
             
             auto before_actual_join = MPI_Wtime();
-            for (u32 sb = 0; sb < input1_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
-                input1[input1_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_left_join_buffer(
+            // for (u32 sb = 0; sb < input1_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
+                input1[bucket_id].as_all_to_allv_left_join_buffer(
                     prefix, join_buffer,
                     input0_buffer + k1,input0_buffer_width,
                     input1_buffer_width, counter,
@@ -76,7 +76,7 @@ bool parallel_join::local_join(int threshold, int* offset,
                     global_join_inserts, output->get_join_column_count(),
                     output->get_is_canonical(),
                     generator_mode, generator_func);
-            }
+            // }
             auto after_actual_join = MPI_Wtime();
             join_time_total += after_actual_join - before_actual_join;
 
@@ -116,8 +116,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                 } else {
                     if (input_ts.size() != 0) {
                         auto before_actual_join = MPI_Wtime();
-                        // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
-                        for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) {
+                        u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
+                        // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
                             input1[bucket_id].as_all_to_allv_right_join_buffer(
                                 std::vector<u64>(prev_non_dependent_columns.begin(),
                                                 prev_non_dependent_columns.begin()+join_column_count),
@@ -131,7 +131,7 @@ bool parallel_join::local_join(int threshold, int* offset,
                                 global_join_inserts,
                                 output->get_join_column_count(),output->get_is_canonical(),
                                 generator_mode, generator_func);
-                        }
+                        // }
                         auto after_actual_join = MPI_Wtime();
                         join_time_total += after_actual_join - before_actual_join;
                         input_ts.clear();
@@ -143,8 +143,8 @@ bool parallel_join::local_join(int threshold, int* offset,
             if (input_ts.size() != 0) {
                 u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
                 auto before_actual_join = MPI_Wtime();
-                for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
-                    input1[input0_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_right_join_buffer(
+                // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
+                    input1[bucket_id].as_all_to_allv_right_join_buffer(
                         std::vector<u64>(prev_non_dependent_columns.begin(),
                                         prev_non_dependent_columns.begin()+join_column_count),
                         join_buffer,
@@ -157,7 +157,7 @@ bool parallel_join::local_join(int threshold, int* offset,
                         global_join_inserts,
                         output->get_join_column_count(),output->get_is_canonical(),
                         generator_mode, generator_func);
-                }
+                // }
                 auto after_actual_join = MPI_Wtime();
                 join_time_total += after_actual_join - before_actual_join;
                 input_ts.clear();
@@ -174,8 +174,8 @@ bool parallel_join::local_join(int threshold, int* offset,
             std::vector<std::vector<u64>> input_ts;
             input_ts.push_back(std::vector<u64>(input0_buffer+k1, input0_buffer+k1+input0_buffer_width));
             auto before_actual_join = MPI_Wtime();
-            for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
-                input1[input0_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_right_join_buffer(
+            // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
+                input1[bucket_id].as_all_to_allv_right_join_buffer(
                     prefix, join_buffer,
                     // input0_buffer + k1, input0_buffer_width,
                     input_ts,
@@ -187,7 +187,7 @@ bool parallel_join::local_join(int threshold, int* offset,
                     global_join_inserts,
                     output->get_join_column_count(),output->get_is_canonical(),
                     generator_mode, generator_func);
-            }
+            // }
             auto after_actual_join = MPI_Wtime();
             join_time_total += after_actual_join - before_actual_join;
 
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 94c13b26..1da0f008 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -436,8 +436,8 @@ int main(int argc, char **argv) {
           ".edge.2.table",
       FULL);
 
-  rel__edge__2__1->balance_flag = true;
-  // rel__edge__2__1->default_sub_bucket_per_bucket_count = 2;
+  // rel__edge__2__1->balance_flag = true;
+  rel__edge__2__1->default_sub_bucket_per_bucket_count = 2;
 
   // relation *rel__edge__2__1__2 = new relation(
   //     2, true, 2, get_tag_for_rel("edge", "1__2"),
@@ -490,7 +490,7 @@ int main(int argc, char **argv) {
   // ));
 
   RAM *cc_init_scc = new RAM(false, 1);
-  cc_init_scc->balance_flag = true;
+  // cc_init_scc->balance_flag = true;
   // cc_init_scc->add_relation(rel__edge__2__1__2, false);
   cc_init_scc->add_relation(rel__edge__2__1, false);
   cc_init_scc->add_relation(rel__cc__2__1, true);

From 62b721c2be9035b3b7f219f3d44b29d03de2c198 Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Tue, 17 Jan 2023 15:36:50 -0500
Subject: [PATCH 29/36] add sssp opt

---
 backend/tests/sssp/compiled_pre/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/tests/sssp/compiled_pre/CMakeLists.txt b/backend/tests/sssp/compiled_pre/CMakeLists.txt
index 89ee3ea4..eac399d0 100644
--- a/backend/tests/sssp/compiled_pre/CMakeLists.txt
+++ b/backend/tests/sssp/compiled_pre/CMakeLists.txt
@@ -18,7 +18,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla
 # set (base_dir "${PROJECT_SOURCE_DIR}/../backend")
 set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
 
-file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
+file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/hash/xxhash.cpp" "${source_dir}/hash/hash.cpp" "${source_dir}/hash/spooky-c.cpp" "${source_dir}/hash/fasthash.cpp" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
 file (GLOB source_files_sssp "${PROJECT_SOURCE_DIR}/sssp_opt.cpp")
 
 ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}")

From 94043c5137c28c24963457aa9a3a3f166ee21ec1 Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Thu, 26 Jan 2023 13:55:44 -0500
Subject: [PATCH 30/36] try change insert

---
 backend/src/RAM/RA_tasks.cpp                  | 10 +++-
 .../src/relation/balanced_hash_relation.cpp   | 18 +++++--
 backend/src/relation/shmap_relation.h         |  1 +
 backend/src/relation/shmap_relation_exp.cpp   |  4 ++
 backend/tests/cc/compiled_pre/cc.cpp          | 50 ++++++++++---------
 .../pagerank/compiled_pre/CMakeLists.txt      |  3 +-
 .../pagerank/compiled_pre/pagerank_full.cpp   | 26 ++++++----
 backend/utility/tsv_to_bin.cpp                |  4 +-
 8 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 546efd21..c57ad061 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -242,6 +242,7 @@ u64 RAM::intra_bucket_comm_execute()
                 input1_trees = input1->get_delta();
                 input1_size = input1->get_delta_element_count();
             }
+            double before_reduce_time = MPI_Wtime();
             int join_direction = LEFT;
             int local_join_direction_count = input0_size < input1_size ? 0 : 1;   // true if size of input0 > input1
             int global_join_direction_count = local_join_direction_count;
@@ -249,6 +250,10 @@ u64 RAM::intra_bucket_comm_execute()
             if (global_join_direction_count > mcomm.get_nprocs() / 2) {
                 join_direction = RIGHT;
             }
+            double after_reduce_time = MPI_Wtime();
+            if (mcomm.get_rank() == 0) {
+                std::cout << "Reduced time : " << after_reduce_time - before_reduce_time << std::endl;
+            }
 
             if (join_direction == LEFT) {
                 intra_bucket_comm(get_bucket_count(),
@@ -851,7 +856,8 @@ void RAM::local_insert_in_newt_comm_compaction(std::map<u64, u64>& intern_map)
                     // temporary index column just to match size of column
                     tt.push_back(0);
                     auto _before_i = MPI_Wtime();
-                    insert_flag = output->check_dependent_value_insert_avalible(tt);
+                    // insert_flag = output->chmeck_dependent_value_insert_avalible(tt);
+                    insert_flag = true;
                     auto _after_i = MPI_Wtime();
                     check_time += _after_i - _before_i;
                 } else {
@@ -1039,8 +1045,8 @@ void RAM::local_insert_in_full()
     for (u32 i=0; i < ram_relation_count; i++)
     {
         relation* current_r = ram_relations[i];
-        current_r->insert_delta_in_full();
         current_r->local_insert_in_delta();
+        current_r->insert_delta_in_full();
     }
     return;
 }
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index 88af9322..ea9671e3 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -8,6 +8,7 @@
 #include "../parallel_RA_inc.h"
 #include "balanced_hash_relation.h"
 #include "mpi.h"
+#include "shmap_relation.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -1276,8 +1277,7 @@ bool relation::insert_in_full(u64* t)
 #endif
     // std::cout << "inserting full for " << intern_tag << std::endl;
 
-    // TODO: use normal insert here!
-    if (full[bucket_id].insert_tuple_from_array(t, arity+1) == true)
+    if (full[bucket_id].insert_tuple_from_array(t, arity+1) != INSERT_FAIL)
     // std::vector<u64> tp(t, t+arity+1);
     // if (full[bucket_id].insert(tp))
     {
@@ -1315,24 +1315,31 @@ int relation::insert_delta_in_full()
             //     if (insert_in_full ( (u64*)( (input_buffer[i].buffer) + (j*sizeof(u64)) )) == true)
             //         insert_success++;
             // }
+            std::vector<std::vector<u64>> tuples_to_del;
             for(auto it=delta[i].begin(); it != delta[i].end(); ++it)
             {
                 auto tuple_d = *it;
-                // std::cout << "inserting into delta ";
+                // std::cout << "inserting into full ";
                 // for (auto v: tuple_d) {
                 //     std::cout << v << " ";
                 // }
                 // std::cout << std::endl;
                 if (insert_in_full(tuple_d.data()) == true)
                     insert_success++;
+                else {
+                    tuples_to_del.push_back(tuple_d);
+                }
+            }
+            for (auto t: tuples_to_del) {
+                delta[i].delete_tuple(t);
             }
-            delta[i].remove_tuple();
+            // delta[i].remove_tuple();
 
             // input_buffer[i].vector_buffer_free();
         }
     }
 
-    set_delta_element_count(0);
+    // set_delta_element_count(0);
     // delete[] input_buffer;
 
     return insert_success;
@@ -1396,6 +1403,7 @@ void relation::local_insert_in_delta()
     //         newt_element_count = 0;
     //     }
     // } else {
+
         delete[] delta;
         delta = newt;
         delta_element_count = newt_element_count;
diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h
index 47ff2eb8..ea7af922 100644
--- a/backend/src/relation/shmap_relation.h
+++ b/backend/src/relation/shmap_relation.h
@@ -126,6 +126,7 @@ struct shmap_relation {
     void remove_tuple();
     bool find_tuple_from_array(u64* t, int arity);
     bool check_dependent_insertion(const std::vector<u64> &v);
+    void delete_tuple(std::vector<u64>& t);
 
     void as_vector_buffer_recursive(vector_buffer* vb, std::vector<u64> prefix);
 
diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp
index 19decd31..8e1ff0f6 100644
--- a/backend/src/relation/shmap_relation_exp.cpp
+++ b/backend/src/relation/shmap_relation_exp.cpp
@@ -20,6 +20,10 @@
 #include <ostream>
 #include <vector>
 
+void shmap_relation::delete_tuple(std::vector<u64>& t) {
+    ind.erase(t);
+}
+
 shmap_relation::shmap_relation(int arity, bool id_flag)
 {
     this->arity = arity;
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 1da0f008..65f8e0c8 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -336,11 +336,13 @@ agg_minimum_local(std::pair<shmap_relation::iterator, shmap_relation::iterator>
   local_agg_res_t min_res = std::numeric_limits<u32>::max();
   for (auto it = joined_range.first; it != joined_range.second; ++it) {
     auto tuple = (*it);
-    auto current_v = tuple[tuple.size() - 1];
+    auto current_v = tuple[tuple.size() - 2];
+    // std::cout << tuple[0] << " " << tuple[1] << " " << tuple.size() << std::endl;
     if (current_v < min_res) {
       min_res = current_v;
     }
   }
+  // std::cout << "Min : " << min_res << std::endl;
   return min_res;
 }
 
@@ -476,18 +478,20 @@ int main(int argc, char **argv) {
   relation *rel__cc_represent__1__1 = new relation(
     1, true, 1, get_tag_for_rel("cc_represent", "1"),
     std::to_string(get_tag_for_rel("cc_represent", "1")) + ".cc_represent.2.table",
+    slog_input_dir + "/" + std::to_string(get_tag_for_rel("cc_represent", "1")) +
+          ".cc_represent.1.table",
     FULL);
 
-  // RAM *to_undirected_scc = new RAM(false, 0);
-  // to_undirected_scc->add_relation(rel__edge__2__1, false);
-  // to_undirected_scc->add_rule(new parallel_copy_generate(
-  //   rel__edge__2__1, rel__edge__2__1, FULL,
-  //   [](const u64 *const data, u64 *const output) -> int {
-  //     output[0] = data[1];
-  //     output[1] = data[0];
-  //     return 1;
-  //   }
-  // ));
+  RAM *to_undirected_scc = new RAM(false, 0);
+  to_undirected_scc->add_relation(rel__edge__2__1, false);
+  to_undirected_scc->add_rule(new parallel_copy_generate(
+    rel__edge__2__1, rel__edge__2__1, FULL,
+    [](const u64 *const data, u64 *const output) -> int {
+      output[0] = data[1];
+      output[1] = data[0];
+      return 1;
+    }
+  ));
 
   RAM *cc_init_scc = new RAM(false, 1);
   // cc_init_scc->balance_flag = true;
@@ -546,12 +550,12 @@ int main(int argc, char **argv) {
     agg_minimum_local, SpecialAggregator::minimum, agg_minimum_reduce,
     nullptr, {0,2}));
   
-  // RAM* cc_rep_scc = new RAM(false, 3);
-  // cc_rep_scc->add_relation(rel__cc_final__2__1, false);
-  // cc_rep_scc->add_relation(rel__cc_represent__1__1, true);
-  // cc_rep_scc->add_rule(new parallel_copy(
-  //   rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1}
-  // ));
+  RAM* cc_rep_scc = new RAM(false, 3);
+  cc_rep_scc->add_relation(rel__cc_final__2__1, false);
+  cc_rep_scc->add_relation(rel__cc_represent__1__1, true);
+  cc_rep_scc->add_rule(new parallel_copy(
+    rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1}
+  ));
 
 
   LIE *cc_lie = new LIE();
@@ -562,16 +566,16 @@ int main(int argc, char **argv) {
   cc_lie->add_relation(rel__cc_represent__1__1);
   // cc_lie->add_relation(rel__edge__2__1__2);
 
-  // cc_lie->add_scc(to_undirected_scc);
+  cc_lie->add_scc(to_undirected_scc);
   cc_lie->add_scc(cc_init_scc);
   cc_lie->add_scc(cc_compute_scc);
   cc_lie->add_scc(cc_agg_scc);
-  // cc_lie->add_scc(cc_rep_scc);
+  cc_lie->add_scc(cc_rep_scc);
 
-  // cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc);
+  cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc);
   cc_lie->add_scc_dependance(cc_init_scc, cc_compute_scc);
   cc_lie->add_scc_dependance(cc_compute_scc, cc_agg_scc);
-  // cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc);
+  cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc);
 
   cc_lie->enable_all_to_all_dump();
   cc_lie->set_output_dir(slog_output_dir); // Write to this directory
@@ -595,8 +599,8 @@ int main(int argc, char **argv) {
   // rel__node__1__1->print();
   // rel__edge__2__1->print();
 //  rel__cc__2__1->print();
- // rel__cc_final__2__1->print();
-  // rel__cc_represent__1__1->print();
+ rel__cc_final__2__1->print();
+  rel__cc_represent__1__1->print();
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
   mcomm.destroy();
diff --git a/backend/tests/pagerank/compiled_pre/CMakeLists.txt b/backend/tests/pagerank/compiled_pre/CMakeLists.txt
index 38953a06..3b12bed5 100644
--- a/backend/tests/pagerank/compiled_pre/CMakeLists.txt
+++ b/backend/tests/pagerank/compiled_pre/CMakeLists.txt
@@ -18,7 +18,8 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla
 # set (base_dir "${PROJECT_SOURCE_DIR}/../backend")
 set (source_dir "${PROJECT_SOURCE_DIR}/../../../src")
 
-file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
+# file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
+file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/hash/xxhash.cpp" "${source_dir}/hash/hash.cpp" "${source_dir}/hash/spooky-c.cpp" "${source_dir}/hash/fasthash.cpp" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp")
 file (GLOB source_files_pagerank "${PROJECT_SOURCE_DIR}/pagerank_full.cpp")
 
 ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}")
diff --git a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
index 7513aa3c..e2087ca8 100644
--- a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
+++ b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp
@@ -1,5 +1,5 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
+#include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h"
 #include "mpi.h"
 
 // #include <bit>
@@ -373,7 +373,7 @@ void load_input_relation(std::string db_dir) {
   for (const auto &entry : std::filesystem::directory_iterator(db_dir)) {
     // check if ends with table
     std::string filename_ss = entry.path().filename().string();
-    std::cout << "input database has file " << filename_ss << std::endl;
+    // std::cout << "input database has file " << filename_ss << std::endl;
     std::string suffix = ".table";
     int ft = filename_ss.size() - suffix.size();
     if (ft < 0)
@@ -396,8 +396,8 @@ void load_input_relation(std::string db_dir) {
     }
     if (tag > max_rel)
       max_rel = tag;
-    std::cout << "load " << tag << "." << index_stream.str() << "has arity "
-              << arity << std::endl;
+    // std::cout << "load " << tag << "." << index_stream.str() << "has arity "
+    //           << arity << std::endl;
     rel_tag_map[index_stream.str()] = tag;
   }
 }
@@ -417,8 +417,8 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) {
   }
   max_rel++;
   rel_tag_map[name_arity] = max_rel;
-  std::cout << "generate rel tag: " << name_arity << " " << max_rel
-            << std::endl;
+  // std::cout << "generate rel tag: " << name_arity << " " << max_rel
+  //           << std::endl;
   return max_rel;
 }
 
@@ -582,10 +582,12 @@ int main(int argc, char **argv) {
 
   dangling_node_cnt = rel__dangling_node->get_global_full_element_count();
   dangling_value = FLOAT_SCALE_CONST / total_node_size;
+  if (mcomm.get_rank() == 0) {
   std::cout << ">>>>>>>>> Number of nodes: " << total_node_size
             << " >>>>>>>>> Dangling node count: " << dangling_node_cnt
             << " >>>>>>>>> Dangling value: "
             << dangling_value * 1.0 / FLOAT_SCALE_CONST << std::endl;
+  }
 
   rel__edge__2__1->disable_initialization();
   rel__node__1__1->disable_initialization();
@@ -616,8 +618,10 @@ int main(int argc, char **argv) {
   std::vector<LIE *> pg_lie_list;
 
   for (int i = 0; i < MAX_PG_ITERATION; i++) {
+    if (mcomm.get_rank() == 0) {
     std::cout << ">>>>>>>>>>>>>>>>>>>>> Compute pagerank iter " << current_iter
               << std::endl;
+    }
     LIE *pg_lie = new LIE();
 
     RAM *scc_init = new RAM(false, 0);
@@ -687,7 +691,9 @@ int main(int argc, char **argv) {
     // MPI_Barrier(mcomm.get_comm());
   }
 
+  if (mcomm.get_rank() == 0) {
   std::cout << "Aggregating Page Rank Result ..." << std::endl;
+  }
   relation *rel__result__2__1__2 = new relation(
       2, true, 2, get_tag_for_rel("result", "1__2"),
       std::to_string(get_tag_for_rel("result", "1__2")) +
@@ -734,10 +740,10 @@ int main(int argc, char **argv) {
   final_lie->execute();
   final_lie->print_all_relation_size(); // Continuously print relation sizes
 
-  rel__result__2__1__2->print([](const std::vector<u64> &tp) {
-    u32 pg_v = tp[1];
-    std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl;
-  });
+  // rel__result__2__1__2->print([](const std::vector<u64> &tp) {
+  //   u32 pg_v = tp[1];
+  //   std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl;
+  // });
 
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
diff --git a/backend/utility/tsv_to_bin.cpp b/backend/utility/tsv_to_bin.cpp
index 9f88afb5..49a22b93 100644
--- a/backend/utility/tsv_to_bin.cpp
+++ b/backend/utility/tsv_to_bin.cpp
@@ -162,8 +162,8 @@ uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed )
 
 uint64_t tuple_hash(const uint64_t* start_ptr, uint64_t prefix_len)
 {
-    // return fnv1a(start_ptr, prefix_len);
-    return MurmurHash64A(start_ptr, (int)prefix_len, MURMUR_SEED);
+    return fnv1a(start_ptr, prefix_len);
+    // return MurmurHash64A(start_ptr, (int)prefix_len, MURMUR_SEED);
 }
 
 u32 string_hash(const std::string& str) {

From 3b929d118910397b820ff56529ac496ca0d2d419 Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Wed, 28 Dec 2022 14:17:19 -0500
Subject: [PATCH 31/36] theta gcc + mpich

---
 backend/src/RA/parallel_join.cpp |  93 ++++++++++------------
 backend/src/RAM/RA_tasks.cpp     | 129 +++++++++++++++++++++++++------
 2 files changed, 143 insertions(+), 79 deletions(-)

diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index 18c7acf7..cdea6c8f 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -115,25 +115,20 @@ bool parallel_join::local_join(int threshold, int* offset,
                     input_ts.push_back(input_t);
                 } else {
                     if (input_ts.size() != 0) {
-                        auto before_actual_join = MPI_Wtime();
-                        u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
-                        // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
-                            input1[bucket_id].as_all_to_allv_right_join_buffer(
-                                std::vector<u64>(prev_non_dependent_columns.begin(),
-                                                prev_non_dependent_columns.begin()+join_column_count),
-                                join_buffer,
-                                input_ts,
-                                input1_buffer_width, counter,
-                                buckets, output_sub_bucket_count,
-                                output_sub_bucket_rank, reorder_map_array,
-                                join_column_count, deduplicate,
-                                &local_join_count, global_join_duplicates,
-                                global_join_inserts,
-                                output->get_join_column_count(),output->get_is_canonical(),
-                                generator_mode, generator_func);
-                        // }
-                        auto after_actual_join = MPI_Wtime();
-                        join_time_total += after_actual_join - before_actual_join;
+                        u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
+                        input1[bucket_id].as_all_to_allv_right_join_buffer(
+                            std::vector<u64>(prev_non_dependent_columns.begin(),
+                                             prev_non_dependent_columns.begin()+join_column_count),
+                            join_buffer,
+                            input_ts,
+                            input1_buffer_width, counter,
+                            buckets, output_sub_bucket_count,
+                            output_sub_bucket_rank, reorder_map_array,
+                            join_column_count, deduplicate,
+                            &local_join_count, global_join_duplicates,
+                            global_join_inserts,
+                            output->get_join_column_count(),output->get_is_canonical(),
+                            generator_mode, generator_func);
                         input_ts.clear();
                     }
                     prev_non_dependent_columns = cur_non_dependent_columns;
@@ -142,24 +137,19 @@ bool parallel_join::local_join(int threshold, int* offset,
             }
             if (input_ts.size() != 0) {
                 u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
-                auto before_actual_join = MPI_Wtime();
-                // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
-                    input1[bucket_id].as_all_to_allv_right_join_buffer(
-                        std::vector<u64>(prev_non_dependent_columns.begin(),
-                                        prev_non_dependent_columns.begin()+join_column_count),
-                        join_buffer,
-                        input_ts,
-                        input1_buffer_width, counter,
-                        buckets, output_sub_bucket_count,
-                        output_sub_bucket_rank, reorder_map_array,
-                        join_column_count, deduplicate,
-                        &local_join_count, global_join_duplicates,
-                        global_join_inserts,
-                        output->get_join_column_count(),output->get_is_canonical(),
-                        generator_mode, generator_func);
-                // }
-                auto after_actual_join = MPI_Wtime();
-                join_time_total += after_actual_join - before_actual_join;
+                input1[bucket_id].as_all_to_allv_right_join_buffer(
+                    std::vector<u64>(prev_non_dependent_columns.begin(),
+                                    prev_non_dependent_columns.begin()+join_column_count),
+                    join_buffer,
+                    input_ts,
+                    input1_buffer_width, counter,
+                    buckets, output_sub_bucket_count,
+                    output_sub_bucket_rank, reorder_map_array,
+                    join_column_count, deduplicate,
+                    &local_join_count, global_join_duplicates,
+                    global_join_inserts,
+                    output->get_join_column_count(),output->get_is_canonical(),
+                    generator_mode, generator_func);
                 input_ts.clear();
             }
         } else {
@@ -173,23 +163,18 @@ bool parallel_join::local_join(int threshold, int* offset,
             u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
             std::vector<std::vector<u64>> input_ts;
             input_ts.push_back(std::vector<u64>(input0_buffer+k1, input0_buffer+k1+input0_buffer_width));
-            auto before_actual_join = MPI_Wtime();
-            // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
-                input1[bucket_id].as_all_to_allv_right_join_buffer(
-                    prefix, join_buffer,
-                    // input0_buffer + k1, input0_buffer_width,
-                    input_ts,
-                    input1_buffer_width, counter,
-                    buckets, output_sub_bucket_count,
-                    output_sub_bucket_rank, reorder_map_array,
-                    join_column_count, deduplicate,
-                    &local_join_count, global_join_duplicates,
-                    global_join_inserts,
-                    output->get_join_column_count(),output->get_is_canonical(),
-                    generator_mode, generator_func);
-            // }
-            auto after_actual_join = MPI_Wtime();
-            join_time_total += after_actual_join - before_actual_join;
+            input1[bucket_id].as_all_to_allv_right_join_buffer(
+                prefix, join_buffer,
+                // input0_buffer + k1, input0_buffer_width,
+                input_ts,
+                input1_buffer_width, counter,
+                buckets, output_sub_bucket_count,
+                output_sub_bucket_rank, reorder_map_array,
+                join_column_count, deduplicate,
+                &local_join_count, global_join_duplicates,
+                global_join_inserts,
+                output->get_join_column_count(),output->get_is_canonical(),
+                generator_mode, generator_func);
 
             // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl;
             if (local_join_count > threshold)
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index c57ad061..418614b5 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -264,7 +264,42 @@ u64 RAM::intra_bucket_comm_execute()
                                   mcomm.get_local_comm());
             } else {
                 intra_bucket_comm(get_bucket_count(),
-                                  input1_trees,
+                                  input0->get_delta(),
+                                  input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
+                                  input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
+                                  &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
+                                  mcomm.get_local_comm());
+                total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
+            }
+
+            /// Join between full and delta
+            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
+            {
+                // std::cout << "here>>>>>>>>>>>>>"  << std::endl;
+                // if (input1->get_dependent_column().size() > 0) {
+                //     intra_bucket_comm(get_bucket_count(),
+                //                   input0->get_full(),
+                //                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
+                //                   input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
+                //                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
+                //                   mcomm.get_local_comm());
+                // } else {
+                    intra_bucket_comm(get_bucket_count(),
+                                    input1->get_delta(),
+                                    input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
+                                    input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
+                                    &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
+                                    mcomm.get_local_comm());
+                // }
+                total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
+            }
+
+            /// Join between full and full
+            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL)
+            {
+
+                intra_bucket_comm(get_bucket_count(),
+                                  input1->get_full(),
                                   input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
                                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
                                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
@@ -369,8 +404,6 @@ bool RAM::local_compute(int* offset)
     auto before_compute_time = MPI_Wtime();
     auto ibf_size = 0;
     u64 jtarget_size = 0;
-    double size_sync_time = 0;
-    double real_join_time = 0;
     for (std::vector<parallel_RA*>::iterator it = RA_list.begin() ; it != RA_list.end(); ++it)
     {
         // std::cout << "RA type : " << (*it)->get_RA_type() << std::endl;
@@ -616,7 +649,7 @@ bool RAM::local_compute(int* offset)
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          LEFT,
                                                                          get_bucket_count(),
-                                                                         input0_trees,
+                                                                         input0->get_delta(),
                                                                          intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
                                                                          input1_trees, input1_size, input1->get_arity()+1,
                                                                          reorder_map_array,
@@ -627,15 +660,71 @@ bool RAM::local_compute(int* offset)
                                                                          counter,
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
-                                                                         &join_tuples,
-                                                                         real_j_time_stat);
-                jtarget_size += input1_size;             
-                ibf_size += input0_size;           
-            } else {
+                                                                         &join_tuples);
+                total_join_tuples = total_join_tuples + join_tuples;
+                jtarget_size += input1->get_delta_element_count();
+            }
+            else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL)
+            {
+
+                join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
+                                                                         LEFT,
+                                                                         get_bucket_count(),
+                                                                         input0->get_delta(),
+                                                                         intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
+                                                                         input1->get_full(), input1->get_full_element_count(), input1->get_arity()+1,
+                                                                         reorder_map_array,
+                                                                         output_relation,
+                                                                         compute_buffer,
+                                                                         counter,
+                                                                         join_column_count,
+                                                                         &join_tuples_duplicates,
+                                                                         &join_tuples);
+                total_join_tuples = total_join_tuples + join_tuples;
+                jtarget_size += input1->get_full_element_count();
+            }
+            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
+            {
+                // if (input1->get_dependent_column().size() > 0) {
+                //     join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
+                //                                                             LEFT,
+                //                                                             get_bucket_count(),
+                //                                                             input0->get_delta(),
+                //                                                             intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
+                //                                                             input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1,
+                //                                                             reorder_map_array,
+                //                                                             output_relation,
+                //                                                             compute_buffer,
+                //                                                             counter,
+                //                                                             join_column_count,
+                //                                                             &join_tuples_duplicates,
+                //                                                             &join_tuples); 
+                //     jtarget_size += input1->get_delta_element_count();
+                // } else {
+                    join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
+                                                                            RIGHT,
+                                                                            get_bucket_count(),
+                                                                            input1->get_delta(),
+                                                                            intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
+                                                                            input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1,
+                                                                            reorder_map_array,
+                                                                            output_relation,
+                                                                            compute_buffer,
+                                                                            counter,
+                                                                            join_column_count,
+                                                                            &join_tuples_duplicates,
+                                                                            &join_tuples);
+                    jtarget_size += input0->get_full_element_count();
+                // }
+                total_join_tuples = total_join_tuples + join_tuples;
+                
+            }
+            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL)
+            {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          RIGHT,
                                                                          get_bucket_count(),
-                                                                         input1_trees,
+                                                                         input1->get_full(),
                                                                          intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
                                                                          input0_trees, input0_size, input0->get_arity()+1,
                                                                          reorder_map_array,
@@ -646,13 +735,12 @@ bool RAM::local_compute(int* offset)
                                                                          counter,
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
-                                                                         &join_tuples,
-                                                                         real_j_time_stat);
-                jtarget_size += input0_size;             
-                ibf_size += input1_size;  
+                                                                         &join_tuples);
+                total_join_tuples = total_join_tuples + join_tuples;
+                jtarget_size += input0->get_full_element_count();
             }
-            total_join_tuples = total_join_tuples + join_tuples;
-            real_join_time += real_j_time_stat[0];
+            
+            ibf_size += intra_bucket_buf_output_size[counter];
         }
         counter++;      
     }
@@ -681,12 +769,9 @@ bool RAM::local_compute(int* offset)
     if (lc_all_time == slowest_rank_time) {
         std::cout << "Slowest Rank >>> " << mcomm.get_rank()
                   << "   Comp Time >>> " << after_compute_time - before_compute_time
-                  << "   Real Join >>> " << real_join_time
                   << "   Sync Time >>> " << after_sync_time - before_sync_time
-                  << "   Size Sync Time >>> " << size_sync_time
                   << "  Input Size >>> " << ibf_size
                   << "  Target Count >>> " << jtarget_size
-                  << "  Join Count >>> " << total_join_tuples
                   << std::endl;
     }
 
@@ -1125,12 +1210,6 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
     double all_insert_newt = 0;
     double all_comm = 0;
     double all_time = 0;
-    double all_insert_in_full = 0;
-    double all_allocate_buf = 0;
-    double all_intra = 0;
-    double all_free_buf =0;
-
-    // auto before_batch = MPI_Wtime();
     while (batch_size != 0)
     {
 #if DEBUG_OUTPUT

From 08dce3c512701bb7ff8e20eb2b65875217f40303 Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Wed, 28 Dec 2022 18:05:23 -0500
Subject: [PATCH 32/36] add more log

---
 backend/src/RAM/RA_tasks.cpp | 126 ++++-------------------------------
 1 file changed, 14 insertions(+), 112 deletions(-)

diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 418614b5..3c3060a9 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -242,7 +242,6 @@ u64 RAM::intra_bucket_comm_execute()
                 input1_trees = input1->get_delta();
                 input1_size = input1->get_delta_element_count();
             }
-            double before_reduce_time = MPI_Wtime();
             int join_direction = LEFT;
             int local_join_direction_count = input0_size < input1_size ? 0 : 1;   // true if size of input0 > input1
             int global_join_direction_count = local_join_direction_count;
@@ -250,10 +249,6 @@ u64 RAM::intra_bucket_comm_execute()
             if (global_join_direction_count > mcomm.get_nprocs() / 2) {
                 join_direction = RIGHT;
             }
-            double after_reduce_time = MPI_Wtime();
-            if (mcomm.get_rank() == 0) {
-                std::cout << "Reduced time : " << after_reduce_time - before_reduce_time << std::endl;
-            }
 
             if (join_direction == LEFT) {
                 intra_bucket_comm(get_bucket_count(),
@@ -264,42 +259,7 @@ u64 RAM::intra_bucket_comm_execute()
                                   mcomm.get_local_comm());
             } else {
                 intra_bucket_comm(get_bucket_count(),
-                                  input0->get_delta(),
-                                  input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
-                                  input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
-                                  &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
-                                  mcomm.get_local_comm());
-                total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
-            }
-
-            /// Join between full and delta
-            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
-            {
-                // std::cout << "here>>>>>>>>>>>>>"  << std::endl;
-                // if (input1->get_dependent_column().size() > 0) {
-                //     intra_bucket_comm(get_bucket_count(),
-                //                   input0->get_full(),
-                //                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
-                //                   input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
-                //                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
-                //                   mcomm.get_local_comm());
-                // } else {
-                    intra_bucket_comm(get_bucket_count(),
-                                    input1->get_delta(),
-                                    input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
-                                    input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
-                                    &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
-                                    mcomm.get_local_comm());
-                // }
-                total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter];
-            }
-
-            /// Join between full and full
-            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL)
-            {
-
-                intra_bucket_comm(get_bucket_count(),
-                                  input1->get_full(),
+                                  input1_trees,
                                   input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(),
                                   input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(),
                                   &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter],
@@ -636,95 +596,31 @@ bool RAM::local_compute(int* offset)
             int join_direction = LEFT;
             int local_join_direction_count = input0_size < input1_size ? 0 : 1;   // true if size of input0 > input1
             int global_join_direction_count = local_join_direction_count;
-
-            auto before_size_sync = MPI_Wtime();
             MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm());
             if (global_join_direction_count > mcomm.get_nprocs() / 2) {
                 join_direction = RIGHT;
             }
-            auto after_size_sync = MPI_Wtime();
-            size_sync_time += after_size_sync - before_size_sync;
-            std::vector<double> real_j_time_stat;
+
             if (join_direction == LEFT) {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          LEFT,
                                                                          get_bucket_count(),
-                                                                         input0->get_delta(),
+                                                                         input0_trees,
                                                                          intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
                                                                          input1_trees, input1_size, input1->get_arity()+1,
                                                                          reorder_map_array,
                                                                          output_relation,
-                                                                         input0,
-                                                                         input1,
-                                                                         compute_buffer,
-                                                                         counter,
-                                                                         join_column_count,
-                                                                         &join_tuples_duplicates,
-                                                                         &join_tuples);
-                total_join_tuples = total_join_tuples + join_tuples;
-                jtarget_size += input1->get_delta_element_count();
-            }
-            else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL)
-            {
-
-                join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
-                                                                         LEFT,
-                                                                         get_bucket_count(),
-                                                                         input0->get_delta(),
-                                                                         intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
-                                                                         input1->get_full(), input1->get_full_element_count(), input1->get_arity()+1,
-                                                                         reorder_map_array,
-                                                                         output_relation,
                                                                          compute_buffer,
                                                                          counter,
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
                                                                          &join_tuples);
-                total_join_tuples = total_join_tuples + join_tuples;
-                jtarget_size += input1->get_full_element_count();
-            }
-            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA)
-            {
-                // if (input1->get_dependent_column().size() > 0) {
-                //     join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
-                //                                                             LEFT,
-                //                                                             get_bucket_count(),
-                //                                                             input0->get_delta(),
-                //                                                             intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter],
-                //                                                             input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1,
-                //                                                             reorder_map_array,
-                //                                                             output_relation,
-                //                                                             compute_buffer,
-                //                                                             counter,
-                //                                                             join_column_count,
-                //                                                             &join_tuples_duplicates,
-                //                                                             &join_tuples); 
-                //     jtarget_size += input1->get_delta_element_count();
-                // } else {
-                    join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
-                                                                            RIGHT,
-                                                                            get_bucket_count(),
-                                                                            input1->get_delta(),
-                                                                            intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
-                                                                            input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1,
-                                                                            reorder_map_array,
-                                                                            output_relation,
-                                                                            compute_buffer,
-                                                                            counter,
-                                                                            join_column_count,
-                                                                            &join_tuples_duplicates,
-                                                                            &join_tuples);
-                    jtarget_size += input0->get_full_element_count();
-                // }
-                total_join_tuples = total_join_tuples + join_tuples;
-                
-            }
-            else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL)
-            {
+                           
+            } else {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          RIGHT,
                                                                          get_bucket_count(),
-                                                                         input1->get_full(),
+                                                                         input1_trees,
                                                                          intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter],
                                                                          input0_trees, input0_size, input0->get_arity()+1,
                                                                          reorder_map_array,
@@ -736,9 +632,9 @@ bool RAM::local_compute(int* offset)
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
                                                                          &join_tuples);
-                total_join_tuples = total_join_tuples + join_tuples;
-                jtarget_size += input0->get_full_element_count();
             }
+            total_join_tuples = total_join_tuples + join_tuples;
+            jtarget_size += input1->get_delta_element_count();  
             
             ibf_size += intra_bucket_buf_output_size[counter];
         }
@@ -1210,6 +1106,12 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector<u32>&
     double all_insert_newt = 0;
     double all_comm = 0;
     double all_time = 0;
+    double all_insert_in_full = 0;
+    double all_allocate_buf = 0;
+    double all_intra = 0;
+    double all_free_buf =0;
+
+    // auto before_batch = MPI_Wtime();
     while (batch_size != 0)
     {
 #if DEBUG_OUTPUT

From 2b34dcba09a6d29d09d73696b44c7c19966cee61 Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Thu, 29 Dec 2022 23:45:09 -0500
Subject: [PATCH 33/36] more stat

---
 backend/src/RA/parallel_join.cpp | 35 +++++++++++++++++++-------------
 backend/src/RAM/RA_tasks.cpp     | 25 ++++++++++++++++-------
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index cdea6c8f..c04806c8 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -62,21 +62,19 @@ bool parallel_join::local_join(int threshold, int* offset,
             }
 
             u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
-            
+
             auto before_actual_join = MPI_Wtime();
-            // for (u32 sb = 0; sb < input1_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) {
-                input1[bucket_id].as_all_to_allv_left_join_buffer(
-                    prefix, join_buffer,
-                    input0_buffer + k1,input0_buffer_width,
-                    input1_buffer_width, counter,
-                    buckets, output_sub_bucket_count,
-                    output_sub_bucket_rank, reorder_map_array,
-                    join_column_count, deduplicate,
-                    &local_join_count, global_join_duplicates,
-                    global_join_inserts, output->get_join_column_count(),
-                    output->get_is_canonical(),
-                    generator_mode, generator_func);
-            // }
+            input1[bucket_id].as_all_to_allv_left_join_buffer(
+                prefix, join_buffer,
+                input0_buffer + k1,input0_buffer_width,
+                input1_buffer_width, counter,
+                buckets, output_sub_bucket_count,
+                output_sub_bucket_rank, reorder_map_array,
+                join_column_count, deduplicate,
+                &local_join_count, global_join_duplicates,
+                global_join_inserts, output->get_join_column_count(),
+                output->get_is_canonical(),
+                generator_mode, generator_func);
             auto after_actual_join = MPI_Wtime();
             join_time_total += after_actual_join - before_actual_join;
 
@@ -115,6 +113,7 @@ bool parallel_join::local_join(int threshold, int* offset,
                     input_ts.push_back(input_t);
                 } else {
                     if (input_ts.size() != 0) {
+                        auto before_actual_join = MPI_Wtime();
                         u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
                         input1[bucket_id].as_all_to_allv_right_join_buffer(
                             std::vector<u64>(prev_non_dependent_columns.begin(),
@@ -129,6 +128,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                             global_join_inserts,
                             output->get_join_column_count(),output->get_is_canonical(),
                             generator_mode, generator_func);
+                        auto after_actual_join = MPI_Wtime();
+                        join_time_total += after_actual_join - before_actual_join;
                         input_ts.clear();
                     }
                     prev_non_dependent_columns = cur_non_dependent_columns;
@@ -137,6 +138,7 @@ bool parallel_join::local_join(int threshold, int* offset,
             }
             if (input_ts.size() != 0) {
                 u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
+                auto before_actual_join = MPI_Wtime();
                 input1[bucket_id].as_all_to_allv_right_join_buffer(
                     std::vector<u64>(prev_non_dependent_columns.begin(),
                                     prev_non_dependent_columns.begin()+join_column_count),
@@ -150,6 +152,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                     global_join_inserts,
                     output->get_join_column_count(),output->get_is_canonical(),
                     generator_mode, generator_func);
+                auto after_actual_join = MPI_Wtime();
+                join_time_total += after_actual_join - before_actual_join;
                 input_ts.clear();
             }
         } else {
@@ -163,6 +167,7 @@ bool parallel_join::local_join(int threshold, int* offset,
             u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
             std::vector<std::vector<u64>> input_ts;
             input_ts.push_back(std::vector<u64>(input0_buffer+k1, input0_buffer+k1+input0_buffer_width));
+            auto before_actual_join = MPI_Wtime();
             input1[bucket_id].as_all_to_allv_right_join_buffer(
                 prefix, join_buffer,
                 // input0_buffer + k1, input0_buffer_width,
@@ -175,6 +180,8 @@ bool parallel_join::local_join(int threshold, int* offset,
                 global_join_inserts,
                 output->get_join_column_count(),output->get_is_canonical(),
                 generator_mode, generator_func);
+            auto after_actual_join = MPI_Wtime();
+            join_time_total += after_actual_join - before_actual_join;
 
             // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl;
             if (local_join_count > threshold)
diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 3c3060a9..805b8423 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -364,6 +364,8 @@ bool RAM::local_compute(int* offset)
     auto before_compute_time = MPI_Wtime();
     auto ibf_size = 0;
     u64 jtarget_size = 0;
+    double size_sync_time = 0;
+    double real_join_time = 0;
     for (std::vector<parallel_RA*>::iterator it = RA_list.begin() ; it != RA_list.end(); ++it)
     {
         // std::cout << "RA type : " << (*it)->get_RA_type() << std::endl;
@@ -596,11 +598,15 @@ bool RAM::local_compute(int* offset)
             int join_direction = LEFT;
             int local_join_direction_count = input0_size < input1_size ? 0 : 1;   // true if size of input0 > input1
             int global_join_direction_count = local_join_direction_count;
+
+            auto before_size_sync = MPI_Wtime();
             MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm());
             if (global_join_direction_count > mcomm.get_nprocs() / 2) {
                 join_direction = RIGHT;
             }
-
+            auto after_size_sync = MPI_Wtime();
+            size_sync_time += after_size_sync - before_size_sync;
+            std::vector<double> real_j_time_stat;
             if (join_direction == LEFT) {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          LEFT,
@@ -614,8 +620,10 @@ bool RAM::local_compute(int* offset)
                                                                          counter,
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
-                                                                         &join_tuples);
-                           
+                                                                         &join_tuples,
+                                                                         real_j_time_stat);
+                jtarget_size += input1_size;             
+                ibf_size += input0_size;           
             } else {
                 join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]),
                                                                          RIGHT,
@@ -631,12 +639,13 @@ bool RAM::local_compute(int* offset)
                                                                          counter,
                                                                          join_column_count,
                                                                          &join_tuples_duplicates,
-                                                                         &join_tuples);
+                                                                         &join_tuples,
+                                                                         real_j_time_stat);
+                jtarget_size += input0_size;             
+                ibf_size += input1_size;  
             }
             total_join_tuples = total_join_tuples + join_tuples;
-            jtarget_size += input1->get_delta_element_count();  
-            
-            ibf_size += intra_bucket_buf_output_size[counter];
+            real_join_time += real_j_time_stat[0];
         }
         counter++;      
     }
@@ -665,7 +674,9 @@ bool RAM::local_compute(int* offset)
     if (lc_all_time == slowest_rank_time) {
         std::cout << "Slowest Rank >>> " << mcomm.get_rank()
                   << "   Comp Time >>> " << after_compute_time - before_compute_time
+                  << "   Real Join >>> " << real_join_time
                   << "   Sync Time >>> " << after_sync_time - before_sync_time
+                  << "   Size Sync Time >>> " << size_sync_time
                   << "  Input Size >>> " << ibf_size
                   << "  Target Count >>> " << jtarget_size
                   << std::endl;

From 271e9c4cc756a76b38b0e8e3c1990a8732e1652b Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Wed, 4 Jan 2023 14:53:44 -0500
Subject: [PATCH 34/36] more hash function

---
 backend/src/RAM/RA_tasks.cpp                  |  1 +
 .../src/relation/balanced_hash_relation.cpp   |  1 -
 backend/tests/cc/compiled_pre/cc.cpp          | 22 ++++++++-----------
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp
index 805b8423..4bb90015 100644
--- a/backend/src/RAM/RA_tasks.cpp
+++ b/backend/src/RAM/RA_tasks.cpp
@@ -679,6 +679,7 @@ bool RAM::local_compute(int* offset)
                   << "   Size Sync Time >>> " << size_sync_time
                   << "  Input Size >>> " << ibf_size
                   << "  Target Count >>> " << jtarget_size
+                  << "  Join Count >>> " << total_join_tuples
                   << std::endl;
     }
 
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index ea9671e3..da06df04 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -8,7 +8,6 @@
 #include "../parallel_RA_inc.h"
 #include "balanced_hash_relation.h"
 #include "mpi.h"
-#include "shmap_relation.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 65f8e0c8..4b53c04e 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -438,15 +438,12 @@ int main(int argc, char **argv) {
           ".edge.2.table",
       FULL);
 
-  // rel__edge__2__1->balance_flag = true;
-  rel__edge__2__1->default_sub_bucket_per_bucket_count = 2;
-
-  // relation *rel__edge__2__1__2 = new relation(
-  //     2, true, 2, get_tag_for_rel("edge", "1__2"),
-  //     std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
-  //     slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
-  //         ".edge.2.table",
-  //     FULL);
+  relation *rel__edge__2__1__2 = new relation(
+      2, true, 2, get_tag_for_rel("edge", "1__2"),
+      std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
+      slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
+          ".edge.2.table",
+      FULL);
 
   relation *rel__cc__2__1 = new  relation(
     1, true, 2, get_tag_for_rel("cc", "1"),
@@ -494,8 +491,7 @@ int main(int argc, char **argv) {
   ));
 
   RAM *cc_init_scc = new RAM(false, 1);
-  // cc_init_scc->balance_flag = true;
-  // cc_init_scc->add_relation(rel__edge__2__1__2, false);
+  cc_init_scc->add_relation(rel__edge__2__1__2, false);
   cc_init_scc->add_relation(rel__edge__2__1, false);
   cc_init_scc->add_relation(rel__cc__2__1, true);
   cc_init_scc->add_relation(rel__node__1__1, true);
@@ -564,7 +560,7 @@ int main(int argc, char **argv) {
   cc_lie->add_relation(rel__cc__2__1);
   cc_lie->add_relation(rel__cc_final__2__1);
   cc_lie->add_relation(rel__cc_represent__1__1);
-  // cc_lie->add_relation(rel__edge__2__1__2);
+  cc_lie->add_relation(rel__edge__2__1__2);
 
   cc_lie->add_scc(to_undirected_scc);
   cc_lie->add_scc(cc_init_scc);
@@ -594,7 +590,7 @@ int main(int argc, char **argv) {
 
 
   // rel__edge__2__1__2->test_calc_hash_rank(4096);
-  // rel__edge__2__1->test_calc_hash_rank(4096);
+  rel__edge__2__1->test_calc_hash_rank(4096);
   // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; 
   // rel__node__1__1->print();
   // rel__edge__2__1->print();

From 1c8f027009af2699aab17c56385ace15f4ff6f0d Mon Sep 17 00:00:00 2001
From: ysun67 <ysun67@its-rc-kkmicins.ad.syr.edu>
Date: Thu, 5 Jan 2023 12:46:37 -0500
Subject: [PATCH 35/36] stage change

---
 backend/src/lie/lie.cpp                       |  2 --
 .../src/relation/balanced_hash_relation.cpp   |  9 +++++++--
 backend/src/relation/balanced_hash_relation.h |  2 ++
 backend/tests/cc/compiled_pre/cc.cpp          | 20 ++++++++++---------
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 6e4de91a..88d85b4e 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -433,9 +433,7 @@ bool LIE::execute ()
 #endif
 
         // load balance before a SCC executed
-        if (executable_task->balance_flag) {
         executable_task->load_balance();
-        }
 
         if (restart_flag == false)
         {
diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp
index da06df04..1934af27 100644
--- a/backend/src/relation/balanced_hash_relation.cpp
+++ b/backend/src/relation/balanced_hash_relation.cpp
@@ -1433,8 +1433,13 @@ bool relation::check_dependent_value_insert_avalible(const std::vector<u64>& tup
     // if (bucket_id != mcomm.get_rank()) {
     //     std::cout << "wwwwwwwwwwwwwwwwwwwwwwwwwwwwww " << std::endl; 
     // }
-    int bucket_id = mcomm.get_rank();
-    return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ;
+    // int bucket_id = mcomm.get_rank();
+    bool res = true;
+    for (int i = 0 ; i < mcomm.get_nprocs(); i ++) {
+        res = (res && delta[i].check_dependent_insertion(tuple)) && full[i].check_dependent_insertion(tuple);
+    }
+    // return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ;
+    return res;
 }
 
 void relation::test_calc_hash_rank(u64 rank_n) {
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index 4b9f5215..5368811f 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -98,6 +98,8 @@ class relation
     u32 default_sub_bucket_per_bucket_count = 1;    /// 1
     bool balance_flag = false;
 
+    bool balance_flag = false;
+
     /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL);
     /// 2: arity (Internally one extra id (intern id) column is added to every relation)
     /// true: arity == join column count
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 4b53c04e..19cad2c9 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -438,12 +438,14 @@ int main(int argc, char **argv) {
           ".edge.2.table",
       FULL);
 
-  relation *rel__edge__2__1__2 = new relation(
-      2, true, 2, get_tag_for_rel("edge", "1__2"),
-      std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
-      slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
-          ".edge.2.table",
-      FULL);
+  rel__edge__2__1->balance_flag = true;
+
+  // relation *rel__edge__2__1__2 = new relation(
+  //     2, true, 2, get_tag_for_rel("edge", "1__2"),
+  //     std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table",
+  //     slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) +
+  //         ".edge.2.table",
+  //     FULL);
 
   relation *rel__cc__2__1 = new  relation(
     1, true, 2, get_tag_for_rel("cc", "1"),
@@ -491,7 +493,7 @@ int main(int argc, char **argv) {
   ));
 
   RAM *cc_init_scc = new RAM(false, 1);
-  cc_init_scc->add_relation(rel__edge__2__1__2, false);
+  // cc_init_scc->add_relation(rel__edge__2__1__2, false);
   cc_init_scc->add_relation(rel__edge__2__1, false);
   cc_init_scc->add_relation(rel__cc__2__1, true);
   cc_init_scc->add_relation(rel__node__1__1, true);
@@ -560,7 +562,7 @@ int main(int argc, char **argv) {
   cc_lie->add_relation(rel__cc__2__1);
   cc_lie->add_relation(rel__cc_final__2__1);
   cc_lie->add_relation(rel__cc_represent__1__1);
-  cc_lie->add_relation(rel__edge__2__1__2);
+  // cc_lie->add_relation(rel__edge__2__1__2);
 
   cc_lie->add_scc(to_undirected_scc);
   cc_lie->add_scc(cc_init_scc);
@@ -590,7 +592,7 @@ int main(int argc, char **argv) {
 
 
   // rel__edge__2__1__2->test_calc_hash_rank(4096);
-  rel__edge__2__1->test_calc_hash_rank(4096);
+  // rel__edge__2__1->test_calc_hash_rank(4096);
   // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; 
   // rel__node__1__1->print();
   // rel__edge__2__1->print();

From b4d1b7f2ff859409e8a7ec85b33c0803fc12ce67 Mon Sep 17 00:00:00 2001
From: Yihao Sun <stargazermiao@gmail.com>
Date: Thu, 5 Jan 2023 21:08:03 -0500
Subject: [PATCH 36/36] add manual sub rank split

---
 backend/src/RA/parallel_join.cpp              | 116 ++++++++++--------
 backend/src/lie/lie.cpp                       |   2 +-
 backend/src/relation/balanced_hash_relation.h |   2 -
 backend/tests/cc/compiled_pre/cc.cpp          |  19 +--
 4 files changed, 73 insertions(+), 66 deletions(-)

diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp
index c04806c8..1333dcdc 100644
--- a/backend/src/RA/parallel_join.cpp
+++ b/backend/src/RA/parallel_join.cpp
@@ -61,20 +61,22 @@ bool parallel_join::local_join(int threshold, int* offset,
                 //std::cout << "PREFIX " << input0_buffer[k1 + jc] << std::endl;
             }
 
-            u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
-
+            // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
+            
             auto before_actual_join = MPI_Wtime();
-            input1[bucket_id].as_all_to_allv_left_join_buffer(
-                prefix, join_buffer,
-                input0_buffer + k1,input0_buffer_width,
-                input1_buffer_width, counter,
-                buckets, output_sub_bucket_count,
-                output_sub_bucket_rank, reorder_map_array,
-                join_column_count, deduplicate,
-                &local_join_count, global_join_duplicates,
-                global_join_inserts, output->get_join_column_count(),
-                output->get_is_canonical(),
-                generator_mode, generator_func);
+            for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) {
+                input1[bucket_id].as_all_to_allv_left_join_buffer(
+                    prefix, join_buffer,
+                    input0_buffer + k1,input0_buffer_width,
+                    input1_buffer_width, counter,
+                    buckets, output_sub_bucket_count,
+                    output_sub_bucket_rank, reorder_map_array,
+                    join_column_count, deduplicate,
+                    &local_join_count, global_join_duplicates,
+                    global_join_inserts, output->get_join_column_count(),
+                    output->get_is_canonical(),
+                    generator_mode, generator_func);
+            }
             auto after_actual_join = MPI_Wtime();
             join_time_total += after_actual_join - before_actual_join;
 
@@ -114,20 +116,22 @@ bool parallel_join::local_join(int threshold, int* offset,
                 } else {
                     if (input_ts.size() != 0) {
                         auto before_actual_join = MPI_Wtime();
-                        u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
-                        input1[bucket_id].as_all_to_allv_right_join_buffer(
-                            std::vector<u64>(prev_non_dependent_columns.begin(),
-                                             prev_non_dependent_columns.begin()+join_column_count),
-                            join_buffer,
-                            input_ts,
-                            input1_buffer_width, counter,
-                            buckets, output_sub_bucket_count,
-                            output_sub_bucket_rank, reorder_map_array,
-                            join_column_count, deduplicate,
-                            &local_join_count, global_join_duplicates,
-                            global_join_inserts,
-                            output->get_join_column_count(),output->get_is_canonical(),
-                            generator_mode, generator_func);
+                        // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
+                        for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) {
+                            input1[bucket_id].as_all_to_allv_right_join_buffer(
+                                std::vector<u64>(prev_non_dependent_columns.begin(),
+                                                prev_non_dependent_columns.begin()+join_column_count),
+                                join_buffer,
+                                input_ts,
+                                input1_buffer_width, counter,
+                                buckets, output_sub_bucket_count,
+                                output_sub_bucket_rank, reorder_map_array,
+                                join_column_count, deduplicate,
+                                &local_join_count, global_join_duplicates,
+                                global_join_inserts,
+                                output->get_join_column_count(),output->get_is_canonical(),
+                                generator_mode, generator_func);
+                        }
                         auto after_actual_join = MPI_Wtime();
                         join_time_total += after_actual_join - before_actual_join;
                         input_ts.clear();
@@ -137,21 +141,23 @@ bool parallel_join::local_join(int threshold, int* offset,
                 }
             }
             if (input_ts.size() != 0) {
-                u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
+                // u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets;
                 auto before_actual_join = MPI_Wtime();
-                input1[bucket_id].as_all_to_allv_right_join_buffer(
-                    std::vector<u64>(prev_non_dependent_columns.begin(),
-                                    prev_non_dependent_columns.begin()+join_column_count),
-                    join_buffer,
-                    input_ts,
-                    input1_buffer_width, counter,
-                    buckets, output_sub_bucket_count,
-                    output_sub_bucket_rank, reorder_map_array,
-                    join_column_count, deduplicate,
-                    &local_join_count, global_join_duplicates,
-                    global_join_inserts,
-                    output->get_join_column_count(),output->get_is_canonical(),
-                    generator_mode, generator_func);
+                for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) {
+                    input1[bucket_id].as_all_to_allv_right_join_buffer(
+                        std::vector<u64>(prev_non_dependent_columns.begin(),
+                                        prev_non_dependent_columns.begin()+join_column_count),
+                        join_buffer,
+                        input_ts,
+                        input1_buffer_width, counter,
+                        buckets, output_sub_bucket_count,
+                        output_sub_bucket_rank, reorder_map_array,
+                        join_column_count, deduplicate,
+                        &local_join_count, global_join_duplicates,
+                        global_join_inserts,
+                        output->get_join_column_count(),output->get_is_canonical(),
+                        generator_mode, generator_func);
+                }
                 auto after_actual_join = MPI_Wtime();
                 join_time_total += after_actual_join - before_actual_join;
                 input_ts.clear();
@@ -164,22 +170,24 @@ bool parallel_join::local_join(int threshold, int* offset,
             for (int jc=0; jc < join_column_count; jc++)
                 prefix.push_back(input0_buffer[k1 + jc]);
 
-            u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
+            // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets;
             std::vector<std::vector<u64>> input_ts;
             input_ts.push_back(std::vector<u64>(input0_buffer+k1, input0_buffer+k1+input0_buffer_width));
             auto before_actual_join = MPI_Wtime();
-            input1[bucket_id].as_all_to_allv_right_join_buffer(
-                prefix, join_buffer,
-                // input0_buffer + k1, input0_buffer_width,
-                input_ts,
-                input1_buffer_width, counter,
-                buckets, output_sub_bucket_count,
-                output_sub_bucket_rank, reorder_map_array,
-                join_column_count, deduplicate,
-                &local_join_count, global_join_duplicates,
-                global_join_inserts,
-                output->get_join_column_count(),output->get_is_canonical(),
-                generator_mode, generator_func);
+            for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) {
+                input1[bucket_id].as_all_to_allv_right_join_buffer(
+                    prefix, join_buffer,
+                    // input0_buffer + k1, input0_buffer_width,
+                    input_ts,
+                    input1_buffer_width, counter,
+                    buckets, output_sub_bucket_count,
+                    output_sub_bucket_rank, reorder_map_array,
+                    join_column_count, deduplicate,
+                    &local_join_count, global_join_duplicates,
+                    global_join_inserts,
+                    output->get_join_column_count(),output->get_is_canonical(),
+                    generator_mode, generator_func);
+            }
             auto after_actual_join = MPI_Wtime();
             join_time_total += after_actual_join - before_actual_join;
 
diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp
index 88d85b4e..c9ea272e 100644
--- a/backend/src/lie/lie.cpp
+++ b/backend/src/lie/lie.cpp
@@ -433,7 +433,7 @@ bool LIE::execute ()
 #endif
 
         // load balance before a SCC executed
-        executable_task->load_balance();
+        // executable_task->load_balance();
 
         if (restart_flag == false)
         {
diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h
index 5368811f..4b9f5215 100644
--- a/backend/src/relation/balanced_hash_relation.h
+++ b/backend/src/relation/balanced_hash_relation.h
@@ -98,8 +98,6 @@ class relation
     u32 default_sub_bucket_per_bucket_count = 1;    /// 1
     bool balance_flag = false;
 
-    bool balance_flag = false;
-
     /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL);
     /// 2: arity (Internally one extra id (intern id) column is added to every relation)
     /// true: arity == join column count
diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp
index 19cad2c9..d27afb1a 100644
--- a/backend/tests/cc/compiled_pre/cc.cpp
+++ b/backend/tests/cc/compiled_pre/cc.cpp
@@ -1,5 +1,5 @@
 // location of `parallel_RA_inc.h` here
-#include "/home/ysun67/workspace/slog/backend/src/parallel_RA_inc.h"
+#include "/home/stargazermiao/workspace/PL/slog/backend/src/parallel_RA_inc.h"
 #include "mpi.h"
 
 // #include <bit>
@@ -439,6 +439,7 @@ int main(int argc, char **argv) {
       FULL);
 
   rel__edge__2__1->balance_flag = true;
+  rel__edge__2__1->default_sub_bucket_per_bucket_count = 2;
 
   // relation *rel__edge__2__1__2 = new relation(
   //     2, true, 2, get_tag_for_rel("edge", "1__2"),
@@ -527,11 +528,11 @@ int main(int argc, char **argv) {
       std::vector<u64> res(2, 0);
       res[0] = input_v[1];
       res[1] = target_v[1];
-      // if (target_v[0] == 21) {
-      // std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl;
-      // std::cout << "cc " << target_v[0] << " " << target_v[1] << std::endl;
-      // std::cout << "res " << res[0] << " " << res[1] << std::endl;
-      // }
+      if (target_v[0] == 21) {
+      std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl;
+      std::cout << "cc " << target_v[0] << " " << target_v[1] << std::endl;
+      std::cout << "res " << res[0] << " " << res[1] << std::endl;
+      }
       res_set.push_back(res);
       return true;
     }
@@ -595,10 +596,10 @@ int main(int argc, char **argv) {
   // rel__edge__2__1->test_calc_hash_rank(4096);
   // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; 
   // rel__node__1__1->print();
-  // rel__edge__2__1->print();
+  rel__edge__2__1->print();
 //  rel__cc__2__1->print();
- rel__cc_final__2__1->print();
-  rel__cc_represent__1__1->print();
+ // rel__cc_final__2__1->print();
+  // rel__cc_represent__1__1->print();
   // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 
   mcomm.destroy();