From 58d176b8313e1e27d82b47f164358610e866dac5 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Mon, 3 Oct 2022 00:24:35 -0400 Subject: [PATCH 01/36] add negative number support --- backend/utility/bin_tsv.py | 4 ++ backend/utility/tsv_to_bin.cpp | 47 +++++++++++++++++----- compiler/src/builtins.cpp | 17 +++++++- slog/common/tuple.py | 6 ++- slog/tests/testcase/number/ground_truth | 3 ++ slog/tests/testcase/number/input/foo.facts | 2 + slog/tests/testcase/number/number.slog | 8 ++++ 7 files changed, 74 insertions(+), 13 deletions(-) create mode 100644 slog/tests/testcase/number/ground_truth create mode 100644 slog/tests/testcase/number/input/foo.facts create mode 100644 slog/tests/testcase/number/number.slog diff --git a/backend/utility/bin_tsv.py b/backend/utility/bin_tsv.py index 9b54643e..44054398 100755 --- a/backend/utility/bin_tsv.py +++ b/backend/utility/bin_tsv.py @@ -50,6 +50,8 @@ BUCKET_MASK = 0x00003FFFF0000000 TUPLE_ID_MASK = 0xFFFFFFFFF0000000 VAL_MASK = ~ TAG_MASK +SIGN_FILP_CONST = 0x0000200000000000 +SIGNED_NUM_MASK = 0xFFFFE00000000000 INT_TAG = 0 STRING_TAG = 2 @@ -115,6 +117,8 @@ def bin_to_tsv(filename, arity, output, index, meta_folder): val_tag = raw_val >> 46 if val_tag == INT_TAG: attr_val = raw_val & VAL_MASK + if attr_val >= SIGN_FILP_CONST: + attr_val = -(attr_val - SIGN_FILP_CONST) elif val_tag == STRING_TAG: attr_val = string_dict[raw_val & VAL_MASK] # elif val_tag == SYMBOL_TAG: diff --git a/backend/utility/tsv_to_bin.cpp b/backend/utility/tsv_to_bin.cpp index e3fc574e..62758039 100644 --- a/backend/utility/tsv_to_bin.cpp +++ b/backend/utility/tsv_to_bin.cpp @@ -2,7 +2,10 @@ // Subsequently by Kris Micinski // Convert Souffle CSV (tab-separated value) files to Slog input tuple files // compile with >= c++14 +#include #include +#include +#include #include #include #include @@ -37,6 +40,8 @@ #define BUCKET_MASK 0x00003FFFF0000000 #define BUCKET_MASK_LENGTH 18 #define TAG_MASK 0xFFFFC00000000000 +#define SIGN_FILP_CONST 0x0000200000000000 +#define SIGNED_NUM_MASK 0xFFFFE00000000000 using namespace std; @@ -232,16 +237,40 @@ void file_to_slog(char *input_file, char *output_file, { break; } - try - { - // TODO: support float later - // FIXME: detect empty space here! - u64 u64_v = stoi(col); - tuple_buffer[col_count] = TUPLE_MASK & u64_v; - // cout << "number at " << col_count << " : " << u64_v << endl; + bool convert_success_flag = false; + if (!convert_success_flag) { + // integer + try { + // FIXME: detect empty space here! + long long int_v = stoll(col); + if (int_v < 0) { + int_v = SIGN_FILP_CONST - int_v; + } + tuple_buffer[col_count] = (~ TAG_MASK) & ((u64)int_v); + // cout << col << " number at " << col_count << " : " <> tag_position == int_tag) << "\n"; @@ -25,13 +27,24 @@ inline bool is_number(u64 datum) { } inline i64 datum_to_number(u64 datum) { - return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); + i64 signed_val = (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); + if (signed_val >= sign_flip_const) { + signed_val = sign_flip_const - signed_val; + } + return signed_val; + // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); } const auto d2n = datum_to_number; inline u64 number_to_datum(i64 number) { - return (number & ~tag_mask) | (int_tag << tag_position); + i64 unsigned_value = number; + if (number < 0) { + unsigned_value = (- number) + sign_flip_const; + } + return (unsigned_value & ~tag_mask) | (int_tag << tag_position); + // return (number & ~tag_mask) | (int_tag << tag_position); } + const auto n2d = number_to_datum; inline u64 string_to_datum(std::string str) diff --git a/slog/common/tuple.py b/slog/common/tuple.py index 5cec57db..ff3d7ee4 100644 --- a/slog/common/tuple.py +++ b/slog/common/tuple.py @@ -9,6 +9,8 @@ BUCKET_MASK = 0x00003FFFF0000000 TUPLE_ID_MASK = 0xFFFFFFFFF0000000 U32_MASK = 0x00000000FFFFFFFF +SIGN_FILP_CONST = 0x0000200000000000 +SIGNED_NUM_MASK = 0xFFFFE00000000000 VAL_MASK = ~ TAG_MASK INT_TAG = 0 STRING_TAG = 2 @@ -67,8 +69,8 @@ def parse_tuple_row(self, u64_list, rel_name, intern_string_dict) -> SlogTuple: val_tag = u64 >> 46 if val_tag == INT_TAG: attr_val = (u64 & VAL_MASK) - if attr_val > 2 ** 31: - attr_val = attr_val - 2 ** 32 + if attr_val >= SIGN_FILP_CONST: + attr_val = SIGN_FILP_CONST - attr_val elif val_tag == STRING_TAG: attr_val = intern_string_dict[u64 & U32_MASK] else: diff --git a/slog/tests/testcase/number/ground_truth b/slog/tests/testcase/number/ground_truth new file mode 100644 index 00000000..c5a9d6d1 --- /dev/null +++ b/slog/tests/testcase/number/ground_truth @@ -0,0 +1,3 @@ +bar-res-check-1,1, 1 +bar-res-check-2,1, 1 +bar-res-check-3,1, 1 diff --git a/slog/tests/testcase/number/input/foo.facts b/slog/tests/testcase/number/input/foo.facts new file mode 100644 index 00000000..5aba8f93 --- /dev/null +++ b/slog/tests/testcase/number/input/foo.facts @@ -0,0 +1,2 @@ +-2 +-3 \ No newline at end of file diff --git a/slog/tests/testcase/number/number.slog b/slog/tests/testcase/number/number.slog new file mode 100644 index 00000000..b4ae865b --- /dev/null +++ b/slog/tests/testcase/number/number.slog @@ -0,0 +1,8 @@ +; testing negative and floating + +(foo -1) +[(bar {+ 2 x}) <-- (foo x)] + +[(bar-res-check-1 "pass") <-- (bar 1)] +[(bar-res-check-2 "pass") <-- (bar 0)] +[(bar-res-check-3 "pass") <-- (bar -1)] From a0f26d224c5220d313260eda37f7071bca19a077 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Wed, 2 Nov 2022 15:40:57 -0400 Subject: [PATCH 02/36] add already compiled check in client --- examples/datalog-example | 1 + slog/common/client.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 160000 examples/datalog-example diff --git a/examples/datalog-example b/examples/datalog-example new file mode 160000 index 00000000..9b29866c --- /dev/null +++ b/examples/datalog-example @@ -0,0 +1 @@ +Subproject commit 9b29866cadd18644be52da674585831b7416dfc6 diff --git a/slog/common/client.py b/slog/common/client.py index 03217114..ccb07ed7 100644 --- a/slog/common/client.py +++ b/slog/common/client.py @@ -199,7 +199,7 @@ def csv_request_generator(csv_hash_map): writer.write(f" {response.error_msg} fail to update!") ftp_conn.close() - @lru_cache(maxsize=None) + # @lru_cache(maxsize=None) def compile_slog(self, filename, writer=Writer()): ''' compile a slog file, and set current DB as the resultant DB. @@ -254,6 +254,9 @@ def _compile(self, program_hashes, writer=Writer()): req.using_database = "" req.hashes.extend(program_hashes) response = self._stub.CompileHashes(req) + if response.promise_id == MAXSIZE: + writer.write("Already compiled!") + return self.cur_db # Wait to resolve the promise in the terminal... # Break when promise is resolved edb = self.run_until_promised(response.promise_id, PING_INTERVAL, writer) From 4c9c25573f1c0ab4b204158deac3d337866fd1f0 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Wed, 2 Nov 2022 15:47:28 -0400 Subject: [PATCH 03/36] remove recompile message --- slog/common/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slog/common/client.py b/slog/common/client.py index ccb07ed7..4d170714 100644 --- a/slog/common/client.py +++ b/slog/common/client.py @@ -255,7 +255,7 @@ def _compile(self, program_hashes, writer=Writer()): req.hashes.extend(program_hashes) response = self._stub.CompileHashes(req) if response.promise_id == MAXSIZE: - writer.write("Already compiled!") + # writer.write("Already compiled!") return self.cur_db # Wait to resolve the promise in the terminal... # Break when promise is resolved From 9b81caf425b504d905094ad15fda88968d731be2 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Fri, 11 Nov 2022 15:51:59 -0500 Subject: [PATCH 04/36] single threaded --- backend/src/RA/parallel_agg.cpp | 18 +- backend/src/RA/parallel_agg.h | 2 +- backend/src/RA/parallel_join.cpp | 14 +- backend/src/RA/parallel_join.h | 12 +- backend/src/RAM/RA_tasks.cpp | 5 - backend/src/lie/lie.cpp | 22 +- backend/src/parallel_RA_inc.h | 7 +- .../src/relation/balanced_hash_relation.cpp | 137 ++++++------- backend/src/relation/balanced_hash_relation.h | 37 +++- backend/src/relation/shmap_relation.h | 35 ++-- backend/src/relation/shmap_relation_exp.cpp | 190 +++++++++++++++--- .../compiled_pre/CMakeLists.txt | 4 +- .../checkpoint-final/256.edge.3.table_full | Bin .../checkpoint-final/257.spath.3.table_full | Bin 0 -> 512 bytes .../checkpoints/checkpoint-final/$strings.csv | 0 .../checkpoint-final/256.edge.3.table_full | Bin 0 -> 288 bytes .../checkpoint-final/257.spath.3.table_full | Bin .../compiled_pre/compiler-out | 0 .../compiled_pre/input-data/$strings.csv | 0 .../compiled_pre/input-data/256.edge.3.table | Bin .../compiled_pre/input-data/257.spath.3.table | 0 .../compiled_pre/sssp.cpp} | 103 +++++++++- .../compiled_pre/sssp.cpp.backup} | 0 backend/tests/sssp/sssp.slog | 3 + .../test-input-graph/edge.csv | 0 backend/tests/update/sssp.slog | 3 - slog/common/client.py | 12 +- slogdb | 22 ++ 28 files changed, 448 insertions(+), 178 deletions(-) rename backend/tests/{update => sssp}/compiled_pre/CMakeLists.txt (94%) rename backend/tests/{update/compiled_pre/checkpoints => sssp/compiled_pre}/checkpoint-final/256.edge.3.table_full (100%) create mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full rename backend/tests/{update => sssp}/compiled_pre/checkpoints/checkpoint-final/$strings.csv (100%) create mode 100644 backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full rename backend/tests/{update => sssp}/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full (100%) rename backend/tests/{update => sssp}/compiled_pre/compiler-out (100%) rename backend/tests/{update => sssp}/compiled_pre/input-data/$strings.csv (100%) rename backend/tests/{update => sssp}/compiled_pre/input-data/256.edge.3.table (100%) rename backend/tests/{update => sssp}/compiled_pre/input-data/257.spath.3.table (100%) rename backend/tests/{update/sssp_update.cpp => sssp/compiled_pre/sssp.cpp} (82%) rename backend/tests/{update/compiled_pre/sssp.cpp => sssp/compiled_pre/sssp.cpp.backup} (100%) create mode 100644 backend/tests/sssp/sssp.slog rename backend/tests/{update => sssp}/test-input-graph/edge.csv (100%) delete mode 100644 backend/tests/update/sssp.slog create mode 100755 slogdb diff --git a/backend/src/RA/parallel_agg.cpp b/backend/src/RA/parallel_agg.cpp index 6e315515..90c07c3d 100644 --- a/backend/src/RA/parallel_agg.cpp +++ b/backend/src/RA/parallel_agg.cpp @@ -80,7 +80,7 @@ void parallel_join_aggregate::local_aggregate( u32* output_sub_bucket_count = output->get_sub_bucket_per_bucket_count(); u32** output_sub_bucket_rank = output->get_sub_bucket_rank(); - int real_join_count = output->get_join_column_count() - 1; + u32 real_join_count = output->get_join_column_count() - 1; agg_buffer.width[ra_counter] = real_join_count + 1; shmap_relation* agg_target; @@ -95,7 +95,7 @@ void parallel_join_aggregate::local_aggregate( } btree::btree_map, u64, shmap_relation::t_comparator> res_map; - for (int bucket=0; bucket < buckets; bucket ++) { + for (u32 bucket=0; bucket < buckets; bucket ++) { for (auto tuple: input->get_full()[bucket]) { std::vector data_v(tuple.begin(), tuple.begin()+target->get_join_column_count()); // std::cout << "On rank " << mcomm.get_rank() << " bucket " << *(target->get_sub_bucket_per_bucket_count()) << std::endl; @@ -111,18 +111,18 @@ void parallel_join_aggregate::local_aggregate( } } - for (int bucket=0; bucket < buckets; bucket ++) { + for (u32 bucket=0; bucket < buckets; bucket ++) { for (auto input_tuple: input->get_full()[bucket]) { std::vector joined_input_tuple(input_tuple.begin(), input_tuple.begin()+input->get_join_column_count()); auto agg_res = res_map[joined_input_tuple]; std::vector tuple(reorder_mapping.size(), 0); int reorder_agg_index = input->get_arity() + 1; - for (int j = 0; j < reorder_mapping.size(); j++) { - if (reorder_mapping[j] == reorder_agg_index) { - tuple[j] = agg_res; - } else { - tuple[j] = input_tuple[reorder_mapping[j]]; - } + for (long unsigned int j = 0; j < reorder_mapping.size(); j++) { + if (reorder_mapping[j] == reorder_agg_index) { + tuple[j] = agg_res; + } else { + tuple[j] = input_tuple[reorder_mapping[j]]; + } } uint64_t bucket_id = tuple_hash(tuple.data(), output->get_join_column_count()) % buckets; diff --git a/backend/src/RA/parallel_agg.h b/backend/src/RA/parallel_agg.h index 8c07c7a8..7189142d 100644 --- a/backend/src/RA/parallel_agg.h +++ b/backend/src/RA/parallel_agg.h @@ -79,7 +79,7 @@ class parallel_join_aggregate : public parallel_RA local_agg_func_t local_func; reduce_agg_func_t reduce_func; global_agg_func_t global_func; - std::vector reorder_mapping; + std::vector reorder_mapping; parallel_join_aggregate(relation* output, relation* target_rel, relation* input, int t_type, local_agg_func_t local_agg_func, diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index 76cc949c..38d9e20c 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -6,6 +6,7 @@ #include "../parallel_RA_inc.h" +#include bool parallel_join::local_join(int threshold, int* offset, @@ -24,6 +25,13 @@ bool parallel_join::local_join(int threshold, int* offset, join_buffer.width[counter] = reorder_map_array.size(); shmap_relation deduplicate(join_column_count, false); + auto out_dep_cols = output->get_dependent_column(); + if (out_dep_cols.size() != 0) { + for (size_t i = 0; i < out_dep_cols.size() - 1; i++) { + deduplicate.dependent_column_indices.push_back(out_dep_cols[i]); + } + deduplicate.update_compare_func = output->get_update_compare_func(); + } u32* output_sub_bucket_count = output->get_sub_bucket_per_bucket_count(); u32** output_sub_bucket_rank = output->get_sub_bucket_rank(); @@ -53,7 +61,8 @@ bool parallel_join::local_join(int threshold, int* offset, join_column_count, deduplicate, &local_join_count, global_join_duplicates, global_join_inserts, output->get_join_column_count(), - output->get_is_canonical()); + output->get_is_canonical(), + generator_mode, generator_func); // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl; if (local_join_count > threshold) @@ -84,7 +93,8 @@ bool parallel_join::local_join(int threshold, int* offset, join_column_count, deduplicate, &local_join_count, global_join_duplicates, global_join_inserts, - output->get_join_column_count(),output->get_is_canonical()); + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl; if (local_join_count > threshold) diff --git a/backend/src/RA/parallel_join.h b/backend/src/RA/parallel_join.h index f7b20979..30e15000 100644 --- a/backend/src/RA/parallel_join.h +++ b/backend/src/RA/parallel_join.h @@ -6,7 +6,9 @@ #pragma once +#include "../parallel_RA_inc.h" #include "../ds.h" +#include class parallel_join: public parallel_RA { @@ -23,6 +25,11 @@ class parallel_join: public parallel_RA { std::vector projection_reorder_index_array; int projection_reorder_index_array_length; + // a function used to generate new tuple based on join input, target tuple (optional) + // if this is provided, it will make join works similar to `copy_generate` + join_generator_func_t generator_func; + bool generator_mode = false; + public: parallel_join() { @@ -64,6 +71,7 @@ class parallel_join: public parallel_RA { int get_join_input1_graph_type() {return join_input1_graph_type;} relation* get_join_output() {return join_output_table;} void get_join_projection_index(std::vector* projection_reorder_index_array) {*projection_reorder_index_array = this->projection_reorder_index_array; } + void set_generator_func(join_generator_func_t func) { generator_func = func; generator_mode = true; } #ifdef GOOGLE_MAP bool local_join(int threshold, int* offset, @@ -75,7 +83,7 @@ class parallel_join: public parallel_RA { relation* output, all_to_allv_buffer& join_buffer, int counter, - int join_colun_count, + int join_column_count, u32* local_join_duplicates, u32* local_join_inserts); #else @@ -88,7 +96,7 @@ class parallel_join: public parallel_RA { relation* output, all_to_allv_buffer& join_buffer, int counter, - int join_colun_count, + int join_column_count, u32* local_join_duplicates, u32* local_join_inserts); diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 9528aeb7..b7a8a029 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -1003,15 +1003,10 @@ void RAM::local_insert_in_newt(std::map& intern_map) void RAM::local_insert_in_full() { for (u32 i=0; i < ram_relation_count; i++) - //for (std::map::iterator it = ram_relations.begin() ; it != ram_relations.end(); ++it) { - //relation* current_r = it->first; relation* current_r = ram_relations[i]; current_r->insert_delta_in_full(); current_r->local_insert_in_delta(); - - //if (current_r->get_debug_id() == 11) - // current_r->print(); } return; } diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index 6018caf6..ad504f1d 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -80,7 +80,7 @@ void LIE::update_task_graph(RAM* executable_task) taskgraph.erase(lie_sccs[i]); // check if relation in this scc need gc auto gc_rels = executable_task->get_gc_relation(); - for (int j=0; j < gc_rels.size(); j++) { + for (size_t j=0; j < gc_rels.size(); j++) { auto pos = std::find(lie_relations.begin(), lie_relations.end(), gc_rels[j]); if (pos != lie_relations.end()) { lie_relations.erase(pos); @@ -440,7 +440,7 @@ bool LIE::execute () delta_filename = delta_filename + "_" + std::to_string(mcomm.get_local_rank()); scc_relation[i]->set_filename(delta_filename); - scc_relation[i]->set_initailization_type(0); + scc_relation[i]->set_initialization_type(0); int is_access = access(delta_filename.c_str(), F_OK); int access_sum = 0; @@ -476,9 +476,9 @@ bool LIE::execute () else executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num); - // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; - // for (u32 i = 0 ; i < scc_relation_count; i++) - // print_relation_size(scc_relation[i]); + std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; + for (u32 i = 0 ; i < scc_relation_count; i++) + print_relation_size(scc_relation[i]); // stat_intermediate(); //executed_scc_id.push_back(executable_task->get_id()); #if 0 @@ -544,10 +544,12 @@ bool LIE::execute () // std::cout << "Writing checkpoint dump " << checkpoint_dumps_num << " takes " << max_write_cp_time << "(s)" << std::endl; checkpoint_dumps_num++; } -#endif - // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; - // for (u32 i = 0 ; i < scc_relation_count; i++) - // print_relation_size(scc_relation[i]); +#endif + // if (loop_counter < 20) { + std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; + for (u32 i = 0 ; i < scc_relation_count; i++) + print_relation_size(scc_relation[i]); + // } // stat_intermediate(); // loop_counter++; //iteration_count[executable_task->get_id()] = loop_counter; @@ -565,7 +567,7 @@ bool LIE::execute () if (mcomm.get_rank() == 0) { - // std::cout << "<<<<<<<<<<< SCC " << executable_task->get_id() << " finish, " << loop_counter << " iteration in total." << std::endl; + std::cout << "<<<<<<<<<<< SCC " << executable_task->get_id() << " finish, " << loop_counter << " iteration in total." << std::endl; // print_all_relation_size(); } full_iteration_count += loop_counter; diff --git a/backend/src/parallel_RA_inc.h b/backend/src/parallel_RA_inc.h index 957d8c42..e02739c5 100644 --- a/backend/src/parallel_RA_inc.h +++ b/backend/src/parallel_RA_inc.h @@ -13,11 +13,13 @@ #include "compat.h" // #include "shmap/shmap.h" #include "shmap/shmap_goog.h" -#include //#define DEBUG_OUTPUT 1 #define MAX_LOOP_COUNT 120000 +using update_partial_compare_func_t = std::function(std::vector old_v, std::vector new_v)>; +using join_generator_func_t = std::function& target_v, std::vector& input_v, u64* res)>; + #include "log/logger.h" #include "hash/hash.h" #include "comm/comm.h" @@ -33,7 +35,7 @@ enum class SpecialAggregator { count, maximum, minimum, - recusive + recursive }; // TODO: remove unused argument @@ -46,6 +48,7 @@ using global_agg_func_t = std::function& data, local_agg_res_t agg_data, int agg_data_count, std::vector& output); + #include "relation/balanced_hash_relation.h" #include "RA/parallel_RA.h" #include "RA/fact.h" diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index 50d31ea0..7b5deef9 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -528,9 +528,9 @@ void relation::load_data_from_separate_files() double read_data_end = MPI_Wtime(); double read_data_time = read_data_end - read_data_start; - if (initailization_type == DELTA) + if (initialization_type == DELTA) populate_delta(file_io.get_hash_buffer_size(), file_io.get_hash_buffer()); - else if (initailization_type == FULL) + else if (initialization_type == FULL) populate_full(file_io.get_hash_buffer_size(), file_io.get_hash_buffer()); file_io.delete_hash_buffers(); @@ -539,7 +539,7 @@ void relation::load_data_from_separate_files() MPI_Reduce(&read_data_time, &max_read_data_time, 1, MPI_DOUBLE, MPI_MAX, 0, mcomm.get_local_comm()); std::string read_io = (share_io == true)? "MPI IO": "POSIX IO"; - std::string type = (initailization_type == DELTA)? "DELTA": "FULL"; + std::string type = (initialization_type == DELTA)? "DELTA": "FULL"; if (mcomm.get_rank() == 0 && restart_flag == true) std::cout << "Read " << get_debug_id() << " (" << read_io << ") :\n " << type << " [RD], " << @@ -554,9 +554,9 @@ void relation::load_data_from_file_with_offset() double read_data_end = MPI_Wtime(); double read_data_time = read_data_end - read_data_start; - if (initailization_type == DELTA) + if (initialization_type == DELTA) populate_delta(file_io.get_hash_buffer_size(), file_io.get_hash_buffer()); - else if (initailization_type == FULL) + else if (initialization_type == FULL) populate_full(file_io.get_hash_buffer_size(), file_io.get_hash_buffer()); file_io.delete_hash_buffers(); @@ -565,7 +565,7 @@ void relation::load_data_from_file_with_offset() MPI_Reduce(&read_data_time, &max_read_data_time, 1, MPI_DOUBLE, MPI_MAX, 0, mcomm.get_local_comm()); std::string read_io = (share_io == true)? "MPI IO": "POSIX IO"; - std::string type = (initailization_type == DELTA)? "DELTA": "FULL"; + std::string type = (initialization_type == DELTA)? "DELTA": "FULL"; if (mcomm.get_rank() == 0 && restart_flag == true) std::cout << "Read " << get_debug_id() << " (" << read_io << ") :\n " << type << " [RD], " << @@ -584,7 +584,7 @@ void relation::load_data_from_file() // // << "c++ object " << this // << "start normal IO" << std::endl; /// reading from file - if (initailization_type != -1) + if (initialization_type != -1) { /// Main : Execute : init : io : end double read_data_start = MPI_Wtime(); @@ -601,10 +601,10 @@ void relation::load_data_from_file() file_io.delete_raw_buffers(); /* Copy data from buffer to relation */ - if (initailization_type == DELTA) + if (initialization_type == DELTA) populate_delta(file_io.get_hash_buffer_size(), file_io.get_hash_buffer()); - else if (initailization_type == FULL) + else if (initialization_type == FULL) populate_full(file_io.get_hash_buffer_size(), file_io.get_hash_buffer()); file_io.delete_hash_buffers(); @@ -615,7 +615,7 @@ void relation::load_data_from_file() MPI_Reduce(&all_to_all_time, &max_all_to_all_time, 1, MPI_DOUBLE, MPI_MAX, 0, mcomm.get_local_comm()); std::string read_io = (share_io == true)? "MPI IO": "POSIX IO"; - std::string type = (initailization_type == DELTA)? "DELTA": "FULL"; + std::string type = (initialization_type == DELTA)? "DELTA": "FULL"; if (mcomm.get_rank() == 0 && restart_flag == true) std::cout << "Read " << get_debug_id() << " (" << read_io << ") :\n " << type << " [RD] [AC], " << @@ -655,13 +655,16 @@ void relation::initialize_relation(mpi_comm& mcomm, std::map& intern_m full = new shmap_relation[buckets]; newt = new shmap_relation[buckets]; - for (int i = 0 ; i < buckets; i++) { + for (u32 i = 0 ; i < buckets; i++) { delta[i].arity = arity; - delta[i].dependant_column_index = dependant_column_index; + delta[i].dependent_column_indices = dependent_column_indices; + delta[i].update_compare_func = update_compare_func; full[i].arity = arity; - full[i].dependant_column_index = dependant_column_index; + full[i].dependent_column_indices = dependent_column_indices; + full[i].update_compare_func = update_compare_func; newt[i].arity = arity; - newt[i].dependant_column_index = dependant_column_index; + newt[i].dependent_column_indices = dependent_column_indices; + newt[i].update_compare_func = update_compare_func; } #endif @@ -777,6 +780,7 @@ void relation::populate_full(int buffer_size, u64* buffer) u32 counter = 0; u64 t[arity+1]; u32 buckets = get_bucket_count(); + std::cout << "populating full for " << intern_tag << std::endl; for (int i = 0; i < buffer_size; i = i + (arity+1)) { @@ -800,6 +804,7 @@ void relation::populate_delta (int buffer_size, u64* buffer) { u64 t[arity+1]; u32 buckets = get_bucket_count(); + std::cout << "populating delta for " << intern_tag << std::endl; for (int i = 0; i < buffer_size; i = i + (arity+1)) { @@ -908,7 +913,7 @@ void relation::finalize_relation() full_element_count = 0; delta_element_count = 0; - initailization_type = -1; + initialization_type = -1; delete[] distinct_sub_bucket_rank_count; for (u64 b = 0; b < buckets; b++) @@ -1064,7 +1069,7 @@ void relation::copy_relation(relation*& recv_rel, mpi_comm output_comm, int targ finalize_relation(); - recv_rel->set_initailization_type(-1); + recv_rel->set_initialization_type(-1); //recv_rel->initialize_relation(output_comm); @@ -1128,6 +1133,7 @@ bool relation::insert_in_delta(u64* t) if (is_canonical == false && arity != 0 && arity >= join_column_count) sub_bucket_id = tuple_hash(t + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id]; + // std::cout << "inserting delta for " << intern_tag << std::endl; //assert((int)bucket_id == mcomm.get_local_rank()); if (delta[bucket_id].insert_tuple_from_array(t, arity+1) == true) { @@ -1150,6 +1156,7 @@ bool relation::insert_in_newt(u64* t) if (is_canonical == false && arity != 0 && arity >= join_column_count) sub_bucket_id = tuple_hash(t + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id]; + // std::cout << "inserting newt for " << intern_tag << std::endl; //assert((int)bucket_id == mcomm.get_local_rank()); if (newt[bucket_id].insert_tuple_from_array(t, arity+1) == true) { @@ -1184,7 +1191,9 @@ bool relation::insert_in_full(u64* t) std::cout << std::endl; } #endif + // std::cout << "inserting full for " << intern_tag << std::endl; + // TODO: use normal insert here! if (full[bucket_id].insert_tuple_from_array(t, arity+1) == true) { full_element_count++; @@ -1274,75 +1283,49 @@ int relation::insert_full_in_delta() return insert_success; } - - void relation::local_insert_in_delta() { int rank; MPI_Comm_rank(mcomm.get_comm(), &rank); u32 buckets = get_bucket_count(); - delete[] delta; - - - delta = newt; - - - /* - u32 i = mcomm.get_rank(); - vector_buffer *vb_newt = new vector_buffer[buckets]; - vb_newt[i].vector_buffer_create_empty(); - std::vector prefix = {}; - newt[i].as_vector_buffer_recursive(&(vb_newt[i]), prefix); - - if (i == 0) - std::cout << "XX [" << get_debug_id() << "] Test " << mcomm.get_rank() << " DELTA " << vb_newt[i].size/(sizeof(u64) * (arity + 1)) << " arity " << arity + 1 << std::endl; - - vb_newt[i].vector_buffer_free(); - - delete[] vb_newt; - - - - //u32 i = mcomm.get_rank(); - vector_buffer *vb_delta = new vector_buffer[buckets]; - vb_delta[i].vector_buffer_create_empty(); - //std::vector prefix = {}; - delta[i].as_vector_buffer_recursive(&(vb_delta[i]), prefix); - - if (i == 0) - std::cout << "YY [" << get_debug_id() << "] Test " << mcomm.get_rank() << " DELTA " << vb_delta[i].size/(sizeof(u64) * (arity + 1)) << " arity " << arity + 1 << std::endl; - - vb_delta[i].vector_buffer_free(); - - delete[] vb_delta; - */ - - - delta_element_count = newt_element_count; - //if (rank == 0) - // std::cout << "[" << get_debug_id() << "] copyng newt pointer to delta " << delta_element_count << std::endl; - - memcpy(delta_bucket_element_count, newt_bucket_element_count, buckets * sizeof(u32)); - for (u32 b = 0; b < buckets; b++) - { - memcpy(delta_sub_bucket_element_count[b], newt_sub_bucket_element_count[b], sub_bucket_per_bucket_count[b] * sizeof(u32)); - memset(newt_sub_bucket_element_count[b], 0, sub_bucket_per_bucket_count[b] * sizeof(u32)); - } - -#ifdef GOOGLE_MAP - newt = new google_relation[buckets]; -#else - newt = new shmap_relation[buckets]; - - for (int i = 0; i < buckets; i++) { - newt[i].arity = arity; - newt[i].dependant_column_index = dependant_column_index; + if (dependent_column_indices.size() > 0) { + delta_element_count = 0; + for (u32 i = 0; i < buckets; i++) { + delta[i].purge(); + memset(delta_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32)); + for (auto& t: newt[i]) { + if (full[i].check_dependent_insertion(t)) { + delta[i].insert(t); + uint64_t bucket_id = tuple_hash(t.data(), join_column_count) % get_bucket_count(); + u32 sub_bucket_id = 0; + if (is_canonical == false && arity != 0 && arity >= join_column_count) + sub_bucket_id = tuple_hash(t.data() + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id]; + delta_sub_bucket_element_count[bucket_id][sub_bucket_id]++; + delta_element_count++; + } + } + newt[i].purge(); + memset(newt_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32)); + } + } else { + delete[] delta; + delta = newt; + delta_element_count = newt_element_count; + memcpy(delta_bucket_element_count, newt_bucket_element_count, buckets * sizeof(u32)); + for (u32 b = 0; b < buckets; b++) + { + memcpy(delta_sub_bucket_element_count[b], newt_sub_bucket_element_count[b], sub_bucket_per_bucket_count[b] * sizeof(u32)); + memset(newt_sub_bucket_element_count[b], 0, sub_bucket_per_bucket_count[b] * sizeof(u32)); + } + newt = new shmap_relation[buckets]; + for (u32 i = 0; i < buckets; i++) { + newt[i].arity = arity; + newt[i].dependent_column_indices = dependent_column_indices; + newt[i].update_compare_func = update_compare_func; + } } -#endif - //for(u32 i=0; i +#include +#include #include +#include enum {LEFT=0, RIGHT}; enum {DELTA=0, FULL, FULL_AND_DELTA}; -enum {COPY=0, COPY_FILTER, COPY_GENERATE, ACOPY, JOIN, FACT, NEGATION, AGGREGATION}; +enum {COPY=0, COPY_FILTER, COPY_GENERATE, ACOPY, JOIN, FACT, NEGATION, AGGREGATION, UPDATE}; enum {STATIC=0, DYNAMIC}; +// this is update function for column has functional dependence +// the size of vector arguments must have exactly same size as dependent_column_indices + class relation { @@ -27,7 +34,7 @@ class relation u32 intern_tag; /// id of relation (to be used for interning) std::string debug_id; - int initailization_type = -1; /// used when task balancing is required + int initialization_type = -1; /// used when task balancing is required std::string filename = NULL; /// Name of file to open @@ -79,7 +86,8 @@ class relation bool restart_flag; //bool fact_load=false; //std::vector init_val; - std::optional dependant_column_index = std::nullopt; + std::vector dependent_column_indices; + update_partial_compare_func_t update_compare_func; public: @@ -92,7 +100,7 @@ class relation /// "/var/tmp/g13236/path_2_1_2": location of data file that gets loaded in the relation /// FULL: load in FULL (other option is to loadin DELTA, but we alwys load in FULL) relation (u32 jcc, bool is_c, u32 ar, u32 tg, std::string fname, int version) - :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), initailization_type(version), filename(fname) + :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), initialization_type(version), filename(fname) { //fact_load = false; full_element_count=0; @@ -100,7 +108,7 @@ class relation } relation (u32 jcc, bool is_c, u32 ar, u32 tg, std::string did, std::string fname, int version) - :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), debug_id(did), initailization_type(version), filename(fname) + :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), debug_id(did), initialization_type(version), filename(fname) { //fact_load = false; full_element_count=0; @@ -108,7 +116,7 @@ class relation } relation (u32 jcc, bool is_c, u32 ar, u32 tg, int version) - :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), initailization_type(version), filename("") + :join_column_count(jcc), is_canonical(is_c), arity(ar), intern_tag(tg), initialization_type(version), filename("") { //fact_load = false; full_element_count=0; @@ -134,14 +142,27 @@ class relation //void set_init_val(std::vector temp_init_val) {init_val = temp_init_val;} - void set_dependant_column(int idx) { dependant_column_index = idx; } + void set_dependent_column_update(std::vector idx, update_partial_compare_func_t f) { + dependent_column_indices = idx; + update_compare_func= f; + // for (int i = 0; i < get_bucket_count(); i++) { + // delta[i].dependent_column_indices = dependent_column_indices; + // delta[i].update_compare_func = update_compare_func; + // full[i].dependent_column_indices = dependent_column_indices; + // full[i].update_compare_func = update_compare_func; + // newt[i].dependent_column_indices = dependent_column_indices; + // newt[i].update_compare_func = update_compare_func; + // } + } + std::vector get_dependent_column() { return dependent_column_indices; } + update_partial_compare_func_t get_update_compare_func() { return update_compare_func; } /// used for load balancing void set_last_rank(int lr) {last_rank = lr;} int get_last_rank() { return last_rank;} /// used for task-level parallelism - void set_initailization_type(int x) { initailization_type = x; } + void set_initialization_type(int x) { initialization_type = x; } bool get_is_canonical() {return is_canonical;} diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h index e299cbc3..19287e53 100644 --- a/backend/src/relation/shmap_relation.h +++ b/backend/src/relation/shmap_relation.h @@ -11,7 +11,6 @@ #include "../btree/btree_set.h" #include #include -#include struct shmap_relation { @@ -19,37 +18,28 @@ struct shmap_relation { int data_structure_type; - std::optional dependant_column_index; // some column may have functional dependance its support for lattice like language feature + // some column may have functional dependance its support for lattice like language feature + // please always consider id column as a functional dependent column + std::vector dependent_column_indices; + update_partial_compare_func_t update_compare_func; using t_tuple = std::vector; struct t_comparator { // 0-arity compare will fail - t_comparator() : _id_flag(true) { dependant_column_index = std::nullopt; } - t_comparator(std::optional dt): dependant_column_index(dt) {} + t_comparator() {} bool operator()(const t_tuple &a, const t_tuple &b) const { // make it an unroll loop when change to array int size = a.size(); - if (dependant_column_index.has_value()) { for (int i=0; i < size; i++) { - if (i == dependant_column_index.value()) { continue; } if (a[i] < b[i]) return true; if (a[i] > b[i]) return false; } - } else { - for (int i=0; i < size; i++) { - if (a[i] < b[i]) - return true; - if (a[i] > b[i]) - return false; - } - } + return false; } - bool _id_flag; - std::optional dependant_column_index; }; // souffle use multi set for some relation @@ -123,7 +113,7 @@ struct shmap_relation { shmap_relation(int arity, bool id_flag); shmap_relation() { // id_flag = true; - dependant_column_index = std::nullopt; + // dependent_column_indices = std::nullopt; // ind = new t_ind(t_comparator(id_flag)); // int rank; // MPI_Comm_rank(MPI_COMM_WORLD, &rank); @@ -134,9 +124,12 @@ struct shmap_relation { bool insert_tuple_from_array(u64* t, int arity); void remove_tuple(); bool find_tuple_from_array(u64* t, int arity); + bool check_dependent_insertion(const std::vector &v); void as_vector_buffer_recursive(vector_buffer* vb, std::vector prefix); + // TODO: move all these logic to RA operation! + void as_all_to_allv_copy_buffer(all_to_allv_buffer& buffer, std::vector prefix, std::vector reorder_map, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, u32 arity, u32 join_column_count, int head_rel_hash_col_count, bool canonical); void as_all_to_allv_copy_filter_buffer(all_to_allv_buffer& buffer, std::vector prefix, std::vector reorder_map, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, u32 arity, u32 join_column_count, bool(*lambda)(const u64* const), int head_rel_hash_col_count, bool canonical); @@ -150,7 +143,8 @@ struct shmap_relation { int join_column_count, shmap_relation& deduplicate, int* local_join_count, u32* local_join_duplicates, u32* local_join_inserts, - int head_rel_hash_col_count, bool canonical); + int head_rel_hash_col_count, bool canonical, + bool generator_mode, join_generator_func_t gen_func); void as_all_to_allv_left_join_buffer( std::vector prefix, all_to_allv_buffer& join_buffer, @@ -161,7 +155,8 @@ struct shmap_relation { int join_column_count, shmap_relation& deduplicate, int* local_join_count, u32* local_join_duplicates, u32* local_join_inserts, int head_rel_hash_col_count, - bool canonical); + bool canonical, + bool generator_mode, join_generator_func_t gen_func); void as_all_to_allv_right_outer_join_buffer( shmap_relation* target_relation, @@ -171,7 +166,7 @@ struct shmap_relation { int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, std::vector& reorder_map, - int join_column_count, int out_airty, + int join_column_count, int out_arity, int head_rel_hash_col_count, bool canonical); void as_all_to_allv_copy_generate_buffer(all_to_allv_buffer& buffer, std::vector prefix, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, u32 arity, u32 join_column_count, int(*lambda)(const u64* const, u64* const), int head_rel_hash_col_count, bool canonical); diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index d9f6c6d6..654cc350 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -12,10 +12,12 @@ #include "../parallel_RA_inc.h" #include "shmap_relation.h" +#include #include #include - - +#include +#include +#include shmap_relation::shmap_relation(int arity, bool id_flag) { @@ -26,8 +28,134 @@ shmap_relation::shmap_relation(int arity, bool id_flag) bool shmap_relation::insert_tuple_from_array(u64 *t, int width) { t_tuple tp(t, t+width); + // check if relation has functional dependance + if (dependent_column_indices.size() > 0) { + std::vector index_columns; + std::vector dependent_columns; + t_tuple upper_bound(width, std::numeric_limits::max()); + t_tuple lower_bound(width, std::numeric_limits::min()); + for (int i = 0; i < width-dependent_column_indices.size(); i++) { + upper_bound[i] = tp[i]; + lower_bound[i] = tp[i]; + } + for (auto i: dependent_column_indices) { + dependent_columns.push_back(t[i]); + } + auto exist_tuples_range = lowerUpperRange(lower_bound, upper_bound); + if (exist_tuples_range.first == ind.end()) { + // std::cout << "adding to lattice with <<<<<< "; + // for (auto c: tp) { + // std::cout << c << " "; + // } + // std::cout << " while lower bound ... "; + // for (auto c: lower_bound) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // std::cout << "The current btree: " << std::endl; + // for (auto t: ind) { + // std::cout << "Tuple : "; + // for (auto c: t) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // } + return insert(tp); + } else { + // update + // iterator need_delete = ind.end(); + std::vector need_deletes; + for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) { + auto cur_tuple = *it; + // std::cout << "comparing <<<<<< "; + // for (auto c: cur_tuple) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + std::vector old_t; + for (auto i: dependent_column_indices) { + old_t.push_back(cur_tuple[i]); + } + auto compare_res = update_compare_func(old_t, dependent_columns); + if (compare_res.has_value() && compare_res.value()) { + need_deletes.push_back(it); + // std::cout << "update with <<<<<< "; + // for (auto c: tp) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + } + } + if (!need_deletes.empty()) { + for (auto d: need_deletes) { + ind.erase(*d); + } + return insert(tp); + } else { + return false; + } + } + } else { + // std::cout << "adding to normal "<< arity << " with <<<<<< "; + // for (auto c: tp) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + return insert(tp); + } +} - return insert(tp); +bool +shmap_relation::check_dependent_insertion(const std::vector &tp) { + if (dependent_column_indices.size() > 0) { + std::vector index_columns; + std::vector dependent_columns; + t_tuple upper_bound(tp.size(), std::numeric_limits::max()); + t_tuple lower_bound(tp.size(), std::numeric_limits::min()); + for (size_t i = 0; i < tp.size()-dependent_column_indices.size(); i++) { + upper_bound[i] = tp[i]; + lower_bound[i] = tp[i]; + } + for (auto i: dependent_column_indices) { + dependent_columns.push_back(tp[i]); + } + auto exist_tuples_range = lowerUpperRange(lower_bound, upper_bound); + if (exist_tuples_range.first == ind.end()) { + return true; + } else { + for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) { + auto cur_tuple = *it; + std::vector old_t; + for (auto i: dependent_column_indices) { + old_t.push_back(cur_tuple[i]); + } + auto compare_res = update_compare_func(old_t, dependent_columns); + if (compare_res.has_value() && compare_res.value()) { + return true; + } + } + // std::cout << " not adding to lattice with <<<<<< "; + // for (auto c: tp) { + // std::cout << c << " "; + // } + // std::cout << " while lower bound ... "; + // for (auto c: lower_bound) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // std::cout << "The current btree: " << std::endl; + // for (auto& t: ind) { + // std::cout << "Tuple : "; + // for (auto c: t) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // } + return false; + } + } else { + return true; + } } std::pair @@ -267,7 +395,8 @@ void shmap_relation::as_all_to_allv_right_join_buffer( u32 *local_join_duplicates, u32 *local_join_inserts, int head_rel_hash_col_count, - bool canonical) + bool canonical, + bool generator_mode, join_generator_func_t gen_func) { if (size() == 0) return; @@ -284,16 +413,20 @@ void shmap_relation::as_all_to_allv_right_join_buffer( { auto cur_path = *it; u64 projected_path[join_buffer.width[ra_id]]; - u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; - for (int i = 0; i < input1_buffer_width; i++) - reordered_cur_path[i] = cur_path[i]; - - for (int i = join_column_count; i < input0_buffer_width; i++) - reordered_cur_path[input1_buffer_width + (i - join_column_count)] = input0_buffer[i]; - - for (int i =0; i < join_buffer.width[ra_id]; i++) - projected_path[i] = reordered_cur_path[reorder_map[i]]; - + if (generator_mode) { + std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); + gen_func(input_t, cur_path, projected_path); + } else { + u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; + for (int i = 0; i < input1_buffer_width; i++) + reordered_cur_path[i] = cur_path[i]; + + for (int i = join_column_count; i < input0_buffer_width; i++) + reordered_cur_path[input1_buffer_width + (i - join_column_count)] = input0_buffer[i]; + + for (int i =0; i < join_buffer.width[ra_id]; i++) + projected_path[i] = reordered_cur_path[reorder_map[i]]; + } if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) { uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets; @@ -318,7 +451,6 @@ void shmap_relation::as_all_to_allv_right_join_buffer( (*local_join_duplicates)++; } } - // std::cout << "inserted " << *local_join_inserts << std::endl; } void shmap_relation::as_all_to_allv_left_join_buffer( @@ -335,7 +467,8 @@ void shmap_relation::as_all_to_allv_left_join_buffer( u32 *local_join_duplicates, u32 *local_join_inserts, int head_rel_hash_col_count, - bool canonical) + bool canonical, + bool generator_mode, join_generator_func_t gen_func) { if (size() == 0) return; @@ -352,16 +485,21 @@ void shmap_relation::as_all_to_allv_left_join_buffer( { auto cur_path = *it; u64 projected_path[join_buffer.width[ra_id]]; - u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; - for (int i = 0; i < input0_buffer_width; i++) - reordered_cur_path[i] = input0_buffer[i]; - - for (int i = join_column_count; i < input1_buffer_width; i++) - reordered_cur_path[input0_buffer_width + (i - join_column_count)] = cur_path[i]; - - for (int i =0; i < join_buffer.width[ra_id]; i++) - projected_path[i] = reordered_cur_path[reorder_map[i]]; - + if (generator_mode) { + std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); + gen_func(cur_path, input_t, projected_path); + } else { + u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; + for (int i = 0; i < input0_buffer_width; i++) + reordered_cur_path[i] = input0_buffer[i]; + + for (int i = join_column_count; i < input1_buffer_width; i++) + reordered_cur_path[input0_buffer_width + (i - join_column_count)] = cur_path[i]; + + for (int i =0; i < join_buffer.width[ra_id]; i++) + projected_path[i] = reordered_cur_path[reorder_map[i]]; + } + //std::cout << "NT " << projected_path[0] << " " << projected_path[1] << std::endl; if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) { diff --git a/backend/tests/update/compiled_pre/CMakeLists.txt b/backend/tests/sssp/compiled_pre/CMakeLists.txt similarity index 94% rename from backend/tests/update/compiled_pre/CMakeLists.txt rename to backend/tests/sssp/compiled_pre/CMakeLists.txt index cb2c1d5f..a5e5801d 100644 --- a/backend/tests/update/compiled_pre/CMakeLists.txt +++ b/backend/tests/sssp/compiled_pre/CMakeLists.txt @@ -15,8 +15,8 @@ find_package(MPI REQUIRED) # endif() set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive") -set (base_dir "${PROJECT_SOURCE_DIR}/../backend") -set (source_dir "${base_dir}/src") +# set (base_dir "${PROJECT_SOURCE_DIR}/../backend") +set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") file (GLOB source_files_sssp "${PROJECT_SOURCE_DIR}/sssp.cpp") diff --git a/backend/tests/update/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full similarity index 100% rename from backend/tests/update/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full rename to backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full new file mode 100644 index 0000000000000000000000000000000000000000..478adf3b2a8e2bf5756cf0669e86d003e15ca8d3 GIT binary patch literal 512 zcmZ9ITMB?M5CdD^qKKl1$Lak~<#x?rsXw8couoB}?`x!;^vIf0h&s2@LHcAZe7Iiz zS^3P#@m%BSMTLh;JaahCuIcCL=YDs^)6bsg8BZ=s|1O?6+^hvZ8_%4*QNH&TK6~VE d%I98<4F0#j!+3o1(Y4~qCugdBa#Rx*`7ihw2~z+7 literal 0 HcmV?d00001 diff --git a/backend/tests/update/compiled_pre/checkpoints/checkpoint-final/$strings.csv b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv similarity index 100% rename from backend/tests/update/compiled_pre/checkpoints/checkpoint-final/$strings.csv rename to backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv diff --git a/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full new file mode 100644 index 0000000000000000000000000000000000000000..a5b47390726befd417416b8e76e64db49a1e53f8 GIT binary patch literal 288 zcmYL@+YP`l3<61CX)=}fpQ=*f&_L+d4S=*-u@*FJgclJ_f= g&{z83&?&Hw-a literal 0 HcmV?d00001 diff --git a/backend/tests/update/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full similarity index 100% rename from backend/tests/update/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full rename to backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full diff --git a/backend/tests/update/compiled_pre/compiler-out b/backend/tests/sssp/compiled_pre/compiler-out similarity index 100% rename from backend/tests/update/compiled_pre/compiler-out rename to backend/tests/sssp/compiled_pre/compiler-out diff --git a/backend/tests/update/compiled_pre/input-data/$strings.csv b/backend/tests/sssp/compiled_pre/input-data/$strings.csv similarity index 100% rename from backend/tests/update/compiled_pre/input-data/$strings.csv rename to backend/tests/sssp/compiled_pre/input-data/$strings.csv diff --git a/backend/tests/update/compiled_pre/input-data/256.edge.3.table b/backend/tests/sssp/compiled_pre/input-data/256.edge.3.table similarity index 100% rename from backend/tests/update/compiled_pre/input-data/256.edge.3.table rename to backend/tests/sssp/compiled_pre/input-data/256.edge.3.table diff --git a/backend/tests/update/compiled_pre/input-data/257.spath.3.table b/backend/tests/sssp/compiled_pre/input-data/257.spath.3.table similarity index 100% rename from backend/tests/update/compiled_pre/input-data/257.spath.3.table rename to backend/tests/sssp/compiled_pre/input-data/257.spath.3.table diff --git a/backend/tests/update/sssp_update.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp similarity index 82% rename from backend/tests/update/sssp_update.cpp rename to backend/tests/sssp/compiled_pre/sssp.cpp index 6c399409..00313a55 100644 --- a/backend/tests/update/sssp_update.cpp +++ b/backend/tests/sssp/compiled_pre/sssp.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -401,24 +402,116 @@ int main(int argc, char **argv) { slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", FULL); + relation* rel__edge__3__1 = new relation( + 1, false, 3, get_tag_for_rel("edge","1"), + std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table", + FULL); + + // the dependent column must be exclude from hash computation, so join column count is 3 - 1 = 2 relation *rel__spath__3__1__2__3 = new relation( - 3, true, 3, get_tag_for_rel("spath", "1__2__3"), + 2, true, 3, get_tag_for_rel("spath", "1__2__3"), std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table", slog_input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table", FULL); + // set functional dependency for spath + rel__spath__3__1__2__3->set_dependent_column_update( + {2, 3}, // len and id column + [](std::vector old_v, std::vector new_v) -> std::optional { + // if (new_v[0] < old_v[0]) { + // std::cout << "Comparing >>>> "; + // for (auto v: old_v) { + // std::cout << v << " "; + // } + // std::cout << " with "; + // for (auto v: new_v) { + // std::cout << v << " "; + // } + // std::cout << std::endl; + // } + return new_v[0] < old_v[0]; + } + ); + relation* rel__spath__3__2 = new relation( + 1, false, 3, get_tag_for_rel("spath","2"), + std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table", + FULL); + rel__spath__3__2->set_dependent_column_update( + {2, 3}, + [](std::vector old_v, std::vector new_v) -> std::optional { + // if (new_v[0] < old_v[0]) { + // std::cout << "Comparing >>>> "; + // for (auto v: old_v) { + // std::cout << v << " "; + // } + // std::cout << " with "; + // for (auto v: new_v) { + // std::cout << v << " "; + // } + // std::cout << std::endl; + // } + return new_v[0] < old_v[0]; + } + ); - RAM *scc0 = new RAM(false, 0); - scc0->add_relation(rel__edge__3__1__2__3, false, false); - scc0->add_relation(rel__spath__3__1__2__3, true, false); - scc0->add_rule(new parallel_copy(rel__spath__3__1__2__3, + RAM* scc0 = new RAM(false, 0); + scc0->add_relation(rel__edge__3__1, true, false); + scc0->add_relation(rel__edge__3__1__2__3, true, false); + scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, DELTA, {0, 3, 1, 2})); + + RAM *scc1 = new RAM(false, 0); + scc1->add_relation(rel__edge__3__1__2__3, false, false); + scc1->add_relation(rel__spath__3__1__2__3, true, false); + scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3, rel__edge__3__1__2__3, FULL, {0, 1, 2})); + RAM *scc2 = new RAM(true, 1); + scc2->add_relation(rel__edge__3__1__2__3, false, false); + scc2->add_relation(rel__spath__3__2, true, false); + scc2->add_relation(rel__spath__3__1__2__3, true, false); + // the order of non join column also need to be carefully arranged because, dependent column + // should always at last + scc2->add_rule(new parallel_acopy( + rel__spath__3__2, + rel__spath__3__1__2__3, DELTA, + {1, 0, 2, 3})); // 2, 1, 3, id + parallel_join* update_spath_j = new parallel_join( + rel__spath__3__1__2__3, + rel__edge__3__1, FULL, + rel__spath__3__2, DELTA, + {5, 2, 3}// useless + ); + update_spath_j->set_generator_func([](std::vector& target_v, std::vector& input_v, u64* res) { + // std::cout << "Join >>>> "; + // for (auto v: target_v) { + // std::cout << v << " "; + // } + // std::cout << " with "; + // for (auto v: input_v) { + // std::cout << v << " "; + // } + // std::cout << std::endl; + res[0] = target_v[1]; + res[1] = input_v[2]; + if (res[0] == res[1]) { + res[2] = 0; + } else { + res[2] = target_v[2] + input_v[3]; + } + }); + scc2->add_rule(update_spath_j); + LIE *lie = new LIE(); + lie->add_relation(rel__edge__3__1); lie->add_relation(rel__edge__3__1__2__3); + lie->add_relation(rel__spath__3__2); lie->add_relation(rel__spath__3__1__2__3); lie->add_scc(scc0); + lie->add_scc(scc1); + lie->add_scc(scc2); + lie->add_scc_dependance(scc0, scc2); + lie->add_scc_dependance(scc1, scc2); // Enable IO lie->enable_all_to_all_dump(); diff --git a/backend/tests/update/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp.backup similarity index 100% rename from backend/tests/update/compiled_pre/sssp.cpp rename to backend/tests/sssp/compiled_pre/sssp.cpp.backup diff --git a/backend/tests/sssp/sssp.slog b/backend/tests/sssp/sssp.slog new file mode 100644 index 00000000..abdace44 --- /dev/null +++ b/backend/tests/sssp/sssp.slog @@ -0,0 +1,3 @@ + +[(spath from to dist) <-- (edge from to dist)] +[(spath from to l) <-- (spath from mid dist) (edge mid to l)] diff --git a/backend/tests/update/test-input-graph/edge.csv b/backend/tests/sssp/test-input-graph/edge.csv similarity index 100% rename from backend/tests/update/test-input-graph/edge.csv rename to backend/tests/sssp/test-input-graph/edge.csv diff --git a/backend/tests/update/sssp.slog b/backend/tests/update/sssp.slog deleted file mode 100644 index 71bfbe56..00000000 --- a/backend/tests/update/sssp.slog +++ /dev/null @@ -1,3 +0,0 @@ - -[(spath from to dist) <-- (edge from to dist)] - diff --git a/slog/common/client.py b/slog/common/client.py index 4d170714..6398146c 100644 --- a/slog/common/client.py +++ b/slog/common/client.py @@ -318,12 +318,12 @@ def _run(self, program_hashes:list, input_database:str, cores=2, writer=Writer() def _update_intern_strings(self, db_id): """ update cached string.csv data """ if self.local_db_path: - with open(os.path.join(self.local_db_path, '$strings.csv'), 'r') as string_file: - for s_line in string_file: - if s_line.strip() == '': - continue - sv = s_line.split('\t')[1] - self.intern_string_dict[string_hash(sv.strip())] = sv.strip() + # with open(os.path.join(self.local_db_path, '$strings.csv'), 'r') as string_file: + # for s_line in string_file: + # if s_line.strip() == '': + # continue + # sv = s_line.split('\t')[1] + # self.intern_string_dict[string_hash(sv.strip())] = sv.strip() return req = slog_pb2.StringRequest() req.database_id = db_id diff --git a/slogdb b/slogdb new file mode 100755 index 00000000..aefb6b59 --- /dev/null +++ b/slogdb @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +""" +show a local database with limited feature in REPL + +Yihao Sun +""" + +import argparse + +from slog.repl.repl import Repl + +def run_repl(db_path): + repl = Repl(local_db_path=db_path) + repl.loop() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("db_path", help="The file folder path of a slog database.") + + args = parser.parse_args() + run_repl(args.db_path) From 357c7c49f9c7a7749d5dd3f51731fee306fd7b61 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Sun, 13 Nov 2022 23:49:38 -0500 Subject: [PATCH 05/36] use real sssp --- backend/src/lie/lie.cpp | 12 +- backend/src/relation/shmap_relation_exp.cpp | 1 + .../checkpoint-final/257.spath.3.table_full | Bin 512 -> 0 bytes .../checkpoint-final/258.spath.2.table_full | Bin 0 -> 96 bytes backend/tests/sssp/compiled_pre/sssp.cpp | 261 ++++++++++++------ backend/tests/sssp/sssp.slog | 4 +- slog/tests/benchmark.py | 191 +++++++++++++ 7 files changed, 373 insertions(+), 96 deletions(-) delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full create mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full create mode 100644 slog/tests/benchmark.py diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index ad504f1d..b2d16761 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -476,9 +476,9 @@ bool LIE::execute () else executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num); - std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; - for (u32 i = 0 ; i < scc_relation_count; i++) - print_relation_size(scc_relation[i]); + // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; + // for (u32 i = 0 ; i < scc_relation_count; i++) + // print_relation_size(scc_relation[i]); // stat_intermediate(); //executed_scc_id.push_back(executable_task->get_id()); #if 0 @@ -546,9 +546,9 @@ bool LIE::execute () } #endif // if (loop_counter < 20) { - std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; - for (u32 i = 0 ; i < scc_relation_count; i++) - print_relation_size(scc_relation[i]); + // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; + // for (u32 i = 0 ; i < scc_relation_count; i++) + // print_relation_size(scc_relation[i]); // } // stat_intermediate(); // loop_counter++; diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index 654cc350..2db1941b 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -417,6 +417,7 @@ void shmap_relation::as_all_to_allv_right_join_buffer( std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); gen_func(input_t, cur_path, projected_path); } else { + // std::cout << "here" << std::endl; u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; for (int i = 0; i < input1_buffer_width; i++) reordered_cur_path[i] = cur_path[i]; diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/257.spath.3.table_full deleted file mode 100644 index 478adf3b2a8e2bf5756cf0669e86d003e15ca8d3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 512 zcmZ9ITMB?M5CdD^qKKl1$Lak~<#x?rsXw8couoB}?`x!;^vIf0h&s2@LHcAZe7Iiz zS^3P#@m%BSMTLh;JaahCuIcCL=YDs^)6bsg8BZ=s|1O?6+^hvZ8_%4*QNH&TK6~VE d%I98<4F0#j!+3o1(Y4~qCugdBa#Rx*`7ihw2~z+7 diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full new file mode 100644 index 0000000000000000000000000000000000000000..d164029436e14bdd3dce073e8e69fa10f32f91de GIT binary patch literal 96 ycmZQ#fB-Hi4W=3#7+9fvW+=@FqJiQ(P=3O^f}TJ13=Aw#aTuQoDi78NR1W}19SRiy literal 0 HcmV?d00001 diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp index 00313a55..2c95a173 100644 --- a/backend/tests/sssp/compiled_pre/sssp.cpp +++ b/backend/tests/sssp/compiled_pre/sssp.cpp @@ -396,121 +396,201 @@ int main(int argc, char **argv) { mpi_comm mcomm; mcomm.create(argc, argv); + // relation *rel__edge__3__1__2__3 = new relation( + // 3, true, 3, get_tag_for_rel("edge", "1__2__3"), + // std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", + // slog_input_dir + "/" + + // std::to_string(get_tag_for_rel("edge", "1__2__3")) + + // ".edge.3.table", + // FULL); + // relation* rel__edge__3__1 = new relation( + // 1, false, 3, get_tag_for_rel("edge","1"), + // std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table", + // FULL); + + // // the dependent column must be exclude from hash computation, so join + // column count is 3 - 1 = 2 relation *rel__spath__3__1__2__3 = new relation( + // 2, true, 3, get_tag_for_rel("spath", "1__2__3"), + // std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table", + // slog_input_dir + "/" + + // std::to_string(get_tag_for_rel("spath", "1__2__3")) + + // ".spath.3.table", + // FULL); + // // set functional dependency for spath + // rel__spath__3__1__2__3->set_dependent_column_update( + // {2, 3}, // len and id column + // [](std::vector old_v, std::vector new_v) -> std::optional + // { + // return new_v[0] < old_v[0]; + // } + // ); + // relation* rel__spath__3__2 = new relation( + // 1, false, 3, get_tag_for_rel("spath","2"), + // std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table", + // FULL); + // rel__spath__3__2->set_dependent_column_update( + // {2, 3}, + // [](std::vector old_v, std::vector new_v) -> std::optional + // { + // return new_v[0] < old_v[0]; + // } + // ); + + // RAM* scc0 = new RAM(false, 0); + // scc0->add_relation(rel__edge__3__1, true, false); + // scc0->add_relation(rel__edge__3__1__2__3, true, false); + // scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, + // DELTA, {0, 3, 1, 2})); + + // RAM *scc1 = new RAM(false, 0); + // scc1->add_relation(rel__edge__3__1__2__3, false, false); + // scc1->add_relation(rel__spath__3__1__2__3, true, false); + // scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3, + // rel__edge__3__1__2__3, FULL, {0, 1, 2})); + + // RAM *scc2 = new RAM(true, 1); + // scc2->add_relation(rel__edge__3__1__2__3, false, false); + // scc2->add_relation(rel__spath__3__2, true, false); + // scc2->add_relation(rel__spath__3__1__2__3, true, false); + // // the order of non join column also need to be carefully arranged + // because, dependent column + // // should always at last + // scc2->add_rule(new parallel_acopy( + // rel__spath__3__2, + // rel__spath__3__1__2__3, DELTA, + // {1, 0, 2, 3})); // 2, 1, 3, id + // parallel_join* update_spath_j = new parallel_join( + // rel__spath__3__1__2__3, + // rel__edge__3__1, FULL, + // rel__spath__3__2, DELTA, + // {5, 2, 3}// useless + // ); + // update_spath_j->set_generator_func([](std::vector& target_v, + // std::vector& input_v, u64* res) { + // res[0] = target_v[1]; + // res[1] = input_v[2]; + // if (res[0] == res[1]) { + // res[2] = 0; + // } else { + // res[2] = target_v[2] + input_v[3]; + // } + // }); + // scc2->add_rule(update_spath_j); + + // LIE *lie = new LIE(); + // lie->add_relation(rel__edge__3__1); + // lie->add_relation(rel__edge__3__1__2__3); + // lie->add_relation(rel__spath__3__2); + // lie->add_relation(rel__spath__3__1__2__3); + // lie->add_scc(scc0); + // lie->add_scc(scc1); + // lie->add_scc(scc2); + // lie->add_scc_dependance(scc0, scc2); + // lie->add_scc_dependance(scc1, scc2); + + relation *rel__spath__2__1__2 = new relation( + 2, true, 2, get_tag_for_rel("spath", "1__2"), + std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) + + ".spath.2.table", + FULL); + rel__spath__2__1__2->set_dependent_column_update( + {1, 2}, // len and id column + [](std::vector old_v, std::vector new_v) -> std::optional + { + return new_v[0] < old_v[0]; + } + ); + relation *rel__edge__3__1 = new relation( + 1, false, 3, get_tag_for_rel("edge", "1"), + std::to_string(get_tag_for_rel("edge", "1")) + ".edge.3.table", FULL); relation *rel__edge__3__1__2__3 = new relation( 3, true, 3, get_tag_for_rel("edge", "1__2__3"), std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", FULL); - relation* rel__edge__3__1 = new relation( - 1, false, 3, get_tag_for_rel("edge","1"), - std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table", - FULL); - - // the dependent column must be exclude from hash computation, so join column count is 3 - 1 = 2 - relation *rel__spath__3__1__2__3 = new relation( - 2, true, 3, get_tag_for_rel("spath", "1__2__3"), - std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table", - slog_input_dir + "/" + - std::to_string(get_tag_for_rel("spath", "1__2__3")) + - ".spath.3.table", - FULL); - // set functional dependency for spath - rel__spath__3__1__2__3->set_dependent_column_update( - {2, 3}, // len and id column - [](std::vector old_v, std::vector new_v) -> std::optional { - // if (new_v[0] < old_v[0]) { - // std::cout << "Comparing >>>> "; - // for (auto v: old_v) { - // std::cout << v << " "; - // } - // std::cout << " with "; - // for (auto v: new_v) { - // std::cout << v << " "; - // } - // std::cout << std::endl; - // } - return new_v[0] < old_v[0]; - } - ); - relation* rel__spath__3__2 = new relation( - 1, false, 3, get_tag_for_rel("spath","2"), - std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table", - FULL); - rel__spath__3__2->set_dependent_column_update( - {2, 3}, - [](std::vector old_v, std::vector new_v) -> std::optional { - // if (new_v[0] < old_v[0]) { - // std::cout << "Comparing >>>> "; - // for (auto v: old_v) { - // std::cout << v << " "; - // } - // std::cout << " with "; - // for (auto v: new_v) { - // std::cout << v << " "; - // } - // std::cout << std::endl; - // } + relation *rel__spath__2__1 = new relation( + 1, false, 2, get_tag_for_rel("spath", "1"), + std::to_string(get_tag_for_rel("spath", "1")) + ".spath.2.table", FULL); + rel__spath__2__1->set_dependent_column_update( + {1, 2}, + [](std::vector old_v, std::vector new_v) -> std::optional + { return new_v[0] < old_v[0]; } ); - RAM* scc0 = new RAM(false, 0); + RAM *scc0 = new RAM(false, 0); scc0->add_relation(rel__edge__3__1, true, false); scc0->add_relation(rel__edge__3__1__2__3, true, false); - scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, DELTA, {0, 3, 1, 2})); - - RAM *scc1 = new RAM(false, 0); - scc1->add_relation(rel__edge__3__1__2__3, false, false); - scc1->add_relation(rel__spath__3__1__2__3, true, false); - scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3, - rel__edge__3__1__2__3, FULL, {0, 1, 2})); - - RAM *scc2 = new RAM(true, 1); - scc2->add_relation(rel__edge__3__1__2__3, false, false); - scc2->add_relation(rel__spath__3__2, true, false); - scc2->add_relation(rel__spath__3__1__2__3, true, false); - // the order of non join column also need to be carefully arranged because, dependent column - // should always at last - scc2->add_rule(new parallel_acopy( - rel__spath__3__2, - rel__spath__3__1__2__3, DELTA, - {1, 0, 2, 3})); // 2, 1, 3, id + scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, + DELTA, {0, 3, 1, 2})); + + RAM *scc1 = new RAM(false, 1); + scc1->add_relation(rel__spath__2__1__2, true, false); + scc1->add_relation(rel__edge__3__1, false, false); + scc1->add_rule(new parallel_copy_generate( + rel__spath__2__1__2, rel__edge__3__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + auto args_for_old_bi = std::array{data[0]}; + using TState = std::tuple; + TState state = std::make_tuple(data, output); + auto callback = [](u64 res_0, TState state) -> TState { + auto [data, output] = state; + auto head_tuple = output; + + bool compatible = true && res_0 == n2d(1); + if (!compatible) + return state; + + head_tuple[0] = data[2]; + head_tuple[1] = data[3]; + return std::make_tuple(data, output + 2); + }; + auto [_, new_ptr] = + builtin_eq_1(args_for_old_bi.data(), state, callback); + auto tuples_count = (new_ptr - output) / 2; + return tuples_count; + })); + + RAM *scc2 = new RAM(true, 2); + scc2->add_relation(rel__spath__2__1__2, true, false); + scc2->add_relation(rel__edge__3__1, false, false); + scc2->add_relation(rel__spath__2__1, true, false); + // scc2->add_rule(new parallel_join(rel__spath__2__1__2, rel__spath__2__1, DELTA, + // rel__edge__3__1, FULL, {4, 5})); parallel_join* update_spath_j = new parallel_join( - rel__spath__3__1__2__3, + rel__spath__2__1__2, rel__edge__3__1, FULL, - rel__spath__3__2, DELTA, - {5, 2, 3}// useless + rel__spath__2__1, DELTA, + {5,4}// useless ); - update_spath_j->set_generator_func([](std::vector& target_v, std::vector& input_v, u64* res) { - // std::cout << "Join >>>> "; - // for (auto v: target_v) { - // std::cout << v << " "; - // } - // std::cout << " with "; - // for (auto v: input_v) { - // std::cout << v << " "; - // } - // std::cout << std::endl; - res[0] = target_v[1]; - res[1] = input_v[2]; - if (res[0] == res[1]) { - res[2] = 0; + update_spath_j->set_generator_func([](std::vector& target_v, + std::vector& input_v, u64* res) { + res[0] = target_v[0]; + // res[1] = input_v[2]; + if (res[0] == input_v[2]) { + res[1] = 0; } else { - res[2] = target_v[2] + input_v[3]; + res[1] = target_v[1] + input_v[3]; } }); + scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2, + DELTA, {0, 1, 2})); scc2->add_rule(update_spath_j); + LIE *lie = new LIE(); + lie->add_relation(rel__spath__2__1__2); lie->add_relation(rel__edge__3__1); lie->add_relation(rel__edge__3__1__2__3); - lie->add_relation(rel__spath__3__2); - lie->add_relation(rel__spath__3__1__2__3); + lie->add_relation(rel__spath__2__1); lie->add_scc(scc0); lie->add_scc(scc1); lie->add_scc(scc2); lie->add_scc_dependance(scc0, scc2); + lie->add_scc_dependance(scc0, scc1); lie->add_scc_dependance(scc1, scc2); // Enable IO @@ -524,6 +604,11 @@ int main(int argc, char **argv) { lie->print_all_relation_size(); // Continuously print relation sizes lie->stat_intermediate(); + // rel__spath__2__1__2->print(); + // rel__spath__2__1->print(); + // rel__edge__3__1->print(); + // rel__edge__3__1__2__3->print(); + // print all variants(non-canonical index of each relation) if (mcomm.get_rank() == 0) { std::cout << "rel_name" diff --git a/backend/tests/sssp/sssp.slog b/backend/tests/sssp/sssp.slog index abdace44..ae617a6d 100644 --- a/backend/tests/sssp/sssp.slog +++ b/backend/tests/sssp/sssp.slog @@ -1,3 +1,3 @@ -[(spath from to dist) <-- (edge from to dist)] -[(spath from to l) <-- (spath from mid dist) (edge mid to l)] +[(spath to dist) <-- (edge 1 to dist)] +[(spath to l) <-- (spath mid dist) (edge mid to l)] diff --git a/slog/tests/benchmark.py b/slog/tests/benchmark.py new file mode 100644 index 00000000..db1bd2d1 --- /dev/null +++ b/slog/tests/benchmark.py @@ -0,0 +1,191 @@ +""" +Benchmark Harness +""" + +import logging +import os +import shutil +import tempfile +from typing import Iterator + + +# class ExecutionResult: +# """ result class for each datalog run """ + +# def __init__(self, engine_name, dataset_name, cores, +# runtime, memory_usage) -> None: +# self.engine_name = engine_name +# self.dataset_name = dataset_name +# self.cores = cores +# self.runtime = runtime +# self.memory_usage = memory_usage + + +class Dataset: + """ dataset class, all file inside dataset folder must be either csv/tsv/facts/ """ + + def __init__(self, name: str, data_dir: str, row_sep: str) -> None: + """ type is csv/tsv/facts """ + self.name = name + self.data_dir = data_dir + self.row_sep = row_sep + self.files = os.listdir(data_dir) + + def fetch_data(self, rel_fname) -> Iterator[list]: + """ + return a tuple iterator of each row, tuple in dataset is processed as python list + """ + rel_file_path = os.path.join(self.data_dir, rel_fname) + if os.path.exists(rel_file_path): + with open(rel_file_path) as rel_f: + for row in rel_f: + cols = row.split(self.row_sep) + if cols != []: + yield list(map(lambda x: x.strip(), cols)) + else: + logging.error("Relation %s not exists in dataset %s", + rel_fname, self.name) + return [] + + def dump(self, out_dir, fname_mapping, data_format="tsv", customize_format_function=None): + """ + dump a dataset to target path + """ + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + os.mkdir(out_dir) + if (self.row_sep == '\t' and data_format in ['tsv', 'facts']) or \ + (self.row_sep == ',' and data_format in ['csv']): + for fname in self.files: + shutil.copyfile(os.path.join(self.data_dir, fname), + os.path.join(out_dir, fname_mapping[fname])) + else: + for fname in fname_mapping.keys(): + with open(fname_mapping[fname], "w+") as out_f: + for row in self.fetch_data(fname): + new_row_txt = "" + if data_format in ['tsv', 'facts']: + new_row_txt = "\t".join(row) + elif data_format in ['csv']: + new_row_txt = ",".join(row) + else: + new_row_txt = customize_format_function(row) + out_f.write(new_row_txt+'\n') + + +class DatalogEngine: + """ datalog engine abstract class """ + + def __init__(self, name, verbose=False) -> None: + self.name = name + self.verbose = verbose + + def run(self, dataset: Dataset, output_file, src, file_mapping, cores): + """ + data_input: dataset + output: statistic info output path + core: core counts used to run benchmark + file_mapping: mapping from dataset file to datalog input facts file + """ + + +class BenchmarkCase: + """ one time test """ + + def __init__(self, datalog: DatalogEngine, dataset: Dataset, src, file_mapping, cores) -> None: + self.datalog = datalog + self.dataset = dataset + self.file_mapping = file_mapping + self.cores = cores + self.datalog_file = src + + def run(self, output_file): + self.datalog.run(self.dataset, output_file, self.datalog_file, + self.file_mapping, self.cores) + + def __str__(self) -> str: + prog_name = os.path.basename(self.datalog_file) + return f"{self.datalog.name}_{self.dataset.name}_{prog_name}_{self.cores}" + + +class Slog(DatalogEngine): + """ slog test harness """ + + def __init__(self, verbose=False) -> None: + super().__init__("slog", verbose) + + def run(self, dataset: Dataset, output_file, src, file_mapping, cores): + program_name = os.path.basename(src)[:-5] + with tempfile.TemporaryDirectory() as tempdir_name: + dataset.dump(tempdir_name+'/in', file_mapping, 'facts') + print(os.listdir(tempdir_name+'/in')) + logging.info( + "Running slog %d cores, dataset %s ..., file %s", cores, dataset.data_dir, src) + os.system( + f"cd /slog && ./runslog -v -co -j {cores} -f {tempdir_name}/in {src} out") + os.system( + f"cd /slog/out/build && /usr/bin/time -v -o {output_file} mpirun -np {cores} ./{program_name} ../input-data ../") + + +class Souffle(DatalogEngine): + """ souffle test harness """ + + def __init__(self, verbose=False) -> None: + super().__init__("souffle", verbose) + + def run(self, dataset: Dataset, output_file, src, file_mapping, cores): + program_name = os.path.basename(src)[:-3] + with tempfile.TemporaryDirectory() as tempdir_name: + dataset.dump(tempdir_name+'/in', file_mapping, 'facts') + print(os.listdir(tempdir_name+'/in')) + out_dir = os.path.join(tempdir_name, "out") + os.mkdir(out_dir) + logging.info( + "Running souffle %d cores, dataset %s ..., file %s", cores, dataset.data_dir, src) + os.system( + f"souffle -o {tempdir_name}/{program_name} -j {cores} -F {tempdir_name}/in -D {out_dir} {src}") + os.system( + f"/usr/bin/time -v -o {output_file} {tempdir_name}/{program_name} -j {cores} -F {tempdir_name}/in -D {out_dir}") + + +class Benchmark: + """ benchmark entrance class """ + + def __init__(self, case_list, output_dir) -> None: + self.case_list = case_list + self.output_dir = output_dir + + def run(self): + """ start benchmark """ + for bench_case in self.case_list: + output_fpath = os.path.join(self.output_dir, str(bench_case)) + bench_case.run(output_fpath) + print(f"case finish, output in {str(bench_case)}") + + +if __name__ == "__main__": + """ test code """ + test_dataset = Dataset("test", "/slog/slog/tests/testcase/tc/input", "\t") + souffle_engine = Souffle() + slog_engine = Slog() + target_slog_program = "/slog/slog/tests/testcase/tc/tc.slog" + target_souffle_program = "/slog/examples/souffle/tc.dl" + case_list = [] + for i in [1, 3, 6]: + case_list.append(BenchmarkCase( + slog_engine, test_dataset, target_slog_program, + { + "edge.facts": "edge.facts" + }, + i + )) + case_list.append(BenchmarkCase( + souffle_engine, test_dataset, target_souffle_program, + { + "edge.facts": "edge.facts" + }, + i + )) + bench_out = "/benchmark_out" + Benchmark(case_list, bench_out).run() + print(f"Benchmark finished, result in {bench_out}.") From d2c7ed617be11bda0d652d522a285eba254678d2 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Mon, 14 Nov 2022 01:07:55 -0500 Subject: [PATCH 06/36] add loop --- .../checkpoint-final/258.spath.2.table_full | Bin 96 -> 96 bytes backend/tests/sssp/compiled_pre/sssp.cpp | 48 +++++++++++------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full index d164029436e14bdd3dce073e8e69fa10f32f91de..32cf0a911bf6c18b1d59c6e2b5f3f49f85a4ea6a 100644 GIT binary patch literal 96 vcmZQ(fB<$V4W=3#7+9fv7Bq1lC_iCdLC>Fh1_rP^P(6eIq7(LiwkM|}zt literal 96 ycmZQ#fB-Hi4W=3#7+9fvW+=@FqJiQ(P=3O^f}TJ13=Aw#aTuQoDi78NR1W}19SRiy diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp index 2c95a173..499370d0 100644 --- a/backend/tests/sssp/compiled_pre/sssp.cpp +++ b/backend/tests/sssp/compiled_pre/sssp.cpp @@ -33,6 +33,7 @@ const u64 int_tag = 0; const u64 str_tag = 2; const u64 sign_flip_const = 0x0000200000000000; const u64 signed_num_mask = 0xFFFFE00000000000; +int start_node = 1; inline bool is_number(u64 datum) { // cout << "is_number(" << datum << "): " << (datum >> tag_position == @@ -381,20 +382,10 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { return max_rel; } -int main(int argc, char **argv) { - // input dir from compiler - std::string slog_input_dir = - "/home/stargazermiao/workspace/PL/slog/out/input-data"; - // output dir from compiler - std::string slog_output_dir = - "/home/stargazermiao/workspace/PL/slog/out/checkpoints"; - if (argc == 3) { - slog_input_dir = argv[1]; - slog_output_dir = argv[2]; - } - load_input_relation(slog_input_dir); - mpi_comm mcomm; - mcomm.create(argc, argv); +void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::string output_dir, int argc, char **argv) { + start_node = sp; + load_input_relation(input_dir); + // relation *rel__edge__3__1__2__3 = new relation( // 3, true, 3, get_tag_for_rel("edge", "1__2__3"), @@ -491,7 +482,7 @@ int main(int argc, char **argv) { relation *rel__spath__2__1__2 = new relation( 2, true, 2, get_tag_for_rel("spath", "1__2"), std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table", - slog_input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) + + input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table", FULL); rel__spath__2__1__2->set_dependent_column_update( @@ -507,7 +498,7 @@ int main(int argc, char **argv) { relation *rel__edge__3__1__2__3 = new relation( 3, true, 3, get_tag_for_rel("edge", "1__2__3"), std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", - slog_input_dir + "/" + + input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", FULL); relation *rel__spath__2__1 = new relation( @@ -540,7 +531,7 @@ int main(int argc, char **argv) { auto [data, output] = state; auto head_tuple = output; - bool compatible = true && res_0 == n2d(1); + bool compatible = true && res_0 == n2d(start_node); if (!compatible) return state; @@ -597,7 +588,7 @@ int main(int argc, char **argv) { lie->enable_all_to_all_dump(); lie->enable_data_IO(); lie->enable_IO(); - lie->set_output_dir(slog_output_dir); // Write to this directory + lie->set_output_dir(output_dir); // Write to this directory lie->set_comm(mcomm); lie->set_batch_size(1); lie->execute(); @@ -624,7 +615,26 @@ int main(int argc, char **argv) { delete lie; - mcomm.destroy(); +} +int main(int argc, char **argv) { + // input dir from compiler + std::string slog_input_dir = + "/home/stargazermiao/workspace/PL/slog/out/input-data"; + // output dir from compiler + std::string slog_output_dir = + "/home/stargazermiao/workspace/PL/slog/out/checkpoints"; + if (argc == 3) { + slog_input_dir = argv[1]; + slog_output_dir = argv[2]; + } + mpi_comm mcomm; + mcomm.create(argc, argv); + + for (int i = 0; i < 5; i++) { + compute_sssp_from(mcomm, i, slog_input_dir, slog_output_dir, argc, argv); + } + + mcomm.destroy(); return 0; } From 649a9d708dd3b2a60dd9638578bfa40ea82eed03 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Mon, 14 Nov 2022 01:25:40 -0500 Subject: [PATCH 07/36] w --- backend/tests/sssp/compiled_pre/run_sssp.sh | 6 ++++++ backend/tests/sssp/compiled_pre/sssp.cpp | 4 +--- 2 files changed, 7 insertions(+), 3 deletions(-) create mode 100755 backend/tests/sssp/compiled_pre/run_sssp.sh diff --git a/backend/tests/sssp/compiled_pre/run_sssp.sh b/backend/tests/sssp/compiled_pre/run_sssp.sh new file mode 100755 index 00000000..9866b9b6 --- /dev/null +++ b/backend/tests/sssp/compiled_pre/run_sssp.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +for i in {1..5} +do + mpirun -np 1 ./build/sssp ./input-data ./ $i +done diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp index 499370d0..56846cc4 100644 --- a/backend/tests/sssp/compiled_pre/sssp.cpp +++ b/backend/tests/sssp/compiled_pre/sssp.cpp @@ -631,9 +631,7 @@ int main(int argc, char **argv) { mpi_comm mcomm; mcomm.create(argc, argv); - for (int i = 0; i < 5; i++) { - compute_sssp_from(mcomm, i, slog_input_dir, slog_output_dir, argc, argv); - } + compute_sssp_from(mcomm, atoi(argv[3]), slog_input_dir, slog_output_dir, argc, argv); mcomm.destroy(); return 0; From 80f2a14959c86daa5ab454292918aa1eb91e65a3 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Mon, 14 Nov 2022 01:41:41 -0500 Subject: [PATCH 08/36] add loop script to run sssp w --- backend/src/lie/lie.cpp | 2 +- backend/src/relation/balanced_hash_relation.cpp | 2 +- backend/tests/sssp/compiled_pre/run_sssp.sh | 2 +- backend/tests/sssp/compiled_pre/sssp.cpp | 2 +- backend/tests/sssp/sssp.slog | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index b2d16761..59a0ec57 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -589,7 +589,7 @@ bool LIE::execute () write_final_checkpoint_dump(); - std::cout << "finish writting checkpoint!" << std::endl; + // std::cout << "finish writting checkpoint!" << std::endl; delete[] rotate_index_array; for (int i=0; i < nprocs; i++) diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index 7b5deef9..b3f4e879 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -780,7 +780,7 @@ void relation::populate_full(int buffer_size, u64* buffer) u32 counter = 0; u64 t[arity+1]; u32 buckets = get_bucket_count(); - std::cout << "populating full for " << intern_tag << std::endl; + // std::cout << "populating full for " << intern_tag << std::endl; for (int i = 0; i < buffer_size; i = i + (arity+1)) { diff --git a/backend/tests/sssp/compiled_pre/run_sssp.sh b/backend/tests/sssp/compiled_pre/run_sssp.sh index 9866b9b6..c2af3466 100755 --- a/backend/tests/sssp/compiled_pre/run_sssp.sh +++ b/backend/tests/sssp/compiled_pre/run_sssp.sh @@ -2,5 +2,5 @@ for i in {1..5} do - mpirun -np 1 ./build/sssp ./input-data ./ $i + mpirun -np 1 ./build/sssp ./input-data ./ $i || exit 1; done diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/sssp/compiled_pre/sssp.cpp index 56846cc4..2c0603e8 100644 --- a/backend/tests/sssp/compiled_pre/sssp.cpp +++ b/backend/tests/sssp/compiled_pre/sssp.cpp @@ -624,7 +624,7 @@ int main(int argc, char **argv) { // output dir from compiler std::string slog_output_dir = "/home/stargazermiao/workspace/PL/slog/out/checkpoints"; - if (argc == 3) { + if (argc > 2) { slog_input_dir = argv[1]; slog_output_dir = argv[2]; } diff --git a/backend/tests/sssp/sssp.slog b/backend/tests/sssp/sssp.slog index ae617a6d..bd00df2c 100644 --- a/backend/tests/sssp/sssp.slog +++ b/backend/tests/sssp/sssp.slog @@ -1,3 +1,3 @@ -[(spath to dist) <-- (edge 1 to dist)] -[(spath to l) <-- (spath mid dist) (edge mid to l)] +[(spath to dist) <-- (edge 1 to dist)] ; loop from 1 ~ 10000 compute average (in c++) +[(spath to {l+dist}) <-- (spath mid dist) (edge mid to l)] From a707d5353351293b9e51c8bf51634f0c953754c0 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Mon, 14 Nov 2022 22:28:57 -0500 Subject: [PATCH 09/36] w --- examples/datalog-example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/datalog-example b/examples/datalog-example index 30c6423b..be103a21 160000 --- a/examples/datalog-example +++ b/examples/datalog-example @@ -1 +1 @@ -Subproject commit 30c6423bf1b1a101075e9712f77c156a688459a2 +Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa From 058f1dccc7144ee66113b539b481dcbef2906890 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Tue, 15 Nov 2022 01:29:24 -0500 Subject: [PATCH 10/36] fix reorder --- .../src/relation/balanced_hash_relation.cpp | 6 ++-- backend/src/relation/shmap_relation_exp.cpp | 31 ++++++++++++++++++ .../checkpoint-final/258.edge.3.table_full | Bin 0 -> 288 bytes .../checkpoint-final/259.spath.2.table_full | Bin 0 -> 144 bytes .../compiled_pre/input-data/256.edge.3.table | Bin 288 -> 0 bytes .../compiled_pre/input-data/258.edge.3.table | Bin 0 -> 288 bytes backend/tests/sssp/compiled_pre/sssp.cpp | 24 +++++++++++--- backend/tests/sssp/test-input-graph/edge.csv | 16 ++++----- 8 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full create mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full delete mode 100644 backend/tests/sssp/compiled_pre/input-data/256.edge.3.table create mode 100644 backend/tests/sssp/compiled_pre/input-data/258.edge.3.table diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index b3f4e879..7663343b 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -1307,6 +1307,7 @@ void relation::local_insert_in_delta() } newt[i].purge(); memset(newt_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32)); + newt_element_count = 0; } } else { delete[] delta; @@ -1324,8 +1325,7 @@ void relation::local_insert_in_delta() newt[i].dependent_column_indices = dependent_column_indices; newt[i].update_compare_func = update_compare_func; } + newt_element_count = 0; + memset(newt_bucket_element_count, 0, buckets * sizeof(u32)); } - - newt_element_count = 0; - memset(newt_bucket_element_count, 0, buckets * sizeof(u32)); } diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index 2db1941b..fec862d4 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -408,6 +408,14 @@ void shmap_relation::as_all_to_allv_right_join_buffer( upper_bound[i] = prefix[i]; lower_bound[i] = prefix[i]; } + // std::cout << "cur tree >>> " << std::endl; + // for (auto r: ind) { + // std::cout << ">>> "; + // for (auto c: r) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // } auto joined_range = lowerUpperRange(lower_bound, upper_bound); for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) { @@ -415,6 +423,11 @@ void shmap_relation::as_all_to_allv_right_join_buffer( u64 projected_path[join_buffer.width[ra_id]]; if (generator_mode) { std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); + // std::cout << "join facts "; + // for (auto c: input_t) { + // std::cout << c << " "; + // } + // std::cout << std::endl; gen_func(input_t, cur_path, projected_path); } else { // std::cout << "here" << std::endl; @@ -428,6 +441,11 @@ void shmap_relation::as_all_to_allv_right_join_buffer( for (int i =0; i < join_buffer.width[ra_id]; i++) projected_path[i] = reordered_cur_path[reorder_map[i]]; } + // std::cout << "add new facts "; + // for (auto c: projected_path) { + // std::cout << c << " "; + // } + // std::cout << std::endl; if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) { uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets; @@ -481,6 +499,19 @@ void shmap_relation::as_all_to_allv_left_join_buffer( upper_bound[i] = prefix[i]; lower_bound[i] = prefix[i]; } + // std::cout << "join >>> "; + // for (auto c: prefix) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // std::cout << "cur tree >>> " << std::endl; + // for (auto r: ind) { + // std::cout << ">>> "; + // for (auto c: r) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // } auto joined_range = lowerUpperRange(lower_bound, upper_bound); for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) { diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full new file mode 100644 index 0000000000000000000000000000000000000000..ff8d44ee05bfce6725eb4fca00771199d6e4af8b GIT binary patch literal 288 zcmZ9^K@xx<3`5aYMUYc@G>&d2Gz-I~!+(X0eU6%0lViQ1`&s?B2iM)59j<=a|Lkz} bX3h>*zs=d<>fM|jKB_67=pgih{QcG_o` fymP&ee=q-OpI!RbpDUr$pXr+=baGF8zf<}EYF7fz diff --git a/backend/tests/sssp/compiled_pre/input-data/258.edge.3.table b/backend/tests/sssp/compiled_pre/input-data/258.edge.3.table new file mode 100644 index 0000000000000000000000000000000000000000..ff8d44ee05bfce6725eb4fca00771199d6e4af8b GIT binary patch literal 288 zcmZ9^K@xx<3`5aYMUYc@G>&d2Gz-I~!+(X0eU6%0lViQ1`&s?B2iM)59j<=a|Lkz} bX3h>*zs=d<>fM|jKB_ old_v, std::vector new_v) -> std::optional { + // std::cout << "Comparing "; + // for (auto c : old_v) { + // std::cout << c << " "; + // } + // std::cout << " <<<<<< "; + // for (auto c : new_v) { + // std::cout << c << " "; + // } return new_v[0] < old_v[0]; } ); @@ -508,6 +516,14 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri {1, 2}, [](std::vector old_v, std::vector new_v) -> std::optional { + // std::cout << "Comparing "; + // for (auto c : old_v) { + // std::cout << c << " "; + // } + // std::cout << " <<<<<< "; + // for (auto c : new_v) { + // std::cout << c << " "; + // } return new_v[0] < old_v[0]; } ); @@ -559,17 +575,17 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri ); update_spath_j->set_generator_func([](std::vector& target_v, std::vector& input_v, u64* res) { - res[0] = target_v[0]; - // res[1] = input_v[2]; - if (res[0] == input_v[2]) { + // res[0] = target_v[0]; + res[0] = input_v[2]; + if (res[0] == start_node) { res[1] = 0; } else { res[1] = target_v[1] + input_v[3]; } }); + scc2->add_rule(update_spath_j); scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2, DELTA, {0, 1, 2})); - scc2->add_rule(update_spath_j); LIE *lie = new LIE(); diff --git a/backend/tests/sssp/test-input-graph/edge.csv b/backend/tests/sssp/test-input-graph/edge.csv index 936bb262..de8668e0 100644 --- a/backend/tests/sssp/test-input-graph/edge.csv +++ b/backend/tests/sssp/test-input-graph/edge.csv @@ -1,9 +1,9 @@ -1 2 10 -1 5 3 -5 2 1 -2 5 4 +1 2 1 +1 3 2 2 3 2 -5 3 8 -5 4 2 -4 3 7 -3 4 9 +3 4 1 +4 5 1 +5 6 1 +6 7 1 +8 9 1 +9 10 1 From 329e9417212e86ce2281df397bfe61b53c1965ce Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 16 Nov 2022 21:50:02 +0000 Subject: [PATCH 11/36] change to compute multi sssp --- backend/src/parallel_RA_inc.h | 3 +- backend/src/relation/shmap_relation_exp.cpp | 4 +- .../checkpoint-final/256.edge.3.table_full | Bin 288 -> 0 bytes .../checkpoint-final/258.edge.3.table_full | Bin 288 -> 0 bytes .../checkpoint-final/258.spath.2.table_full | Bin 96 -> 0 bytes .../checkpoint-final/259.spath.2.table_full | Bin 144 -> 0 bytes .../sssp/compiled_pre/res-128-1000.output | 24 ++ backend/tests/sssp/compiled_pre/run_sssp.sh | 4 +- backend/tests/sssp/compiled_pre/sssp.cpp | 359 ++++++++++-------- examples/datalog-example | 2 +- 10 files changed, 224 insertions(+), 172 deletions(-) delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/258.spath.2.table_full delete mode 100644 backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full create mode 100644 backend/tests/sssp/compiled_pre/res-128-1000.output diff --git a/backend/src/parallel_RA_inc.h b/backend/src/parallel_RA_inc.h index e02739c5..00c3f688 100644 --- a/backend/src/parallel_RA_inc.h +++ b/backend/src/parallel_RA_inc.h @@ -13,11 +13,12 @@ #include "compat.h" // #include "shmap/shmap.h" #include "shmap/shmap_goog.h" +#include //#define DEBUG_OUTPUT 1 #define MAX_LOOP_COUNT 120000 -using update_partial_compare_func_t = std::function(std::vector old_v, std::vector new_v)>; +using update_partial_compare_func_t = std::function(const std::vector& old_v, const std::vector& new_v, const std::vector& prefix)>; using join_generator_func_t = std::function& target_v, std::vector& input_v, u64* res)>; #include "log/logger.h" diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index fec862d4..700c7a6f 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -76,7 +76,7 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width) for (auto i: dependent_column_indices) { old_t.push_back(cur_tuple[i]); } - auto compare_res = update_compare_func(old_t, dependent_columns); + auto compare_res = update_compare_func(old_t, dependent_columns, tp); if (compare_res.has_value() && compare_res.value()) { need_deletes.push_back(it); // std::cout << "update with <<<<<< "; @@ -129,7 +129,7 @@ shmap_relation::check_dependent_insertion(const std::vector &tp) { for (auto i: dependent_column_indices) { old_t.push_back(cur_tuple[i]); } - auto compare_res = update_compare_func(old_t, dependent_columns); + auto compare_res = update_compare_func(old_t, dependent_columns, tp); if (compare_res.has_value() && compare_res.value()) { return true; } diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/256.edge.3.table_full deleted file mode 100644 index a5b47390726befd417416b8e76e64db49a1e53f8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 288 zcmYL@+YP`l3<61CX)=}fpQ=*f&_L+d4S=*-u@*FJgclJ_f= g&{z83&?&Hw-a diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/258.edge.3.table_full deleted file mode 100644 index ff8d44ee05bfce6725eb4fca00771199d6e4af8b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 288 zcmZ9^K@xx<3`5aYMUYc@G>&d2Gz-I~!+(X0eU6%0lViQ1`&s?B2iM)59j<=a|Lkz} bX3h>*zs=d<>fM|jKB_Fh1_rP^P(6eIq7(LiwkM|}zt diff --git a/backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full b/backend/tests/sssp/compiled_pre/checkpoint-final/259.spath.2.table_full deleted file mode 100644 index f9fa0f4550a97cb1cde84f74525d202b0c48713a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 144 zcmXwu$qfJ?48uyf)?#! #include @@ -333,7 +333,7 @@ void load_input_relation(std::string db_dir) { for (const auto &entry : std::filesystem::directory_iterator(db_dir)) { // check if ends with table std::string filename_ss = entry.path().filename().string(); - std::cout << "input database has file " << filename_ss << std::endl; + //std::cout << "input database has file " << filename_ss << std::endl; std::string suffix = ".table"; int ft = filename_ss.size() - suffix.size(); if (ft < 0) @@ -356,8 +356,8 @@ void load_input_relation(std::string db_dir) { } if (tag > max_rel) max_rel = tag; - std::cout << "load " << tag << "." << index_stream.str() << "has arity " - << arity << std::endl; + //std::cout << "load " << tag << "." << index_stream.str() << "has arity " + // << arity << std::endl; rel_tag_map[index_stream.str()] = tag; } } @@ -377,8 +377,8 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { } max_rel++; rel_tag_map[name_arity] = max_rel; - std::cout << "generate rel tag: " << name_arity << " " << max_rel - << std::endl; + //std::cout << "generate rel tag: " << name_arity << " " << max_rel + // << std::endl; return max_rel; } @@ -386,173 +386,75 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri start_node = sp; load_input_relation(input_dir); - - // relation *rel__edge__3__1__2__3 = new relation( - // 3, true, 3, get_tag_for_rel("edge", "1__2__3"), - // std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", - // slog_input_dir + "/" + - // std::to_string(get_tag_for_rel("edge", "1__2__3")) + - // ".edge.3.table", - // FULL); - // relation* rel__edge__3__1 = new relation( - // 1, false, 3, get_tag_for_rel("edge","1"), - // std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table", - // FULL); - - // // the dependent column must be exclude from hash computation, so join - // column count is 3 - 1 = 2 relation *rel__spath__3__1__2__3 = new relation( - // 2, true, 3, get_tag_for_rel("spath", "1__2__3"), - // std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table", - // slog_input_dir + "/" + - // std::to_string(get_tag_for_rel("spath", "1__2__3")) + - // ".spath.3.table", - // FULL); - // // set functional dependency for spath - // rel__spath__3__1__2__3->set_dependent_column_update( - // {2, 3}, // len and id column - // [](std::vector old_v, std::vector new_v) -> std::optional - // { - // return new_v[0] < old_v[0]; - // } - // ); - // relation* rel__spath__3__2 = new relation( - // 1, false, 3, get_tag_for_rel("spath","2"), - // std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table", - // FULL); - // rel__spath__3__2->set_dependent_column_update( - // {2, 3}, - // [](std::vector old_v, std::vector new_v) -> std::optional - // { - // return new_v[0] < old_v[0]; - // } - // ); - - // RAM* scc0 = new RAM(false, 0); - // scc0->add_relation(rel__edge__3__1, true, false); - // scc0->add_relation(rel__edge__3__1__2__3, true, false); - // scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, - // DELTA, {0, 3, 1, 2})); - - // RAM *scc1 = new RAM(false, 0); - // scc1->add_relation(rel__edge__3__1__2__3, false, false); - // scc1->add_relation(rel__spath__3__1__2__3, true, false); - // scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3, - // rel__edge__3__1__2__3, FULL, {0, 1, 2})); - - // RAM *scc2 = new RAM(true, 1); - // scc2->add_relation(rel__edge__3__1__2__3, false, false); - // scc2->add_relation(rel__spath__3__2, true, false); - // scc2->add_relation(rel__spath__3__1__2__3, true, false); - // // the order of non join column also need to be carefully arranged - // because, dependent column - // // should always at last - // scc2->add_rule(new parallel_acopy( - // rel__spath__3__2, - // rel__spath__3__1__2__3, DELTA, - // {1, 0, 2, 3})); // 2, 1, 3, id - // parallel_join* update_spath_j = new parallel_join( - // rel__spath__3__1__2__3, - // rel__edge__3__1, FULL, - // rel__spath__3__2, DELTA, - // {5, 2, 3}// useless - // ); - // update_spath_j->set_generator_func([](std::vector& target_v, - // std::vector& input_v, u64* res) { - // res[0] = target_v[1]; - // res[1] = input_v[2]; - // if (res[0] == res[1]) { - // res[2] = 0; - // } else { - // res[2] = target_v[2] + input_v[3]; - // } - // }); - // scc2->add_rule(update_spath_j); - - // LIE *lie = new LIE(); - // lie->add_relation(rel__edge__3__1); - // lie->add_relation(rel__edge__3__1__2__3); - // lie->add_relation(rel__spath__3__2); - // lie->add_relation(rel__spath__3__1__2__3); - // lie->add_scc(scc0); - // lie->add_scc(scc1); - // lie->add_scc(scc2); - // lie->add_scc_dependance(scc0, scc2); - // lie->add_scc_dependance(scc1, scc2); - - relation *rel__spath__2__1__2 = new relation( - 2, true, 2, get_tag_for_rel("spath", "1__2"), - std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table", - input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) + - ".spath.2.table", - FULL); - rel__spath__2__1__2->set_dependent_column_update( - {1, 2}, // len and id column - [](std::vector old_v, std::vector new_v) -> std::optional - { - // std::cout << "Comparing "; - // for (auto c : old_v) { - // std::cout << c << " "; - // } - // std::cout << " <<<<<< "; - // for (auto c : new_v) { - // std::cout << c << " "; - // } - return new_v[0] < old_v[0]; - } - ); - relation *rel__edge__3__1 = new relation( - 1, false, 3, get_tag_for_rel("edge", "1"), - std::to_string(get_tag_for_rel("edge", "1")) + ".edge.3.table", FULL); relation *rel__edge__3__1__2__3 = new relation( 3, true, 3, get_tag_for_rel("edge", "1__2__3"), std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", input_dir + "/" + - std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", + std::to_string(get_tag_for_rel("edge", "1__2__3")) + + ".edge.3.table", FULL); - relation *rel__spath__2__1 = new relation( - 1, false, 2, get_tag_for_rel("spath", "1"), - std::to_string(get_tag_for_rel("spath", "1")) + ".spath.2.table", FULL); - rel__spath__2__1->set_dependent_column_update( - {1, 2}, - [](std::vector old_v, std::vector new_v) -> std::optional + relation* rel__edge__3__1 = new relation( + 1, false, 3, get_tag_for_rel("edge","1"), + std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table", + FULL); + + // the dependent column must be exclude from hash computation, so join + // column count is 3 - 1 = 2 + relation *rel__spath__3__1__2__3 = new relation( + 2, true, 3, get_tag_for_rel("spath", "1__2__3"), + std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table", + input_dir + "/" + + std::to_string(get_tag_for_rel("spath", "1__2__3")) + + ".spath.3.table", + FULL); + // set functional dependency for spath + rel__spath__3__1__2__3->set_dependent_column_update( + {2, 3}, // len and id column + [](const std::vector& old_v, const std::vector& new_v, const vector& nt) -> std::optional + { + return new_v[0] < old_v[0]; + } + ); + relation* rel__spath__3__2 = new relation( + 1, false, 3, get_tag_for_rel("spath","2"), + std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table", + FULL); + rel__spath__3__2->set_dependent_column_update( + {2, 3}, + [](const std::vector& old_v, const std::vector& new_v, const vector& nt) -> std::optional { - // std::cout << "Comparing "; - // for (auto c : old_v) { - // std::cout << c << " "; - // } - // std::cout << " <<<<<< "; - // for (auto c : new_v) { - // std::cout << c << " "; - // } return new_v[0] < old_v[0]; } ); - RAM *scc0 = new RAM(false, 0); + RAM* scc0 = new RAM(false, 0); scc0->add_relation(rel__edge__3__1, true, false); scc0->add_relation(rel__edge__3__1__2__3, true, false); scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, - DELTA, {0, 3, 1, 2})); + DELTA, {0, 3, 1, 2})); RAM *scc1 = new RAM(false, 1); - scc1->add_relation(rel__spath__2__1__2, true, false); - scc1->add_relation(rel__edge__3__1, false, false); + scc1->add_relation(rel__edge__3__1__2__3, false, false); + scc1->add_relation(rel__spath__3__1__2__3, true, false); + // scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3, + // rel__edge__3__1__2__3, FULL, {0, 1, 2})); scc1->add_rule(new parallel_copy_generate( - rel__spath__2__1__2, rel__edge__3__1, FULL, + rel__spath__3__1__2__3, rel__edge__3__1__2__3, FULL, [](const u64 *const data, u64 *const output) -> int { - auto args_for_old_bi = std::array{data[0]}; + auto args_for_old_bi = std::array{data[0], data[1], data[2]}; using TState = std::tuple; TState state = std::make_tuple(data, output); auto callback = [](u64 res_0, TState state) -> TState { auto [data, output] = state; auto head_tuple = output; - bool compatible = true && res_0 == n2d(start_node); + bool compatible = true && res_0 < n2d(start_node); if (!compatible) return state; - head_tuple[0] = data[2]; - head_tuple[1] = data[3]; + head_tuple[0] = data[0]; + head_tuple[1] = data[1]; + head_tuple[2] = data[2]; return std::make_tuple(data, output + 2); }; auto [_, new_ptr] = @@ -562,44 +464,167 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri })); RAM *scc2 = new RAM(true, 2); - scc2->add_relation(rel__spath__2__1__2, true, false); - scc2->add_relation(rel__edge__3__1, false, false); - scc2->add_relation(rel__spath__2__1, true, false); - // scc2->add_rule(new parallel_join(rel__spath__2__1__2, rel__spath__2__1, DELTA, - // rel__edge__3__1, FULL, {4, 5})); + scc2->add_relation(rel__edge__3__1__2__3, false, false); + scc2->add_relation(rel__spath__3__2, true, false); + scc2->add_relation(rel__spath__3__1__2__3, true, false); + // the order of non join column also need to be carefully arranged + // because, dependent column + // should always at last + scc2->add_rule(new parallel_acopy( + rel__spath__3__2, + rel__spath__3__1__2__3, DELTA, + {1, 0, 2, 3})); // 2, 1, 3, id parallel_join* update_spath_j = new parallel_join( - rel__spath__2__1__2, + rel__spath__3__1__2__3, rel__edge__3__1, FULL, - rel__spath__2__1, DELTA, - {5,4}// useless + rel__spath__3__2, DELTA, + {5, 2, 3}// useless ); update_spath_j->set_generator_func([](std::vector& target_v, std::vector& input_v, u64* res) { - // res[0] = target_v[0]; - res[0] = input_v[2]; - if (res[0] == start_node) { - res[1] = 0; + res[0] = target_v[1]; + res[1] = input_v[2]; + if (res[0] == res[1]) { + res[2] = 0; } else { - res[1] = target_v[1] + input_v[3]; + res[2] = target_v[2] + input_v[3]; } }); scc2->add_rule(update_spath_j); - scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2, - DELTA, {0, 1, 2})); - LIE *lie = new LIE(); - lie->add_relation(rel__spath__2__1__2); lie->add_relation(rel__edge__3__1); lie->add_relation(rel__edge__3__1__2__3); - lie->add_relation(rel__spath__2__1); + lie->add_relation(rel__spath__3__2); + lie->add_relation(rel__spath__3__1__2__3); lie->add_scc(scc0); lie->add_scc(scc1); lie->add_scc(scc2); lie->add_scc_dependance(scc0, scc2); - lie->add_scc_dependance(scc0, scc1); lie->add_scc_dependance(scc1, scc2); + // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + // relation *rel__spath__2__1__2 = new relation( + // 2, true, 2, get_tag_for_rel("spath", "1__2"), + // std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table", + // input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) + + // ".spath.2.table", + // FULL); + // rel__spath__2__1__2->set_dependent_column_update( + // {1, 2}, // len and id column + // [](std::vector old_v, std::vector new_v) -> std::optional + // { + // // std::cout << "Comparing "; + // // for (auto c : old_v) { + // // std::cout << c << " "; + // // } + // // std::cout << " <<<<<< "; + // // for (auto c : new_v) { + // // std::cout << c << " "; + // // } + // return new_v[0] < old_v[0]; + // } + // ); + // relation *rel__edge__3__1 = new relation( + // 1, false, 3, get_tag_for_rel("edge", "1"), + // std::to_string(get_tag_for_rel("edge", "1")) + ".edge.3.table", FULL); + // relation *rel__edge__3__1__2__3 = new relation( + // 3, true, 3, get_tag_for_rel("edge", "1__2__3"), + // std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", + // input_dir + "/" + + // std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", + // FULL); + // relation *rel__spath__2__1 = new relation( + // 1, false, 2, get_tag_for_rel("spath", "1"), + // std::to_string(get_tag_for_rel("spath", "1")) + ".spath.2.table", FULL); + // rel__spath__2__1->set_dependent_column_update( + // {1, 2}, + // [](std::vector old_v, std::vector new_v) -> std::optional + // { + // // std::cout << "Comparing "; + // // for (auto c : old_v) { + // // std::cout << c << " "; + // // } + // // std::cout << " <<<<<< "; + // // for (auto c : new_v) { + // // std::cout << c << " "; + // // } + // return new_v[0] < old_v[0]; + // } + // ); + + // RAM *scc0 = new RAM(false, 0); + // scc0->add_relation(rel__edge__3__1, true, false); + // scc0->add_relation(rel__edge__3__1__2__3, true, false); + // scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, + // DELTA, {0, 3, 1, 2})); + + // RAM *scc1 = new RAM(false, 1); + // scc1->add_relation(rel__spath__2__1__2, true, false); + // scc1->add_relation(rel__edge__3__1, false, false); + // scc1->add_rule(new parallel_copy_generate( + // rel__spath__2__1__2, rel__edge__3__1, FULL, + // [](const u64 *const data, u64 *const output) -> int { + // auto args_for_old_bi = std::array{data[0]}; + // using TState = std::tuple; + // TState state = std::make_tuple(data, output); + // auto callback = [](u64 res_0, TState state) -> TState { + // auto [data, output] = state; + // auto head_tuple = output; + + // bool compatible = true && res_0 == n2d(start_node); + // if (!compatible) + // return state; + + // head_tuple[0] = data[2]; + // head_tuple[1] = data[3]; + // return std::make_tuple(data, output + 2); + // }; + // auto [_, new_ptr] = + // builtin_eq_1(args_for_old_bi.data(), state, callback); + // auto tuples_count = (new_ptr - output) / 2; + // return tuples_count; + // })); + + // RAM *scc2 = new RAM(true, 2); + // scc2->add_relation(rel__spath__2__1__2, true, false); + // scc2->add_relation(rel__edge__3__1, false, false); + // scc2->add_relation(rel__spath__2__1, true, false); + // // scc2->add_rule(new parallel_join(rel__spath__2__1__2, rel__spath__2__1, DELTA, + // // rel__edge__3__1, FULL, {4, 5})); + // parallel_join* update_spath_j = new parallel_join( + // rel__spath__2__1__2, + // rel__edge__3__1, FULL, + // rel__spath__2__1, DELTA, + // {5,4}// useless + // ); + // update_spath_j->set_generator_func([](std::vector& target_v, + // std::vector& input_v, u64* res) { + // // res[0] = target_v[0]; + // res[0] = input_v[2]; + // if (res[0] == start_node) { + // res[1] = 0; + // } else { + // res[1] = target_v[1] + input_v[3]; + // } + // }); + // scc2->add_rule(update_spath_j); + // scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2, + // DELTA, {0, 1, 2})); + + + // LIE *lie = new LIE(); + // lie->add_relation(rel__spath__2__1__2); + // lie->add_relation(rel__edge__3__1); + // lie->add_relation(rel__edge__3__1__2__3); + // lie->add_relation(rel__spath__2__1); + // lie->add_scc(scc0); + // lie->add_scc(scc1); + // lie->add_scc(scc2); + // lie->add_scc_dependance(scc0, scc2); + // lie->add_scc_dependance(scc0, scc1); + // lie->add_scc_dependance(scc1, scc2); + // Enable IO lie->enable_all_to_all_dump(); lie->enable_data_IO(); @@ -611,6 +636,8 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri lie->print_all_relation_size(); // Continuously print relation sizes lie->stat_intermediate(); + // rel__spath__3__1__2__3->print(); + // rel__spath__2__1__2->print(); // rel__spath__2__1->print(); // rel__edge__3__1->print(); diff --git a/examples/datalog-example b/examples/datalog-example index be103a21..87266643 160000 --- a/examples/datalog-example +++ b/examples/datalog-example @@ -1 +1 @@ -Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa +Subproject commit 872666433df43282a408b4e37dad3fcbeafa1891 From f155e41352f9557716dd693fd6fccd3a2e3b6bd5 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Sun, 20 Nov 2022 15:26:17 -0500 Subject: [PATCH 12/36] delete copy rule --- .../tests/sssp/compiled_pre/CMakeLists.txt | 2 +- .../compiled_pre/input-data/257.spath.3.table | 0 .../compiled_pre/input-data/258.edge.3.table | Bin 288 -> 0 bytes .../sssp/compiled_pre/res-128-1000.output | 24 - backend/tests/sssp/compiled_pre/sssp.cpp | 2 +- backend/tests/sssp/compiled_pre/sssp_opt.cpp | 525 ++++++++++++++++++ backend/tests/sssp/test-input-graph/edge.csv | 18 +- examples/datalog-example | 2 +- runslog | 2 + 9 files changed, 539 insertions(+), 36 deletions(-) delete mode 100644 backend/tests/sssp/compiled_pre/input-data/257.spath.3.table delete mode 100644 backend/tests/sssp/compiled_pre/input-data/258.edge.3.table delete mode 100644 backend/tests/sssp/compiled_pre/res-128-1000.output create mode 100644 backend/tests/sssp/compiled_pre/sssp_opt.cpp diff --git a/backend/tests/sssp/compiled_pre/CMakeLists.txt b/backend/tests/sssp/compiled_pre/CMakeLists.txt index a5e5801d..89ee3ea4 100644 --- a/backend/tests/sssp/compiled_pre/CMakeLists.txt +++ b/backend/tests/sssp/compiled_pre/CMakeLists.txt @@ -19,7 +19,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") -file (GLOB source_files_sssp "${PROJECT_SOURCE_DIR}/sssp.cpp") +file (GLOB source_files_sssp "${PROJECT_SOURCE_DIR}/sssp_opt.cpp") ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}") diff --git a/backend/tests/sssp/compiled_pre/input-data/257.spath.3.table b/backend/tests/sssp/compiled_pre/input-data/257.spath.3.table deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/tests/sssp/compiled_pre/input-data/258.edge.3.table b/backend/tests/sssp/compiled_pre/input-data/258.edge.3.table deleted file mode 100644 index ff8d44ee05bfce6725eb4fca00771199d6e4af8b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 288 zcmZ9^K@xx<3`5aYMUYc@G>&d2Gz-I~!+(X0eU6%0lViQ1`&s?B2iM)59j<=a|Lkz} bX3h>*zs=d<>fM|jKB_ #include diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp new file mode 100644 index 00000000..748dc4ca --- /dev/null +++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp @@ -0,0 +1,525 @@ +// location of `parallel_RA_inc.h` here +#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h" + +#include +#include +#include +#include +#include +#include +#include + +// builtins.cpp goes here! +// builtins.cpp +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +#define u64 uint64_t +#define u32 uint32_t +using i64 = int64_t; + +const u64 tag_mask = 0xffffc00000000000; +const u64 tag_position = 46; +const u64 int_tag = 0; +const u64 str_tag = 2; +const u64 sign_flip_const = 0x0000200000000000; +const u64 signed_num_mask = 0xFFFFE00000000000; +int start_node = 1; + +inline bool is_number(u64 datum) { + // cout << "is_number(" << datum << "): " << (datum >> tag_position == + // int_tag) << "\n"; + return datum >> tag_position == int_tag; +} + +inline i64 datum_to_number(u64 datum) { + i64 signed_val = + (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); + if (signed_val >= sign_flip_const) { + signed_val = sign_flip_const - signed_val; + } + return signed_val; + // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - + // tag_position); +} +const auto d2n = datum_to_number; + +inline u64 number_to_datum(i64 number) { + i64 unsigned_value = number; + if (number < 0) { + unsigned_value = (-number) + sign_flip_const; + } + return (unsigned_value & ~tag_mask) | (int_tag << tag_position); + // return (number & ~tag_mask) | (int_tag << tag_position); +} + +const auto n2d = number_to_datum; + +inline u64 string_to_datum(std::string str) { + u32 str_hash = string_hash(str); + return (str_hash & ~tag_mask) | (str_tag << tag_position); +} +const auto s2d = string_to_datum; + +vector> builtin_div_rem(const u64 *const data) { + if (is_number(data[0]) && is_number(data[1])) { + auto div = number_to_datum(d2n(data[0]) / d2n(data[1])); + auto rem = number_to_datum(d2n(data[0]) % d2n(data[1])); + return {{div, rem}}; + } else { + return {}; + } +} + +#define BUILTIN_BINARY_NUMBER_PRED(name, op) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (is_number(data[0]) && is_number(data[1]) && \ + datum_to_number(data[0]) op datum_to_number(data[1])) { \ + return callback(init_state); \ + } else \ + return init_state; \ + } + +BUILTIN_BINARY_NUMBER_PRED(builtin_less, <) +BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >) +BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=) +BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=) + +#define BUILTIN_BINARY_NUMBER_FUNC(name, op) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0]) && is_number(data[1])) { \ + auto res = number_to_datum(datum_to_number(data[0]) \ + op datum_to_number(data[1])); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +) +BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -) +BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *) +BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /) + +#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0]) && is_number(data[1])) { \ + auto res = number_to_datum( \ + impl(datum_to_number(data[0]), datum_to_number(data[1]))); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) { return arg2 - arg1; } +BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1) + +#define BUILTIN_UNARY_NUMBER_FUNC(name, impl) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0])) { \ + auto res = number_to_datum(impl(datum_to_number(data[0]))); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +inline u64 add1(u64 x) { return x + 1; } +inline u64 sub1(u64 x) { return x - 1; } + +BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1) + +vector> builtin_range(const u64 *const data) { + vector> res; + if (is_number(data[0]) && is_number(data[1])) { + auto lb = datum_to_number(data[0]); + auto ub = datum_to_number(data[1]); + res.reserve(ub - lb); + for (u64 x = lb; x < ub; x++) + res.push_back({number_to_datum(x)}); + } + return res; +} + +template +TState callback_builtin_range(const u64 *data, TState init_state, + TState (*callback)(u64 res, TState state)) { + auto state = init_state; + if (is_number(data[0]) && is_number(data[1])) { + auto lb = datum_to_number(data[0]); + auto ub = datum_to_number(data[1]); + for (u64 x = lb; x < ub; x++) + state = callback(number_to_datum(x), state); + } + return state; +} + +#define BUILTIN_BINARY_PRED(name, op) \ + template \ + TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (data[0] op data[1]) \ + return callback(init_state); \ + else \ + return init_state; \ + } +BUILTIN_BINARY_PRED(builtin_eq, ==) +BUILTIN_BINARY_PRED(builtin_neq, !=) + +template +TState builtin_eq_1(const u64 *data, TState init_state, + TState (*callback)(u64 res, TState state)) { + return callback(data[0], init_state); +} + +#define BUILTIN_UNARY_PRED(name, pred) \ + template \ + TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (pred(data[0])) \ + return callback(init_state); \ + else \ + return init_state; \ + } + +bool is_not_number(u64 datum) { return !is_number(datum); } +BUILTIN_UNARY_PRED(builtin_number_huh, is_number) +BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number) + +// for generate-cpp-lambda-for-computational-join +struct CL2CB_State { + void *original_callback; // There be dragons? + void *original_state; + const u64 *original_data; + u64 *cl1_output_args; +}; + +// for generate-cpp-lambda-for-computational-copy +struct BCLCB_State { + void *original_callback; + void *original_state; + const u64 *original_data; +}; + +// an experiment: +template bool builtin_binary_number_pred(const u64 *data) { + if (is_number(data[0]) && is_number(data[1])) { + return f(datum_to_number(data[0]), datum_to_number(data[1])); + } else { + return false; + } +} +bool _less(u64 x, u64 y) { return x < y; } +auto builtin_less2 = builtin_binary_number_pred<_less>; + +template +inline TState builtin_nop(const u64 *data, TState init_state, + TState (*callback)(TState state)) { + return callback(init_state); +} + +// //////////////////// AGGREGATORS Alternative design //////////////////// + +// TODO: add number type check +////////////////////////////// count ///////////////////////////////////// + +local_agg_res_t +agg_count_local(std::pair + joined_range) { + local_agg_res_t cnt = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + cnt++; + } + return cnt; +} + +local_agg_res_t agg_count_reduce(local_agg_res_t x, local_agg_res_t y) { + return x + y; +} + +////////////////////////////// sum ///////////////////////////////////// + +local_agg_res_t +agg_sum_local(std::pair + joined_range) { + local_agg_res_t sum_res = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + sum_res += tuple[tuple.size() - 1]; + } + return sum_res; +} + +local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) { + return x + y; +} + +////////////////////////////// maximum ///////////////////////////////////// + +local_agg_res_t +agg_maximum_local(std::pair + joined_range) { + local_agg_res_t max_res = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + auto current_v = tuple[tuple.size() - 1]; + if (current_v > max_res) { + max_res = current_v; + } + } + return max_res; +} + +local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) { + if (x > y) { + return x; + } else { + return y; + } +} + +////////////////////////////// minimum ///////////////////////////////////// + +local_agg_res_t +agg_minimum_local(std::pair + joined_range) { + local_agg_res_t min_res = std::numeric_limits::max(); + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + auto current_v = tuple[tuple.size() - 1]; + if (current_v < min_res) { + min_res = current_v; + } + } + return min_res; +} + +local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) { + if (x < y) { + return x; + } else { + return y; + } +} + +// // end of builtins.cpp + +// global definitions: + +int max_rel = 255; +std::map rel_tag_map; +std::map> rel_index_map; + +// load all relation inside input database +void load_input_relation(std::string db_dir) { + for (const auto &entry : std::filesystem::directory_iterator(db_dir)) { + // check if ends with table + std::string filename_ss = entry.path().filename().string(); + // std::cout << "input database has file " << filename_ss << std::endl; + std::string suffix = ".table"; + int ft = filename_ss.size() - suffix.size(); + if (ft < 0) + ft = 0; + if (filename_ss.rfind(suffix) != ft) { + continue; + } + std::string filename_s = entry.path().stem().string(); + int tag = std::stoi(filename_s.substr(0, filename_s.find("."))); + std::string name_arity = filename_s.substr( + filename_s.find(".") + 1, filename_s.size() - filename_s.find(".") - 1); + std::string name = name_arity.substr(0, name_arity.rfind(".")); + std::string arity_s = + name_arity.substr(name_arity.rfind(".") + 1, name_arity.size()); + int arity = std::stoi(arity_s); + std::stringstream index_stream; + index_stream << name; + for (int i = 1; i <= arity; i++) { + index_stream << "__" << i; + } + if (tag > max_rel) + max_rel = tag; + // std::cout << "load " << tag << "." << index_stream.str() << "has arity " + // << arity << std::endl; + rel_tag_map[index_stream.str()] = tag; + } +} + +int get_tag_for_rel(std::string relation_name, std::string index_str) { + std::string name_arity = relation_name + "__" + index_str; + if (rel_index_map.find(relation_name) != rel_index_map.end()) { + rel_index_map[relation_name].insert(index_str); + } else { + rel_index_map[relation_name] = {index_str}; + } + + if (rel_tag_map.find(name_arity) != rel_tag_map.end()) { + // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] << + // std::endl; + return rel_tag_map[name_arity]; + } + max_rel++; + rel_tag_map[name_arity] = max_rel; + // std::cout << "generate rel tag: " << name_arity << " " << max_rel + // << std::endl; + return max_rel; +} + +void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, + std::string output_dir, int argc, char **argv) { + start_node = sp; + load_input_relation(input_dir); + + relation *rel__edge__2__1__2 = new relation( + 1, true, 2, get_tag_for_rel("edge", "1__2"), + std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", + input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + + ".edge.2.table", + FULL); + + relation *rel__spath__3__2 = new relation( + 1, true, 3, get_tag_for_rel("spath", "2"), + std::to_string(get_tag_for_rel("spath", "2")) + ".spath.3.table", + std::to_string(get_tag_for_rel("spath", "2")) + ".spath.3.table", FULL); + rel__spath__3__2->set_dependent_column_update( + {2, 3}, + [](const std::vector &old_v, const std::vector &new_v, + const vector &nt) -> std::optional { + return new_v[0] < old_v[0]; + }); + + RAM *scc0 = new RAM(false, 0); + scc0->add_relation(rel__edge__2__1__2, false, false); + scc0->add_relation(rel__spath__3__2, true, false); + scc0->add_rule(new parallel_copy_generate( + rel__spath__3__2, rel__edge__2__1__2, FULL, + [](const u64 *const data, u64 *const output) -> int { + auto args_for_old_bi = std::array{data[0], data[1], n2d(1)}; + using TState = std::tuple; + TState state = std::make_tuple(args_for_old_bi.data(), output); + auto callback = [](u64 res_0, TState state) -> TState { + auto [data, output] = state; + auto head_tuple = output; + + bool compatible = true && res_0 < n2d(start_node); + if (!compatible) + return state; + + head_tuple[0] = data[1]; + head_tuple[1] = data[0]; + head_tuple[2] = data[2]; + return std::make_tuple(data, output + 2); + }; + auto [_, new_ptr] = + builtin_eq_1(args_for_old_bi.data(), state, callback); + auto tuples_count = (new_ptr - output) / 2; + return tuples_count; + })); + + RAM *scc1 = new RAM(true, 1); + scc1->add_relation(rel__edge__2__1__2, false, false); + scc1->add_relation(rel__spath__3__2, true, false); + parallel_join *update_spath_j = + new parallel_join(rel__spath__3__2, rel__edge__2__1__2, FULL, + rel__spath__3__2, DELTA, {5, 2, 3} // useless + ); + update_spath_j->set_generator_func( + [](std::vector &target_v, std::vector &input_v, u64 *res) { + // std::cout << "Joining >>> "; + // for (auto c : input_v) { + // std::cout << c << " "; + // } + // std::cout << " and >>>>>>>"; + // for (auto c : target_v) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + res[0] = input_v[1]; + res[1] = target_v[1]; + if (res[0] == res[1]) { + res[2] = 0; + } else { + res[2] = target_v[2] + 1; + } + }); + scc1->add_rule(update_spath_j); + + LIE *lie = new LIE(); + lie->add_relation(rel__edge__2__1__2); + lie->add_relation(rel__spath__3__2); + lie->add_scc(scc0); + lie->add_scc(scc1); + lie->add_scc_dependance(scc0, scc1); + + // Enable IO + lie->enable_all_to_all_dump(); + lie->enable_data_IO(); + // lie->enable_share_io(); + lie->enable_IO(); + lie->set_output_dir(output_dir); // Write to this directory + lie->set_comm(mcomm); + lie->set_batch_size(1); + lie->execute(); + lie->print_all_relation_size(); // Continuously print relation sizes + // lie->stat_intermediate(); + + // rel__spath__3__1__2__3->print(); + + // rel__spath__2__1__2->print(); +// rel__spath__3__2->print(); + // rel__edge__3__1->print(); + // rel__edge__3__1__2__3->print(); + + // print all variants(non-canonical index of each relation) + if (mcomm.get_rank() == 0) { + std::cout << "rel_name" + << ",\t" + << "indices\n"; + for (auto const &rel_p : rel_index_map) { + std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n"; + } + std::cout << std::endl; + } + + // lie->print_all_relation_size(); // Continuously print relation sizes + + delete lie; +} + +int main(int argc, char **argv) { + // input dir from compiler + std::string slog_input_dir = + "/home/stargazermiao/workspace/PL/slog/out/input-data"; + // output dir from compiler + std::string slog_output_dir = + "/home/stargazermiao/workspace/PL/slog/out/checkpoints"; + if (argc > 2) { + slog_input_dir = argv[1]; + slog_output_dir = argv[2]; + } + mpi_comm mcomm; + mcomm.create(argc, argv); + + compute_sssp_from(mcomm, atoi(argv[3]), slog_input_dir, slog_output_dir, argc, + argv); + + mcomm.destroy(); + return 0; +} diff --git a/backend/tests/sssp/test-input-graph/edge.csv b/backend/tests/sssp/test-input-graph/edge.csv index de8668e0..1d997fb7 100644 --- a/backend/tests/sssp/test-input-graph/edge.csv +++ b/backend/tests/sssp/test-input-graph/edge.csv @@ -1,9 +1,9 @@ -1 2 1 -1 3 2 -2 3 2 -3 4 1 -4 5 1 -5 6 1 -6 7 1 -8 9 1 -9 10 1 +1 2 +1 3 +2 3 +3 4 +4 5 +5 6 +6 7 +8 9 +9 10 diff --git a/examples/datalog-example b/examples/datalog-example index 87266643..be103a21 160000 --- a/examples/datalog-example +++ b/examples/datalog-example @@ -1 +1 @@ -Subproject commit 872666433df43282a408b4e37dad3fcbeafa1891 +Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa diff --git a/runslog b/runslog index 13112e90..73e747e0 100755 --- a/runslog +++ b/runslog @@ -89,6 +89,8 @@ def ingest_facts(factloc, inputloc, tsv_bin_path, cores): tableloc = os.path.join(inputloc, table[1]) try: # idk why 16 is buckets, got from rpc.py + print(" ".join([tsv_bin_path, factfile, str(arity), + tableloc, str(cores), str(tabletag), inputloc])) subprocess.check_output([tsv_bin_path, factfile, str(arity), tableloc, str(cores), str(tabletag), inputloc]) except subprocess.CalledProcessError as e: From 61b9eba688bb488a9d912406fc17298c0b88dbdd Mon Sep 17 00:00:00 2001 From: ysun67 Date: Mon, 21 Nov 2022 13:01:05 -0500 Subject: [PATCH 13/36] add debug --- backend/src/IO/parallel_io.cpp | 2 +- backend/src/RAM/RA_tasks.cpp | 29 +++++++++++++----- backend/src/RAM/RA_tasks.h | 2 ++ backend/src/lie/lie.cpp | 4 +-- .../checkpoints/checkpoint-final/$strings.csv | 0 .../checkpoint-final/256.edge.3.table_full | Bin 288 -> 0 bytes .../checkpoint-final/257.spath.3.table_full | Bin 288 -> 0 bytes backend/tests/sssp/compiled_pre/sssp_opt.cpp | 5 +-- 8 files changed, 30 insertions(+), 12 deletions(-) delete mode 100644 backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv delete mode 100644 backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full delete mode 100644 backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full diff --git a/backend/src/IO/parallel_io.cpp b/backend/src/IO/parallel_io.cpp index 123b11a1..ce57ae83 100644 --- a/backend/src/IO/parallel_io.cpp +++ b/backend/src/IO/parallel_io.cpp @@ -158,7 +158,7 @@ void parallel_io::parallel_read_input_relation_from_file_to_local_buffer(u32 ari /* Read all data in parallel */ uint64_t read_offset; - read_offset = ceil((float)global_row_count / nprocs) * rank; + read_offset = (int)ceil((float)global_row_count / nprocs) * rank; if (read_offset > (uint64_t)global_row_count) { diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index b7a8a029..71a85d34 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -6,6 +6,7 @@ #include "../parallel_RA_inc.h" +#include "mpi.h" #include RAM::~RAM() @@ -579,6 +580,7 @@ bool RAM::local_compute(int* offset) else if ((*it)->get_RA_type() == JOIN) { + // auto before_time = MPI_Wtime(); parallel_join* current_ra = (parallel_join*) *it; relation* output_relation = current_ra->get_join_output(); @@ -603,6 +605,7 @@ bool RAM::local_compute(int* offset) &join_tuples_duplicates, &join_tuples); total_join_tuples = total_join_tuples + join_tuples; + } else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL) { @@ -654,8 +657,12 @@ bool RAM::local_compute(int* offset) &join_tuples); total_join_tuples = total_join_tuples + join_tuples; } + // auto after_time = MPI_Wtime(); + // if (mcomm.get_local_rank() == 0) { + // std::cout << "local join on rank " << mcomm.get_local_rank() << " takes " << after_time - before_time << std::endl; + // } } - counter++; + counter++; } #if 0 @@ -714,12 +721,14 @@ void RAM::local_comm() int cnt=0; cumulative_all_to_allv_buffer_cmp = new u64*[RA_list.size()]; cumulative_all_to_allv_recv_process_size_array_cmp = new int[RA_list.size()]; - + auto before_time = MPI_Wtime(); for (std::vector::iterator it = RA_list.begin() ; it != RA_list.end(); ++it) { all_to_all_comm(compute_buffer.local_compute_output[cnt], compute_buffer.local_compute_output_size_rel[cnt], compute_buffer.local_compute_output_size[cnt], &cumulative_all_to_allv_recv_process_size_array_cmp[cnt], &cumulative_all_to_allv_buffer_cmp[cnt], mcomm.get_local_comm()); cnt++; } + auto after_time = MPI_Wtime(); + all_to_all_time += (after_time - before_time); } #endif @@ -1226,22 +1235,28 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s bool local_join_status = false; while (local_join_status == false) { + auto allocate_buffers_start = MPI_Wtime(); allocate_compute_buffers(); + auto allocate_buffers_end = MPI_Wtime(); - + auto compute_start = MPI_Wtime(); local_join_status = local_compute(offset); + auto compute_end = MPI_Wtime(); + auto all_to_all_start = MPI_Wtime(); comm_compaction_all_to_all(compute_buffer, &cumulative_all_to_allv_recv_process_count_array, &cumulative_all_to_allv_buffer, mcomm.get_local_comm(), *loop_counter, task_id, output_dir, all_to_all_record, sloav_mode, rotate_index_array, send_indexes, sendb_num); + auto all_to_all_end = MPI_Wtime(); - + auto free_buffers_start = MPI_Wtime(); free_compute_buffers(); + auto free_buffers_end = MPI_Wtime(); - - + auto insert_in_newt_start = MPI_Wtime(); local_insert_in_newt_comm_compaction(intern_map); + auto insert_in_newt_end = MPI_Wtime(); -#if DEBUG_OUTPUT +#if 1 if (mcomm.get_rank() == 0) { #if 0 diff --git a/backend/src/RAM/RA_tasks.h b/backend/src/RAM/RA_tasks.h index 0c650a8d..8b5d8e0d 100644 --- a/backend/src/RAM/RA_tasks.h +++ b/backend/src/RAM/RA_tasks.h @@ -56,6 +56,8 @@ class RAM public: + double all_to_all_time = 0; + ~RAM(); RAM (bool ic, int ram_id); diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index 59a0ec57..7b25b044 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -511,8 +511,8 @@ bool LIE::execute () /// For SCCs that runs till fixed point is reached else { - //if (mcomm.get_rank() == 0) - // std::cout << "name\tnprocs\tmin\tmax\tmean\tIteration#\tBuffer_creation_time\tComputation_time\tAll_to_all_time\tBuffer_free_time\tInsert_in_newt_time\tIntra_comm_time\tInsert_in_full_time\tTotal_time" << std::endl; + if (mcomm.get_rank() == 0) + std::cout << "name\tnprocs\tmin\tmax\tmean\tIteration#\tBuffer_creation_time\tComputation_time\tAll_to_all_time\tBuffer_free_time\tInsert_in_newt_time\tIntra_comm_time\tInsert_in_full_time\tTotal_time" << std::endl; u64 delta_in_scc = 0; do { diff --git a/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/$strings.csv deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/256.edge.3.table_full deleted file mode 100644 index a5b47390726befd417416b8e76e64db49a1e53f8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 288 zcmYL@+YP`l3<61CX)=}fpQ=*f&_L+d4S=*-u@*FJgclJ_f= g&{z83&?&Hw-a diff --git a/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full b/backend/tests/sssp/compiled_pre/checkpoints/checkpoint-final/257.spath.3.table_full deleted file mode 100644 index b6a5a0c18677b3766072782e2050000c95305bde..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 288 zcmYk0+Y!JZ5CYXaOj4EoR~16onezv^J<#6%Gonw_vR3QyM|6vNE>4a)-dZ^Mz4nNc jV}6I*i!*0$hPzLk-ckRh`}z68$zR!XPvN{nkN3o1lcog@ diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp index 748dc4ca..dc3e9c86 100644 --- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp +++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp @@ -1,7 +1,7 @@ // location of `parallel_RA_inc.h` here -#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h" +#include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" -#include +#include #include #include #include @@ -21,6 +21,7 @@ #include #include #include +#include using namespace std; #define u64 uint64_t From a11ac849f0d5f74a4cfe04ece0fe02ae3825b38a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 23 Nov 2022 07:04:56 +0000 Subject: [PATCH 14/36] add support for multi dependent column --- backend/src/RA/parallel_join.cpp | 2 + backend/src/RAM/RA_tasks.cpp | 168 ++++--- backend/src/lie/lie.cpp | 4 +- backend/src/parallel_RA_inc.h | 20 +- .../src/relation/balanced_hash_relation.cpp | 65 ++- backend/src/relation/balanced_hash_relation.h | 2 + backend/src/relation/shmap_relation_exp.cpp | 223 ++++++--- .../tests/msum/compiled_pre/CMakeLists.txt | 28 ++ backend/tests/msum/compiled_pre/compiler-out | 18 + .../msum/compiled_pre/input-data/$strings.csv | 0 .../compiled_pre/input-data/257.edge.2.table | Bin 0 -> 264 bytes .../sssp.cpp => msum/compiled_pre/msum.cpp} | 344 ++++---------- backend/tests/msum/msum.slog | 3 + backend/tests/pagerank/pagerank.slog | 3 + .../compiled_pre/input-data/258.edge.2.table | Bin 0 -> 528 bytes .../tests/sssp/compiled_pre/sssp.cpp.backup | 429 ------------------ backend/tests/sssp/compiled_pre/sssp_opt.cpp | 14 +- backend/tests/sssp/sssp.py | 18 + backend/tests/sssp/test-input-graph/edge.csv | 2 + examples/datalog-example | 2 +- 20 files changed, 491 insertions(+), 854 deletions(-) create mode 100644 backend/tests/msum/compiled_pre/CMakeLists.txt create mode 100644 backend/tests/msum/compiled_pre/compiler-out create mode 100644 backend/tests/msum/compiled_pre/input-data/$strings.csv create mode 100644 backend/tests/msum/compiled_pre/input-data/257.edge.2.table rename backend/tests/{sssp/compiled_pre/sssp.cpp => msum/compiled_pre/msum.cpp} (58%) create mode 100644 backend/tests/msum/msum.slog create mode 100644 backend/tests/pagerank/pagerank.slog create mode 100644 backend/tests/sssp/compiled_pre/input-data/258.edge.2.table delete mode 100644 backend/tests/sssp/compiled_pre/sssp.cpp.backup create mode 100644 backend/tests/sssp/sssp.py diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index 38d9e20c..59b2fd48 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -7,6 +7,7 @@ #include "../parallel_RA_inc.h" #include +#include bool parallel_join::local_join(int threshold, int* offset, @@ -34,6 +35,7 @@ bool parallel_join::local_join(int threshold, int* offset, } u32* output_sub_bucket_count = output->get_sub_bucket_per_bucket_count(); u32** output_sub_bucket_rank = output->get_sub_bucket_rank(); + // std::cout << "wwwwwwwww " << input0_buffer_size << " " << input0_buffer_size << " " << i1_size << std::endl; if (*offset > input0_buffer_size || input0_buffer_size == 0 || i1_size == 0) return true; diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 71a85d34..2712e343 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -7,7 +7,9 @@ #include "../parallel_RA_inc.h" #include "mpi.h" +#include #include +#include RAM::~RAM() { @@ -256,12 +258,22 @@ u64 RAM::intra_bucket_comm_execute() /// Join between full and delta else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) { - intra_bucket_comm(get_bucket_count(), - input1->get_delta(), - input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), + // std::cout << "here>>>>>>>>>>>>>" << std::endl; + if (input1->get_dependent_column().size() > 0) { + intra_bucket_comm(get_bucket_count(), + input0->get_full(), input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), + input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], mcomm.get_local_comm()); + } else { + intra_bucket_comm(get_bucket_count(), + input1->get_delta(), + input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), + input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), + &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], + mcomm.get_local_comm()); + } total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; } @@ -626,19 +638,33 @@ bool RAM::local_compute(int* offset) } else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) { - - join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), - RIGHT, - get_bucket_count(), - intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], - input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1, - reorder_map_array, - output_relation, - compute_buffer, - counter, - join_column_count, - &join_tuples_duplicates, - &join_tuples); + if (input1->get_dependent_column().size() > 0) { + join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), + LEFT, + get_bucket_count(), + intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], + input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1, + reorder_map_array, + output_relation, + compute_buffer, + counter, + join_column_count, + &join_tuples_duplicates, + &join_tuples); + } else { + join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), + RIGHT, + get_bucket_count(), + intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], + input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1, + reorder_map_array, + output_relation, + compute_buffer, + counter, + join_column_count, + &join_tuples_duplicates, + &join_tuples); + } total_join_tuples = total_join_tuples + join_tuples; } else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL) @@ -824,10 +850,22 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) for (int tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++) { u32 x = starting + tuple_ind * width; - if (output->find_in_full(cumulative_all_to_allv_buffer + x, width) == false && + bool insert_flag = true; + if (output->get_dependent_column().size() > 1) { + std::vector tt; + for (int i = 0; i < width; i++) { + tt.push_back(cumulative_all_to_allv_buffer[x+i]); + } + // temporary index column just to match size of column + tt.push_back(0); + insert_flag = output->check_dependent_value_insert_avalible(tt); + + } else { + insert_flag = output->find_in_full(cumulative_all_to_allv_buffer + x, width) == false && output->find_in_delta(cumulative_all_to_allv_buffer + x, width) == false && - output->find_in_newt(cumulative_all_to_allv_buffer + x, width) == false) - { + output->find_in_newt(cumulative_all_to_allv_buffer + x, width) == false; + } + if (insert_flag){ for (u32 i = 0; i < width; i++) tuple[i] = cumulative_all_to_allv_buffer[x+i]; @@ -850,17 +888,7 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) if (output->insert_in_newt(tuple) == true) successful_insert++; - - //if (RA_list[ra_id]->get_RA_type() == FACT) - // std::cout << "FFFFFFFFFF "<< tuple[0] << " " << tuple[1] << " " << successful_insert << std::endl; } - // else { - // std::cout << "insert fail "; - // for (int i = 0; i < width; i++) { - // std::cout << cumulative_all_to_allv_buffer[i] << " "; - // } - // std::cout << std::endl; - // } } starting = starting + elements_to_read; } @@ -1097,35 +1125,35 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& // std::cout << "--------------FIXED POINT ITERATION " << loop_count_tracker << "--------------" << std::endl; #endif - + auto intra_start = MPI_Wtime(); intra_bucket_comm_execute(); + auto intra_end = MPI_Wtime(); - + std::cout << std::setiosflags(std::ios::fixed); bool local_join_status = false; while (local_join_status == false) { - + auto allocate_buffers_start = MPI_Wtime(); allocate_compute_buffers(); + auto allocate_buffers_end = MPI_Wtime(); - - + auto compute_start = MPI_Wtime(); local_join_status = local_compute(offset); + auto compute_end = MPI_Wtime(); - - + auto all_to_all_start = MPI_Wtime(); local_comm(); + auto all_to_all_end = MPI_Wtime(); - - - + auto free_buffers_start = MPI_Wtime(); free_compute_buffers(); + auto free_buffers_end = MPI_Wtime(); - - + auto insert_in_newt_start = MPI_Wtime(); local_insert_in_newt(intern_map); + auto insert_in_newt_end = MPI_Wtime(); - -#if DEBUG_OUTPUT +#if 1 if (mcomm.get_rank() == 0) { #if 0 @@ -1145,19 +1173,24 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& << " newt " << *running_insert_newt << std::endl; #endif - std::cout << loop_count_tracker << "\t" - << (allocate_buffers_end - allocate_buffers_start) << "\t" - << (compute_end - compute_start) << "\t" - << (all_to_all_end - all_to_all_start) << "\t" - << (free_buffers_end - free_buffers_start) << "\t" - << (insert_in_newt_end - insert_in_newt_start) << "\t"; + std::cout << "loop" << std::setw(12) << "alloc_buf" << std::setw(12) << "compute" << std::setw(12) + << "all2all" << std::setw(12) << "free_buf" << std::setw(12) << "insert_newt" << std::setw(12) + << "intra" << std::setw(12) << "insert_full" << std::setw(12) << "total" << "\n" ; + std::cout << loop_count_tracker << std::setprecision(4) << std::setw(12) + << (allocate_buffers_end - allocate_buffers_start) << std::setprecision(4) << std::setw(12) + << (compute_end - compute_start) << std::setprecision(4) << std::setw(12) + << (all_to_all_end - all_to_all_start) << std::setprecision(4) << std::setw(12) + << (free_buffers_end - free_buffers_start) << std::setprecision(4) << std::setw(12) + << (insert_in_newt_end - insert_in_newt_start) << std::setprecision(4) << std::setw(12); } #endif inner_loop++; } + auto insert_in_full_start = MPI_Wtime(); local_insert_in_full(); - -#if DEBUG_OUTPUT + auto insert_in_full_end = MPI_Wtime(); + +#if 1 if (mcomm.get_rank() == 0) { #if 0 @@ -1174,8 +1207,8 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& << " full " << *running_insert_in_full << " Total " << *running_intra_bucket_comm + *running_buffer_allocate + *running_local_compute + *running_all_to_all + *running_buffer_free + *running_insert_newt + *running_insert_in_full << std::endl; #endif - std::cout << (intra_end - intra_start) << "\t" - << (insert_in_full_end - insert_in_full_start) << "\t" + std::cout << (intra_end - intra_start) << std::setw(12) + << (insert_in_full_end - insert_in_full_start) << std::setw(12) << (insert_in_full_end - intra_start) << std::endl; } @@ -1229,8 +1262,10 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s // std::cout << "--------------FIXED POINT ITERATION " << loop_count_tracker << "--------------" << std::endl; #endif - + std::cout << std::setiosflags(std::ios::fixed); + auto intra_start = MPI_Wtime(); intra_bucket_comm_execute(); + auto intra_end = MPI_Wtime(); bool local_join_status = false; while (local_join_status == false) @@ -1276,22 +1311,25 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s << " newt " << *running_insert_newt << std::endl; #endif - std::cout << loop_count_tracker << "\t" - << (allocate_buffers_end - allocate_buffers_start) << "\t" - << (compute_end - compute_start) << "\t" - << (all_to_all_end - all_to_all_start) << "\t" - << (free_buffers_end - free_buffers_start) << "\t" - << (insert_in_newt_end - insert_in_newt_start) << "\t"; + std::cout << "loop" << std::setw(12) << "alloc_buf" << std::setw(12) << "compute" << std::setw(12) + << "all2all" << std::setw(12) << "free_buf" << std::setw(12) << "insert_newt" << std::setw(12) + << "intra" << std::setw(12) << "insert_full" << std::setw(12) << "total" << "\n" ; + std::cout << loop_count_tracker << std::setprecision(4) << std::setw(12) + << (allocate_buffers_end - allocate_buffers_start) << std::setprecision(4) << std::setw(12) + << (compute_end - compute_start) << std::setprecision(4) << std::setw(12) + << (all_to_all_end - all_to_all_start) << std::setprecision(4) << std::setw(12) + << (free_buffers_end - free_buffers_start) << std::setprecision(4) << std::setw(12) + << (insert_in_newt_end - insert_in_newt_start) << std::setprecision(4) << std::setw(12); } #endif inner_loop++; } - + auto insert_in_full_start = MPI_Wtime(); local_insert_in_full(); + auto insert_in_full_end = MPI_Wtime(); - -#if DEBUG_OUTPUT +#if 1 if (mcomm.get_rank() == 0) { #if 0 @@ -1308,8 +1346,8 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s << " full " << *running_insert_in_full << " Total " << *running_intra_bucket_comm + *running_buffer_allocate + *running_local_compute + *running_all_to_all + *running_buffer_free + *running_insert_newt + *running_insert_in_full << std::endl; #endif - std::cout << (intra_end - intra_start) << "\t" - << (insert_in_full_end - insert_in_full_start) << "\t" + std::cout << (intra_end - intra_start) << std::setw(12) + << (insert_in_full_end - insert_in_full_start) << std::setw(12) << (insert_in_full_end - intra_start) << std::endl; } diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index 7b25b044..59a0ec57 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -511,8 +511,8 @@ bool LIE::execute () /// For SCCs that runs till fixed point is reached else { - if (mcomm.get_rank() == 0) - std::cout << "name\tnprocs\tmin\tmax\tmean\tIteration#\tBuffer_creation_time\tComputation_time\tAll_to_all_time\tBuffer_free_time\tInsert_in_newt_time\tIntra_comm_time\tInsert_in_full_time\tTotal_time" << std::endl; + //if (mcomm.get_rank() == 0) + // std::cout << "name\tnprocs\tmin\tmax\tmean\tIteration#\tBuffer_creation_time\tComputation_time\tAll_to_all_time\tBuffer_free_time\tInsert_in_newt_time\tIntra_comm_time\tInsert_in_full_time\tTotal_time" << std::endl; u64 delta_in_scc = 0; do { diff --git a/backend/src/parallel_RA_inc.h b/backend/src/parallel_RA_inc.h index 00c3f688..a0047436 100644 --- a/backend/src/parallel_RA_inc.h +++ b/backend/src/parallel_RA_inc.h @@ -18,8 +18,26 @@ //#define DEBUG_OUTPUT 1 #define MAX_LOOP_COUNT 120000 +struct vec_comparator { + vec_comparator() {} + + bool operator()(const std::vector &a, const std::vector &b) const { + // make it an unroll loop when change to array + int size = a.size(); + for (int i=0; i < size; i++) { + if (a[i] < b[i]) + return true; + if (a[i] > b[i]) + return false; + } + + return false; + } +}; + +using depend_val_t = std::vector>; using update_partial_compare_func_t = std::function(const std::vector& old_v, const std::vector& new_v, const std::vector& prefix)>; -using join_generator_func_t = std::function& target_v, std::vector& input_v, u64* res)>; +using join_generator_func_t = std::function& input_v, depend_val_t& res_set)>; #include "log/logger.h" #include "hash/hash.h" diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index 7663343b..7182fc24 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -6,6 +6,7 @@ #include "../parallel_RA_inc.h" +#include "balanced_hash_relation.h" #include #include #include @@ -1196,6 +1197,7 @@ bool relation::insert_in_full(u64* t) // TODO: use normal insert here! if (full[bucket_id].insert_tuple_from_array(t, arity+1) == true) { + // TODO: change how to deal with element counts full_element_count++; full_bucket_element_count[bucket_id]++; full_sub_bucket_element_count[bucket_id][sub_bucket_id]++; @@ -1289,27 +1291,27 @@ void relation::local_insert_in_delta() MPI_Comm_rank(mcomm.get_comm(), &rank); u32 buckets = get_bucket_count(); - if (dependent_column_indices.size() > 0) { - delta_element_count = 0; - for (u32 i = 0; i < buckets; i++) { - delta[i].purge(); - memset(delta_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32)); - for (auto& t: newt[i]) { - if (full[i].check_dependent_insertion(t)) { - delta[i].insert(t); - uint64_t bucket_id = tuple_hash(t.data(), join_column_count) % get_bucket_count(); - u32 sub_bucket_id = 0; - if (is_canonical == false && arity != 0 && arity >= join_column_count) - sub_bucket_id = tuple_hash(t.data() + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id]; - delta_sub_bucket_element_count[bucket_id][sub_bucket_id]++; - delta_element_count++; - } - } - newt[i].purge(); - memset(newt_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32)); - newt_element_count = 0; - } - } else { + // if (dependent_column_indices.size() > 0) { + // delta_element_count = 0; + // for (u32 i = 0; i < buckets; i++) { + // delta[i].purge(); + // memset(delta_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32)); + // for (auto& t: newt[i]) { + // if (full[i].check_dependent_insertion(t)) { + // delta[i].insert(t); + // uint64_t bucket_id = tuple_hash(t.data(), join_column_count) % get_bucket_count(); + // u32 sub_bucket_id = 0; + // if (is_canonical == false && arity != 0 && arity >= join_column_count) + // sub_bucket_id = tuple_hash(t.data() + join_column_count, arity-join_column_count) % sub_bucket_per_bucket_count[bucket_id]; + // delta_sub_bucket_element_count[bucket_id][sub_bucket_id]++; + // delta_element_count++; + // } + // } + // newt[i].purge(); + // memset(newt_sub_bucket_element_count[i], 0, sub_bucket_per_bucket_count[i] * sizeof(u32)); + // newt_element_count = 0; + // } + // } else { delete[] delta; delta = newt; delta_element_count = newt_element_count; @@ -1327,5 +1329,24 @@ void relation::local_insert_in_delta() } newt_element_count = 0; memset(newt_bucket_element_count, 0, buckets * sizeof(u32)); - } + // } +} + +bool relation::check_dependent_value_insert_avalible(const std::vector& tuple) { + uint64_t bucket_id = tuple_hash(tuple.data(), join_column_count) % get_bucket_count(); + // return newt[bucket_id].check_dependent_insertion(tuple); + // if (!(full[bucket_id].check_dependent_insertion(tuple) && delta[bucket_id].check_dependent_insertion(tuple))) { + // for (auto c: tuple) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // std::cout << "current tree >>" << std::endl; + // for (auto t: delta[bucket_id]) { + // for (auto c: t) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // } + // } + return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ; } diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index c0e88f9e..cfd322ad 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -267,6 +267,8 @@ class relation void local_insert_in_delta(); void copy_newt_to_delta() {delta = newt;} + // lattice value check + bool check_dependent_value_insert_avalible(const std::vector& tuple); /// for load balancing (implemented in relation_load_balance.cpp) bool load_balance_merge_full_and_delta(float rf); diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index 700c7a6f..9ceff570 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -123,6 +123,7 @@ shmap_relation::check_dependent_insertion(const std::vector &tp) { if (exist_tuples_range.first == ind.end()) { return true; } else { + auto joined = false; for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) { auto cur_tuple = *it; std::vector old_t; @@ -130,8 +131,14 @@ shmap_relation::check_dependent_insertion(const std::vector &tp) { old_t.push_back(cur_tuple[i]); } auto compare_res = update_compare_func(old_t, dependent_columns, tp); - if (compare_res.has_value() && compare_res.value()) { + if (!compare_res.has_value()) { + continue; + } + if (compare_res.value()) { + joined = true; return true; + } else { + joined = true; } } // std::cout << " not adding to lattice with <<<<<< "; @@ -151,7 +158,11 @@ shmap_relation::check_dependent_insertion(const std::vector &tp) { // } // std::cout << std::endl; // } - return false; + if (!joined) { + return true; + } else { + return false; + } } } else { return true; @@ -416,42 +427,41 @@ void shmap_relation::as_all_to_allv_right_join_buffer( // } // std::cout << std::endl; // } + // std::cout << "upper bound >> "; + // for (auto c: upper_bound) { + // std::cout << c << " "; + // } + std::cout << std::endl; auto joined_range = lowerUpperRange(lower_bound, upper_bound); - for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) - { - auto cur_path = *it; - u64 projected_path[join_buffer.width[ra_id]]; - if (generator_mode) { - std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); - // std::cout << "join facts "; - // for (auto c: input_t) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - gen_func(input_t, cur_path, projected_path); - } else { - // std::cout << "here" << std::endl; - u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; - for (int i = 0; i < input1_buffer_width; i++) - reordered_cur_path[i] = cur_path[i]; - - for (int i = join_column_count; i < input0_buffer_width; i++) - reordered_cur_path[input1_buffer_width + (i - join_column_count)] = input0_buffer[i]; - for (int i =0; i < join_buffer.width[ra_id]; i++) - projected_path[i] = reordered_cur_path[reorder_map[i]]; + if (generator_mode) { + std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); + std::vector> eq_tuple_set; + std::vector> generated_tuple_set; + std::vector prev_non_dependent_columns; + for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it){ + auto cur_path = *it; + std::vector cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+arity+1-dependent_column_indices.size()); + if (cur_non_dependent_columns == prev_non_dependent_columns) { + eq_tuple_set.push_back(cur_path); + continue; + } else { + if (eq_tuple_set.size() != 0) { + gen_func(eq_tuple_set, input_t, generated_tuple_set); + eq_tuple_set.clear(); + } + prev_non_dependent_columns = cur_non_dependent_columns; + eq_tuple_set.push_back(cur_path); + } } - // std::cout << "add new facts "; - // for (auto c: projected_path) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) - { - uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets; + if (eq_tuple_set.size() != 0) { + gen_func(eq_tuple_set, input_t, generated_tuple_set); + } + for (auto& tp: generated_tuple_set) { + uint64_t bucket_id = tuple_hash(tp.data(), head_rel_hash_col_count) % buckets; uint64_t sub_bucket_id=0; if (canonical == false) - sub_bucket_id = tuple_hash(projected_path + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id]; + sub_bucket_id = tuple_hash(tp.data() + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id]; int index = output_sub_bucket_rank[bucket_id][sub_bucket_id]; @@ -459,15 +469,55 @@ void shmap_relation::as_all_to_allv_right_join_buffer( join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id]; join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id]; join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++; - join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id]; + join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id]; join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id]; - join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)projected_path, sizeof(u64)*join_buffer.width[ra_id]); + join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)tp.data(), sizeof(u64)*join_buffer.width[ra_id]); (*local_join_inserts)++; (*local_join_count)++; } - else { - (*local_join_duplicates)++; + } else { + for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) + { + auto cur_path = *it; + u64 projected_path[join_buffer.width[ra_id]]; + u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; + for (int i = 0; i < input1_buffer_width; i++) + reordered_cur_path[i] = cur_path[i]; + + for (int i = join_column_count; i < input0_buffer_width; i++) + reordered_cur_path[input1_buffer_width + (i - join_column_count)] = input0_buffer[i]; + + for (int i =0; i < join_buffer.width[ra_id]; i++) + projected_path[i] = reordered_cur_path[reorder_map[i]]; + // std::cout << "add new facts "; + // for (auto c: projected_path) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) + { + uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets; + uint64_t sub_bucket_id=0; + if (canonical == false) + sub_bucket_id = tuple_hash(projected_path + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id]; + + int index = output_sub_bucket_rank[bucket_id][sub_bucket_id]; + + join_buffer.local_compute_output_size_rel[ra_id] = join_buffer.local_compute_output_size_rel[ra_id] + join_buffer.width[ra_id]; + join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id]; + join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id]; + join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++; + join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id]; + + join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id]; + join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)projected_path, sizeof(u64)*join_buffer.width[ra_id]); + (*local_join_inserts)++; + (*local_join_count)++; + } + else { + (*local_join_duplicates)++; + } } } } @@ -499,11 +549,9 @@ void shmap_relation::as_all_to_allv_left_join_buffer( upper_bound[i] = prefix[i]; lower_bound[i] = prefix[i]; } - // std::cout << "join >>> "; - // for (auto c: prefix) { - // std::cout << c << " "; - // } - // std::cout << std::endl; + + auto joined_range = lowerUpperRange(lower_bound, upper_bound); + // std::cout << "cur tree >>> " << std::endl; // for (auto r: ind) { // std::cout << ">>> "; @@ -512,33 +560,35 @@ void shmap_relation::as_all_to_allv_left_join_buffer( // } // std::cout << std::endl; // } - auto joined_range = lowerUpperRange(lower_bound, upper_bound); - for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) - { - auto cur_path = *it; - u64 projected_path[join_buffer.width[ra_id]]; - if (generator_mode) { - std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); - gen_func(cur_path, input_t, projected_path); - } else { - u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; - for (int i = 0; i < input0_buffer_width; i++) - reordered_cur_path[i] = input0_buffer[i]; - for (int i = join_column_count; i < input1_buffer_width; i++) - reordered_cur_path[input0_buffer_width + (i - join_column_count)] = cur_path[i]; - - for (int i =0; i < join_buffer.width[ra_id]; i++) - projected_path[i] = reordered_cur_path[reorder_map[i]]; + if (generator_mode) { + std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); + std::vector> eq_tuple_set; + std::vector> generated_tuple_set; + std::vector prev_non_dependent_columns; + for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it){ + auto cur_path = *it; + std::vector cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+arity+1-dependent_column_indices.size()); + if (cur_non_dependent_columns == prev_non_dependent_columns) { + eq_tuple_set.push_back(cur_path); + continue; + } else { + if (eq_tuple_set.size() != 0) { + gen_func(eq_tuple_set, input_t, generated_tuple_set); + eq_tuple_set.clear(); + } + prev_non_dependent_columns = cur_non_dependent_columns; + eq_tuple_set.push_back(cur_path); + } } - - //std::cout << "NT " << projected_path[0] << " " << projected_path[1] << std::endl; - if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) - { - uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets; + if (eq_tuple_set.size() != 0) { + gen_func(eq_tuple_set, input_t, generated_tuple_set); + } + for (auto& tp: generated_tuple_set) { + uint64_t bucket_id = tuple_hash(tp.data(), head_rel_hash_col_count) % buckets; uint64_t sub_bucket_id=0; if (canonical == false) - sub_bucket_id = tuple_hash(projected_path + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id]; + sub_bucket_id = tuple_hash(tp.data() + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id]; int index = output_sub_bucket_rank[bucket_id][sub_bucket_id]; @@ -549,12 +599,49 @@ void shmap_relation::as_all_to_allv_left_join_buffer( join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id]; join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id]; - join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)projected_path, sizeof(u64)*join_buffer.width[ra_id]); + join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)tp.data(), sizeof(u64)*join_buffer.width[ra_id]); (*local_join_inserts)++; (*local_join_count)++; } - else { - (*local_join_duplicates)++; + } else { + for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) + { + auto cur_path = *it; + u64 projected_path[join_buffer.width[ra_id]]; + u64 reordered_cur_path[input0_buffer_width + input1_buffer_width - join_column_count]; + for (int i = 0; i < input0_buffer_width; i++) + reordered_cur_path[i] = input0_buffer[i]; + + for (int i = join_column_count; i < input1_buffer_width; i++) + reordered_cur_path[input0_buffer_width + (i - join_column_count)] = cur_path[i]; + + for (int i =0; i < join_buffer.width[ra_id]; i++) + projected_path[i] = reordered_cur_path[reorder_map[i]]; + + //std::cout << "NT " << projected_path[0] << " " << projected_path[1] << std::endl; + if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) + { + uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets; + uint64_t sub_bucket_id=0; + if (canonical == false) + sub_bucket_id = tuple_hash(projected_path + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id]; + + int index = output_sub_bucket_rank[bucket_id][sub_bucket_id]; + + join_buffer.local_compute_output_size_rel[ra_id] = join_buffer.local_compute_output_size_rel[ra_id] + join_buffer.width[ra_id]; + join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id]; + join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id]; + join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++; + + join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id]; + join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id]; + join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)projected_path, sizeof(u64)*join_buffer.width[ra_id]); + (*local_join_inserts)++; + (*local_join_count)++; + } + else { + (*local_join_duplicates)++; + } } } // std::cout << "inserted " << *local_join_inserts << std::endl; diff --git a/backend/tests/msum/compiled_pre/CMakeLists.txt b/backend/tests/msum/compiled_pre/CMakeLists.txt new file mode 100644 index 00000000..2930b4c2 --- /dev/null +++ b/backend/tests/msum/compiled_pre/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required (VERSION 3.9) + +project (msum) + +add_compile_options(--std=c++17 -lstdc++fs -Wno-strict-aliasing -Werror=class-memaccess -fpermissive) + +link_libraries(stdc++fs) + +find_package(MPI REQUIRED) +# find_package(OpenMP) +# if (OPENMP_FOUND) +# set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") +# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +# set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +# endif() + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive") +# set (base_dir "${PROJECT_SOURCE_DIR}/../backend") +set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") + +file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") +file (GLOB source_files_msum "${PROJECT_SOURCE_DIR}/msum.cpp") + +ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}") + +add_executable(msum ${source_files_msum}) +INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH}) +TARGET_LINK_LIBRARIES(msum parallel_RA ${MPI_LIBRARIES}) diff --git a/backend/tests/msum/compiled_pre/compiler-out b/backend/tests/msum/compiled_pre/compiler-out new file mode 100644 index 00000000..d617368b --- /dev/null +++ b/backend/tests/msum/compiled_pre/compiler-out @@ -0,0 +1,18 @@ +parsing + compilation took 4 ms. +ir-small: +RULES: +------------------------------------------------------ +/home/ubuntu/workspace/slog/backend/tests/msum/msum.slog 1: +[((rel-arity cpath 4 db) x y x $=1) <-- + (= $_1 ((rel-arity edge 2 db) x y)) + (= $_5 ((rel-arity = 2 comp) $=1 1))] +------------------------------------------------------ +/home/ubuntu/workspace/slog/backend/tests/msum/msum.slog 2: +[((rel-arity cpath 4 db) x z y l) <-- + (= $_3 ((rel-arity cpath 4 db) x y prev l)) + (= $_4 ((rel-arity edge 2 db) y z))] + + +All rules: 5, arules: 3, copy rules: 0, join rules: 2, facts: 0 +rels: 2, sccs: 4 +[wrote C++ driver and data to "/home/ubuntu/workspace/slog/out/msum.cpp"] diff --git a/backend/tests/msum/compiled_pre/input-data/$strings.csv b/backend/tests/msum/compiled_pre/input-data/$strings.csv new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/msum/compiled_pre/input-data/257.edge.2.table b/backend/tests/msum/compiled_pre/input-data/257.edge.2.table new file mode 100644 index 0000000000000000000000000000000000000000..a1596313ab507848e78f8864707b0e9340f23b48 GIT binary patch literal 264 zcmYk%K@NZ*00Th~5%g93|5Y?=o4}>n2^^^9KbMzTt5kRM((>Gk)AICvzLTfBotCE` bJ1tKSJ1tL7J1tMYc3Pg!Kj_0h-)H3;;?@Q! literal 0 HcmV?d00001 diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp b/backend/tests/msum/compiled_pre/msum.cpp similarity index 58% rename from backend/tests/sssp/compiled_pre/sssp.cpp rename to backend/tests/msum/compiled_pre/msum.cpp index ee96ecb9..2df29aa4 100644 --- a/backend/tests/sssp/compiled_pre/sssp.cpp +++ b/backend/tests/msum/compiled_pre/msum.cpp @@ -1,5 +1,5 @@ // location of `parallel_RA_inc.h` here -#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h" +#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" #include #include @@ -33,7 +33,6 @@ const u64 int_tag = 0; const u64 str_tag = 2; const u64 sign_flip_const = 0x0000200000000000; const u64 signed_num_mask = 0xFFFFE00000000000; -int start_node = 1; inline bool is_number(u64 datum) { // cout << "is_number(" << datum << "): " << (datum >> tag_position == @@ -333,7 +332,7 @@ void load_input_relation(std::string db_dir) { for (const auto &entry : std::filesystem::directory_iterator(db_dir)) { // check if ends with table std::string filename_ss = entry.path().filename().string(); - //std::cout << "input database has file " << filename_ss << std::endl; + std::cout << "input database has file " << filename_ss << std::endl; std::string suffix = ".table"; int ft = filename_ss.size() - suffix.size(); if (ft < 0) @@ -356,8 +355,8 @@ void load_input_relation(std::string db_dir) { } if (tag > max_rel) max_rel = tag; - //std::cout << "load " << tag << "." << index_stream.str() << "has arity " - // << arity << std::endl; + std::cout << "load " << tag << "." << index_stream.str() << "has arity " + << arity << std::endl; rel_tag_map[index_stream.str()] = tag; } } @@ -377,271 +376,107 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { } max_rel++; rel_tag_map[name_arity] = max_rel; - //std::cout << "generate rel tag: " << name_arity << " " << max_rel - // << std::endl; + std::cout << "generate rel tag: " << name_arity << " " << max_rel + << std::endl; return max_rel; } -void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::string output_dir, int argc, char **argv) { - start_node = sp; - load_input_relation(input_dir); +int main(int argc, char **argv) { + // input dir from compiler + std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data"; + // output dir from compiler + std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints"; + if (argc == 3) { + slog_input_dir = argv[1]; + slog_output_dir = argv[2]; + } + load_input_relation(slog_input_dir); + mpi_comm mcomm; + mcomm.create(argc, argv); - relation *rel__edge__3__1__2__3 = new relation( - 3, true, 3, get_tag_for_rel("edge", "1__2__3"), - std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", - input_dir + "/" + - std::to_string(get_tag_for_rel("edge", "1__2__3")) + - ".edge.3.table", - FULL); - relation* rel__edge__3__1 = new relation( - 1, false, 3, get_tag_for_rel("edge","1"), - std::to_string(get_tag_for_rel("edge","1")) + ".edge.3.table", - FULL); - - // the dependent column must be exclude from hash computation, so join - // column count is 3 - 1 = 2 - relation *rel__spath__3__1__2__3 = new relation( - 2, true, 3, get_tag_for_rel("spath", "1__2__3"), - std::to_string(get_tag_for_rel("spath", "1__2__3")) + ".spath.3.table", - input_dir + "/" + - std::to_string(get_tag_for_rel("spath", "1__2__3")) + - ".spath.3.table", + relation *rel__edge__2__1__2 = new relation( + 2, true, 2, get_tag_for_rel("edge", "1__2"), + std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + + ".edge.2.table", FULL); - // set functional dependency for spath - rel__spath__3__1__2__3->set_dependent_column_update( - {2, 3}, // len and id column - [](const std::vector& old_v, const std::vector& new_v, const vector& nt) -> std::optional - { - return new_v[0] < old_v[0]; - } - ); - relation* rel__spath__3__2 = new relation( - 1, false, 3, get_tag_for_rel("spath","2"), - std::to_string(get_tag_for_rel("spath","2")) + ".spath.3.table", - FULL); - rel__spath__3__2->set_dependent_column_update( - {2, 3}, - [](const std::vector& old_v, const std::vector& new_v, const vector& nt) -> std::optional - { - return new_v[0] < old_v[0]; + relation *rel__cpath__4__2 = new relation( + 1, true, 4, get_tag_for_rel("cpath", "2"), + std::to_string(get_tag_for_rel("cpath", "2")) + ".cpath.4.table", + std::to_string(get_tag_for_rel("cpath", "2")) + ".cpath.4.table", FULL); + rel__cpath__4__2->set_dependent_column_update( + {2,3,4}, + [](const std::vector &old_v, const std::vector &new_v, + const vector &nt) -> std::optional { + if (new_v[0] != old_v[0]) { + return std::nullopt; + } else { + // monotonic + assert(new_v[1] > old_v[1]); + return new_v[1] > old_v[1]; + } } ); - RAM* scc0 = new RAM(false, 0); - scc0->add_relation(rel__edge__3__1, true, false); - scc0->add_relation(rel__edge__3__1__2__3, true, false); - scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, - DELTA, {0, 3, 1, 2})); - - RAM *scc1 = new RAM(false, 1); - scc1->add_relation(rel__edge__3__1__2__3, false, false); - scc1->add_relation(rel__spath__3__1__2__3, true, false); - // scc1->add_rule(new parallel_copy(rel__spath__3__1__2__3, - // rel__edge__3__1__2__3, FULL, {0, 1, 2})); - scc1->add_rule(new parallel_copy_generate( - rel__spath__3__1__2__3, rel__edge__3__1__2__3, FULL, + RAM *scc0 = new RAM(false, 0); + scc0->add_relation(rel__edge__2__1__2, false, false); + scc0->add_relation(rel__cpath__4__2, true, false); + scc0->add_rule(new parallel_copy_generate( + rel__cpath__4__2, rel__edge__2__1__2, FULL, [](const u64 *const data, u64 *const output) -> int { - auto args_for_old_bi = std::array{data[0], data[1], data[2]}; - using TState = std::tuple; - TState state = std::make_tuple(data, output); - auto callback = [](u64 res_0, TState state) -> TState { - auto [data, output] = state; - auto head_tuple = output; - - bool compatible = true && res_0 < n2d(start_node); - if (!compatible) - return state; - - head_tuple[0] = data[0]; - head_tuple[1] = data[1]; - head_tuple[2] = data[2]; - return std::make_tuple(data, output + 2); - }; - auto [_, new_ptr] = - builtin_eq_1(args_for_old_bi.data(), state, callback); - auto tuples_count = (new_ptr - output) / 2; - return tuples_count; + output[0] = data[1]; + output[1] = data[0]; + output[2] = data[1]; + output[3] = n2d(1); + return 1; })); - RAM *scc2 = new RAM(true, 2); - scc2->add_relation(rel__edge__3__1__2__3, false, false); - scc2->add_relation(rel__spath__3__2, true, false); - scc2->add_relation(rel__spath__3__1__2__3, true, false); - // the order of non join column also need to be carefully arranged - // because, dependent column - // should always at last - scc2->add_rule(new parallel_acopy( - rel__spath__3__2, - rel__spath__3__1__2__3, DELTA, - {1, 0, 2, 3})); // 2, 1, 3, id - parallel_join* update_spath_j = new parallel_join( - rel__spath__3__1__2__3, - rel__edge__3__1, FULL, - rel__spath__3__2, DELTA, - {5, 2, 3}// useless + RAM *scc1 = new RAM(true, 1); + scc1->add_relation(rel__cpath__4__2, true, false); + scc1->add_relation(rel__edge__2__1__2, false, false); + auto pj = new parallel_join( + rel__cpath__4__2, + rel__edge__2__1__2, FULL, + rel__cpath__4__2, DELTA, + {4, 2, 0, 6} // useless ); - update_spath_j->set_generator_func([](std::vector& target_v, - std::vector& input_v, u64* res) { - res[0] = target_v[1]; - res[1] = input_v[2]; - if (res[0] == res[1]) { - res[2] = 0; - } else { - res[2] = target_v[2] + input_v[3]; + pj->set_generator_func( + [](const depend_val_t& target_vs, const std::vector& input_v, depend_val_t& res_set) -> bool { + auto sum_res = 0; + for (auto& tv: target_vs) { + sum_res += tv[3]; + } + std::vector res_tuple(4, 0); + res_tuple[0] = input_v[1]; + res_tuple[1] = target_vs[0][1]; + res_tuple[2] = target_vs[0][0]; + res_tuple[3] = sum_res; + res_set.push_back(res_tuple); + return true; } - }); - scc2->add_rule(update_spath_j); + ); + scc1->add_rule(pj); + + LIE *lie = new LIE(); - lie->add_relation(rel__edge__3__1); - lie->add_relation(rel__edge__3__1__2__3); - lie->add_relation(rel__spath__3__2); - lie->add_relation(rel__spath__3__1__2__3); + lie->add_relation(rel__edge__2__1__2); + lie->add_relation(rel__cpath__4__2); lie->add_scc(scc0); lie->add_scc(scc1); - lie->add_scc(scc2); - lie->add_scc_dependance(scc0, scc2); - lie->add_scc_dependance(scc1, scc2); - - // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - // relation *rel__spath__2__1__2 = new relation( - // 2, true, 2, get_tag_for_rel("spath", "1__2"), - // std::to_string(get_tag_for_rel("spath", "1__2")) + ".spath.2.table", - // input_dir + "/" + std::to_string(get_tag_for_rel("spath", "1__2")) + - // ".spath.2.table", - // FULL); - // rel__spath__2__1__2->set_dependent_column_update( - // {1, 2}, // len and id column - // [](std::vector old_v, std::vector new_v) -> std::optional - // { - // // std::cout << "Comparing "; - // // for (auto c : old_v) { - // // std::cout << c << " "; - // // } - // // std::cout << " <<<<<< "; - // // for (auto c : new_v) { - // // std::cout << c << " "; - // // } - // return new_v[0] < old_v[0]; - // } - // ); - // relation *rel__edge__3__1 = new relation( - // 1, false, 3, get_tag_for_rel("edge", "1"), - // std::to_string(get_tag_for_rel("edge", "1")) + ".edge.3.table", FULL); - // relation *rel__edge__3__1__2__3 = new relation( - // 3, true, 3, get_tag_for_rel("edge", "1__2__3"), - // std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", - // input_dir + "/" + - // std::to_string(get_tag_for_rel("edge", "1__2__3")) + ".edge.3.table", - // FULL); - // relation *rel__spath__2__1 = new relation( - // 1, false, 2, get_tag_for_rel("spath", "1"), - // std::to_string(get_tag_for_rel("spath", "1")) + ".spath.2.table", FULL); - // rel__spath__2__1->set_dependent_column_update( - // {1, 2}, - // [](std::vector old_v, std::vector new_v) -> std::optional - // { - // // std::cout << "Comparing "; - // // for (auto c : old_v) { - // // std::cout << c << " "; - // // } - // // std::cout << " <<<<<< "; - // // for (auto c : new_v) { - // // std::cout << c << " "; - // // } - // return new_v[0] < old_v[0]; - // } - // ); - - // RAM *scc0 = new RAM(false, 0); - // scc0->add_relation(rel__edge__3__1, true, false); - // scc0->add_relation(rel__edge__3__1__2__3, true, false); - // scc0->add_rule(new parallel_acopy(rel__edge__3__1, rel__edge__3__1__2__3, - // DELTA, {0, 3, 1, 2})); - - // RAM *scc1 = new RAM(false, 1); - // scc1->add_relation(rel__spath__2__1__2, true, false); - // scc1->add_relation(rel__edge__3__1, false, false); - // scc1->add_rule(new parallel_copy_generate( - // rel__spath__2__1__2, rel__edge__3__1, FULL, - // [](const u64 *const data, u64 *const output) -> int { - // auto args_for_old_bi = std::array{data[0]}; - // using TState = std::tuple; - // TState state = std::make_tuple(data, output); - // auto callback = [](u64 res_0, TState state) -> TState { - // auto [data, output] = state; - // auto head_tuple = output; - - // bool compatible = true && res_0 == n2d(start_node); - // if (!compatible) - // return state; - - // head_tuple[0] = data[2]; - // head_tuple[1] = data[3]; - // return std::make_tuple(data, output + 2); - // }; - // auto [_, new_ptr] = - // builtin_eq_1(args_for_old_bi.data(), state, callback); - // auto tuples_count = (new_ptr - output) / 2; - // return tuples_count; - // })); - - // RAM *scc2 = new RAM(true, 2); - // scc2->add_relation(rel__spath__2__1__2, true, false); - // scc2->add_relation(rel__edge__3__1, false, false); - // scc2->add_relation(rel__spath__2__1, true, false); - // // scc2->add_rule(new parallel_join(rel__spath__2__1__2, rel__spath__2__1, DELTA, - // // rel__edge__3__1, FULL, {4, 5})); - // parallel_join* update_spath_j = new parallel_join( - // rel__spath__2__1__2, - // rel__edge__3__1, FULL, - // rel__spath__2__1, DELTA, - // {5,4}// useless - // ); - // update_spath_j->set_generator_func([](std::vector& target_v, - // std::vector& input_v, u64* res) { - // // res[0] = target_v[0]; - // res[0] = input_v[2]; - // if (res[0] == start_node) { - // res[1] = 0; - // } else { - // res[1] = target_v[1] + input_v[3]; - // } - // }); - // scc2->add_rule(update_spath_j); - // scc2->add_rule(new parallel_acopy(rel__spath__2__1, rel__spath__2__1__2, - // DELTA, {0, 1, 2})); - - - // LIE *lie = new LIE(); - // lie->add_relation(rel__spath__2__1__2); - // lie->add_relation(rel__edge__3__1); - // lie->add_relation(rel__edge__3__1__2__3); - // lie->add_relation(rel__spath__2__1); - // lie->add_scc(scc0); - // lie->add_scc(scc1); - // lie->add_scc(scc2); - // lie->add_scc_dependance(scc0, scc2); - // lie->add_scc_dependance(scc0, scc1); - // lie->add_scc_dependance(scc1, scc2); + lie->add_scc_dependance(scc0, scc1); // Enable IO lie->enable_all_to_all_dump(); lie->enable_data_IO(); lie->enable_IO(); - lie->set_output_dir(output_dir); // Write to this directory + // lie->enable_share_io(); + lie->set_output_dir(slog_output_dir); // Write to this directory lie->set_comm(mcomm); lie->set_batch_size(1); lie->execute(); lie->print_all_relation_size(); // Continuously print relation sizes - lie->stat_intermediate(); - - // rel__spath__3__1__2__3->print(); - - // rel__spath__2__1__2->print(); - // rel__spath__2__1->print(); - // rel__edge__3__1->print(); - // rel__edge__3__1__2__3->print(); + // lie->stat_intermediate(); // print all variants(non-canonical index of each relation) if (mcomm.get_rank() == 0) { @@ -655,27 +490,12 @@ void compute_sssp_from(mpi_comm& mcomm, int sp, std::string input_dir, std::stri } // lie->print_all_relation_size(); // Continuously print relation sizes + rel__edge__2__1__2->print(); + rel__cpath__4__2->print(); delete lie; -} - -int main(int argc, char **argv) { - // input dir from compiler - std::string slog_input_dir = - "/home/stargazermiao/workspace/PL/slog/out/input-data"; - // output dir from compiler - std::string slog_output_dir = - "/home/stargazermiao/workspace/PL/slog/out/checkpoints"; - if (argc > 2) { - slog_input_dir = argv[1]; - slog_output_dir = argv[2]; - } - mpi_comm mcomm; - mcomm.create(argc, argv); - - compute_sssp_from(mcomm, atoi(argv[3]), slog_input_dir, slog_output_dir, argc, argv); - mcomm.destroy(); + return 0; } diff --git a/backend/tests/msum/msum.slog b/backend/tests/msum/msum.slog new file mode 100644 index 00000000..98152e61 --- /dev/null +++ b/backend/tests/msum/msum.slog @@ -0,0 +1,3 @@ + +[(cpath x y {x 1}) <-- (edge x y)] +[(cpath x z {y l}) <-- (cpath x y prev l) (edge y z)] diff --git a/backend/tests/pagerank/pagerank.slog b/backend/tests/pagerank/pagerank.slog new file mode 100644 index 00000000..f52d9663 --- /dev/null +++ b/backend/tests/pagerank/pagerank.slog @@ -0,0 +1,3 @@ + +[(rank x x (computed1 x)) <-- (matrix x _ _)] +[(rank x y (computed2 c d)) <-- (rank y _ c) (matrix y x d)] diff --git a/backend/tests/sssp/compiled_pre/input-data/258.edge.2.table b/backend/tests/sssp/compiled_pre/input-data/258.edge.2.table new file mode 100644 index 0000000000000000000000000000000000000000..f0c99a55eee0e88f256ab0afb576049c5b55fbd2 GIT binary patch literal 528 zcmZ{gTMmLi5Cj)gR6xMjKS%Hg9F3#XJ9LFe0zZ@L)K2Ir()yL28e^KLRG3em;$P13 z$b*%`=j~nWfiKR3<-?V8arp4nd9Zx==3E>;e0Lr!AEt+R$p24X+t>W-TRLxcIsMv$ rJ3R6`?Z=~c=RGjJd;2iG2m3I+pY~yTzwE>Gd)tS(!{^`*U+-A}u#FT7 literal 0 HcmV?d00001 diff --git a/backend/tests/sssp/compiled_pre/sssp.cpp.backup b/backend/tests/sssp/compiled_pre/sssp.cpp.backup deleted file mode 100644 index 0b208342..00000000 --- a/backend/tests/sssp/compiled_pre/sssp.cpp.backup +++ /dev/null @@ -1,429 +0,0 @@ -// location of `parallel_RA_inc.h` here -#include "/home/stargazermiao/workspace/PL/slog/compiler/../backend/src/parallel_RA_inc.h" - -#include -#include -#include -#include -#include -#include - -// builtins.cpp goes here! -// builtins.cpp -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; -#define u64 uint64_t -#define u32 uint32_t -using i64 = int64_t; - -const u64 tag_mask = 0xffffc00000000000; -const u64 tag_position = 46; -const u64 int_tag = 0; -const u64 str_tag = 2; -const u64 sign_flip_const = 0x0000200000000000; -const u64 signed_num_mask = 0xFFFFE00000000000; - -inline bool is_number(u64 datum) { - // cout << "is_number(" << datum << "): " << (datum >> tag_position == int_tag) << "\n"; - return datum >> tag_position == int_tag; -} - -inline i64 datum_to_number(u64 datum) { - i64 signed_val = (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); - if (signed_val >= sign_flip_const) { - signed_val = sign_flip_const - signed_val; - } - return signed_val; - // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); -} -const auto d2n = datum_to_number; - -inline u64 number_to_datum(i64 number) { - i64 unsigned_value = number; - if (number < 0) { - unsigned_value = (- number) + sign_flip_const; - } - return (unsigned_value & ~tag_mask) | (int_tag << tag_position); - // return (number & ~tag_mask) | (int_tag << tag_position); -} - -const auto n2d = number_to_datum; - -inline u64 string_to_datum(std::string str) -{ - u32 str_hash = string_hash(str); - return (str_hash & ~tag_mask) | (str_tag << tag_position); -} -const auto s2d = string_to_datum; - - -vector> builtin_div_rem(const u64* const data){ - if (is_number(data[0]) && is_number(data[1])){ - auto div = number_to_datum(d2n(data[0]) / d2n(data[1])); - auto rem = number_to_datum(d2n(data[0]) % d2n(data[1])); - return {{div, rem}}; - } else { - return {}; - } -} - -#define BUILTIN_BINARY_NUMBER_PRED(name, op) \ -template inline TState name(const u64* data, TState init_state, TState (*callback) (TState state)){ \ - if (is_number(data[0]) && is_number(data[1]) &&\ - datum_to_number(data[0]) op datum_to_number(data[1])){\ - return callback(init_state);\ - } else \ - return init_state;\ -} - -BUILTIN_BINARY_NUMBER_PRED(builtin_less, <) -BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >) -BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=) -BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=) - -#define BUILTIN_BINARY_NUMBER_FUNC(name, op) \ -template inline TState name(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){ \ - if (is_number(data[0]) && is_number(data[1])){\ - auto res = number_to_datum(datum_to_number(data[0]) op datum_to_number(data[1]));\ - return callback(res, init_state);\ -} else \ - return init_state;\ -} - -BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +) -BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -) -BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *) -BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /) - -#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl) \ -template inline TState name(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){ \ - if (is_number(data[0]) && is_number(data[1])){\ - auto res = number_to_datum(impl(datum_to_number(data[0]), datum_to_number(data[1])));\ - return callback(res, init_state);\ -} else \ - return init_state;\ -} - -inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) {return arg2 - arg1;} -BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1) - - -#define BUILTIN_UNARY_NUMBER_FUNC(name, impl) \ -template inline TState name(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){ \ - if (is_number(data[0])){\ - auto res = number_to_datum(impl(datum_to_number(data[0])));\ - return callback(res, init_state);\ -} else \ - return init_state;\ -} - -inline u64 add1(u64 x) {return x + 1;} -inline u64 sub1(u64 x) {return x - 1;} - -BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1) -BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1) -BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1) -BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1) - - -vector> builtin_range(const u64* const data){ - vector> res; - if (is_number(data[0]) && is_number(data[1])){ - auto lb = datum_to_number(data[0]); - auto ub = datum_to_number(data[1]); - res.reserve(ub - lb); - for (u64 x = lb; x < ub; x++) - res.push_back({number_to_datum(x)}); - } - return res; -} - -template -TState callback_builtin_range(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){ - auto state = init_state; - if (is_number(data[0]) && is_number(data[1])){ - auto lb = datum_to_number(data[0]); - auto ub = datum_to_number(data[1]); - for (u64 x = lb; x < ub; x++) - state = callback(number_to_datum(x), state); - } - return state; -} - - -#define BUILTIN_BINARY_PRED(name, op) \ -template TState name(const u64* data, TState init_state, TState (*callback) (TState state)){ \ - if (data[0] op data[1])\ - return callback(init_state);\ - else\ - return init_state;\ -} -BUILTIN_BINARY_PRED(builtin_eq, ==) -BUILTIN_BINARY_PRED(builtin_neq, !=) - -template -TState builtin_eq_1(const u64* data, TState init_state, TState (*callback) (u64 res, TState state)){ - return callback(data[0], init_state); -} - -#define BUILTIN_UNARY_PRED(name, pred) \ -template TState name(const u64* data, TState init_state, TState (*callback) (TState state)){ \ - if (pred(data[0]))\ - return callback(init_state);\ - else\ - return init_state;\ -} - -bool is_not_number(u64 datum) {return !is_number(datum);} -BUILTIN_UNARY_PRED(builtin_number_huh, is_number) -BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number) - -// for generate-cpp-lambda-for-computational-join -struct CL2CB_State{ - void* original_callback; // There be dragons? - void* original_state; - const u64* original_data; - u64* cl1_output_args; -}; - -// for generate-cpp-lambda-for-computational-copy -struct BCLCB_State{ - void* original_callback; - void* original_state; - const u64* original_data; -}; - -//an experiment: -template -bool builtin_binary_number_pred(const u64* data){ - if (is_number(data[0]) && is_number(data[1])){ - return f(datum_to_number(data[0]), datum_to_number(data[1])); - } else { - return false; - } -} -bool _less(u64 x, u64 y) { return x < y;} -auto builtin_less2 = builtin_binary_number_pred<_less>; - - -template inline TState builtin_nop(const u64* data, TState init_state, TState (*callback) (TState state)){ - return callback(init_state); -} - -// //////////////////// AGGREGATORS Alternative design //////////////////// - - -// TODO: add number type check -////////////////////////////// count ///////////////////////////////////// - -local_agg_res_t agg_count_local(std::pair joined_range) -{ - local_agg_res_t cnt = 0; - for(auto it = joined_range.first; it != joined_range.second ; ++it) { - cnt ++; - } - return cnt; -} - -local_agg_res_t agg_count_reduce (local_agg_res_t x, local_agg_res_t y) { - return x + y; -} - -////////////////////////////// sum ///////////////////////////////////// - -local_agg_res_t agg_sum_local(std::pair joined_range) -{ - local_agg_res_t sum_res = 0; - for(auto it = joined_range.first; it != joined_range.second ; ++it) { - auto tuple = (*it); - sum_res += tuple[tuple.size()-1]; - } - return sum_res; -} - -local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) { - return x + y; -} - -////////////////////////////// maximum ///////////////////////////////////// - -local_agg_res_t agg_maximum_local(std::pair joined_range) -{ - local_agg_res_t max_res = 0; - for(auto it = joined_range.first; it != joined_range.second ; ++it) { - auto tuple = (*it); - auto current_v = tuple[tuple.size()-1]; - if (current_v > max_res) { - max_res = current_v; - } - } - return max_res; -} - -local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) { - if (x > y){ - return x; - } else{ - return y; - } -} - -////////////////////////////// minimum ///////////////////////////////////// - -local_agg_res_t agg_minimum_local(std::pair joined_range) -{ - local_agg_res_t min_res = std::numeric_limits::max(); - for(auto it = joined_range.first; it != joined_range.second ; ++it) { - auto tuple = (*it); - auto current_v = tuple[tuple.size()-1]; - if (current_v < min_res) { - min_res = current_v; - } - } - return min_res; -} - -local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) { - if (x < y){ - return x; - } else{ - return y; - } -} - -// // end of builtins.cpp - - -// global definitions: - - -int max_rel = 255; -std::map rel_tag_map; -std::map> rel_index_map; - -// load all relation inside input database -void load_input_relation(std::string db_dir) -{ - for (const auto & entry : std::filesystem::directory_iterator(db_dir)) - { - // check if ends with table - std::string filename_ss = entry.path().filename().string(); - std::cout << "input database has file " << filename_ss << std::endl; - std::string suffix = ".table"; - int ft = filename_ss.size()-suffix.size(); - if (ft < 0) - ft = 0; - if (filename_ss.rfind(suffix) != ft) - { - continue; - } - std::string filename_s = entry.path().stem().string(); - int tag = std::stoi(filename_s.substr(0, filename_s.find("."))); - std::string name_arity = filename_s.substr(filename_s.find(".")+1, filename_s.size()-filename_s.find(".")-1); - std::string name = name_arity.substr(0, name_arity.rfind(".")); - std::string arity_s = name_arity.substr(name_arity.rfind(".")+1, name_arity.size()); - int arity = std::stoi(arity_s); - std::stringstream index_stream; - index_stream << name; - for (int i = 1; i <= arity; i++) - { - index_stream << "__" << i; - } - if (tag > max_rel) - max_rel = tag; - std::cout << "load " << tag << "." << index_stream.str() << "has arity " << arity << std::endl; - rel_tag_map[index_stream.str()] = tag; - } -} - -int get_tag_for_rel(std::string relation_name, std::string index_str) { - std::string name_arity = relation_name + "__" + index_str; - if (rel_index_map.find(relation_name) != rel_index_map.end()) { - rel_index_map[relation_name].insert(index_str); - } else { - rel_index_map[relation_name] = {index_str}; - } - - if (rel_tag_map.find(name_arity) != rel_tag_map.end()) - { - // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] << std::endl; - return rel_tag_map[name_arity]; - } - max_rel++; - rel_tag_map[name_arity] = max_rel; - std::cout << "generate rel tag: " << name_arity << " " << max_rel << std::endl; - return max_rel; -} - -int main(int argc, char **argv) -{ - // input dir from compiler - std::string slog_input_dir = "/home/stargazermiao/workspace/PL/slog/out/input-data"; - // output dir from compiler - std::string slog_output_dir = "/home/stargazermiao/workspace/PL/slog/out/checkpoints"; - if (argc == 3) { - slog_input_dir = argv[1]; - slog_output_dir = argv[2]; - } - load_input_relation(slog_input_dir); - mpi_comm mcomm; - mcomm.create(argc, argv); - -relation* rel__edge__3__1__2__3 = new relation(3, true, 3, get_tag_for_rel("edge","1__2__3"), std::to_string(get_tag_for_rel("edge","1__2__3")) + ".edge.3.table", slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge","1__2__3")) + ".edge.3.table", FULL); -relation* rel__spath__3__1__2__3 = new relation(3, true, 3, get_tag_for_rel("spath","1__2__3"), std::to_string(get_tag_for_rel("spath","1__2__3")) + ".spath.3.table", slog_input_dir + "/" + std::to_string(get_tag_for_rel("spath","1__2__3")) + ".spath.3.table", FULL); - -RAM* scc0 = new RAM(false, 0); -scc0->add_relation(rel__edge__3__1__2__3, false, false); -scc0->add_relation(rel__spath__3__1__2__3, true, false); -scc0->add_rule(new parallel_copy(rel__spath__3__1__2__3, rel__edge__3__1__2__3, FULL, {0, 1, 2})); - -LIE* lie = new LIE(); -lie->add_relation(rel__edge__3__1__2__3); -lie->add_relation(rel__spath__3__1__2__3); -lie->add_scc(scc0); - - - - - // Enable IO - lie->enable_all_to_all_dump(); - lie->enable_data_IO(); - // lie->enable_share_io(); - lie->enable_IO(); - // lie->enable_share_io(); - lie->set_output_dir(slog_output_dir); // Write to this directory - lie->set_comm(mcomm); - lie->set_batch_size(1); - lie->execute(); - lie->print_all_relation_size(); // Continuously print relation sizes - lie->stat_intermediate(); - - // print all variants(non-canonical index of each relation) - if (mcomm.get_rank() == 0) - { - std::cout << "rel_name" << ",\t" << "indices\n"; - for (auto const& rel_p : rel_index_map) { - std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n"; - } - std::cout << std::endl; - } - - // lie->print_all_relation_size(); // Continuously print relation sizes - - delete lie; - - mcomm.destroy(); - - return 0; -} diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp index dc3e9c86..6d41428b 100644 --- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp +++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp @@ -1,5 +1,5 @@ // location of `parallel_RA_inc.h` here -#include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" +#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" #include #include @@ -412,7 +412,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, scc0->add_rule(new parallel_copy_generate( rel__spath__3__2, rel__edge__2__1__2, FULL, [](const u64 *const data, u64 *const output) -> int { - auto args_for_old_bi = std::array{data[0], data[1], n2d(1)}; + auto args_for_old_bi = std::array{data[0], data[1], 1}; using TState = std::tuple; TState state = std::make_tuple(args_for_old_bi.data(), output); auto callback = [](u64 res_0, TState state) -> TState { @@ -442,7 +442,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, rel__spath__3__2, DELTA, {5, 2, 3} // useless ); update_spath_j->set_generator_func( - [](std::vector &target_v, std::vector &input_v, u64 *res) { + [](const depend_val_t& target_vs, const std::vector& input_v, depend_val_t& res_set) -> bool { // std::cout << "Joining >>> "; // for (auto c : input_v) { // std::cout << c << " "; @@ -452,13 +452,18 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, // std::cout << c << " "; // } // std::cout << std::endl; + auto target_v = target_vs[0]; + std::vector res(3, 0); res[0] = input_v[1]; res[1] = target_v[1]; if (res[0] == res[1]) { + // std::cout << "Warning detect a loop for node " << res[0] << std::endl; res[2] = 0; } else { res[2] = target_v[2] + 1; } + res_set.push_back(res); + return true; }); scc1->add_rule(update_spath_j); @@ -484,7 +489,8 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, // rel__spath__3__1__2__3->print(); // rel__spath__2__1__2->print(); -// rel__spath__3__2->print(); + // rel__edge__2__1__2->print(); + // rel__spath__3__2->print(); // rel__edge__3__1->print(); // rel__edge__3__1__2__3->print(); diff --git a/backend/tests/sssp/sssp.py b/backend/tests/sssp/sssp.py new file mode 100644 index 00000000..bb7862a1 --- /dev/null +++ b/backend/tests/sssp/sssp.py @@ -0,0 +1,18 @@ + +import networkx as nx + +data_f = open("/home/ubuntu/workspace/dataset/soc-LiveJournal1.txt") +# data_f = open("/home/ubuntu/workspace/slog/backend/tests/sssp/test-input-graph/edge.csv") + +g = nx.DiGraph() +for l in data_f: + g.add_edge(*map(int, l.strip().split("\t"))) + +sssp_nodes = 0 +for i in range(1,10): + reached_map = nx.shortest_path(g, i) + sssp_nodes = sssp_nodes + len(reached_map.keys()) + for k, v in reached_map.items(): + print(f"{k} {i} {len(v)-1}") + +print(sssp_nodes) diff --git a/backend/tests/sssp/test-input-graph/edge.csv b/backend/tests/sssp/test-input-graph/edge.csv index 1d997fb7..20eddd3d 100644 --- a/backend/tests/sssp/test-input-graph/edge.csv +++ b/backend/tests/sssp/test-input-graph/edge.csv @@ -5,5 +5,7 @@ 4 5 5 6 6 7 +7 8 8 9 +9 1 9 10 diff --git a/examples/datalog-example b/examples/datalog-example index be103a21..87266643 160000 --- a/examples/datalog-example +++ b/examples/datalog-example @@ -1 +1 @@ -Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa +Subproject commit 872666433df43282a408b4e37dad3fcbeafa1891 From 336e1a129bfcfc6d7fcf98a6c763e5f004d3c835 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 25 Nov 2022 02:51:13 +0000 Subject: [PATCH 15/36] add page rank test --- backend/CMakeLists.txt | 2 +- backend/src/RA/parallel_agg.cpp | 4 +- backend/src/compat.h | 1 + backend/src/relation/balanced_hash_relation.h | 8 + .../pagerank/compiled_pre/CMakeLists.txt | 28 + .../tests/pagerank/compiled_pre/pagerank.cpp | 631 ++++++++++++++++++ server_log | 12 +- 7 files changed, 677 insertions(+), 9 deletions(-) create mode 100644 backend/tests/pagerank/compiled_pre/CMakeLists.txt create mode 100644 backend/tests/pagerank/compiled_pre/pagerank.cpp diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt index 36256bd9..a348cc28 100644 --- a/backend/CMakeLists.txt +++ b/backend/CMakeLists.txt @@ -12,7 +12,7 @@ find_package(MPI REQUIRED) # set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") # endif() -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++20 -lstdc++fs -Werror=class-memaccess -fpermissive") set (source_dir "${PROJECT_SOURCE_DIR}/src") set (tests_dir "${PROJECT_SOURCE_DIR}/tests") set (data_dir "${PROJECT_SOURCE_DIR}/data") diff --git a/backend/src/RA/parallel_agg.cpp b/backend/src/RA/parallel_agg.cpp index 90c07c3d..9fc5a204 100644 --- a/backend/src/RA/parallel_agg.cpp +++ b/backend/src/RA/parallel_agg.cpp @@ -80,8 +80,8 @@ void parallel_join_aggregate::local_aggregate( u32* output_sub_bucket_count = output->get_sub_bucket_per_bucket_count(); u32** output_sub_bucket_rank = output->get_sub_bucket_rank(); - u32 real_join_count = output->get_join_column_count() - 1; - agg_buffer.width[ra_counter] = real_join_count + 1; + u32 real_join_count = output->get_join_column_count(); + agg_buffer.width[ra_counter] = output->get_arity(); shmap_relation* agg_target; if (*(target->get_sub_bucket_per_bucket_count()) == 1) { diff --git a/backend/src/compat.h b/backend/src/compat.h index 397a5249..dbc42cf4 100644 --- a/backend/src/compat.h +++ b/backend/src/compat.h @@ -25,6 +25,7 @@ #include "btree/btree_set.h" #include #include +#include #ifdef __GNUC__ diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index cfd322ad..88d63404 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -89,6 +89,9 @@ class relation std::vector dependent_column_indices; update_partial_compare_func_t update_compare_func; + // This is only used when this relation need to be reused in another computation loop + bool init_flag = true; + public: /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL); @@ -283,4 +286,9 @@ class relation } return !is_canonical; } + + // skip initialization/loading facts + void disable_initialization() { init_flag = false; } + void enable_initialization() { init_flag = true; } + }; diff --git a/backend/tests/pagerank/compiled_pre/CMakeLists.txt b/backend/tests/pagerank/compiled_pre/CMakeLists.txt new file mode 100644 index 00000000..44733818 --- /dev/null +++ b/backend/tests/pagerank/compiled_pre/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required (VERSION 3.9) + +project (pagerank) + +add_compile_options(--std=c++17 -lstdc++fs -Wno-strict-aliasing -Werror=class-memaccess -fpermissive) + +link_libraries(stdc++fs) + +find_package(MPI REQUIRED) +# find_package(OpenMP) +# if (OPENMP_FOUND) +# set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") +# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +# set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +# endif() + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive") +# set (base_dir "${PROJECT_SOURCE_DIR}/../backend") +set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") + +file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") +file (GLOB source_files_pagerank "${PROJECT_SOURCE_DIR}/pagerank.cpp") + +ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}") + +add_executable(pagerank ${source_files_pagerank}) +INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH}) +TARGET_LINK_LIBRARIES(pagerank parallel_RA ${MPI_LIBRARIES}) diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp new file mode 100644 index 00000000..21a2aa94 --- /dev/null +++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp @@ -0,0 +1,631 @@ +// location of `parallel_RA_inc.h` here +#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// builtins.cpp goes here! +// builtins.cpp +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +#define u64 uint64_t +#define u32 uint32_t +using i64 = int64_t; + +const u64 tag_mask = 0xffffc00000000000; +const u64 tag_position = 46; +const u64 int_tag = 0; +const u64 str_tag = 2; +const u64 sign_flip_const = 0x0000200000000000; +const u64 signed_num_mask = 0xFFFFE00000000000; + +inline bool is_number(u64 datum) { + // cout << "is_number(" << datum << "): " << (datum >> tag_position == + // int_tag) << "\n"; + return datum >> tag_position == int_tag; +} + +inline i64 datum_to_number(u64 datum) { + i64 signed_val = + (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); + if (signed_val >= sign_flip_const) { + signed_val = sign_flip_const - signed_val; + } + return signed_val; + // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - + // tag_position); +} +const auto d2n = datum_to_number; + +inline u64 number_to_datum(i64 number) { + i64 unsigned_value = number; + if (number < 0) { + unsigned_value = (-number) + sign_flip_const; + } + return (unsigned_value & ~tag_mask) | (int_tag << tag_position); + // return (number & ~tag_mask) | (int_tag << tag_position); +} + +const auto n2d = number_to_datum; + +inline u64 string_to_datum(std::string str) { + u32 str_hash = string_hash(str); + return (str_hash & ~tag_mask) | (str_tag << tag_position); +} +const auto s2d = string_to_datum; + +vector> builtin_div_rem(const u64 *const data) { + if (is_number(data[0]) && is_number(data[1])) { + auto div = number_to_datum(d2n(data[0]) / d2n(data[1])); + auto rem = number_to_datum(d2n(data[0]) % d2n(data[1])); + return {{div, rem}}; + } else { + return {}; + } +} + +#define BUILTIN_BINARY_NUMBER_PRED(name, op) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (is_number(data[0]) && is_number(data[1]) && \ + datum_to_number(data[0]) op datum_to_number(data[1])) { \ + return callback(init_state); \ + } else \ + return init_state; \ + } + +BUILTIN_BINARY_NUMBER_PRED(builtin_less, <) +BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >) +BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=) +BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=) + +#define BUILTIN_BINARY_NUMBER_FUNC(name, op) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0]) && is_number(data[1])) { \ + auto res = number_to_datum(datum_to_number(data[0]) \ + op datum_to_number(data[1])); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +) +BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -) +BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *) +BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /) + +#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0]) && is_number(data[1])) { \ + auto res = number_to_datum( \ + impl(datum_to_number(data[0]), datum_to_number(data[1]))); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) { return arg2 - arg1; } +BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1) + +#define BUILTIN_UNARY_NUMBER_FUNC(name, impl) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0])) { \ + auto res = number_to_datum(impl(datum_to_number(data[0]))); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +inline u64 add1(u64 x) { return x + 1; } +inline u64 sub1(u64 x) { return x - 1; } + +BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1) + +vector> builtin_range(const u64 *const data) { + vector> res; + if (is_number(data[0]) && is_number(data[1])) { + auto lb = datum_to_number(data[0]); + auto ub = datum_to_number(data[1]); + res.reserve(ub - lb); + for (u64 x = lb; x < ub; x++) + res.push_back({number_to_datum(x)}); + } + return res; +} + +template +TState callback_builtin_range(const u64 *data, TState init_state, + TState (*callback)(u64 res, TState state)) { + auto state = init_state; + if (is_number(data[0]) && is_number(data[1])) { + auto lb = datum_to_number(data[0]); + auto ub = datum_to_number(data[1]); + for (u64 x = lb; x < ub; x++) + state = callback(number_to_datum(x), state); + } + return state; +} + +#define BUILTIN_BINARY_PRED(name, op) \ + template \ + TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (data[0] op data[1]) \ + return callback(init_state); \ + else \ + return init_state; \ + } +BUILTIN_BINARY_PRED(builtin_eq, ==) +BUILTIN_BINARY_PRED(builtin_neq, !=) + +template +TState builtin_eq_1(const u64 *data, TState init_state, + TState (*callback)(u64 res, TState state)) { + return callback(data[0], init_state); +} + +#define BUILTIN_UNARY_PRED(name, pred) \ + template \ + TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (pred(data[0])) \ + return callback(init_state); \ + else \ + return init_state; \ + } + +bool is_not_number(u64 datum) { return !is_number(datum); } +BUILTIN_UNARY_PRED(builtin_number_huh, is_number) +BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number) + +// for generate-cpp-lambda-for-computational-join +struct CL2CB_State { + void *original_callback; // There be dragons? + void *original_state; + const u64 *original_data; + u64 *cl1_output_args; +}; + +// for generate-cpp-lambda-for-computational-copy +struct BCLCB_State { + void *original_callback; + void *original_state; + const u64 *original_data; +}; + +// an experiment: +template bool builtin_binary_number_pred(const u64 *data) { + if (is_number(data[0]) && is_number(data[1])) { + return f(datum_to_number(data[0]), datum_to_number(data[1])); + } else { + return false; + } +} +bool _less(u64 x, u64 y) { return x < y; } +auto builtin_less2 = builtin_binary_number_pred<_less>; + +template +inline TState builtin_nop(const u64 *data, TState init_state, + TState (*callback)(TState state)) { + return callback(init_state); +} + +// //////////////////// AGGREGATORS Alternative design //////////////////// + +// TODO: add number type check +////////////////////////////// count ///////////////////////////////////// + +local_agg_res_t +agg_count_local(std::pair + joined_range) { + local_agg_res_t cnt = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + cnt++; + } + return cnt; +} + +local_agg_res_t agg_count_reduce(local_agg_res_t x, local_agg_res_t y) { + return x + y; +} + +////////////////////////////// sum ///////////////////////////////////// + +local_agg_res_t +agg_sum_local(std::pair + joined_range) { + local_agg_res_t sum_res = 0; + for (shmap_relation::iterator it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + sum_res += tuple[tuple.size() - 2]; + } + return sum_res; +} + +local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) { + return x + y; +} + +////////////////////////////// maximum ///////////////////////////////////// + +local_agg_res_t +agg_maximum_local(std::pair + joined_range) { + local_agg_res_t max_res = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + auto current_v = tuple[tuple.size() - 1]; + if (current_v > max_res) { + max_res = current_v; + } + } + return max_res; +} + +local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) { + if (x > y) { + return x; + } else { + return y; + } +} + +////////////////////////////// minimum ///////////////////////////////////// + +local_agg_res_t +agg_minimum_local(std::pair + joined_range) { + local_agg_res_t min_res = std::numeric_limits::max(); + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + auto current_v = tuple[tuple.size() - 1]; + if (current_v < min_res) { + min_res = current_v; + } + } + return min_res; +} + +local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) { + if (x < y) { + return x; + } else { + return y; + } +} + +// // end of builtins.cpp + +// global definitions: + +int max_rel = 255; +std::map rel_tag_map; +std::map> rel_index_map; + +// load all relation inside input database +void load_input_relation(std::string db_dir) { + for (const auto &entry : std::filesystem::directory_iterator(db_dir)) { + // check if ends with table + std::string filename_ss = entry.path().filename().string(); + std::cout << "input database has file " << filename_ss << std::endl; + std::string suffix = ".table"; + int ft = filename_ss.size() - suffix.size(); + if (ft < 0) + ft = 0; + if (filename_ss.rfind(suffix) != ft) { + continue; + } + std::string filename_s = entry.path().stem().string(); + int tag = std::stoi(filename_s.substr(0, filename_s.find("."))); + std::string name_arity = filename_s.substr( + filename_s.find(".") + 1, filename_s.size() - filename_s.find(".") - 1); + std::string name = name_arity.substr(0, name_arity.rfind(".")); + std::string arity_s = + name_arity.substr(name_arity.rfind(".") + 1, name_arity.size()); + int arity = std::stoi(arity_s); + std::stringstream index_stream; + index_stream << name; + for (int i = 1; i <= arity; i++) { + index_stream << "__" << i; + } + if (tag > max_rel) + max_rel = tag; + std::cout << "load " << tag << "." << index_stream.str() << "has arity " + << arity << std::endl; + rel_tag_map[index_stream.str()] = tag; + } +} + +int get_tag_for_rel(std::string relation_name, std::string index_str) { + std::string name_arity = relation_name + "__" + index_str; + if (rel_index_map.find(relation_name) != rel_index_map.end()) { + rel_index_map[relation_name].insert(index_str); + } else { + rel_index_map[relation_name] = {index_str}; + } + + if (rel_tag_map.find(name_arity) != rel_tag_map.end()) { + // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] << + // std::endl; + return rel_tag_map[name_arity]; + } + max_rel++; + rel_tag_map[name_arity] = max_rel; + std::cout << "generate rel tag: " << name_arity << " " << max_rel + << std::endl; + return max_rel; +} + +float ALPHA = 0.85; +u64 total_node_size = 0; + +int main(int argc, char **argv) { + // input dir from compiler + std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data"; + // output dir from compiler + std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints"; + if (argc == 3) { + slog_input_dir = argv[1]; + slog_output_dir = argv[2]; + } + load_input_relation(slog_input_dir); + mpi_comm mcomm; + mcomm.create(argc, argv); + + // (edge from to) + relation *rel__edge__2__1 = new relation( + 1, true, 2, get_tag_for_rel("edge", "1__2"), + std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + + ".edge.2.table", + FULL); + + // >>>>>>>>>>>>>>> compute node size + // (node x) + relation *rel__node__1__1 = new relation( + 1, true, 2, get_tag_for_rel("node", "1"), + std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("node", "1")) + + ".node.1.table", + FULL); + + // (total_node_cnt n) + relation *rel__total_node_cnt__1__1 = + new relation(1, true, 2, get_tag_for_rel("total_node_cnt", "1"), + std::to_string(get_tag_for_rel("total_node_cnt", "1")) + + ".total_node_cnt.1.table", + slog_input_dir + "/" + + std::to_string(get_tag_for_rel("total_node_cnt", "1")) + + ".total_node_cnt.1.table", + FULL); + + // helper relation for non-join aggregation + relation *rel___dollorunit__1__1 = new relation( + 0, true, 1, get_tag_for_rel("$unit", "1"), + std::to_string(get_tag_for_rel("$unit", "1")) + ".$unit.1.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("$unit", "1")) + + ".$unit.1.table", + FULL); + + RAM *scc_helper_fact = new RAM(false, 0); + scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false); + scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)})); + + // [(node a) (node b) <-- (edge a b)] + RAM *scc_compute_node = new RAM(false, 1); + scc_compute_node->add_relation(rel__edge__2__1, false, false); + scc_compute_node->add_relation(rel__node__1__1, true, false); + scc_compute_node->add_rule(new parallel_copy_generate( + rel__node__1__1, rel__edge__2__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[0]; + return 1; + })); + scc_compute_node->add_rule(new parallel_copy_generate( + rel__node__1__1, rel__edge__2__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[1]; + return 1; + })); + + // (total_node_cnt {count node _}) + RAM *scc_count_nodes = new RAM(false, 2); + scc_count_nodes->add_relation(rel__node__1__1, false, false); + scc_count_nodes->add_relation(rel__total_node_cnt__1__1, true, false); + scc_count_nodes->add_relation(rel___dollorunit__1__1, false, false); + scc_count_nodes->add_rule(new parallel_join_aggregate( + rel__total_node_cnt__1__1, rel__node__1__1, rel___dollorunit__1__1, FULL, + agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, + {2})); + + LIE *cnt_lie = new LIE(); + cnt_lie->add_relation(rel__edge__2__1); + cnt_lie->add_relation(rel__node__1__1); + cnt_lie->add_relation(rel___dollorunit__1__1); + cnt_lie->add_relation(rel__total_node_cnt__1__1); + cnt_lie->add_scc(scc_helper_fact); + cnt_lie->add_scc(scc_compute_node); + cnt_lie->add_scc(scc_count_nodes); + cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes); + cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes); + + cnt_lie->enable_all_to_all_dump(); + cnt_lie->set_output_dir(slog_output_dir); // Write to this directory + cnt_lie->set_comm(mcomm); + cnt_lie->set_batch_size(1); + cnt_lie->execute(); + cnt_lie->print_all_relation_size(); // Continuously print relation sizes + + // only 1 data in this rel so its safe + rel__total_node_cnt__1__1->print(); + + for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) { + total_node_size = t[0]; + std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << std::endl; + } + + // >>>>>>>>>>>>>>> compute page rank + std::cout << ">>>>>>>>>> Computing pagerank ... " << std::endl; + + rel__edge__2__1->disable_initialization(); + rel__node__1__1->disable_initialization(); + +// matrix edge + successor count + relation *rel__matrix__3__1 = new relation( + 1, true, 3, get_tag_for_rel("matrix", "1"), + std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", + std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL); + + relation *rel__rank__3__1 = new relation( + 1, true, 3, get_tag_for_rel("rank", "1"), + std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", + std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL); + + rel__rank__3__1->set_dependent_column_update( + {2, 3, 4}, + [](const std::vector &old_v, const std::vector &new_v, + const vector &nt) -> std::optional { + if (new_v[0] != old_v[0]) { + return std::nullopt; + } else { + // monotonic + assert(new_v[1] > old_v[1]); + return new_v[1] > old_v[1]; + } + }); + + relation *rel__result__2__1__2 = new relation( + 2, true, 2, get_tag_for_rel("result", "1__2"), + std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", + std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", FULL); + + // + + RAM* scc_compute_matrix = new RAM(false, 0); + scc_compute_matrix->add_relation(rel__edge__2__1, false, false); + scc_compute_matrix->add_relation(rel__matrix__3__1, true, false); + scc_compute_matrix->add_rule( + new parallel_join_aggregate( + rel__matrix__3__1, rel__edge__2__1, rel__node__1__1, FULL, + agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, + {1, 2, 3})); + + RAM* scc_page_rank = new RAM(true, 1); + scc_page_rank->add_relation(rel__matrix__3__1, false, false); + scc_page_rank->add_relation(rel__rank__3__1, true, false); + scc_page_rank->add_rule(new parallel_copy_generate( + rel__rank__3__1, rel__matrix__3__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[0]; + output[1] = data[0]; + float init_pg_v = (1 - ALPHA) / total_node_size; + output[1] = *reinterpret_cast(&init_pg_v); + return 1; + } + )); + parallel_join* rank_join = new parallel_join( + rel__rank__3__1, + rel__matrix__3__1, FULL, + rel__rank__3__1, DELTA, + {3,1,2} // useless + ); + rank_join->set_generator_func( + [](const depend_val_t& target_vs, const std::vector& input_v, depend_val_t& res_set) -> bool { + float pg_sum = 0.0; + for (auto& tv: target_vs) { + u32 raw_succ_pg_v = (u32)d2n(tv[3]); // all columns are u64, cast to u32 first + auto succ_pg_v = *reinterpret_cast(raw_succ_pg_v); + pg_sum += ALPHA * succ_pg_v / d2n(input_v[3]); + } + // u64 encoded_sum = + std::vector res_tuple(3, 0); + res_tuple[0] = input_v[1]; + res_tuple[1] = input_v[0]; + res_tuple[2] = *reinterpret_cast(&pg_sum); + res_set.push_back(res_tuple); + return true; + } + ); + scc_page_rank->add_rule(rank_join); + + RAM *scc_result = new RAM(false, 2); + scc_result->add_relation(rel__rank__3__1, false, false); + scc_result->add_relation(rel__result__2__1__2, true, false); + scc_result->add_relation(rel__node__1__1, false, false); + scc_result->add_rule(new parallel_join_aggregate( + rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, + agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, + {0, 2} + )); + + + LIE *pg_lie = new LIE(); + pg_lie->add_relation(rel__edge__2__1); + pg_lie->add_relation(rel__matrix__3__1); + pg_lie->add_relation(rel__node__1__1); + pg_lie->add_relation(rel__rank__3__1); + pg_lie->add_relation(rel__result__2__1__2); + pg_lie->add_scc(scc_compute_matrix); + pg_lie->add_scc(scc_page_rank); + pg_lie->add_scc(scc_result); + pg_lie->add_scc_dependance(scc_compute_matrix, scc_page_rank); + pg_lie->add_scc_dependance(scc_page_rank, scc_result); + + // Enable IO + pg_lie->enable_all_to_all_dump(); + pg_lie->enable_data_IO(); + pg_lie->enable_IO(); + // lie->enable_share_io(); + pg_lie->set_output_dir(slog_output_dir); // Write to this directory + pg_lie->set_comm(mcomm); + pg_lie->set_batch_size(1); + pg_lie->execute(); + pg_lie->print_all_relation_size(); // Continuously print relation sizes + // lie->stat_intermediate(); + + // print all variants(non-canonical index of each relation) +// if (mcomm.get_rank() == 0) { +// std::cout << "rel_name" +// << ",\t" +// << "indices\n"; +// for (auto const &rel_p : rel_index_map) { +// std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n"; +// } +// std::cout << std::endl; +// } + + + delete pg_lie; + + // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + mcomm.destroy(); + + return 0; +} diff --git a/server_log b/server_log index 9c9807f1..3024eca0 100644 --- a/server_log +++ b/server_log @@ -1,8 +1,8 @@ -[I 2022-08-22 16:33:53] concurrency model: async -[I 2022-08-22 16:33:53] masquerade (NAT) address: None -[I 2022-08-22 16:33:53] passive ports: None -[I 2022-08-22 16:33:53] >>> starting FTP server on 0.0.0.0:2121, pid=160658 <<< +[I 2022-11-24 17:09:37] concurrency model: async +[I 2022-11-24 17:09:37] masquerade (NAT) address: None +[I 2022-11-24 17:09:37] passive ports: None +[I 2022-11-24 17:09:37] >>> starting FTP server on 0.0.0.0:2121, pid=1188430 <<< user break context...: - /home/stargazermiao/workspace/PL/slog/compiler/slog-process.rkt:45:0: loop - body of "/home/stargazermiao/workspace/PL/slog/compiler/slog-process.rkt" + /home/ubuntu/workspace/slog/compiler/slog-process.rkt:45:0: loop + body of "/home/ubuntu/workspace/slog/compiler/slog-process.rkt" From a3efb3a8a41a59a10e57dccd40f4f9897f013547 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 27 Nov 2022 19:55:37 +0000 Subject: [PATCH 16/36] fix dependent index --- backend/src/RA/parallel_agg.cpp | 13 +- backend/src/RAM/RA_tasks.cpp | 4 +- backend/src/lie/lie.cpp | 13 +- .../src/relation/balanced_hash_relation.cpp | 64 + backend/src/relation/balanced_hash_relation.h | 4 + backend/src/relation/shmap_relation_exp.cpp | 53 +- .../pagerank/compiled_pre/in/$strings.csv | 0 .../pagerank/compiled_pre/in/258.edge.2.table | Bin 0 -> 37392 bytes .../tests/pagerank/compiled_pre/pagerank.cpp | 242 ++- backend/tests/pagerank/ground_truth | 60 + backend/tests/pagerank/pagerank.py | 19 + backend/tests/pagerank/test-graph/edge.fasts | 1558 +++++++++++++++++ 12 files changed, 1947 insertions(+), 83 deletions(-) create mode 100644 backend/tests/pagerank/compiled_pre/in/$strings.csv create mode 100644 backend/tests/pagerank/compiled_pre/in/258.edge.2.table create mode 100644 backend/tests/pagerank/ground_truth create mode 100644 backend/tests/pagerank/pagerank.py create mode 100644 backend/tests/pagerank/test-graph/edge.fasts diff --git a/backend/src/RA/parallel_agg.cpp b/backend/src/RA/parallel_agg.cpp index 9fc5a204..fc42c114 100644 --- a/backend/src/RA/parallel_agg.cpp +++ b/backend/src/RA/parallel_agg.cpp @@ -93,11 +93,10 @@ void parallel_join_aggregate::local_aggregate( agg_target->insert_tuple_from_array(input0_buffer+k1, target->get_arity()+1); } } - btree::btree_map, u64, shmap_relation::t_comparator> res_map; for (u32 bucket=0; bucket < buckets; bucket ++) { for (auto tuple: input->get_full()[bucket]) { - std::vector data_v(tuple.begin(), tuple.begin()+target->get_join_column_count()); + std::vector data_v(tuple.begin(), tuple.begin()+input->get_join_column_count()); // std::cout << "On rank " << mcomm.get_rank() << " bucket " << *(target->get_sub_bucket_per_bucket_count()) << std::endl; auto joined_range = agg_target->prefix_range(data_v); auto agg_data = local_func(joined_range); @@ -110,20 +109,28 @@ void parallel_join_aggregate::local_aggregate( } } } + + // std::cout << ">>>>>>>>>>>>>>>>>>>>> " << input->get_full()[0].size() << std::endl; for (u32 bucket=0; bucket < buckets; bucket ++) { for (auto input_tuple: input->get_full()[bucket]) { std::vector joined_input_tuple(input_tuple.begin(), input_tuple.begin()+input->get_join_column_count()); auto agg_res = res_map[joined_input_tuple]; - std::vector tuple(reorder_mapping.size(), 0); + std::vector tuple(output->get_arity(), 0); int reorder_agg_index = input->get_arity() + 1; for (long unsigned int j = 0; j < reorder_mapping.size(); j++) { + // std::cout << reorder_mapping[j] << " " << reorder_agg_index << std::endl; if (reorder_mapping[j] == reorder_agg_index) { tuple[j] = agg_res; } else { tuple[j] = input_tuple[reorder_mapping[j]]; } } + // std::cout << "aggregated tuple <<<" << reorder_mapping.size() << " >>> "; + // for (auto c: tuple) { + // std::cout << c << " "; + // } + // std::cout << std::endl; uint64_t bucket_id = tuple_hash(tuple.data(), output->get_join_column_count()) % buckets; uint64_t sub_bucket_id = 0; diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 2712e343..7bbd4a71 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -1129,7 +1129,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& intra_bucket_comm_execute(); auto intra_end = MPI_Wtime(); - std::cout << std::setiosflags(std::ios::fixed); + // std::cout << std::setiosflags(std::ios::fixed); bool local_join_status = false; while (local_join_status == false) { @@ -1262,7 +1262,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s // std::cout << "--------------FIXED POINT ITERATION " << loop_count_tracker << "--------------" << std::endl; #endif - std::cout << std::setiosflags(std::ios::fixed); + // std::cout << std::setiosflags(std::ios::fixed); auto intra_start = MPI_Wtime(); intra_bucket_comm_execute(); auto intra_end = MPI_Wtime(); diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index 59a0ec57..674a2d92 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -305,12 +305,13 @@ bool LIE::execute () /// Initialize all relations for (u32 i = 0 ; i < lie_relations.size(); i++) { - lie_relations[i]->set_restart_flag(restart_flag); - lie_relations[i]->set_share_io(share_io); - lie_relations[i]->set_separate_io(separate_io); - lie_relations[i]->set_offset_io(offset_io); - lie_relations[i]->initialize_relation(mcomm, intern_map); - + if (lie_relations[i]->need_init_huh()) { + lie_relations[i]->set_restart_flag(restart_flag); + lie_relations[i]->set_share_io(share_io); + lie_relations[i]->set_separate_io(separate_io); + lie_relations[i]->set_offset_io(offset_io); + lie_relations[i]->initialize_relation(mcomm, intern_map); + } #if DEBUG_OUTPUT //lie_relations[i]->print(); #endif diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index 7182fc24..9fcfe747 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -488,6 +488,70 @@ void relation::print() // } } +void relation::print(tuple_formator_t ft) +{ + u32 buckets = get_bucket_count(); +// if (mcomm.get_rank() == 0) +// { + vector_buffer *vb_full = new vector_buffer[buckets]; + for (u32 i=0; i < buckets; i++) + { + vb_full[i].vector_buffer_create_empty(); + std::vector prefix = {}; + full[i].as_vector_buffer_recursive(&(vb_full[i]), prefix); + + if (vb_full[i].size != 0) + std::cout << get_debug_id() << " " << mcomm.get_rank() << " FULL Rows " << vb_full[i].size/(sizeof(u64) * (arity + 1)) << " columns " << arity + 1 << std::endl; + for (u32 j=0; j < vb_full[i].size/sizeof(u64); j = j + arity+1) + { + if (j % (arity+1) == 0) { + std::cout << "F [" << j/(arity+1) << "] "; + + } + std::vector cur_tuple; + for (u32 k = 0; k < arity+1; k++) + { + u64 temp; + memcpy(&temp, (vb_full[i].buffer) + (j + k)*sizeof(u64), sizeof(u64)); + // std::cout << temp << " "; + cur_tuple.push_back(temp); + } + ft(cur_tuple); + } + + vb_full[i].vector_buffer_free(); + } + delete[] vb_full; + + + // vector_buffer *vb_delta = new vector_buffer[buckets]; + // for (u32 i=0; i < buckets; i++) + // { + // vb_delta[i].vector_buffer_create_empty(); + // std::vector prefix = {}; + // delta[i].as_vector_buffer_recursive(&(vb_delta[i]), prefix); + + // if (vb_delta[i].size != 0) + // std::cout << get_debug_id() << " " << mcomm.get_rank() << " DELTA Rows " << vb_delta[i].size/(sizeof(u64) * (arity + 1)) << " columns " << arity + 1 << std::endl; + + // for (u32 j=0; j < vb_delta[i].size/sizeof(u64); j = j + arity+1) + // { + // if (j % (arity+1) == 0) + // std::cout << "D "; + + // for (u32 k = 0; k < arity+1; k++) + // { + // u64 temp; + // memcpy(&temp, (vb_delta[i].buffer) + (j + k)*sizeof(u64), sizeof(u64)); + // std::cout << temp << " "; + // } + // std::cout << std::endl; + // } + + // vb_delta[i].vector_buffer_free(); + // } + // delete[] vb_delta; +} #if 0 void relation::flush_full() diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index 88d63404..0757011d 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -20,6 +20,8 @@ enum {DELTA=0, FULL, FULL_AND_DELTA}; enum {COPY=0, COPY_FILTER, COPY_GENERATE, ACOPY, JOIN, FACT, NEGATION, AGGREGATION, UPDATE}; enum {STATIC=0, DYNAMIC}; +using tuple_formator_t = std::function&)>; + // this is update function for column has functional dependence // the size of vector arguments must have exactly same size as dependent_column_indices @@ -223,6 +225,7 @@ class relation /// print all tuples of newt, delta and full void print(); + void print(tuple_formator_t ft); void serial_IO(std::string filename_template); @@ -290,5 +293,6 @@ class relation // skip initialization/loading facts void disable_initialization() { init_flag = false; } void enable_initialization() { init_flag = true; } + bool need_init_huh() { return init_flag; } }; diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index 9ceff570..6d2ea852 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -65,29 +65,50 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width) // update // iterator need_delete = ind.end(); std::vector need_deletes; + bool joined = false; for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) { auto cur_tuple = *it; - // std::cout << "comparing <<<<<< "; - // for (auto c: cur_tuple) { - // std::cout << c << " "; + // if (tp[0] == 59 && tp[1] == 58) { + // std::cout << "tppppp <<<<<< "; + // for (auto c: cur_tuple) { + // std::cout << c << " "; + // } + // std::cout << std::endl; // } - // std::cout << std::endl; + std::vector old_t; for (auto i: dependent_column_indices) { old_t.push_back(cur_tuple[i]); } auto compare_res = update_compare_func(old_t, dependent_columns, tp); - if (compare_res.has_value() && compare_res.value()) { - need_deletes.push_back(it); - // std::cout << "update with <<<<<< "; - // for (auto c: tp) { - // std::cout << c << " "; + if (!compare_res.has_value()) { + continue; + } + if (compare_res.value()) { + need_deletes.push_back(it); + // if (tp[0] == 59 && tp[1] == 58) { + // for (auto c: cur_tuple) { + // std::cout << c << " "; + // } + // std::cout << "update with " << compare_res.value() <<" <<<<<< "; + // for (auto c: tp) { + // std::cout << c << " "; + // } + // std::cout << std::endl; // } - // std::cout << std::endl; } + joined = true; + } + if (!joined) { + return insert(tp); } if (!need_deletes.empty()) { for (auto d: need_deletes) { + // std::cout << "delete >>>> "; + // for (auto c: *d) { + // std::cout << c << " "; + // } + // std::cout << std::endl; ind.erase(*d); } return insert(tp); @@ -563,12 +584,22 @@ void shmap_relation::as_all_to_allv_left_join_buffer( if (generator_mode) { std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); + // std::cout << "Input >>>>>> "; + // for (auto c: input_t) { + // std::cout << c << " "; + // } + // std::cout << std::endl; std::vector> eq_tuple_set; std::vector> generated_tuple_set; std::vector prev_non_dependent_columns; for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it){ auto cur_path = *it; - std::vector cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+arity+1-dependent_column_indices.size()); + std::vector cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+cur_path.size()-dependent_column_indices.size()); + // std::cout << " cur prefix >>>>>>> "; + // for (auto c: cur_path) { + // std::cout << c << " "; + // } + // std::cout << std::endl; if (cur_non_dependent_columns == prev_non_dependent_columns) { eq_tuple_set.push_back(cur_path); continue; diff --git a/backend/tests/pagerank/compiled_pre/in/$strings.csv b/backend/tests/pagerank/compiled_pre/in/$strings.csv new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/pagerank/compiled_pre/in/258.edge.2.table b/backend/tests/pagerank/compiled_pre/in/258.edge.2.table new file mode 100644 index 0000000000000000000000000000000000000000..dab8cf6b7808db7f2375a2e0273c5839937d6f20 GIT binary patch literal 37392 zcmZAAcf61F!^iQjz4zV+$EFY^Dk>@sp-4tXs7Pg`BCDc|wgwFuC9v(iO?o{gBN2c&O{xq`L|9+J@d=+0$c$VPt z^@L{)9$!y*w&3yggl7*PUr%_B;PLf@=L{ZSPk652@%4n~4jx}mc%I<#^@QgQ9$!y* zzTology#<)Ur%^};PLf@7YrU>Pk5o=@%4lk4jx}mc#+`o^@JA<9$!y*vEcFbgclDU zUr%_6;PLf@mkb_XPk5=|@%4n44jx}mc$whw^@NuV9$!y*x#02jgqIHj*3$JY~HCwP24;pYX9uP6Nc;PLf@*9{(DPk6oH@%4n)4<27n z_yxh^>j`fVJiearhQZ_O3BNFSd_Cb81&^;MyixG@dcrRb9$!y*PV zwsOwpCOPMFvz&8jC+A$+%Q=@@OXaCW1`Y=Y${*TMq|B2w)qtA(tvC-EVJLBv;PR`EbX{H&avpOdrm^Ky2cD5o1Q$l3WtIURT@c#i0E;^XD$>x`XocAg|>=gD$*enrmC zugcl^H932}E@#gva`t>f&Yn}{>^V)&p3~*@WQLrb-;{Gdza^(9GlS=hJ|{ljj=s*= zA7}qra`vAsXa9HP>_11&{&VHr=kw&8%X~TKvOvzcyesEi7Rou7MRI!bo}B&Pm$UN+ z!E;5Q6CWQ&UuW!$v-3xCc3v!J=OuD>UMgqjWpZ}@SkBI$$k}&R>{~9^>|5`ccvMzY;=yT#@ee`w4x!|13XL8PEgPi?0%GrODoc%Y;*?)_i z{kO{5f18~9`Exn@e<5f8FXim~m7JZo%h`E{oc@0uJWupF@$pUcb;kZU`|p&q|F?3U zi(PWgWw)ID_sH3Qublh&J2^XlFK5pm<^GBbP zADJ?Kk+CPvo|)wAnOV-BS>)`QRnDH-$KP z`Q+T^`Q_Z_1>~GpK{@w-Avxz*ICz2N8D@N4M9w)Dm2-~8V|h6{SCF%Fnw*_01}~_cE6LfpvYegM{d~oo%eVWSIrxlYdh*UQ=e208n;lC%Gfa&~Sl z=ecepXaBZxcD_l@&Ns{1vz?s#xxJh{Z;`WShu}rE=dE(?^Nw=%>?CKm&T{VCE^_v~ zP0pTO<=nU3j-vPtN_>U(WMBK+gUH^wxy&O_zw{IHx343o3-a5+1VkhAkhIXjP%v-2atOK8tWjAXJs+2I zKR+R7=dp5j9w%q#@p5*aAg3Eo%DMlal5;Lk%Q=^4_0`${%^?Hf2y4Qr^$IP zrUx&j{b$J8|4li2z9nb3nR5Pp-03`Tft8bN|efbN|ej z^Y6Pr&U5#!oO53&=NuQwImh?p?DM{y9X<$NMmu~cXNQmE?66qQ4ol?huvE@-v`o(P z^0A!fFMtdO(YN;&t>DmlA-8oaD_TPe5p+vMEOpUc_*3pxGyQqH-26}+6zWxJen*&*j#zLs+? z-^kg2r=0!2m9zgYIs5OHv-2J~d+wF9=XY}M^Y7*K;RiYU{}{Zy_WViCp8MqN`Lmoo zf01*a@0YXl0XaJ#l(Xj{IrsCga(4br&d$Hf+4&DSJO3$X=fC9K&wtCgpZ^J7LHi$; zv;V(x_CF$L|D$sD|4+{T$K>pPT+V%dLe9CIlyfep+d@1?B8nNY0*x{(3Cp2g+tSwhZzTT;%>rR3~fTF!l2 zM$Z0a`)bh|?EO4>h7&i)nU>|aUF{*~qIpDt(rDsuL(Drf&{a`vw-Xa5>< z_OB^t|5|ePuPtZ)I&wO5o}6z^l5;Md8IhVe2&gE`7=h9Ek z^Ky@z^SW2gdEF=HT>8s7mjS`6>0AcNIhR3l&gFhN=Q3E%xjZ1}TppBjE)U5$mmzZY zA1Y_hhvn=xEO>S8FkH?KBjoHbQqB&eoE;vO^Y8nZocnOJoE^r8Y zD$f>tnml{(>GB-GXUO^Q^xu^8-<7{5&mI0gGv#@Lzb(%je3m?4@Y(YG!QYV=2tG$% zF!)?~q2Tl6g@ezR7Y)8ZUM%>#^5Vf4%1Z=aBrh5KJ$b3%@5@UE|3F?Q_=j@-ockj= zf6lsC&Yu@9k@M#+OXX?dpSw)XpC5cIuN40JPvrbuwOr2o{t7wop)2LQ2d$FxefU#3 z-FydiU0o}utLx-+b-kRfekP}@8{~9#qnxg8lGD}Aa=N-jPFJ_e>FPE)UHx26 zSHFFqCadb?jvZx6`n?Lm2wbH|c8B&WB( z%IWQIa(er_oZkK+r>lR;>FQr{y85@AuKpvZtB2)u^FNnNT|FtMtEc32^|YL>o{{t3bXHDRn)~bhU(>u9lS3)lzc$Sz1m%%gE_xSvmbIC#Rp~ z<#e)woKB|6>10JY@2i#M^s};@ex}RmXB9d9tSYCU)#UWEx}1L2kkikaa{5_IPCskQ z)8e}=m8v7BpXbRdg}?rMdF9}B<@B?joPO4q)6Wa!^s|ASem0cT&kN=B^CCIDY$T_j z7t85qV>$h7BB!61$m!>$a{76hoPIWy)6Zsd`gysWeqJG`pI6H1XLC9IY$2zgSIOz; z)xp^}+&-ymd8?dGc9he}PI5ZgSxzUr$oaYcHaY$5DyN^_?fy_ z_sHqwy>i}z?vvBW{&G4wKu#wI%IV}FIi0*;PA3P;>Er`)I{BcSPCg{3lSAZma;Th6 zJ}jq)!-BI{xJ^>S<@9iboF0yp)4fr0y7!2j?ma4}dymQK-e@`98zZNCkIQ*)d_qq5 z#>(m5I62)LFX!j(337V)q?{f;C8vi^%Xy!CMouT6mD9uLE0AM-Frh$_om9}-ZVMgn=Yq& zGvsvdO*!v1Z^`N2OgY_qTTb_8$?4u~Io*3lPWR@>>E2v9-J2(;d-LUVZ-Jcdy({Pa zaG{(YE(*?G^zc17J$zqI4?mF8!w=>3@FO`rTr8)1OXPHKshsXDlk>j!v78=$BBzJT z<@9icoF1-})5BGAdibfF9EY+{%Hf9oLQW6Al+(knZ;zbr?UmEL@8opvdpX_vK~DF6l+(SRE3=h-8&GR zUFhCHIo&%Xr+dH3qhCfJzsc$0?{a$hhnyb%Dd+w0FFBq3TTUnck<-b;ayt31obDZw z)4ii|y7!-)z8#bEzIR+s_fE*^-bp##J0+)kr{#3-jGXttvvRtZiVk#SvlRCa-OD7W zZ<*z3(M^)^kws45vdZaOHhKEFzfCH;oQ~y?)3Ka#I+jb$dtGiheaj=KZ+YePEuWme z<(Jd90&@CRP|o{HAvt|3ET?ZpqmE?4+vYd{k%jsAZIUTDi=e@3)ocFrw za{5+7PTy+E>02#19jh&;V|C^wOgJ6}%6>dNU@Jvkk#FQ;P{$mv)EIqz=`<@D`B zIUT!5PRAO_E1$cqQWwk9gEyAbw
c8Q#hT`H$zm&xf^Q#l=LCZ}VU%jwt^ayoXU zoQ^e@^L?g;oQ_>3r(;*k>DV=LI@VH7$F7yrvFqe??0PvJyFpIJTFL3yjluc%47Wq7 zwVaN%k<+oZayoXCoQ~Zrr(^BpbgaFcj@=@sV;$sl>{dA)>nNvVo#b?^vz(4~k@Iut zZF1hzy2|NWH#xoPE~i&L~SOmz*y3meZv>Y>D53vy&5E^SNF^5)nGZjdO*(m$b)iv z^^lxi4UyBUp>lflu$*2Elhdo=a(XpFPOnDF>D8#<{QJD5PadbLSH!r%PMqbZMKM{(LT{KVQh{&zExg^OcQ)1SR^`tzNf{(LW|KR?Ln&yT_R{e@#s z{UoP9`{eZJXF2`(MNUul%jwAhIqx|K<@Dr`oNoLorw_l$>A>%D-e>-h(}zFh^x-c# z?>T?V>Bc{DeqK8)rzii)>B$i}Kd&8?)06+?ycZpl^ImjZo)&JD6LR`(gS zPC1>)C8smF<-AAak<*#HaypYwPG|DV=}ZARohc}%Glhcl`v}LLDlDfnMdWm*sGQCe zlSj8l^if<+XG+NFOi4MNDJAE&>l#%n^QdUlv%E{?Wc{x3)Ag3p3a(YrxPERVy z=}BcdKWC-O=}8qiJ*g_EC)MO>@oA+})#dc0hMb<%l=D7QOI{`X{cFqVOdUBrIZsYc z&X?1Zx^j9_Pfkzj%jwAla(dE0PEQ)j>B)t1dUBDRo-~rvlZ)l_wXvL@G?CMjOXT$A zQaL@jOioXl2Irp_jyKgzPERhE(~~RY(J!NqE9LxL(_BtBTFB|ZRdSy9tL40hTqCCg zE#R9JgPabulGA}3<-BjSmeYYYayrmf&d)VB$$8#ymh-%~lc%3M z##DQGmEgC?=|BfL?;E$u`SX^Ja=Ot;PB%Ks=|&eh-MCFoH@eE{MmIU#=q{%lJ>+!b zb~)YXDW@B~TWq5=qKmB;vPBg75B=c zUq&DI$@zJwznu4w0dhJsP)=tC$$3AyUrv7p%jwSpa(>=BHl4`tXFD zK8%&qhjDWHFkVg{Cdlc-lXCj-l$<_1EvFC9$mzqga{BO`oIX4+rwSa0a50m8dVX~Y)ydsZ&8GXDervtCadEQ@_^Sn=y^Yh3Xa(?cZDyIX} zp%jw2Da-R1&a-QqCa-QpX z@+x7U`EuSL7RY(t-<9*cFO>7VFOu`TzbEH;e_zh?{(+q5{X;p=`$uw~_r-Fa_a$1ALKmmKgxODf0FaO@00Vq|19Tu|3%L8zF*Gsen8Iieo)Tyen`&u^IzpW@4p4- z*F5jP%lW?lhnx=lDd&CQFFEi3f6IBU|B>_k`mmhm_g^{B?-4oA=}|e)>3?#b(_?a; z)8lg9|4+#IK7LZpd;Tdo-``Kmd9KgMd9KgOdH+vEgO6;M;(U32Gs&xjgU>9l8a#`f z=Q^vL=Qo?2=Qq2Y=QoF(@8db;JlDD8JlDD9JlA>TJlA>UJlFZ;JlFZ{Qm$w3|%<@ literal 0 HcmV?d00001 diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp index 21a2aa94..34214be3 100644 --- a/backend/tests/pagerank/compiled_pre/pagerank.cpp +++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp @@ -1,8 +1,8 @@ // location of `parallel_RA_inc.h` here #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" -#include #include +#include #include #include #include @@ -261,7 +261,8 @@ local_agg_res_t agg_sum_local(std::pair joined_range) { local_agg_res_t sum_res = 0; - for (shmap_relation::iterator it = joined_range.first; it != joined_range.second; ++it) { + for (shmap_relation::iterator it = joined_range.first; + it != joined_range.second; ++it) { auto tuple = (*it); sum_res += tuple[tuple.size() - 2]; } @@ -272,6 +273,33 @@ local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) { return x + y; } +local_agg_res_t agg_sum_float_local( + std::pair + joined_range) { + float sum_res = 0.0; + for (shmap_relation::iterator it = joined_range.first; + it != joined_range.second; ++it) { + auto tuple = (*it); + u32 agg_column_raw = tuple[tuple.size() - 2]; + + sum_res += *reinterpret_cast(&agg_column_raw); + } + // std::cout << ">>>>>>> " << sum_res << " " << + // *reinterpret_cast(&sum_res) << std::endl; + u32 sum_res_encoded = *reinterpret_cast(&sum_res); + return sum_res_encoded; +} + +local_agg_res_t agg_sum_float_reduce(local_agg_res_t x_raw, + local_agg_res_t y_raw) { + float x = *reinterpret_cast(&x_raw); + float y = *reinterpret_cast(&y_raw); + float res = x + y; + // std::cout << res << std::endl; + u32 res_encoded = *reinterpret_cast(&res); + return res_encoded; +} + ////////////////////////////// maximum ///////////////////////////////////// local_agg_res_t @@ -409,7 +437,7 @@ int main(int argc, char **argv) { // >>>>>>>>>>>>>>> compute node size // (node x) relation *rel__node__1__1 = new relation( - 1, true, 2, get_tag_for_rel("node", "1"), + 1, true, 1, get_tag_for_rel("node", "1"), std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table", slog_input_dir + "/" + std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table", @@ -496,7 +524,12 @@ int main(int argc, char **argv) { rel__edge__2__1->disable_initialization(); rel__node__1__1->disable_initialization(); -// matrix edge + successor count + relation *rel__edge__2__2 = new relation( + 1, false, 2, get_tag_for_rel("edge", "2"), + std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", + std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL); + + // matrix edge + successor count relation *rel__matrix__3__1 = new relation( 1, true, 3, get_tag_for_rel("matrix", "1"), std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", @@ -508,93 +541,167 @@ int main(int argc, char **argv) { std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL); rel__rank__3__1->set_dependent_column_update( - {2, 3, 4}, + {1, 2, 3}, [](const std::vector &old_v, const std::vector &new_v, const vector &nt) -> std::optional { + // if (nt[0] == 59 && nt[1] == 58) { + // std::cout << "dependent column size " << new_v.size() << std::endl; + // std::cout << new_v[0] << " " << new_v[1] << " " << new_v[2] << " + // comparing with " << old_v[0] << " " << old_v[1] << " " << old_v[2] + // << std::endl; + // } if (new_v[0] != old_v[0]) { + // std::cout << " www >>>>>>>>>" << std::endl; return std::nullopt; } else { // monotonic - assert(new_v[1] > old_v[1]); + // assert(new_v[1] > old_v[1]); + // u32 new_sum_raw = new_v[1]; + // u32 old_sum_raw = old_v[1]; + // float new_sum = *reinterpret_cast(&new_sum_raw); + // float old_sum = *reinterpret_cast(&old_sum_raw); + // if (new_sum > old_sum) { + // std::cout << "new >> " << new_sum << " old >> " << old_sum << + // std::endl; + // } + // return new_sum > old_sum; return new_v[1] > old_v[1]; + // return true; } }); relation *rel__result__2__1__2 = new relation( 2, true, 2, get_tag_for_rel("result", "1__2"), std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", - std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", FULL); + std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", + FULL); - // + // - RAM* scc_compute_matrix = new RAM(false, 0); + RAM *scc_copy_edge = new RAM(false, 0); + scc_copy_edge->add_relation(rel__edge__2__1, false, false); + scc_copy_edge->add_relation(rel__edge__2__2, true, false); + scc_copy_edge->add_rule( + new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2})); + + RAM *scc_compute_matrix = new RAM(false, 1); scc_compute_matrix->add_relation(rel__edge__2__1, false, false); + scc_compute_matrix->add_relation(rel__edge__2__2, false, false); scc_compute_matrix->add_relation(rel__matrix__3__1, true, false); - scc_compute_matrix->add_rule( - new parallel_join_aggregate( - rel__matrix__3__1, rel__edge__2__1, rel__node__1__1, FULL, + scc_compute_matrix->add_rule(new parallel_join_aggregate( + rel__matrix__3__1, rel__edge__2__2, rel__edge__2__1, FULL, agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, - {1, 2, 3})); + {0, 1, 3})); - RAM* scc_page_rank = new RAM(true, 1); - scc_page_rank->add_relation(rel__matrix__3__1, false, false); - scc_page_rank->add_relation(rel__rank__3__1, true, false); - scc_page_rank->add_rule(new parallel_copy_generate( - rel__rank__3__1, rel__matrix__3__1, FULL, - [](const u64 *const data, u64 *const output) -> int { + RAM *scc_init = new RAM(false, 2); + scc_init->add_relation(rel__matrix__3__1, false, false); + scc_init->add_relation(rel__rank__3__1, true, false); + scc_init->add_rule(new parallel_copy_generate( + rel__rank__3__1, rel__matrix__3__1, FULL, + [](const u64 *const data, u64 *const output) -> int { output[0] = data[0]; output[1] = data[0]; - float init_pg_v = (1 - ALPHA) / total_node_size; - output[1] = *reinterpret_cast(&init_pg_v); + // float init_pg_v = (1 - ALPHA) / total_node_size; + u64 init_pg_v = (u64)(((1 - ALPHA) / total_node_size) * 100000); + // std::cout << init_pg_v << std::endl; + // output[2] = *reinterpret_cast(&init_pg_v); + output[2] = init_pg_v; return 1; + })); + + RAM *scc_page_rank = new RAM(true, 3); + scc_page_rank->add_relation(rel__matrix__3__1, false, false); + scc_page_rank->add_relation(rel__rank__3__1, true, false); + parallel_join *rank_join = + new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL, + rel__rank__3__1, DELTA, {3, 1, 2} // useless + ); + rank_join->set_generator_func([](const depend_val_t &target_vs, + const std::vector &input_v, + depend_val_t &res_set) -> bool { + // float pg_sum = 0.0; + u64 pg_sum = 0; + + int count = 0; + for (auto &tv : target_vs) { + // std::cout << "tagret v >>>>> "; + // for (auto c: tv) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + u32 raw_succ_pg_v = tv[2]; // all columns are u64, cast to u32 first + // std::cout << ">>>>>>>>>>>>>>> " << + // *reinterpret_cast(&raw_succ_pg_v) << std::endl; + // auto succ_pg_v = *reinterpret_cast(&raw_succ_pg_v); + // if(succ_pg_v == 0) { + // // std::cout << ">>>>>>>>>> " << succ_pg_v << std::endl; + // std::cout << "tagret v >>>>> "; + // for (auto c: tv) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // } + if (input_v[2] != 0) { + // pg_sum += ((ALPHA * succ_pg_v) / input_v[2]); + pg_sum += (u64)(((u64)(ALPHA * raw_succ_pg_v)) / input_v[2]); + // if (input_v[1] == 51) { + // std::cout << "Sum 51 " << input_v[0] << " with "; + // for (auto c: tv) { + // std::cout << c << " "; + // } + // std::cout << " result " << pg_sum << std::endl; + // } + } + count++; } - )); - parallel_join* rank_join = new parallel_join( - rel__rank__3__1, - rel__matrix__3__1, FULL, - rel__rank__3__1, DELTA, - {3,1,2} // useless - ); - rank_join->set_generator_func( - [](const depend_val_t& target_vs, const std::vector& input_v, depend_val_t& res_set) -> bool { - float pg_sum = 0.0; - for (auto& tv: target_vs) { - u32 raw_succ_pg_v = (u32)d2n(tv[3]); // all columns are u64, cast to u32 first - auto succ_pg_v = *reinterpret_cast(raw_succ_pg_v); - pg_sum += ALPHA * succ_pg_v / d2n(input_v[3]); - } - // u64 encoded_sum = - std::vector res_tuple(3, 0); - res_tuple[0] = input_v[1]; - res_tuple[1] = input_v[0]; - res_tuple[2] = *reinterpret_cast(&pg_sum); - res_set.push_back(res_tuple); - return true; + if (pg_sum == 0) { + return false; } - ); + if (count == 0) { + return false; + } + std::vector res_tuple(3, 0); + res_tuple[0] = input_v[1]; + res_tuple[1] = input_v[0]; + // res_tuple[2] = *reinterpret_cast(&pg_sum); + res_tuple[2] = pg_sum; + // std::cout << "New tuple >>>>>>> " << pg_sum << std::endl; + // for (auto c: res_tuple) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + res_set.push_back(res_tuple); + return true; + }); scc_page_rank->add_rule(rank_join); - RAM *scc_result = new RAM(false, 2); + RAM *scc_result = new RAM(false, 4); scc_result->add_relation(rel__rank__3__1, false, false); scc_result->add_relation(rel__result__2__1__2, true, false); scc_result->add_relation(rel__node__1__1, false, false); + // scc_result->add_rule(new parallel_join_aggregate( + // rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, + // agg_sum_float_local, SpecialAggregator::sum, agg_sum_float_reduce, + // nullptr, {0, 2})); scc_result->add_rule(new parallel_join_aggregate( - rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, - agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, - {0, 2} - )); + rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, + agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, 2})); - LIE *pg_lie = new LIE(); pg_lie->add_relation(rel__edge__2__1); pg_lie->add_relation(rel__matrix__3__1); pg_lie->add_relation(rel__node__1__1); + pg_lie->add_relation(rel__edge__2__2); pg_lie->add_relation(rel__rank__3__1); pg_lie->add_relation(rel__result__2__1__2); + pg_lie->add_scc(scc_copy_edge); pg_lie->add_scc(scc_compute_matrix); + pg_lie->add_scc(scc_init); pg_lie->add_scc(scc_page_rank); pg_lie->add_scc(scc_result); - pg_lie->add_scc_dependance(scc_compute_matrix, scc_page_rank); + pg_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix); + pg_lie->add_scc_dependance(scc_compute_matrix, scc_init); + pg_lie->add_scc_dependance(scc_init, scc_page_rank); pg_lie->add_scc_dependance(scc_page_rank, scc_result); // Enable IO @@ -608,18 +715,31 @@ int main(int argc, char **argv) { pg_lie->execute(); pg_lie->print_all_relation_size(); // Continuously print relation sizes // lie->stat_intermediate(); + // rel__matrix__3__1->print(); + // rel__rank__3__1->print( + // [](const std::vector& tp){ + // u32 pg_v = tp[2]; + // // std::cout << tp[0] << " " << tp[1] << " " << + // *reinterpret_cast(&pg_v) << std::cout << tp[0] << " " << tp[1] + // << " " << pg_v << std::endl; + // } + // ); + rel__result__2__1__2->print([](const std::vector &tp) { + u32 pg_v = tp[1]; + // std::cout << tp[0] << " " << *reinterpret_cast(&pg_v) << + std::cout << tp[0] << " " << pg_v * 1.0 / 100000 << std::endl; + }); // print all variants(non-canonical index of each relation) -// if (mcomm.get_rank() == 0) { -// std::cout << "rel_name" -// << ",\t" -// << "indices\n"; -// for (auto const &rel_p : rel_index_map) { -// std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n"; -// } -// std::cout << std::endl; -// } - + // if (mcomm.get_rank() == 0) { + // std::cout << "rel_name" + // << ",\t" + // << "indices\n"; + // for (auto const &rel_p : rel_index_map) { + // std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n"; + // } + // std::cout << std::endl; + // } delete pg_lie; diff --git a/backend/tests/pagerank/ground_truth b/backend/tests/pagerank/ground_truth new file mode 100644 index 00000000..e896aa19 --- /dev/null +++ b/backend/tests/pagerank/ground_truth @@ -0,0 +1,60 @@ +0 0.0360 +1 0.0113 +2 0.0119 +3 0.0103 +4 0.0093 +5 0.0103 +6 0.0103 +7 0.0108 +8 0.0103 +9 0.0113 +10 0.0108 +11 0.0087 +12 0.0098 +13 0.0098 +14 0.0124 +15 0.0103 +16 0.0087 +17 0.0087 +18 0.0113 +19 0.0098 +20 0.0087 +21 0.0108 +22 0.0098 +23 0.0113 +24 0.0108 +25 0.0067 +26 0.0082 +27 0.0093 +28 0.0113 +29 0.0098 +30 0.0108 +31 0.0103 +32 0.0098 +33 0.0108 +34 0.0113 +35 0.0108 +36 0.0108 +37 0.0113 +38 0.0113 +39 0.0082 +40 0.0093 +41 0.0119 +42 0.0354 +43 0.0347 +44 0.0341 +45 0.0331 +46 0.0329 +47 0.0324 +48 0.0317 +49 0.0310 +50 0.0305 +51 0.0300 +52 0.0294 +53 0.0288 +54 0.0281 +55 0.0271 +56 0.0272 +57 0.0265 +58 0.0259 +59 0.0254 diff --git a/backend/tests/pagerank/pagerank.py b/backend/tests/pagerank/pagerank.py new file mode 100644 index 00000000..753feb8e --- /dev/null +++ b/backend/tests/pagerank/pagerank.py @@ -0,0 +1,19 @@ +""" +generate a test graph and pagerank ground truth for testing + +""" + +import networkx as nx + +bag = nx.barabasi_albert_graph(60, 41).to_directed() +pr = nx.pagerank(bag) + +with open("test-graph/edge.fasts", "w+") as edge_f: + for f, t in bag.edges: + edge_f.write(f"{f}\t{t}\n") + +with open("ground_truth", "w+") as truth_f: + for node, val in pr.items(): + truth_f.write(f"{node}\t{val:.4f}\n") + +print("done!") diff --git a/backend/tests/pagerank/test-graph/edge.fasts b/backend/tests/pagerank/test-graph/edge.fasts new file mode 100644 index 00000000..dc33fde7 --- /dev/null +++ b/backend/tests/pagerank/test-graph/edge.fasts @@ -0,0 +1,1558 @@ +0 1 +0 2 +0 3 +0 4 +0 5 +0 6 +0 7 +0 8 +0 9 +0 10 +0 11 +0 12 +0 13 +0 14 +0 15 +0 16 +0 17 +0 18 +0 19 +0 20 +0 21 +0 22 +0 23 +0 24 +0 25 +0 26 +0 27 +0 28 +0 29 +0 30 +0 31 +0 32 +0 33 +0 34 +0 35 +0 36 +0 37 +0 38 +0 39 +0 40 +0 41 +0 42 +0 43 +0 44 +0 45 +0 46 +0 47 +0 48 +0 49 +0 50 +0 51 +0 52 +0 53 +0 54 +0 55 +0 56 +0 57 +0 58 +0 59 +1 0 +1 42 +1 43 +1 44 +1 45 +1 46 +1 47 +1 48 +1 49 +1 50 +1 51 +1 52 +1 53 +1 54 +1 55 +1 57 +1 59 +2 0 +2 42 +2 43 +2 44 +2 45 +2 46 +2 47 +2 48 +2 49 +2 50 +2 51 +2 52 +2 53 +2 54 +2 55 +2 56 +2 58 +2 59 +3 0 +3 42 +3 43 +3 44 +3 45 +3 48 +3 49 +3 50 +3 51 +3 52 +3 53 +3 54 +3 56 +3 57 +3 59 +4 0 +4 42 +4 43 +4 44 +4 45 +4 46 +4 47 +4 48 +4 52 +4 54 +4 56 +4 57 +4 58 +5 0 +5 42 +5 43 +5 44 +5 45 +5 46 +5 47 +5 48 +5 49 +5 50 +5 51 +5 52 +5 55 +5 56 +5 57 +6 0 +6 42 +6 43 +6 44 +6 45 +6 46 +6 47 +6 48 +6 49 +6 50 +6 51 +6 52 +6 53 +6 56 +6 59 +7 0 +7 42 +7 43 +7 44 +7 45 +7 46 +7 48 +7 49 +7 50 +7 51 +7 52 +7 53 +7 54 +7 56 +7 57 +7 58 +8 0 +8 42 +8 43 +8 44 +8 45 +8 46 +8 47 +8 48 +8 49 +8 50 +8 51 +8 52 +8 55 +8 56 +8 58 +9 0 +9 42 +9 43 +9 44 +9 45 +9 46 +9 47 +9 48 +9 49 +9 50 +9 51 +9 53 +9 54 +9 55 +9 56 +9 57 +9 58 +10 0 +10 42 +10 43 +10 45 +10 46 +10 47 +10 48 +10 49 +10 50 +10 52 +10 53 +10 54 +10 56 +10 57 +10 58 +10 59 +11 0 +11 42 +11 43 +11 44 +11 45 +11 47 +11 48 +11 51 +11 52 +11 54 +11 57 +11 58 +12 0 +12 42 +12 43 +12 44 +12 46 +12 47 +12 49 +12 50 +12 52 +12 53 +12 54 +12 55 +12 56 +12 58 +13 0 +13 42 +13 43 +13 44 +13 45 +13 46 +13 47 +13 49 +13 50 +13 51 +13 52 +13 54 +13 57 +13 59 +14 0 +14 42 +14 43 +14 44 +14 45 +14 46 +14 47 +14 48 +14 49 +14 50 +14 51 +14 52 +14 53 +14 54 +14 55 +14 56 +14 57 +14 58 +14 59 +15 0 +15 42 +15 43 +15 44 +15 45 +15 46 +15 47 +15 48 +15 49 +15 50 +15 51 +15 52 +15 57 +15 58 +15 59 +16 0 +16 42 +16 43 +16 46 +16 47 +16 48 +16 49 +16 51 +16 52 +16 53 +16 55 +16 58 +17 0 +17 42 +17 43 +17 44 +17 45 +17 46 +17 47 +17 50 +17 51 +17 53 +17 56 +17 57 +18 0 +18 42 +18 43 +18 44 +18 45 +18 46 +18 47 +18 48 +18 50 +18 51 +18 52 +18 53 +18 54 +18 55 +18 57 +18 58 +18 59 +19 0 +19 43 +19 44 +19 45 +19 46 +19 48 +19 49 +19 50 +19 52 +19 53 +19 55 +19 56 +19 58 +19 59 +20 0 +20 42 +20 44 +20 46 +20 47 +20 48 +20 49 +20 51 +20 53 +20 54 +20 58 +20 59 +21 0 +21 42 +21 43 +21 44 +21 45 +21 46 +21 47 +21 48 +21 49 +21 52 +21 53 +21 54 +21 55 +21 57 +21 58 +21 59 +22 0 +22 42 +22 43 +22 44 +22 45 +22 47 +22 48 +22 49 +22 50 +22 53 +22 54 +22 55 +22 57 +22 58 +23 0 +23 42 +23 43 +23 44 +23 45 +23 46 +23 47 +23 48 +23 49 +23 50 +23 51 +23 52 +23 53 +23 54 +23 55 +23 56 +23 59 +24 0 +24 42 +24 43 +24 44 +24 45 +24 46 +24 47 +24 48 +24 49 +24 51 +24 52 +24 53 +24 54 +24 55 +24 56 +24 58 +25 0 +25 42 +25 45 +25 47 +25 52 +25 56 +25 57 +25 59 +26 0 +26 42 +26 43 +26 44 +26 45 +26 50 +26 51 +26 54 +26 55 +26 56 +26 59 +27 0 +27 42 +27 43 +27 44 +27 45 +27 46 +27 48 +27 49 +27 50 +27 53 +27 55 +27 57 +27 59 +28 0 +28 42 +28 43 +28 44 +28 45 +28 46 +28 47 +28 48 +28 49 +28 50 +28 51 +28 52 +28 53 +28 54 +28 57 +28 58 +28 59 +29 0 +29 42 +29 43 +29 44 +29 46 +29 47 +29 48 +29 49 +29 51 +29 53 +29 54 +29 55 +29 56 +29 57 +30 0 +30 42 +30 43 +30 44 +30 45 +30 46 +30 47 +30 49 +30 50 +30 51 +30 53 +30 54 +30 55 +30 56 +30 58 +30 59 +31 0 +31 42 +31 43 +31 44 +31 45 +31 46 +31 47 +31 48 +31 49 +31 50 +31 51 +31 52 +31 55 +31 56 +31 59 +32 0 +32 42 +32 43 +32 44 +32 45 +32 46 +32 47 +32 48 +32 50 +32 51 +32 53 +32 54 +32 56 +32 58 +33 0 +33 42 +33 43 +33 44 +33 45 +33 46 +33 47 +33 48 +33 49 +33 50 +33 51 +33 52 +33 53 +33 54 +33 56 +33 57 +34 0 +34 42 +34 43 +34 44 +34 45 +34 46 +34 47 +34 48 +34 49 +34 50 +34 51 +34 52 +34 53 +34 54 +34 55 +34 56 +34 58 +35 0 +35 42 +35 43 +35 44 +35 45 +35 46 +35 47 +35 49 +35 50 +35 52 +35 54 +35 55 +35 56 +35 57 +35 58 +35 59 +36 0 +36 42 +36 43 +36 44 +36 45 +36 46 +36 47 +36 48 +36 49 +36 51 +36 52 +36 53 +36 55 +36 56 +36 57 +36 58 +37 0 +37 42 +37 43 +37 44 +37 45 +37 46 +37 47 +37 48 +37 49 +37 50 +37 51 +37 52 +37 54 +37 55 +37 56 +37 57 +37 59 +38 0 +38 42 +38 43 +38 44 +38 45 +38 46 +38 47 +38 48 +38 49 +38 50 +38 51 +38 52 +38 53 +38 54 +38 55 +38 57 +38 59 +39 0 +39 42 +39 43 +39 44 +39 45 +39 46 +39 47 +39 48 +39 50 +39 53 +39 55 +40 0 +40 42 +40 43 +40 44 +40 45 +40 46 +40 48 +40 49 +40 50 +40 51 +40 55 +40 58 +40 59 +41 0 +41 42 +41 43 +41 44 +41 45 +41 46 +41 47 +41 48 +41 49 +41 50 +41 51 +41 52 +41 53 +41 54 +41 55 +41 56 +41 57 +41 58 +42 0 +42 1 +42 2 +42 3 +42 4 +42 5 +42 6 +42 7 +42 8 +42 9 +42 10 +42 11 +42 12 +42 13 +42 14 +42 15 +42 16 +42 17 +42 18 +42 20 +42 21 +42 22 +42 23 +42 24 +42 25 +42 26 +42 27 +42 28 +42 29 +42 30 +42 31 +42 32 +42 33 +42 34 +42 35 +42 36 +42 37 +42 38 +42 39 +42 40 +42 41 +42 43 +42 44 +42 45 +42 46 +42 47 +42 48 +42 49 +42 50 +42 51 +42 52 +42 53 +42 54 +42 55 +42 56 +42 57 +42 58 +42 59 +43 0 +43 1 +43 2 +43 3 +43 4 +43 5 +43 6 +43 7 +43 8 +43 9 +43 10 +43 11 +43 12 +43 13 +43 14 +43 15 +43 16 +43 17 +43 18 +43 19 +43 21 +43 22 +43 23 +43 24 +43 26 +43 27 +43 28 +43 29 +43 30 +43 31 +43 32 +43 33 +43 34 +43 35 +43 36 +43 37 +43 38 +43 39 +43 40 +43 41 +43 42 +43 44 +43 45 +43 46 +43 47 +43 48 +43 49 +43 50 +43 51 +43 52 +43 53 +43 54 +43 55 +43 56 +43 57 +43 58 +43 59 +44 0 +44 1 +44 2 +44 3 +44 4 +44 5 +44 6 +44 7 +44 8 +44 9 +44 11 +44 12 +44 13 +44 14 +44 15 +44 17 +44 18 +44 19 +44 20 +44 21 +44 22 +44 23 +44 24 +44 26 +44 27 +44 28 +44 29 +44 30 +44 31 +44 32 +44 33 +44 34 +44 35 +44 36 +44 37 +44 38 +44 39 +44 40 +44 41 +44 42 +44 43 +44 45 +44 46 +44 47 +44 48 +44 49 +44 50 +44 51 +44 52 +44 53 +44 54 +44 55 +44 56 +44 57 +44 58 +44 59 +45 0 +45 1 +45 2 +45 3 +45 4 +45 5 +45 6 +45 7 +45 8 +45 9 +45 10 +45 11 +45 13 +45 14 +45 15 +45 17 +45 18 +45 19 +45 21 +45 22 +45 23 +45 24 +45 25 +45 26 +45 27 +45 28 +45 30 +45 31 +45 32 +45 33 +45 34 +45 35 +45 36 +45 37 +45 38 +45 39 +45 40 +45 41 +45 42 +45 43 +45 44 +45 46 +45 47 +45 48 +45 49 +45 50 +45 51 +45 52 +45 53 +45 54 +45 55 +45 56 +45 57 +45 59 +46 0 +46 1 +46 2 +46 4 +46 5 +46 6 +46 7 +46 8 +46 9 +46 10 +46 12 +46 13 +46 14 +46 15 +46 16 +46 17 +46 18 +46 19 +46 20 +46 21 +46 23 +46 24 +46 27 +46 28 +46 29 +46 30 +46 31 +46 32 +46 33 +46 34 +46 35 +46 36 +46 37 +46 38 +46 39 +46 40 +46 41 +46 42 +46 43 +46 44 +46 45 +46 47 +46 48 +46 49 +46 50 +46 51 +46 52 +46 53 +46 54 +46 55 +46 56 +46 57 +46 58 +46 59 +47 0 +47 1 +47 2 +47 4 +47 5 +47 6 +47 8 +47 9 +47 10 +47 11 +47 12 +47 13 +47 14 +47 15 +47 16 +47 17 +47 18 +47 20 +47 21 +47 22 +47 23 +47 24 +47 25 +47 28 +47 29 +47 30 +47 31 +47 32 +47 33 +47 34 +47 35 +47 36 +47 37 +47 38 +47 39 +47 41 +47 42 +47 43 +47 44 +47 45 +47 46 +47 48 +47 49 +47 50 +47 51 +47 52 +47 53 +47 54 +47 55 +47 56 +47 57 +47 58 +47 59 +48 0 +48 1 +48 2 +48 3 +48 4 +48 5 +48 6 +48 7 +48 8 +48 9 +48 10 +48 11 +48 14 +48 15 +48 16 +48 18 +48 19 +48 20 +48 21 +48 22 +48 23 +48 24 +48 27 +48 28 +48 29 +48 31 +48 32 +48 33 +48 34 +48 36 +48 37 +48 38 +48 39 +48 40 +48 41 +48 42 +48 43 +48 44 +48 45 +48 46 +48 47 +48 49 +48 50 +48 51 +48 52 +48 53 +48 54 +48 55 +48 56 +48 57 +48 58 +48 59 +49 0 +49 1 +49 2 +49 3 +49 5 +49 6 +49 7 +49 8 +49 9 +49 10 +49 12 +49 13 +49 14 +49 15 +49 16 +49 19 +49 20 +49 21 +49 22 +49 23 +49 24 +49 27 +49 28 +49 29 +49 30 +49 31 +49 33 +49 34 +49 35 +49 36 +49 37 +49 38 +49 40 +49 41 +49 42 +49 43 +49 44 +49 45 +49 46 +49 47 +49 48 +49 50 +49 51 +49 52 +49 53 +49 54 +49 55 +49 56 +49 57 +49 58 +49 59 +50 0 +50 1 +50 2 +50 3 +50 5 +50 6 +50 7 +50 8 +50 9 +50 10 +50 12 +50 13 +50 14 +50 15 +50 17 +50 18 +50 19 +50 22 +50 23 +50 26 +50 27 +50 28 +50 30 +50 31 +50 32 +50 33 +50 34 +50 35 +50 37 +50 38 +50 39 +50 40 +50 41 +50 42 +50 43 +50 44 +50 45 +50 46 +50 47 +50 48 +50 49 +50 51 +50 52 +50 53 +50 54 +50 55 +50 56 +50 57 +50 58 +50 59 +51 0 +51 1 +51 2 +51 3 +51 5 +51 6 +51 7 +51 8 +51 9 +51 11 +51 13 +51 14 +51 15 +51 16 +51 17 +51 18 +51 20 +51 23 +51 24 +51 26 +51 28 +51 29 +51 30 +51 31 +51 32 +51 33 +51 34 +51 36 +51 37 +51 38 +51 40 +51 41 +51 42 +51 43 +51 44 +51 45 +51 46 +51 47 +51 48 +51 49 +51 50 +51 52 +51 53 +51 54 +51 55 +51 56 +51 57 +51 58 +51 59 +52 0 +52 1 +52 2 +52 3 +52 4 +52 5 +52 6 +52 7 +52 8 +52 10 +52 11 +52 12 +52 13 +52 14 +52 15 +52 16 +52 18 +52 19 +52 21 +52 23 +52 24 +52 25 +52 28 +52 31 +52 33 +52 34 +52 35 +52 36 +52 37 +52 38 +52 41 +52 42 +52 43 +52 44 +52 45 +52 46 +52 47 +52 48 +52 49 +52 50 +52 51 +52 53 +52 54 +52 55 +52 56 +52 57 +52 58 +52 59 +53 0 +53 1 +53 2 +53 3 +53 6 +53 7 +53 9 +53 10 +53 12 +53 14 +53 16 +53 17 +53 18 +53 19 +53 20 +53 21 +53 22 +53 23 +53 24 +53 27 +53 28 +53 29 +53 30 +53 32 +53 33 +53 34 +53 36 +53 38 +53 39 +53 41 +53 42 +53 43 +53 44 +53 45 +53 46 +53 47 +53 48 +53 49 +53 50 +53 51 +53 52 +53 54 +53 55 +53 56 +53 57 +53 58 +53 59 +54 0 +54 1 +54 2 +54 3 +54 4 +54 7 +54 9 +54 10 +54 11 +54 12 +54 13 +54 14 +54 18 +54 20 +54 21 +54 22 +54 23 +54 24 +54 26 +54 28 +54 29 +54 30 +54 32 +54 33 +54 34 +54 35 +54 37 +54 38 +54 41 +54 42 +54 43 +54 44 +54 45 +54 46 +54 47 +54 48 +54 49 +54 50 +54 51 +54 52 +54 53 +54 55 +54 56 +54 57 +54 58 +54 59 +55 0 +55 1 +55 2 +55 5 +55 8 +55 9 +55 12 +55 14 +55 16 +55 18 +55 19 +55 21 +55 22 +55 23 +55 24 +55 26 +55 27 +55 29 +55 30 +55 31 +55 34 +55 35 +55 36 +55 37 +55 38 +55 39 +55 40 +55 41 +55 42 +55 43 +55 44 +55 45 +55 46 +55 47 +55 48 +55 49 +55 50 +55 51 +55 52 +55 53 +55 54 +55 57 +55 58 +55 59 +56 0 +56 2 +56 3 +56 4 +56 5 +56 6 +56 7 +56 8 +56 9 +56 10 +56 12 +56 14 +56 17 +56 19 +56 23 +56 24 +56 25 +56 26 +56 29 +56 30 +56 31 +56 32 +56 33 +56 34 +56 35 +56 36 +56 37 +56 41 +56 42 +56 43 +56 44 +56 45 +56 46 +56 47 +56 48 +56 49 +56 50 +56 51 +56 52 +56 53 +56 54 +56 57 +56 58 +56 59 +57 0 +57 1 +57 3 +57 4 +57 5 +57 7 +57 9 +57 10 +57 11 +57 13 +57 14 +57 15 +57 17 +57 18 +57 21 +57 22 +57 25 +57 27 +57 28 +57 29 +57 33 +57 35 +57 36 +57 37 +57 38 +57 41 +57 42 +57 43 +57 44 +57 45 +57 46 +57 47 +57 48 +57 49 +57 50 +57 51 +57 52 +57 53 +57 54 +57 55 +57 56 +57 58 +57 59 +58 0 +58 2 +58 4 +58 7 +58 8 +58 9 +58 10 +58 11 +58 12 +58 14 +58 15 +58 16 +58 18 +58 19 +58 20 +58 21 +58 22 +58 24 +58 28 +58 30 +58 32 +58 34 +58 35 +58 36 +58 40 +58 41 +58 42 +58 43 +58 44 +58 46 +58 47 +58 48 +58 49 +58 50 +58 51 +58 52 +58 53 +58 54 +58 55 +58 56 +58 57 +58 59 +59 0 +59 1 +59 2 +59 3 +59 6 +59 10 +59 13 +59 14 +59 15 +59 18 +59 19 +59 20 +59 21 +59 23 +59 25 +59 26 +59 27 +59 28 +59 30 +59 31 +59 35 +59 37 +59 38 +59 40 +59 42 +59 43 +59 44 +59 45 +59 46 +59 47 +59 48 +59 49 +59 50 +59 51 +59 52 +59 53 +59 54 +59 55 +59 56 +59 57 +59 58 From ec9250cc34010f73ae968beb200997b6da1a75f1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 29 Nov 2022 22:25:47 +0000 Subject: [PATCH 17/36] stage --- backend/src/RAM/RA_tasks.cpp | 4 +- backend/src/lie/lie.cpp | 14 +- .../src/relation/balanced_hash_relation.cpp | 2 + .../tests/pagerank/compiled_pre/pagerank.cpp | 397 ++++----- .../pagerank/compiled_pre/pagerank_full.cpp | 753 ++++++++++++++++++ 5 files changed, 964 insertions(+), 206 deletions(-) create mode 100644 backend/tests/pagerank/compiled_pre/pagerank_full.cpp diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 7bbd4a71..2712e343 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -1129,7 +1129,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& intra_bucket_comm_execute(); auto intra_end = MPI_Wtime(); - // std::cout << std::setiosflags(std::ios::fixed); + std::cout << std::setiosflags(std::ios::fixed); bool local_join_status = false; while (local_join_status == false) { @@ -1262,7 +1262,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s // std::cout << "--------------FIXED POINT ITERATION " << loop_count_tracker << "--------------" << std::endl; #endif - // std::cout << std::setiosflags(std::ios::fixed); + std::cout << std::setiosflags(std::ios::fixed); auto intra_start = MPI_Wtime(); intra_bucket_comm_execute(); auto intra_end = MPI_Wtime(); diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index 674a2d92..e517288f 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -588,7 +588,9 @@ bool LIE::execute () if (mcomm.get_rank() == 0) std::cout << "Total interation count: " << full_iteration_count << std::endl; - write_final_checkpoint_dump(); + if (enable_data_io) { + write_final_checkpoint_dump(); + } // std::cout << "finish writting checkpoint!" << std::endl; @@ -604,9 +606,9 @@ bool LIE::execute () LIE::~LIE () { - for (u32 i = 0 ; i < lie_relations.size(); i++) - { - lie_relations[i]->finalize_relation(); - delete (lie_relations[i]); - } + // for (u32 i = 0 ; i < lie_relations.size(); i++) + // { + // lie_relations[i]->finalize_relation(); + // delete (lie_relations[i]); + // } } diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index 9fcfe747..660f5a7d 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -23,6 +23,7 @@ u32 relation::get_global_delta_element_count() u32 relation::get_global_full_element_count() { + // TODO: change to use size of shamp_relation rather than counter u32 global_full_element_count; MPI_Allreduce(&full_element_count, &global_full_element_count, 1, MPI_INT, MPI_SUM, mcomm.get_local_comm()); return global_full_element_count; @@ -856,6 +857,7 @@ void relation::populate_full(int buffer_size, u64* buffer) if (full[bucket_id].insert_tuple_from_array(t, (arity+1)) == true) { + // TODO: check if its update, if it is keep full count same full_element_count++; full_bucket_element_count[bucket_id]++; counter++; diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp index 34214be3..ec7133ba 100644 --- a/backend/tests/pagerank/compiled_pre/pagerank.cpp +++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp @@ -1,5 +1,6 @@ // location of `parallel_RA_inc.h` here #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" +#include "mpi.h" #include #include @@ -35,6 +36,14 @@ const u64 str_tag = 2; const u64 sign_flip_const = 0x0000200000000000; const u64 signed_num_mask = 0xFFFFE00000000000; +#define FLOAT_SCALE_CONST 100000 +float ALPHA = 0.85; +u64 total_node_size = 0; +u64 dangling_value = 0; +u64 current_iter = 0; +int MAX_PG_ITERATION = 2; +u64 dangling_node_cnt; + inline bool is_number(u64 datum) { // cout << "is_number(" << datum << "): " << (datum >> tag_position == // int_tag) << "\n"; @@ -264,7 +273,9 @@ agg_sum_local(std::pair for (shmap_relation::iterator it = joined_range.first; it != joined_range.second; ++it) { auto tuple = (*it); - sum_res += tuple[tuple.size() - 2]; + // if (tuple[1] == MAX_PG_ITERATION) { + sum_res += tuple[tuple.size() - 2]; + // } } return sum_res; } @@ -410,9 +421,6 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { return max_rel; } -float ALPHA = 0.85; -u64 total_node_size = 0; - int main(int argc, char **argv) { // input dir from compiler std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data"; @@ -426,6 +434,8 @@ int main(int argc, char **argv) { mpi_comm mcomm; mcomm.create(argc, argv); + MAX_PG_ITERATION = atoi(argv[3]); + // (edge from to) relation *rel__edge__2__1 = new relation( 1, true, 2, get_tag_for_rel("edge", "1__2"), @@ -461,6 +471,39 @@ int main(int argc, char **argv) { ".$unit.1.table", FULL); + // relation *rel__edge__2__2 = new relation( + // 1, false, 2, get_tag_for_rel("edge", "2"), + // std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", + // std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL); + + // matrix edge + successor count + relation *rel__matrix__3__1 = new relation( + 1, true, 3, get_tag_for_rel("matrix", "1"), + std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", + std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL); + + + relation *rel__dangling_node = new relation( + 1, true, 1, get_tag_for_rel("dangling_node", "1"), + std::to_string(get_tag_for_rel("dangling_node", "1")) + ".dangling_node.table", + std::to_string(get_tag_for_rel("dangling_node", "1")) + ".dangling_node.table", + FULL); + + // RAM *scc_copy_edge = new RAM(false, 0); + // scc_copy_edge->add_relation(rel__edge__2__1, false, false); + // scc_copy_edge->add_relation(rel__edge__2__2, true, false); + // scc_copy_edge->add_rule( + // new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2})); + + RAM *scc_compute_matrix = new RAM(false, 1); + scc_compute_matrix->add_relation(rel__edge__2__1, false, false); + // scc_compute_matrix->add_relation(rel__edge__2__2, false, false); + scc_compute_matrix->add_relation(rel__matrix__3__1, true, false); + scc_compute_matrix->add_rule(new parallel_join_aggregate( + rel__matrix__3__1, rel__edge__2__1, rel__edge__2__1, FULL, + agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, + {0, 1, 3})); + RAM *scc_helper_fact = new RAM(false, 0); scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false); scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)})); @@ -492,14 +535,30 @@ int main(int argc, char **argv) { agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, {2})); + RAM *scc_populate_dangling = new RAM(false, 3); + scc_populate_dangling->add_relation(rel__edge__2__1, false); + scc_populate_dangling->add_relation(rel__dangling_node, true); + scc_populate_dangling->add_relation(rel__node__1__1, false); + scc_populate_dangling->add_rule(new parallel_join_negate( + rel__dangling_node, rel__node__1__1, FULL, rel__edge__2__1, + {0} + )); + LIE *cnt_lie = new LIE(); cnt_lie->add_relation(rel__edge__2__1); + // cnt_lie->add_relation(rel__edge__2__2); cnt_lie->add_relation(rel__node__1__1); cnt_lie->add_relation(rel___dollorunit__1__1); cnt_lie->add_relation(rel__total_node_cnt__1__1); + cnt_lie->add_relation(rel__matrix__3__1); + cnt_lie->add_relation(rel__dangling_node); cnt_lie->add_scc(scc_helper_fact); cnt_lie->add_scc(scc_compute_node); cnt_lie->add_scc(scc_count_nodes); + // cnt_lie->add_scc(scc_copy_edge); + cnt_lie->add_scc(scc_compute_matrix); + cnt_lie->add_scc(scc_populate_dangling); + // cnt_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix); cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes); cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes); @@ -513,28 +572,32 @@ int main(int argc, char **argv) { // only 1 data in this rel so its safe rel__total_node_cnt__1__1->print(); + u64 local_node_size = 0; for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) { - total_node_size = t[0]; - std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << std::endl; + if (t[0] != 0) { + local_node_size = t[0]; + } } + rel__matrix__3__1->print(); + MPI_Barrier(mcomm.get_comm()); - // >>>>>>>>>>>>>>> compute page rank - std::cout << ">>>>>>>>>> Computing pagerank ... " << std::endl; + MPI_Allreduce(&local_node_size, &total_node_size, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, mcomm.get_comm()); + + dangling_node_cnt = rel__dangling_node->get_global_full_element_count(); + dangling_value = FLOAT_SCALE_CONST / total_node_size; + std::cout << ">>>>>>>>> Number of nodes: " << total_node_size + << " >>>>>>>>> Dangling node count: " << dangling_node_cnt + << " >>>>>>>>> Dangling value: " << dangling_value * 1.0 / FLOAT_SCALE_CONST + << std::endl; rel__edge__2__1->disable_initialization(); rel__node__1__1->disable_initialization(); + rel__matrix__3__1->disable_initialization(); + rel__dangling_node->disable_initialization(); + + // rel__matrix__3__1->print(); - relation *rel__edge__2__2 = new relation( - 1, false, 2, get_tag_for_rel("edge", "2"), - std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", - std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL); - - // matrix edge + successor count - relation *rel__matrix__3__1 = new relation( - 1, true, 3, get_tag_for_rel("matrix", "1"), - std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", - std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL); - + ////////////////// compute Page rank relation *rel__rank__3__1 = new relation( 1, true, 3, get_tag_for_rel("rank", "1"), std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", @@ -544,205 +607,143 @@ int main(int argc, char **argv) { {1, 2, 3}, [](const std::vector &old_v, const std::vector &new_v, const vector &nt) -> std::optional { - // if (nt[0] == 59 && nt[1] == 58) { - // std::cout << "dependent column size " << new_v.size() << std::endl; - // std::cout << new_v[0] << " " << new_v[1] << " " << new_v[2] << " - // comparing with " << old_v[0] << " " << old_v[1] << " " << old_v[2] - // << std::endl; - // } if (new_v[0] != old_v[0]) { - // std::cout << " www >>>>>>>>>" << std::endl; return std::nullopt; - } else { - // monotonic - // assert(new_v[1] > old_v[1]); - // u32 new_sum_raw = new_v[1]; - // u32 old_sum_raw = old_v[1]; - // float new_sum = *reinterpret_cast(&new_sum_raw); - // float old_sum = *reinterpret_cast(&old_sum_raw); - // if (new_sum > old_sum) { - // std::cout << "new >> " << new_sum << " old >> " << old_sum << - // std::endl; - // } - // return new_sum > old_sum; - return new_v[1] > old_v[1]; - // return true; } + // if (std::abs((int)new_v[1] - (int)old_v[1]) < 10) { + // return false; + // } + return true; }); - relation *rel__result__2__1__2 = new relation( - 2, true, 2, get_tag_for_rel("result", "1__2"), - std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", - std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", - FULL); - - // - - RAM *scc_copy_edge = new RAM(false, 0); - scc_copy_edge->add_relation(rel__edge__2__1, false, false); - scc_copy_edge->add_relation(rel__edge__2__2, true, false); - scc_copy_edge->add_rule( - new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2})); - - RAM *scc_compute_matrix = new RAM(false, 1); - scc_compute_matrix->add_relation(rel__edge__2__1, false, false); - scc_compute_matrix->add_relation(rel__edge__2__2, false, false); - scc_compute_matrix->add_relation(rel__matrix__3__1, true, false); - scc_compute_matrix->add_rule(new parallel_join_aggregate( - rel__matrix__3__1, rel__edge__2__2, rel__edge__2__1, FULL, - agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, - {0, 1, 3})); - - RAM *scc_init = new RAM(false, 2); - scc_init->add_relation(rel__matrix__3__1, false, false); - scc_init->add_relation(rel__rank__3__1, true, false); - scc_init->add_rule(new parallel_copy_generate( - rel__rank__3__1, rel__matrix__3__1, FULL, - [](const u64 *const data, u64 *const output) -> int { - output[0] = data[0]; - output[1] = data[0]; - // float init_pg_v = (1 - ALPHA) / total_node_size; - u64 init_pg_v = (u64)(((1 - ALPHA) / total_node_size) * 100000); - // std::cout << init_pg_v << std::endl; - // output[2] = *reinterpret_cast(&init_pg_v); - output[2] = init_pg_v; - return 1; - })); - - RAM *scc_page_rank = new RAM(true, 3); - scc_page_rank->add_relation(rel__matrix__3__1, false, false); - scc_page_rank->add_relation(rel__rank__3__1, true, false); - parallel_join *rank_join = - new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL, - rel__rank__3__1, DELTA, {3, 1, 2} // useless - ); - rank_join->set_generator_func([](const depend_val_t &target_vs, - const std::vector &input_v, - depend_val_t &res_set) -> bool { - // float pg_sum = 0.0; - u64 pg_sum = 0; - - int count = 0; - for (auto &tv : target_vs) { - // std::cout << "tagret v >>>>> "; - // for (auto c: tv) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - u32 raw_succ_pg_v = tv[2]; // all columns are u64, cast to u32 first - // std::cout << ">>>>>>>>>>>>>>> " << - // *reinterpret_cast(&raw_succ_pg_v) << std::endl; - // auto succ_pg_v = *reinterpret_cast(&raw_succ_pg_v); - // if(succ_pg_v == 0) { - // // std::cout << ">>>>>>>>>> " << succ_pg_v << std::endl; - // std::cout << "tagret v >>>>> "; - // for (auto c: tv) { - // std::cout << c << " "; + std::vector pg_lie_list; + + for (int i = 0; i < MAX_PG_ITERATION; i++) { + std::cout << ">>>>>>>>>>>>>>>>>>>>> Compute pagerank iter " << current_iter + << std::endl; + LIE *pg_lie = new LIE(); + + // RAM *scc_ + RAM *scc_init = new RAM(false, 0); + scc_init->add_relation(rel__matrix__3__1, false, false); + scc_init->add_relation(rel__rank__3__1, true, false); + scc_init->add_rule(new parallel_copy_generate( + rel__rank__3__1, rel__matrix__3__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[0]; + output[1] = data[0]; + output[2] = dangling_value; + return 1; + })); + RAM *scc_page_rank = new RAM(false, 1); + scc_page_rank->add_relation(rel__matrix__3__1, false, false); + scc_page_rank->add_relation(rel__rank__3__1, true, false); + parallel_join *rank_join = + new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL, + rel__rank__3__1, DELTA, {3, 1, 2} // useless + ); + rank_join->set_generator_func([](const depend_val_t &target_vs, + const std::vector &input_v, + depend_val_t &res_set) -> bool { + // if (current_iter > MAX_PG_ITERATION) { + // return false; // } - // std::cout << std::endl; - // } - if (input_v[2] != 0) { - // pg_sum += ((ALPHA * succ_pg_v) / input_v[2]); - pg_sum += (u64)(((u64)(ALPHA * raw_succ_pg_v)) / input_v[2]); - // if (input_v[1] == 51) { - // std::cout << "Sum 51 " << input_v[0] << " with "; - // for (auto c: tv) { - // std::cout << c << " "; - // } - // std::cout << " result " << pg_sum << std::endl; - // } + u64 pg_sum = dangling_node_cnt * dangling_value; + int count = 0; + for (auto &tv : target_vs) { + if ((tv[0] == tv[1]) && (current_iter != 0)) { + continue; + } + u32 raw_succ_pg_v_sub = tv[2]; // all columns are u64, cast to u32 first + if (current_iter == 0) { + raw_succ_pg_v_sub = raw_succ_pg_v_sub / input_v[2]; + } + pg_sum += (u64)(raw_succ_pg_v_sub * ALPHA); + count++; } - count++; + pg_sum += (1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size; + std::vector res_tuple(3, 0); + res_tuple[0] = input_v[1]; + res_tuple[1] = input_v[0]; + res_tuple[2] = pg_sum / input_v[2]; + res_set.push_back(res_tuple); + return true; + }); + scc_page_rank->add_rule(rank_join); + + pg_lie_list.push_back(pg_lie); + pg_lie->add_relation(rel__matrix__3__1); + pg_lie->add_relation(rel__node__1__1); + pg_lie->add_relation(rel__rank__3__1); + pg_lie->add_scc(scc_page_rank); + if (current_iter == 0) { + pg_lie->add_scc(scc_init); + pg_lie->add_scc_dependance(scc_init, scc_page_rank); } - if (pg_sum == 0) { - return false; + // Enable IO + if (i == MAX_PG_ITERATION - 1) { + pg_lie->enable_all_to_all_dump(); + pg_lie->enable_data_IO(); + pg_lie->enable_IO(); } - if (count == 0) { - return false; - } - std::vector res_tuple(3, 0); - res_tuple[0] = input_v[1]; - res_tuple[1] = input_v[0]; - // res_tuple[2] = *reinterpret_cast(&pg_sum); - res_tuple[2] = pg_sum; - // std::cout << "New tuple >>>>>>> " << pg_sum << std::endl; - // for (auto c: res_tuple) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - res_set.push_back(res_tuple); - return true; - }); - scc_page_rank->add_rule(rank_join); + // lie->enable_share_io(); + pg_lie->set_output_dir(slog_output_dir); // Write to this directory + pg_lie->set_comm(mcomm); + pg_lie->set_batch_size(1); + pg_lie->execute(); + current_iter++; + rel__rank__3__1->disable_initialization(); + pg_lie->print_all_relation_size(); // Continuously print relation sizes + } + // rel__rank__4__1->print( + // [](const std::vector& tp){ + // u32 pg_v = tp[3]; + // std::cout << tp[0] << " " << tp[1] << " " + // // *reinterpret_cast(&pg_v) << std::cout << tp[0] << " " << tp[1] + // << tp[2] << " " << pg_v << std::endl; + // } + // ); + // delete pg_pre_lie; + // delete pg_lie; + + std::cout << "Aggregating sum ..." << std::endl; + relation *rel__result__2__1__2 = new relation( + 2, true, 2, get_tag_for_rel("result", "1__2"), + std::to_string(get_tag_for_rel("result", "1__2")) + + ".result.2.table", std::to_string(get_tag_for_rel("result", "1__2")) + + ".result.2.table", FULL); RAM *scc_result = new RAM(false, 4); scc_result->add_relation(rel__rank__3__1, false, false); scc_result->add_relation(rel__result__2__1__2, true, false); scc_result->add_relation(rel__node__1__1, false, false); - // scc_result->add_rule(new parallel_join_aggregate( - // rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, - // agg_sum_float_local, SpecialAggregator::sum, agg_sum_float_reduce, - // nullptr, {0, 2})); scc_result->add_rule(new parallel_join_aggregate( rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, - agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, 2})); - - LIE *pg_lie = new LIE(); - pg_lie->add_relation(rel__edge__2__1); - pg_lie->add_relation(rel__matrix__3__1); - pg_lie->add_relation(rel__node__1__1); - pg_lie->add_relation(rel__edge__2__2); - pg_lie->add_relation(rel__rank__3__1); - pg_lie->add_relation(rel__result__2__1__2); - pg_lie->add_scc(scc_copy_edge); - pg_lie->add_scc(scc_compute_matrix); - pg_lie->add_scc(scc_init); - pg_lie->add_scc(scc_page_rank); - pg_lie->add_scc(scc_result); - pg_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix); - pg_lie->add_scc_dependance(scc_compute_matrix, scc_init); - pg_lie->add_scc_dependance(scc_init, scc_page_rank); - pg_lie->add_scc_dependance(scc_page_rank, scc_result); - - // Enable IO - pg_lie->enable_all_to_all_dump(); - pg_lie->enable_data_IO(); - pg_lie->enable_IO(); - // lie->enable_share_io(); - pg_lie->set_output_dir(slog_output_dir); // Write to this directory - pg_lie->set_comm(mcomm); - pg_lie->set_batch_size(1); - pg_lie->execute(); - pg_lie->print_all_relation_size(); // Continuously print relation sizes - // lie->stat_intermediate(); - // rel__matrix__3__1->print(); - // rel__rank__3__1->print( - // [](const std::vector& tp){ - // u32 pg_v = tp[2]; - // // std::cout << tp[0] << " " << tp[1] << " " << - // *reinterpret_cast(&pg_v) << std::cout << tp[0] << " " << tp[1] - // << " " << pg_v << std::endl; - // } - // ); + agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, + 2})); + + LIE* final_lie = new LIE(); + final_lie->add_relation(rel__result__2__1__2); + final_lie->add_relation(rel__node__1__1); + final_lie->add_relation(rel__rank__3__1); + final_lie->add_scc(scc_result); + final_lie->enable_all_to_all_dump(); + final_lie->enable_data_IO(); + final_lie->enable_IO(); + + final_lie->set_output_dir(slog_output_dir); // Write to this directory + final_lie->set_comm(mcomm); + final_lie->set_batch_size(1); + final_lie->execute(); + final_lie->print_all_relation_size(); // Continuously print relation sizes + + // rel__rank__3__1->print(); + rel__result__2__1__2->print([](const std::vector &tp) { u32 pg_v = tp[1]; // std::cout << tp[0] << " " << *reinterpret_cast(&pg_v) << - std::cout << tp[0] << " " << pg_v * 1.0 / 100000 << std::endl; + std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl; }); - - // print all variants(non-canonical index of each relation) - // if (mcomm.get_rank() == 0) { - // std::cout << "rel_name" - // << ",\t" - // << "indices\n"; - // for (auto const &rel_p : rel_index_map) { - // std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n"; - // } - // std::cout << std::endl; - // } - - delete pg_lie; - // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> mcomm.destroy(); diff --git a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp new file mode 100644 index 00000000..c0636f61 --- /dev/null +++ b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp @@ -0,0 +1,753 @@ +// location of `parallel_RA_inc.h` here +#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// builtins.cpp goes here! +// builtins.cpp +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +#define u64 uint64_t +#define u32 uint32_t +using i64 = int64_t; + +const u64 tag_mask = 0xffffc00000000000; +const u64 tag_position = 46; +const u64 int_tag = 0; +const u64 str_tag = 2; +const u64 sign_flip_const = 0x0000200000000000; +const u64 signed_num_mask = 0xFFFFE00000000000; + +inline bool is_number(u64 datum) { + // cout << "is_number(" << datum << "): " << (datum >> tag_position == + // int_tag) << "\n"; + return datum >> tag_position == int_tag; +} + +inline i64 datum_to_number(u64 datum) { + i64 signed_val = + (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); + if (signed_val >= sign_flip_const) { + signed_val = sign_flip_const - signed_val; + } + return signed_val; + // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - + // tag_position); +} +const auto d2n = datum_to_number; + +inline u64 number_to_datum(i64 number) { + i64 unsigned_value = number; + if (number < 0) { + unsigned_value = (-number) + sign_flip_const; + } + return (unsigned_value & ~tag_mask) | (int_tag << tag_position); + // return (number & ~tag_mask) | (int_tag << tag_position); +} + +const auto n2d = number_to_datum; + +inline u64 string_to_datum(std::string str) { + u32 str_hash = string_hash(str); + return (str_hash & ~tag_mask) | (str_tag << tag_position); +} +const auto s2d = string_to_datum; + +vector> builtin_div_rem(const u64 *const data) { + if (is_number(data[0]) && is_number(data[1])) { + auto div = number_to_datum(d2n(data[0]) / d2n(data[1])); + auto rem = number_to_datum(d2n(data[0]) % d2n(data[1])); + return {{div, rem}}; + } else { + return {}; + } +} + +#define BUILTIN_BINARY_NUMBER_PRED(name, op) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (is_number(data[0]) && is_number(data[1]) && \ + datum_to_number(data[0]) op datum_to_number(data[1])) { \ + return callback(init_state); \ + } else \ + return init_state; \ + } + +BUILTIN_BINARY_NUMBER_PRED(builtin_less, <) +BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >) +BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=) +BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=) + +#define BUILTIN_BINARY_NUMBER_FUNC(name, op) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0]) && is_number(data[1])) { \ + auto res = number_to_datum(datum_to_number(data[0]) \ + op datum_to_number(data[1])); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +) +BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -) +BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *) +BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /) + +#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0]) && is_number(data[1])) { \ + auto res = number_to_datum( \ + impl(datum_to_number(data[0]), datum_to_number(data[1]))); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) { return arg2 - arg1; } +BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1) + +#define BUILTIN_UNARY_NUMBER_FUNC(name, impl) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0])) { \ + auto res = number_to_datum(impl(datum_to_number(data[0]))); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +inline u64 add1(u64 x) { return x + 1; } +inline u64 sub1(u64 x) { return x - 1; } + +BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1) + +vector> builtin_range(const u64 *const data) { + vector> res; + if (is_number(data[0]) && is_number(data[1])) { + auto lb = datum_to_number(data[0]); + auto ub = datum_to_number(data[1]); + res.reserve(ub - lb); + for (u64 x = lb; x < ub; x++) + res.push_back({number_to_datum(x)}); + } + return res; +} + +template +TState callback_builtin_range(const u64 *data, TState init_state, + TState (*callback)(u64 res, TState state)) { + auto state = init_state; + if (is_number(data[0]) && is_number(data[1])) { + auto lb = datum_to_number(data[0]); + auto ub = datum_to_number(data[1]); + for (u64 x = lb; x < ub; x++) + state = callback(number_to_datum(x), state); + } + return state; +} + +#define BUILTIN_BINARY_PRED(name, op) \ + template \ + TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (data[0] op data[1]) \ + return callback(init_state); \ + else \ + return init_state; \ + } +BUILTIN_BINARY_PRED(builtin_eq, ==) +BUILTIN_BINARY_PRED(builtin_neq, !=) + +template +TState builtin_eq_1(const u64 *data, TState init_state, + TState (*callback)(u64 res, TState state)) { + return callback(data[0], init_state); +} + +#define BUILTIN_UNARY_PRED(name, pred) \ + template \ + TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (pred(data[0])) \ + return callback(init_state); \ + else \ + return init_state; \ + } + +bool is_not_number(u64 datum) { return !is_number(datum); } +BUILTIN_UNARY_PRED(builtin_number_huh, is_number) +BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number) + +// for generate-cpp-lambda-for-computational-join +struct CL2CB_State { + void *original_callback; // There be dragons? + void *original_state; + const u64 *original_data; + u64 *cl1_output_args; +}; + +// for generate-cpp-lambda-for-computational-copy +struct BCLCB_State { + void *original_callback; + void *original_state; + const u64 *original_data; +}; + +// an experiment: +template bool builtin_binary_number_pred(const u64 *data) { + if (is_number(data[0]) && is_number(data[1])) { + return f(datum_to_number(data[0]), datum_to_number(data[1])); + } else { + return false; + } +} +bool _less(u64 x, u64 y) { return x < y; } +auto builtin_less2 = builtin_binary_number_pred<_less>; + +template +inline TState builtin_nop(const u64 *data, TState init_state, + TState (*callback)(TState state)) { + return callback(init_state); +} + +// //////////////////// AGGREGATORS Alternative design //////////////////// + +// TODO: add number type check +////////////////////////////// count ///////////////////////////////////// + +local_agg_res_t +agg_count_local(std::pair + joined_range) { + local_agg_res_t cnt = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + cnt++; + } + return cnt; +} + +local_agg_res_t agg_count_reduce(local_agg_res_t x, local_agg_res_t y) { + return x + y; +} + +////////////////////////////// sum ///////////////////////////////////// + +local_agg_res_t +agg_sum_local(std::pair + joined_range) { + local_agg_res_t sum_res = 0; + for (shmap_relation::iterator it = joined_range.first; + it != joined_range.second; ++it) { + auto tuple = (*it); + sum_res += tuple[tuple.size() - 2]; + } + return sum_res; +} + +local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) { + return x + y; +} + +local_agg_res_t agg_sum_float_local( + std::pair + joined_range) { + float sum_res = 0.0; + for (shmap_relation::iterator it = joined_range.first; + it != joined_range.second; ++it) { + auto tuple = (*it); + u32 agg_column_raw = tuple[tuple.size() - 2]; + + sum_res += *reinterpret_cast(&agg_column_raw); + } + // std::cout << ">>>>>>> " << sum_res << " " << + // *reinterpret_cast(&sum_res) << std::endl; + u32 sum_res_encoded = *reinterpret_cast(&sum_res); + return sum_res_encoded; +} + +local_agg_res_t agg_sum_float_reduce(local_agg_res_t x_raw, + local_agg_res_t y_raw) { + float x = *reinterpret_cast(&x_raw); + float y = *reinterpret_cast(&y_raw); + float res = x + y; + // std::cout << res << std::endl; + u32 res_encoded = *reinterpret_cast(&res); + return res_encoded; +} + +////////////////////////////// maximum ///////////////////////////////////// + +local_agg_res_t +agg_maximum_local(std::pair + joined_range) { + local_agg_res_t max_res = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + auto current_v = tuple[tuple.size() - 1]; + if (current_v > max_res) { + max_res = current_v; + } + } + return max_res; +} + +local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) { + if (x > y) { + return x; + } else { + return y; + } +} + +////////////////////////////// minimum ///////////////////////////////////// + +local_agg_res_t +agg_minimum_local(std::pair + joined_range) { + local_agg_res_t min_res = std::numeric_limits::max(); + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + auto current_v = tuple[tuple.size() - 1]; + if (current_v < min_res) { + min_res = current_v; + } + } + return min_res; +} + +local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) { + if (x < y) { + return x; + } else { + return y; + } +} + +// // end of builtins.cpp + +// global definitions: + +int max_rel = 255; +std::map rel_tag_map; +std::map> rel_index_map; + +// load all relation inside input database +void load_input_relation(std::string db_dir) { + for (const auto &entry : std::filesystem::directory_iterator(db_dir)) { + // check if ends with table + std::string filename_ss = entry.path().filename().string(); + std::cout << "input database has file " << filename_ss << std::endl; + std::string suffix = ".table"; + int ft = filename_ss.size() - suffix.size(); + if (ft < 0) + ft = 0; + if (filename_ss.rfind(suffix) != ft) { + continue; + } + std::string filename_s = entry.path().stem().string(); + int tag = std::stoi(filename_s.substr(0, filename_s.find("."))); + std::string name_arity = filename_s.substr( + filename_s.find(".") + 1, filename_s.size() - filename_s.find(".") - 1); + std::string name = name_arity.substr(0, name_arity.rfind(".")); + std::string arity_s = + name_arity.substr(name_arity.rfind(".") + 1, name_arity.size()); + int arity = std::stoi(arity_s); + std::stringstream index_stream; + index_stream << name; + for (int i = 1; i <= arity; i++) { + index_stream << "__" << i; + } + if (tag > max_rel) + max_rel = tag; + std::cout << "load " << tag << "." << index_stream.str() << "has arity " + << arity << std::endl; + rel_tag_map[index_stream.str()] = tag; + } +} + +int get_tag_for_rel(std::string relation_name, std::string index_str) { + std::string name_arity = relation_name + "__" + index_str; + if (rel_index_map.find(relation_name) != rel_index_map.end()) { + rel_index_map[relation_name].insert(index_str); + } else { + rel_index_map[relation_name] = {index_str}; + } + + if (rel_tag_map.find(name_arity) != rel_tag_map.end()) { + // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] << + // std::endl; + return rel_tag_map[name_arity]; + } + max_rel++; + rel_tag_map[name_arity] = max_rel; + std::cout << "generate rel tag: " << name_arity << " " << max_rel + << std::endl; + return max_rel; +} + +float ALPHA = 0.85; +u64 total_node_size = 0; +u64 dangling_value = 0; + +int main(int argc, char **argv) { + // input dir from compiler + std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data"; + // output dir from compiler + std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints"; + if (argc == 3) { + slog_input_dir = argv[1]; + slog_output_dir = argv[2]; + } + load_input_relation(slog_input_dir); + mpi_comm mcomm; + mcomm.create(argc, argv); + + // (edge from to) + relation *rel__edge__2__1 = new relation( + 1, true, 2, get_tag_for_rel("edge", "1__2"), + std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + + ".edge.2.table", + FULL); + + // >>>>>>>>>>>>>>> compute node size + // (node x) + relation *rel__node__1__1 = new relation( + 1, true, 1, get_tag_for_rel("node", "1"), + std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("node", "1")) + + ".node.1.table", + FULL); + + // (total_node_cnt n) + relation *rel__total_node_cnt__1__1 = + new relation(1, true, 2, get_tag_for_rel("total_node_cnt", "1"), + std::to_string(get_tag_for_rel("total_node_cnt", "1")) + + ".total_node_cnt.1.table", + slog_input_dir + "/" + + std::to_string(get_tag_for_rel("total_node_cnt", "1")) + + ".total_node_cnt.1.table", + FULL); + + // helper relation for non-join aggregation + relation *rel___dollorunit__1__1 = new relation( + 0, true, 1, get_tag_for_rel("$unit", "1"), + std::to_string(get_tag_for_rel("$unit", "1")) + ".$unit.1.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("$unit", "1")) + + ".$unit.1.table", + FULL); + + RAM *scc_helper_fact = new RAM(false, 0); + scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false); + scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)})); + + // [(node a) (node b) <-- (edge a b)] + RAM *scc_compute_node = new RAM(false, 1); + scc_compute_node->add_relation(rel__edge__2__1, false, false); + scc_compute_node->add_relation(rel__node__1__1, true, false); + scc_compute_node->add_rule(new parallel_copy_generate( + rel__node__1__1, rel__edge__2__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[0]; + return 1; + })); + scc_compute_node->add_rule(new parallel_copy_generate( + rel__node__1__1, rel__edge__2__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[1]; + return 1; + })); + + // (total_node_cnt {count node _}) + RAM *scc_count_nodes = new RAM(false, 2); + scc_count_nodes->add_relation(rel__node__1__1, false, false); + scc_count_nodes->add_relation(rel__total_node_cnt__1__1, true, false); + scc_count_nodes->add_relation(rel___dollorunit__1__1, false, false); + scc_count_nodes->add_rule(new parallel_join_aggregate( + rel__total_node_cnt__1__1, rel__node__1__1, rel___dollorunit__1__1, FULL, + agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, + {2})); + + LIE *cnt_lie = new LIE(); + cnt_lie->add_relation(rel__edge__2__1); + cnt_lie->add_relation(rel__node__1__1); + cnt_lie->add_relation(rel___dollorunit__1__1); + cnt_lie->add_relation(rel__total_node_cnt__1__1); + cnt_lie->add_scc(scc_helper_fact); + cnt_lie->add_scc(scc_compute_node); + cnt_lie->add_scc(scc_count_nodes); + cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes); + cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes); + + cnt_lie->enable_all_to_all_dump(); + cnt_lie->set_output_dir(slog_output_dir); // Write to this directory + cnt_lie->set_comm(mcomm); + cnt_lie->set_batch_size(1); + cnt_lie->execute(); + cnt_lie->print_all_relation_size(); // Continuously print relation sizes + + // only 1 data in this rel so its safe + rel__total_node_cnt__1__1->print(); + + for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) { + total_node_size = t[0]; + dangling_value = (u64)(((1 - ALPHA) / total_node_size) * 100000); + std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << std::endl; + } + + // >>>>>>>>>>>>>>> compute page rank + std::cout << ">>>>>>>>>> Computing pagerank ... " << std::endl; + + rel__edge__2__1->disable_initialization(); + rel__node__1__1->disable_initialization(); + + relation *rel__edge__2__2 = new relation( + 1, false, 2, get_tag_for_rel("edge", "2"), + std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", + std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL); + + // matrix edge + successor count + relation *rel__matrix__3__1 = new relation( + 1, true, 3, get_tag_for_rel("matrix", "1"), + std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", + std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL); + + relation *rel__rank__3__1 = new relation( + 1, true, 3, get_tag_for_rel("rank", "1"), + std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", + std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL); + + rel__rank__3__1->set_dependent_column_update( + {1, 2, 3}, + [](const std::vector &old_v, const std::vector &new_v, + const vector &nt) -> std::optional { + // if (nt[0] == 59 && nt[1] == 58) { + // std::cout << "dependent column size " << new_v.size() << std::endl; + // std::cout << new_v[0] << " " << new_v[1] << " " << new_v[2] << " + // comparing with " << old_v[0] << " " << old_v[1] << " " << old_v[2] + // << std::endl; + // } + if (new_v[0] != old_v[0]) { + // std::cout << " www >>>>>>>>>" << std::endl; + return std::nullopt; + } else { + // monotonic + // assert(new_v[1] > old_v[1]); + // u32 new_sum_raw = new_v[1]; + // u32 old_sum_raw = old_v[1]; + // float new_sum = *reinterpret_cast(&new_sum_raw); + // float old_sum = *reinterpret_cast(&old_sum_raw); + // if (new_sum > old_sum) { + // std::cout << "new >> " << new_sum << " old >> " << old_sum << + // std::endl; + // } + // return new_sum > old_sum; + return new_v[1] > old_v[1]; + // return true; + } + }); + + relation *rel__result__2__1__2 = new relation( + 2, true, 2, get_tag_for_rel("result", "1__2"), + std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", + std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", + FULL); + + // + + RAM *scc_copy_edge = new RAM(false, 0); + scc_copy_edge->add_relation(rel__edge__2__1, false, false); + scc_copy_edge->add_relation(rel__edge__2__2, true, false); + scc_copy_edge->add_rule( + new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2})); + + RAM *scc_compute_matrix = new RAM(false, 1); + scc_compute_matrix->add_relation(rel__edge__2__1, false, false); + scc_compute_matrix->add_relation(rel__edge__2__2, false, false); + scc_compute_matrix->add_relation(rel__matrix__3__1, true, false); + scc_compute_matrix->add_rule(new parallel_join_aggregate( + rel__matrix__3__1, rel__edge__2__2, rel__edge__2__1, FULL, + agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, + {0, 1, 3})); + + RAM *scc_init = new RAM(false, 2); + scc_init->add_relation(rel__matrix__3__1, false, false); + scc_init->add_relation(rel__rank__3__1, true, false); + scc_init->add_rule(new parallel_copy_generate( + rel__rank__3__1, rel__matrix__3__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[0]; + output[1] = data[0]; + // float init_pg_v = (1 - ALPHA) / total_node_size; + u64 init_pg_v = dangling_value; + // std::cout << init_pg_v << std::endl; + // output[2] = *reinterpret_cast(&init_pg_v); + output[2] = init_pg_v; + return 1; + })); + + RAM *scc_page_rank = new RAM(true, 3); + scc_page_rank->add_relation(rel__matrix__3__1, false, false); + scc_page_rank->add_relation(rel__rank__3__1, true, false); + parallel_join *rank_join = + new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL, + rel__rank__3__1, DELTA, {3, 1, 2} // useless + ); + rank_join->set_generator_func([](const depend_val_t &target_vs, + const std::vector &input_v, + depend_val_t &res_set) -> bool { + // float pg_sum = 0.0; + u64 pg_sum = dangling_value; + + int count = 0; + for (auto &tv : target_vs) { + // std::cout << "tagret v >>>>> "; + // for (auto c: tv) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + u32 raw_succ_pg_v = tv[2]; // all columns are u64, cast to u32 first + // std::cout << ">>>>>>>>>>>>>>> " << + // *reinterpret_cast(&raw_succ_pg_v) << std::endl; + // auto succ_pg_v = *reinterpret_cast(&raw_succ_pg_v); + // if(succ_pg_v == 0) { + // // std::cout << ">>>>>>>>>> " << succ_pg_v << std::endl; + // std::cout << "tagret v >>>>> "; + // for (auto c: tv) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + // } + if (input_v[2] != 0) { + // pg_sum += ((ALPHA * succ_pg_v) / input_v[2]); + pg_sum += (u64)(((u64)(ALPHA * raw_succ_pg_v)) / input_v[2]); + // if (input_v[1] == 51) { + // std::cout << "Sum 51 " << input_v[0] << " with "; + // for (auto c: tv) { + // std::cout << c << " "; + // } + // std::cout << " result " << pg_sum << std::endl; + // } + } + count++; + } + if (pg_sum == 0) { + return false; + } + if (count == 0) { + return false; + } + std::vector res_tuple(3, 0); + res_tuple[0] = input_v[1]; + res_tuple[1] = input_v[0]; + // res_tuple[2] = *reinterpret_cast(&pg_sum); + res_tuple[2] = pg_sum; + // std::cout << "New tuple >>>>>>> " << pg_sum << std::endl; + // for (auto c: res_tuple) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + res_set.push_back(res_tuple); + return true; + }); + scc_page_rank->add_rule(rank_join); + + RAM *scc_result = new RAM(false, 4); + scc_result->add_relation(rel__rank__3__1, false, false); + scc_result->add_relation(rel__result__2__1__2, true, false); + scc_result->add_relation(rel__node__1__1, false, false); + // scc_result->add_rule(new parallel_join_aggregate( + // rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, + // agg_sum_float_local, SpecialAggregator::sum, agg_sum_float_reduce, + // nullptr, {0, 2})); + scc_result->add_rule(new parallel_join_aggregate( + rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, + agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, 2})); + + LIE *pg_lie = new LIE(); + pg_lie->add_relation(rel__edge__2__1); + pg_lie->add_relation(rel__matrix__3__1); + pg_lie->add_relation(rel__node__1__1); + pg_lie->add_relation(rel__edge__2__2); + pg_lie->add_relation(rel__rank__3__1); + pg_lie->add_relation(rel__result__2__1__2); + pg_lie->add_scc(scc_copy_edge); + pg_lie->add_scc(scc_compute_matrix); + pg_lie->add_scc(scc_init); + pg_lie->add_scc(scc_page_rank); + pg_lie->add_scc(scc_result); + pg_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix); + pg_lie->add_scc_dependance(scc_compute_matrix, scc_init); + pg_lie->add_scc_dependance(scc_init, scc_page_rank); + pg_lie->add_scc_dependance(scc_page_rank, scc_result); + + // Enable IO + pg_lie->enable_all_to_all_dump(); + pg_lie->enable_data_IO(); + pg_lie->enable_IO(); + // lie->enable_share_io(); + pg_lie->set_output_dir(slog_output_dir); // Write to this directory + pg_lie->set_comm(mcomm); + pg_lie->set_batch_size(1); + pg_lie->execute(); + pg_lie->print_all_relation_size(); // Continuously print relation sizes + // lie->stat_intermediate(); + // rel__matrix__3__1->print(); + // rel__rank__3__1->print( + // [](const std::vector& tp){ + // u32 pg_v = tp[2]; + // // std::cout << tp[0] << " " << tp[1] << " " << + // *reinterpret_cast(&pg_v) << std::cout << tp[0] << " " << tp[1] + // << " " << pg_v << std::endl; + // } + // ); + rel__result__2__1__2->print([](const std::vector &tp) { + u32 pg_v = tp[1]; + // std::cout << tp[0] << " " << *reinterpret_cast(&pg_v) << + std::cout << tp[0] << " " << pg_v * 1.0 / 100000 << std::endl; + }); + + // print all variants(non-canonical index of each relation) + // if (mcomm.get_rank() == 0) { + // std::cout << "rel_name" + // << ",\t" + // << "indices\n"; + // for (auto const &rel_p : rel_index_map) { + // std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n"; + // } + // std::cout << std::endl; + // } + + delete pg_lie; + + // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + mcomm.destroy(); + + return 0; +} From ab530e68e605f0f99bc93ccc91c3dddebd784d0e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 2 Dec 2022 07:06:38 +0000 Subject: [PATCH 18/36] finish impl page rank --- backend/src/RA/parallel_agg.cpp | 1 + .../src/relation/balanced_hash_relation.cpp | 21 +- backend/src/relation/balanced_hash_relation.h | 3 +- backend/src/relation/shmap_relation.h | 2 +- backend/src/relation/shmap_relation_exp.cpp | 35 +- .../pagerank/compiled_pre/CMakeLists.txt | 2 +- .../tests/pagerank/compiled_pre/pagerank.cpp | 426 +++++++++++----- .../pagerank/compiled_pre/pagerank_full.cpp | 462 +++++++++--------- 8 files changed, 568 insertions(+), 384 deletions(-) diff --git a/backend/src/RA/parallel_agg.cpp b/backend/src/RA/parallel_agg.cpp index fc42c114..0b13ed7c 100644 --- a/backend/src/RA/parallel_agg.cpp +++ b/backend/src/RA/parallel_agg.cpp @@ -117,6 +117,7 @@ void parallel_join_aggregate::local_aggregate( std::vector joined_input_tuple(input_tuple.begin(), input_tuple.begin()+input->get_join_column_count()); auto agg_res = res_map[joined_input_tuple]; std::vector tuple(output->get_arity(), 0); + // std::cout << "wwwwwwwwwwwwwwwwwwwwwwww " << output->get_arity() << std::endl; int reorder_agg_index = input->get_arity() + 1; for (long unsigned int j = 0; j < reorder_mapping.size(); j++) { // std::cout << reorder_mapping[j] << " " << reorder_agg_index << std::endl; diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index 660f5a7d..b3b7a1e7 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -14,6 +14,7 @@ u32 relation::get_global_delta_element_count() { + delta_element_count = delta[mcomm.get_rank()].count(); int dec = (int)delta_element_count; int global_delta_element_count; MPI_Allreduce(&dec, &global_delta_element_count, 1, MPI_INT, MPI_SUM, mcomm.get_local_comm()); @@ -25,6 +26,7 @@ u32 relation::get_global_full_element_count() { // TODO: change to use size of shamp_relation rather than counter u32 global_full_element_count; + full_element_count = full[mcomm.get_rank()].count(); MPI_Allreduce(&full_element_count, &global_full_element_count, 1, MPI_INT, MPI_SUM, mcomm.get_local_comm()); return global_full_element_count; } @@ -854,8 +856,8 @@ void relation::populate_full(int buffer_size, u64* buffer) for (u32 a = i; a < i + arity + 1; a++) t[a-i] = buffer[a]; - - if (full[bucket_id].insert_tuple_from_array(t, (arity+1)) == true) + int insert_res = full[bucket_id].insert_tuple_from_array(t, (arity+1)); + if (insert_res == INSERT_SUCCESS) { // TODO: check if its update, if it is keep full count same full_element_count++; @@ -880,7 +882,8 @@ void relation::populate_delta (int buffer_size, u64* buffer) for (u32 a = i; a < i + arity + 1; a++) t[a-i] = buffer[a]; - if (delta[bucket_id].insert_tuple_from_array(t, arity+1) == true) + int insert_res = delta[bucket_id].insert_tuple_from_array(t, arity+1); + if (insert_res == INSERT_SUCCESS) { delta_element_count++; delta_bucket_element_count[bucket_id]++; @@ -1202,13 +1205,17 @@ bool relation::insert_in_delta(u64* t) // std::cout << "inserting delta for " << intern_tag << std::endl; //assert((int)bucket_id == mcomm.get_local_rank()); - if (delta[bucket_id].insert_tuple_from_array(t, arity+1) == true) + int insert_res = delta[bucket_id].insert_tuple_from_array(t, arity+1); + if (insert_res == INSERT_SUCCESS) { delta_element_count++; delta_bucket_element_count[bucket_id]++; delta_sub_bucket_element_count[bucket_id][sub_bucket_id]++; bucket_map[bucket_id] = 1; + return true; + } else if (insert_res == INSERT_UPDATED) { + bucket_map[bucket_id] = 1; return true; } return false; @@ -1225,13 +1232,17 @@ bool relation::insert_in_newt(u64* t) // std::cout << "inserting newt for " << intern_tag << std::endl; //assert((int)bucket_id == mcomm.get_local_rank()); - if (newt[bucket_id].insert_tuple_from_array(t, arity+1) == true) + int insert_res = newt[bucket_id].insert_tuple_from_array(t, arity+1); + if (insert_res == INSERT_SUCCESS) { newt_element_count++; newt_bucket_element_count[bucket_id]++; newt_sub_bucket_element_count[bucket_id][sub_bucket_id]++; bucket_map[bucket_id] = 1; + return true; + } else if (insert_res == INSERT_UPDATED) { + bucket_map[bucket_id] = 1; return true; } return false; diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index 0757011d..e34d6e76 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -19,6 +19,7 @@ enum {LEFT=0, RIGHT}; enum {DELTA=0, FULL, FULL_AND_DELTA}; enum {COPY=0, COPY_FILTER, COPY_GENERATE, ACOPY, JOIN, FACT, NEGATION, AGGREGATION, UPDATE}; enum {STATIC=0, DYNAMIC}; +enum {INSERT_SUCCESS=0, INSERT_FAIL, INSERT_UPDATED}; using tuple_formator_t = std::function&)>; @@ -188,7 +189,7 @@ class relation void set_full_element_count(int val) {full_element_count = val;} - int get_full_element_count() {return full_element_count;} + int get_full_element_count() {return full[mcomm.get_rank()].count();} u32** get_full_sub_bucket_element_count() {return full_sub_bucket_element_count;} u32 get_global_full_element_count(); diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h index 19287e53..3d1fe9a5 100644 --- a/backend/src/relation/shmap_relation.h +++ b/backend/src/relation/shmap_relation.h @@ -121,7 +121,7 @@ struct shmap_relation { }; int count(); - bool insert_tuple_from_array(u64* t, int arity); + int insert_tuple_from_array(u64* t, int arity); void remove_tuple(); bool find_tuple_from_array(u64* t, int arity); bool check_dependent_insertion(const std::vector &v); diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index 6d2ea852..d0fbc176 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -11,6 +11,7 @@ */ #include "../parallel_RA_inc.h" +#include "balanced_hash_relation.h" #include "shmap_relation.h" #include #include @@ -25,7 +26,7 @@ shmap_relation::shmap_relation(int arity, bool id_flag) // ind = new t_ind(t_comparator(id_flag)); } -bool shmap_relation::insert_tuple_from_array(u64 *t, int width) +int shmap_relation::insert_tuple_from_array(u64 *t, int width) { t_tuple tp(t, t+width); // check if relation has functional dependance @@ -60,7 +61,11 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width) // } // std::cout << std::endl; // } - return insert(tp); + if (insert(tp)) { + return INSERT_SUCCESS; + } else { + return INSERT_FAIL; + } } else { // update // iterator need_delete = ind.end(); @@ -100,7 +105,11 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width) joined = true; } if (!joined) { - return insert(tp); + if (insert(tp)) { + return INSERT_SUCCESS; + } else { + return INSERT_FAIL; + } } if (!need_deletes.empty()) { for (auto d: need_deletes) { @@ -111,9 +120,13 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width) // std::cout << std::endl; ind.erase(*d); } - return insert(tp); + if (insert(tp)) { + return INSERT_SUCCESS; + } else { + return INSERT_UPDATED; + } } else { - return false; + return INSERT_FAIL; } } } else { @@ -122,7 +135,11 @@ bool shmap_relation::insert_tuple_from_array(u64 *t, int width) // std::cout << c << " "; // } // std::cout << std::endl; - return insert(tp); + if (insert(tp)) { + return INSERT_SUCCESS; + } else { + return INSERT_FAIL; + } } } @@ -452,7 +469,7 @@ void shmap_relation::as_all_to_allv_right_join_buffer( // for (auto c: upper_bound) { // std::cout << c << " "; // } - std::cout << std::endl; + // std::cout << std::endl; auto joined_range = lowerUpperRange(lower_bound, upper_bound); if (generator_mode) { @@ -516,7 +533,7 @@ void shmap_relation::as_all_to_allv_right_join_buffer( // std::cout << c << " "; // } // std::cout << std::endl; - if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) + if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) != INSERT_FAIL) { uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets; uint64_t sub_bucket_id=0; @@ -650,7 +667,7 @@ void shmap_relation::as_all_to_allv_left_join_buffer( projected_path[i] = reordered_cur_path[reorder_map[i]]; //std::cout << "NT " << projected_path[0] << " " << projected_path[1] << std::endl; - if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) == true) + if (deduplicate.insert_tuple_from_array(projected_path, join_buffer.width[ra_id]) != INSERT_FAIL) { uint64_t bucket_id = tuple_hash(projected_path, head_rel_hash_col_count) % buckets; uint64_t sub_bucket_id=0; diff --git a/backend/tests/pagerank/compiled_pre/CMakeLists.txt b/backend/tests/pagerank/compiled_pre/CMakeLists.txt index 44733818..38953a06 100644 --- a/backend/tests/pagerank/compiled_pre/CMakeLists.txt +++ b/backend/tests/pagerank/compiled_pre/CMakeLists.txt @@ -19,7 +19,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") -file (GLOB source_files_pagerank "${PROJECT_SOURCE_DIR}/pagerank.cpp") +file (GLOB source_files_pagerank "${PROJECT_SOURCE_DIR}/pagerank_full.cpp") ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}") diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp index ec7133ba..a24bcf25 100644 --- a/backend/tests/pagerank/compiled_pre/pagerank.cpp +++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp @@ -273,6 +273,7 @@ agg_sum_local(std::pair for (shmap_relation::iterator it = joined_range.first; it != joined_range.second; ++it) { auto tuple = (*it); + // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl; // if (tuple[1] == MAX_PG_ITERATION) { sum_res += tuple[tuple.size() - 2]; // } @@ -426,10 +427,10 @@ int main(int argc, char **argv) { std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data"; // output dir from compiler std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints"; - if (argc == 3) { + // if (argc 3) { slog_input_dir = argv[1]; slog_output_dir = argv[2]; - } + // } load_input_relation(slog_input_dir); mpi_comm mcomm; mcomm.create(argc, argv); @@ -455,7 +456,7 @@ int main(int argc, char **argv) { // (total_node_cnt n) relation *rel__total_node_cnt__1__1 = - new relation(1, true, 2, get_tag_for_rel("total_node_cnt", "1"), + new relation(1, true, 1, get_tag_for_rel("total_node_cnt", "1"), std::to_string(get_tag_for_rel("total_node_cnt", "1")) + ".total_node_cnt.1.table", slog_input_dir + "/" + @@ -476,7 +477,7 @@ int main(int argc, char **argv) { // std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", // std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL); - // matrix edge + successor count + // from, to, outage degree of `from` relation *rel__matrix__3__1 = new relation( 1, true, 3, get_tag_for_rel("matrix", "1"), std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", @@ -489,13 +490,13 @@ int main(int argc, char **argv) { std::to_string(get_tag_for_rel("dangling_node", "1")) + ".dangling_node.table", FULL); - // RAM *scc_copy_edge = new RAM(false, 0); - // scc_copy_edge->add_relation(rel__edge__2__1, false, false); - // scc_copy_edge->add_relation(rel__edge__2__2, true, false); - // scc_copy_edge->add_rule( - // new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2})); + relation *rel__node_outage_degree = new relation( + 1, true, 2, get_tag_for_rel("node_outage_degree", "2"), + std::to_string(get_tag_for_rel("node_outage_degree", "2")) + ".node_outage_degree.table", + std::to_string(get_tag_for_rel("node_outage_degree", "2")) + ".node_outage_degree.table", + FULL); - RAM *scc_compute_matrix = new RAM(false, 1); + RAM *scc_compute_matrix = new RAM(false, 0); scc_compute_matrix->add_relation(rel__edge__2__1, false, false); // scc_compute_matrix->add_relation(rel__edge__2__2, false, false); scc_compute_matrix->add_relation(rel__matrix__3__1, true, false); @@ -504,12 +505,12 @@ int main(int argc, char **argv) { agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, {0, 1, 3})); - RAM *scc_helper_fact = new RAM(false, 0); + RAM *scc_helper_fact = new RAM(false, 1); scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false); scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)})); // [(node a) (node b) <-- (edge a b)] - RAM *scc_compute_node = new RAM(false, 1); + RAM *scc_compute_node = new RAM(false, 2); scc_compute_node->add_relation(rel__edge__2__1, false, false); scc_compute_node->add_relation(rel__node__1__1, true, false); scc_compute_node->add_rule(new parallel_copy_generate( @@ -526,7 +527,7 @@ int main(int argc, char **argv) { })); // (total_node_cnt {count node _}) - RAM *scc_count_nodes = new RAM(false, 2); + RAM *scc_count_nodes = new RAM(false, 3); scc_count_nodes->add_relation(rel__node__1__1, false, false); scc_count_nodes->add_relation(rel__total_node_cnt__1__1, true, false); scc_count_nodes->add_relation(rel___dollorunit__1__1, false, false); @@ -535,7 +536,7 @@ int main(int argc, char **argv) { agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, {2})); - RAM *scc_populate_dangling = new RAM(false, 3); + RAM *scc_populate_dangling = new RAM(false, 4); scc_populate_dangling->add_relation(rel__edge__2__1, false); scc_populate_dangling->add_relation(rel__dangling_node, true); scc_populate_dangling->add_relation(rel__node__1__1, false); @@ -544,43 +545,54 @@ int main(int argc, char **argv) { {0} )); - LIE *cnt_lie = new LIE(); - cnt_lie->add_relation(rel__edge__2__1); + RAM *scc_degree = new RAM(false, 5); + scc_degree->add_relation(rel__node_outage_degree, true); + scc_degree->add_relation(rel__matrix__3__1, false); + scc_degree->add_rule(new parallel_copy( + rel__node_outage_degree, rel__matrix__3__1, FULL, {0, 2} + )); + + LIE *init_lie = new LIE(); + init_lie->add_relation(rel__edge__2__1); // cnt_lie->add_relation(rel__edge__2__2); - cnt_lie->add_relation(rel__node__1__1); - cnt_lie->add_relation(rel___dollorunit__1__1); - cnt_lie->add_relation(rel__total_node_cnt__1__1); - cnt_lie->add_relation(rel__matrix__3__1); - cnt_lie->add_relation(rel__dangling_node); - cnt_lie->add_scc(scc_helper_fact); - cnt_lie->add_scc(scc_compute_node); - cnt_lie->add_scc(scc_count_nodes); + init_lie->add_relation(rel__node__1__1); + init_lie->add_relation(rel___dollorunit__1__1); + init_lie->add_relation(rel__total_node_cnt__1__1); + init_lie->add_relation(rel__matrix__3__1); + init_lie->add_relation(rel__dangling_node); + init_lie->add_relation(rel__node_outage_degree); + // init_lie->add_relation(rel__page_rank__2__1); + init_lie->add_scc(scc_helper_fact); + init_lie->add_scc(scc_compute_node); + init_lie->add_scc(scc_count_nodes); // cnt_lie->add_scc(scc_copy_edge); - cnt_lie->add_scc(scc_compute_matrix); - cnt_lie->add_scc(scc_populate_dangling); + init_lie->add_scc(scc_compute_matrix); + init_lie->add_scc(scc_populate_dangling); + init_lie->add_scc(scc_degree); // cnt_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix); - cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes); - cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes); - - cnt_lie->enable_all_to_all_dump(); - cnt_lie->set_output_dir(slog_output_dir); // Write to this directory - cnt_lie->set_comm(mcomm); - cnt_lie->set_batch_size(1); - cnt_lie->execute(); - cnt_lie->print_all_relation_size(); // Continuously print relation sizes + init_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes); + init_lie->add_scc_dependance(scc_compute_node, scc_count_nodes); + init_lie->add_scc_dependance(scc_compute_matrix, scc_degree); + + init_lie->enable_all_to_all_dump(); + init_lie->set_output_dir(slog_output_dir); // Write to this directory + init_lie->set_comm(mcomm); + init_lie->set_batch_size(1); + init_lie->execute(); + init_lie->print_all_relation_size(); // Continuously print relation sizes + MPI_Barrier(mcomm.get_comm()); // only 1 data in this rel so its safe rel__total_node_cnt__1__1->print(); + // rel__node_outage_degree->print(); - u64 local_node_size = 0; - for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) { - if (t[0] != 0) { - local_node_size = t[0]; - } - } - rel__matrix__3__1->print(); - MPI_Barrier(mcomm.get_comm()); - + u64 local_node_size = rel__node__1__1->get_full_element_count(); + // for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) { + // if (t[0] != 0) { + // local_node_size = t[0]; + // } + // } + // rel__matrix__3__1->print(); MPI_Allreduce(&local_node_size, &total_node_size, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, mcomm.get_comm()); dangling_node_cnt = rel__dangling_node->get_global_full_element_count(); @@ -594,93 +606,188 @@ int main(int argc, char **argv) { rel__node__1__1->disable_initialization(); rel__matrix__3__1->disable_initialization(); rel__dangling_node->disable_initialization(); + rel__node_outage_degree->disable_initialization(); // rel__matrix__3__1->print(); ////////////////// compute Page rank - relation *rel__rank__3__1 = new relation( - 1, true, 3, get_tag_for_rel("rank", "1"), - std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", - std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL); + relation *rel__page_rank__2__1 = new relation( + 1, true, 2, get_tag_for_rel("page_rank", "1"), + std::to_string(get_tag_for_rel("page_rank", "1")) + ".page_rank.2.table", + std::to_string(get_tag_for_rel("page_rank", "1")) + ".page_rank.2.table", FULL); + rel__page_rank__2__1->set_dependent_column_update( + {1,2}, + [](const std::vector &old_v, const std::vector &new_v, + const vector &nt) -> std::optional { + // if (std::abs((int)new_v[1] - (int)old_v[1]) < 5) { + // return false; + // } else { + // return true; + // } + return true; + }); - rel__rank__3__1->set_dependent_column_update( + relation *rel__sub_rank__3__1 = new relation( + 1, true, 3, get_tag_for_rel("sub_rank", "1"), + std::to_string(get_tag_for_rel("sub_rank", "1")) + ".sub_rank.3.table", + std::to_string(get_tag_for_rel("sub_rank", "1")) + ".sub_rank.3.table", FULL); + + // page rank (node N, + // , + // ) + rel__sub_rank__3__1->set_dependent_column_update( {1, 2, 3}, [](const std::vector &old_v, const std::vector &new_v, const vector &nt) -> std::optional { if (new_v[0] != old_v[0]) { + // std::cout << "New " << new_v[0] << " " << new_v[1] << " Old " << old_v[0] << " " << old_v[1] << std::endl; return std::nullopt; } - // if (std::abs((int)new_v[1] - (int)old_v[1]) < 10) { - // return false; - // } return true; }); + LIE *pg_defaukt_lie = new LIE(); + RAM *scc_defaultv = new RAM(false, 0); + scc_defaultv->add_relation(rel__node_outage_degree, false, false); + scc_defaultv->add_relation(rel__page_rank__2__1, true, false); + scc_defaultv->add_rule(new parallel_copy_generate( + rel__page_rank__2__1, rel__node_outage_degree, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[0]; + output[1] = dangling_value; + return 1; + })); + + pg_defaukt_lie->add_relation(rel__node_outage_degree); + pg_defaukt_lie->add_relation(rel__page_rank__2__1); + pg_defaukt_lie->add_scc(scc_defaultv); + pg_defaukt_lie->set_output_dir(slog_output_dir); // Write to this directory + pg_defaukt_lie->set_comm(mcomm); + pg_defaukt_lie->set_batch_size(1); + pg_defaukt_lie->execute(); + + rel__page_rank__2__1->disable_initialization(); + std::vector pg_lie_list; for (int i = 0; i < MAX_PG_ITERATION; i++) { std::cout << ">>>>>>>>>>>>>>>>>>>>> Compute pagerank iter " << current_iter << std::endl; LIE *pg_lie = new LIE(); - - // RAM *scc_ - RAM *scc_init = new RAM(false, 0); - scc_init->add_relation(rel__matrix__3__1, false, false); - scc_init->add_relation(rel__rank__3__1, true, false); - scc_init->add_rule(new parallel_copy_generate( - rel__rank__3__1, rel__matrix__3__1, FULL, - [](const u64 *const data, u64 *const output) -> int { - output[0] = data[0]; - output[1] = data[0]; - output[2] = dangling_value; - return 1; - })); RAM *scc_page_rank = new RAM(false, 1); scc_page_rank->add_relation(rel__matrix__3__1, false, false); - scc_page_rank->add_relation(rel__rank__3__1, true, false); - parallel_join *rank_join = - new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL, - rel__rank__3__1, DELTA, {3, 1, 2} // useless + scc_page_rank->add_relation(rel__page_rank__2__1, false, false); + scc_page_rank->add_relation(rel__sub_rank__3__1, false, false); + // scc_page_rank->add_relation(rel__node_outage_degree, false, false); + parallel_join *sub_rank_join = + new parallel_join(rel__sub_rank__3__1, + rel__page_rank__2__1, FULL, + rel__matrix__3__1, FULL, + {3, 1, 2} // useless ); - rank_join->set_generator_func([](const depend_val_t &target_vs, + sub_rank_join->set_generator_func([](const depend_val_t &target_vs, const std::vector &input_v, depend_val_t &res_set) -> bool { - // if (current_iter > MAX_PG_ITERATION) { - // return false; - // } - u64 pg_sum = dangling_node_cnt * dangling_value; + u64 pg_v = dangling_node_cnt * dangling_value; + // std::cout << input_v[0] << " " << input_v[1] << " " << input_v[2] << std::endl; int count = 0; for (auto &tv : target_vs) { - if ((tv[0] == tv[1]) && (current_iter != 0)) { - continue; - } - u32 raw_succ_pg_v_sub = tv[2]; // all columns are u64, cast to u32 first - if (current_iter == 0) { - raw_succ_pg_v_sub = raw_succ_pg_v_sub / input_v[2]; - } - pg_sum += (u64)(raw_succ_pg_v_sub * ALPHA); + u32 raw_succ_pg_v_sub = tv[1]; // all columns are u64, cast to u32 first + pg_v += raw_succ_pg_v_sub; count++; } - pg_sum += (1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size; std::vector res_tuple(3, 0); res_tuple[0] = input_v[1]; res_tuple[1] = input_v[0]; - res_tuple[2] = pg_sum / input_v[2]; + res_tuple[2] = (u64)((pg_v * ALPHA) / input_v[2]); res_set.push_back(res_tuple); return true; }); - scc_page_rank->add_rule(rank_join); + scc_page_rank->add_rule(sub_rank_join); + + RAM *scc_sum = new RAM(false, 1); + scc_sum->add_relation(rel__page_rank__2__1, false, false); + scc_sum->add_relation(rel__sub_rank__3__1, false, false); + scc_sum->add_relation(rel__node__1__1, false, false); + scc_sum->add_rule( + new parallel_join_aggregate( + rel__page_rank__2__1, + rel__sub_rank__3__1, + rel__node__1__1, FULL, + [](std::pair joined_range) -> local_agg_res_t { + local_agg_res_t sum_res = (u64)((1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size); + // std::cout << sum_res << std::endl; + for (shmap_relation::iterator it = joined_range.first; + it != joined_range.second; ++it) { + auto tp = *it; + sum_res += tp[2]; + } + return sum_res; + }, SpecialAggregator::sum, agg_sum_reduce, nullptr, + {0, 2} + )); + + // RAM *scc_init = new RAM(false, 0); + // scc_init->add_relation(rel__matrix__3__1, false, false); + // scc_init->add_relation(rel__sub_rank__3__1, true, false); + // scc_init->add_rule(new parallel_copy_generate( + // rel__sub_rank__3__1, rel__matrix__3__1, FULL, + // [](const u64 *const data, u64 *const output) -> int { + // output[0] = data[0]; + // output[1] = data[0]; + // output[2] = (u64)((ALPHA * dangling_value) / data[2]); + // return 1; + // })); + // RAM *scc_page_rank = new RAM(false, 1); + // scc_page_rank->add_relation(rel__matrix__3__1, false, false); + // scc_page_rank->add_relation(rel__sub_rank__3__1, true, false); + // parallel_join *rank_join = + // new parallel_join(rel__sub_rank__3__1, rel__matrix__3__1, FULL, + // rel__sub_rank__3__1, FULL, {3, 1, 2} // useless + // ); + // rank_join->set_generator_func([](const depend_val_t &target_vs, + // const std::vector &input_v, + // depend_val_t &res_set) -> bool { + // // if (current_iter > MAX_PG_ITERATION) { + // // return false; + // // } + // u64 pg_sum = dangling_node_cnt * dangling_value; + // int count = 0; + // for (auto &tv : target_vs) { + // if ((tv[0] == tv[1]) && (current_iter != 0)) { + // continue; + // } + // u32 raw_succ_pg_v_sub = tv[2]; // all columns are u64, cast to u32 first + // pg_sum += raw_succ_pg_v_sub; + // count++; + // } + // pg_sum += (1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size; + // std::vector res_tuple(3, 0); + // res_tuple[0] = input_v[1]; + // res_tuple[1] = input_v[0]; + // res_tuple[2] = (u64)(pg_sum * ALPHA / input_v[2]); + // res_set.push_back(res_tuple); + // return true; + // }); + // scc_page_rank->add_rule(rank_join); + // pg_lie->add_relation(rel__matrix__3__1); + // pg_lie->add_relation(rel__node__1__1); + // pg_lie->add_relation(rel__sub_rank__3__1); + // pg_lie->add_scc(scc_page_rank); + // if (current_iter == 0) { + // pg_lie->add_scc(scc_init); + // pg_lie->add_scc_dependance(scc_init, scc_page_rank); + // } - pg_lie_list.push_back(pg_lie); - pg_lie->add_relation(rel__matrix__3__1); + pg_lie->add_relation(rel__page_rank__2__1); + pg_lie->add_relation(rel__sub_rank__3__1); pg_lie->add_relation(rel__node__1__1); - pg_lie->add_relation(rel__rank__3__1); + pg_lie->add_relation(rel__matrix__3__1); pg_lie->add_scc(scc_page_rank); - if (current_iter == 0) { - pg_lie->add_scc(scc_init); - pg_lie->add_scc_dependance(scc_init, scc_page_rank); - } - // Enable IO + pg_lie->add_scc(scc_sum); + pg_lie->add_scc_dependance(scc_page_rank, scc_sum); + pg_lie_list.push_back(pg_lie); + if (i == MAX_PG_ITERATION - 1) { pg_lie->enable_all_to_all_dump(); pg_lie->enable_data_IO(); @@ -691,10 +798,19 @@ int main(int argc, char **argv) { pg_lie->set_comm(mcomm); pg_lie->set_batch_size(1); pg_lie->execute(); - current_iter++; - rel__rank__3__1->disable_initialization(); + rel__sub_rank__3__1->disable_initialization(); pg_lie->print_all_relation_size(); // Continuously print relation sizes + current_iter++; + // rel__page_rank__2__1->print([](const std::vector &tp) { + // u32 pg_v = tp[1]; + // // std::cout << tp[0] << " " << *reinterpret_cast(&pg_v) << + // std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl; + // }); + // // need this? + // MPI_Barrier(mcomm.get_comm()); } + rel__page_rank__2__1->print(); + // rel__rank__4__1->print( // [](const std::vector& tp){ // u32 pg_v = tp[3]; @@ -703,47 +819,91 @@ int main(int argc, char **argv) { // << tp[2] << " " << pg_v << std::endl; // } // ); - // delete pg_pre_lie; - // delete pg_lie; - - std::cout << "Aggregating sum ..." << std::endl; - relation *rel__result__2__1__2 = new relation( - 2, true, 2, get_tag_for_rel("result", "1__2"), - std::to_string(get_tag_for_rel("result", "1__2")) + - ".result.2.table", std::to_string(get_tag_for_rel("result", "1__2")) - + ".result.2.table", FULL); - - RAM *scc_result = new RAM(false, 4); - scc_result->add_relation(rel__rank__3__1, false, false); - scc_result->add_relation(rel__result__2__1__2, true, false); - scc_result->add_relation(rel__node__1__1, false, false); - scc_result->add_rule(new parallel_join_aggregate( - rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, - agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, - 2})); - - LIE* final_lie = new LIE(); - final_lie->add_relation(rel__result__2__1__2); - final_lie->add_relation(rel__node__1__1); - final_lie->add_relation(rel__rank__3__1); - final_lie->add_scc(scc_result); - final_lie->enable_all_to_all_dump(); - final_lie->enable_data_IO(); - final_lie->enable_IO(); - - final_lie->set_output_dir(slog_output_dir); // Write to this directory - final_lie->set_comm(mcomm); - final_lie->set_batch_size(1); - final_lie->execute(); - final_lie->print_all_relation_size(); // Continuously print relation sizes + + // std::cout << "Aggregating sum ..." << std::endl; + // relation *rel__result__2__1__2 = new relation( + // 2, true, 2, get_tag_for_rel("result", "1__2"), + // std::to_string(get_tag_for_rel("result", "1__2")) + + // ".result.2.table", std::to_string(get_tag_for_rel("result", "1__2")) + // + ".result.2.table", FULL); + // relation *rel__sum_pg__1__1__1 = new relation( + // 1, true, 1, get_tag_for_rel("sum_pg", "1"), + // std::to_string(get_tag_for_rel("sum_pg", "1")) + "sum_pg.2.table", + // std::to_string(get_tag_for_rel("sum_pg", "1")) + "sum_pg.2.table", + // FULL); + + // RAM *scc_result = new RAM(false, 4); + // scc_result->add_relation(rel__sub_rank__3__1, false, false); + // scc_result->add_relation(rel__result__2__1__2, true, false); + // scc_result->add_relation(rel__node__1__1, false, false); + // scc_result->add_relation(rel__sum_pg__1__1__1, true, false); + // scc_result->add_rule(new parallel_join_aggregate( + // rel__result__2__1__2, rel__sub_rank__3__1, rel__node__1__1, FULL, + // [](std::pair joined_range) { + // local_agg_res_t sum_res = 0; + // for (shmap_relation::iterator it = joined_range.first; + // it != joined_range.second; ++it) { + // auto tuple = (*it); + // // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl; + // if (tuple[0] != tuple[1]) { + // sum_res += tuple[tuple.size() - 2]; + // } + // } + // sum_res += (u64)((1 - ALPHA) / total_node_size); + // return sum_res; + // }, + // SpecialAggregator::sum, + // agg_sum_reduce, + // nullptr, {0, 2})); + // scc_result->add_rule(new parallel_join_aggregate( + // rel__sum_pg__1__1__1, rel__sub_rank__3__1, rel___dollorunit__1__1, FULL, + // [](std::pair joined_range) { + // local_agg_res_t sum_res = 0; + // for (shmap_relation::iterator it = joined_range.first; + // it != joined_range.second; ++it) { + // auto tuple = (*it); + // // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl; + // if (tuple[0] != tuple[1]) { + // sum_res += tuple[tuple.size() - 2]; + // } + // } + // // sum_res += (u64)((1 - ALPHA) * FLOAT_SCALE_CONST); + // return sum_res; + // }, + // SpecialAggregator::sum, agg_sum_reduce, nullptr, + // {2})); + + // LIE* final_lie = new LIE(); + // final_lie->add_relation(rel__result__2__1__2); + // final_lie->add_relation(rel__node__1__1); + // final_lie->add_relation(rel__sub_rank__3__1); + // final_lie->add_relation(rel__sum_pg__1__1__1); + // final_lie->add_scc(scc_result); + // final_lie->enable_all_to_all_dump(); + // final_lie->enable_data_IO(); + // final_lie->enable_IO(); + + // final_lie->set_output_dir(slog_output_dir); // Write to this directory + // final_lie->set_comm(mcomm); + // final_lie->set_batch_size(1); + // final_lie->execute(); + // final_lie->print_all_relation_size(); // Continuously print relation sizes // rel__rank__3__1->print(); + // rel__node__1__1->print(); + + // rel__result__2__1__2->print([](const std::vector &tp) { + // u32 pg_v = tp[1]; + // // std::cout << tp[0] << " " << *reinterpret_cast(&pg_v) << + // std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl; + // }); + // rel__sum_pg__1__1__1->print([](const std::vector &tp) { + // u32 pg_v = tp[0]; + // // std::cout << tp[0] << " " << *reinterpret_cast(&pg_v) << + // std::cout << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl; + // }); + - rel__result__2__1__2->print([](const std::vector &tp) { - u32 pg_v = tp[1]; - // std::cout << tp[0] << " " << *reinterpret_cast(&pg_v) << - std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl; - }); // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> mcomm.destroy(); diff --git a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp index c0636f61..11055725 100644 --- a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp +++ b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp @@ -1,5 +1,6 @@ // location of `parallel_RA_inc.h` here #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" +#include "mpi.h" #include #include @@ -35,6 +36,14 @@ const u64 str_tag = 2; const u64 sign_flip_const = 0x0000200000000000; const u64 signed_num_mask = 0xFFFFE00000000000; +#define FLOAT_SCALE_CONST 100000 +float ALPHA = 0.85; +u64 total_node_size = 0; +u64 dangling_value = 0; +u64 current_iter = 0; +int MAX_PG_ITERATION = 2; +u64 dangling_node_cnt; + inline bool is_number(u64 datum) { // cout << "is_number(" << datum << "): " << (datum >> tag_position == // int_tag) << "\n"; @@ -264,7 +273,10 @@ agg_sum_local(std::pair for (shmap_relation::iterator it = joined_range.first; it != joined_range.second; ++it) { auto tuple = (*it); + // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl; + // if (tuple[1] == MAX_PG_ITERATION) { sum_res += tuple[tuple.size() - 2]; + // } } return sum_res; } @@ -410,23 +422,21 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { return max_rel; } -float ALPHA = 0.85; -u64 total_node_size = 0; -u64 dangling_value = 0; - int main(int argc, char **argv) { // input dir from compiler std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data"; // output dir from compiler std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints"; - if (argc == 3) { - slog_input_dir = argv[1]; - slog_output_dir = argv[2]; - } + // if (argc 3) { + slog_input_dir = argv[1]; + slog_output_dir = argv[2]; + // } load_input_relation(slog_input_dir); mpi_comm mcomm; mcomm.create(argc, argv); + MAX_PG_ITERATION = atoi(argv[3]); + // (edge from to) relation *rel__edge__2__1 = new relation( 1, true, 2, get_tag_for_rel("edge", "1__2"), @@ -462,12 +472,43 @@ int main(int argc, char **argv) { ".$unit.1.table", FULL); - RAM *scc_helper_fact = new RAM(false, 0); + // from, to, outage degree of `from` + relation *rel__matrix__3__1 = new relation( + 1, true, 3, get_tag_for_rel("matrix", "1"), + std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", + std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL); + + relation *rel__dangling_node = + new relation(1, true, 1, get_tag_for_rel("dangling_node", "1"), + std::to_string(get_tag_for_rel("dangling_node", "1")) + + ".dangling_node.table", + std::to_string(get_tag_for_rel("dangling_node", "1")) + + ".dangling_node.table", + FULL); + + relation *rel__node_outage_degree = + new relation(1, true, 2, get_tag_for_rel("node_outage_degree", "2"), + std::to_string(get_tag_for_rel("node_outage_degree", "2")) + + ".node_outage_degree.table", + std::to_string(get_tag_for_rel("node_outage_degree", "2")) + + ".node_outage_degree.table", + FULL); + + RAM *scc_compute_matrix = new RAM(false, 0); + scc_compute_matrix->add_relation(rel__edge__2__1, false, false); + // scc_compute_matrix->add_relation(rel__edge__2__2, false, false); + scc_compute_matrix->add_relation(rel__matrix__3__1, true, false); + scc_compute_matrix->add_rule(new parallel_join_aggregate( + rel__matrix__3__1, rel__edge__2__1, rel__edge__2__1, FULL, + agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, + {0, 1, 3})); + + RAM *scc_helper_fact = new RAM(false, 1); scc_helper_fact->add_relation(rel___dollorunit__1__1, true, false); scc_helper_fact->add_rule(new fact(rel___dollorunit__1__1, {n2d(0)})); // [(node a) (node b) <-- (edge a b)] - RAM *scc_compute_node = new RAM(false, 1); + RAM *scc_compute_node = new RAM(false, 2); scc_compute_node->add_relation(rel__edge__2__1, false, false); scc_compute_node->add_relation(rel__node__1__1, true, false); scc_compute_node->add_rule(new parallel_copy_generate( @@ -484,7 +525,7 @@ int main(int argc, char **argv) { })); // (total_node_cnt {count node _}) - RAM *scc_count_nodes = new RAM(false, 2); + RAM *scc_count_nodes = new RAM(false, 3); scc_count_nodes->add_relation(rel__node__1__1, false, false); scc_count_nodes->add_relation(rel__total_node_cnt__1__1, true, false); scc_count_nodes->add_relation(rel___dollorunit__1__1, false, false); @@ -493,258 +534,211 @@ int main(int argc, char **argv) { agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, {2})); - LIE *cnt_lie = new LIE(); - cnt_lie->add_relation(rel__edge__2__1); - cnt_lie->add_relation(rel__node__1__1); - cnt_lie->add_relation(rel___dollorunit__1__1); - cnt_lie->add_relation(rel__total_node_cnt__1__1); - cnt_lie->add_scc(scc_helper_fact); - cnt_lie->add_scc(scc_compute_node); - cnt_lie->add_scc(scc_count_nodes); - cnt_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes); - cnt_lie->add_scc_dependance(scc_compute_node, scc_count_nodes); - - cnt_lie->enable_all_to_all_dump(); - cnt_lie->set_output_dir(slog_output_dir); // Write to this directory - cnt_lie->set_comm(mcomm); - cnt_lie->set_batch_size(1); - cnt_lie->execute(); - cnt_lie->print_all_relation_size(); // Continuously print relation sizes + RAM *scc_populate_dangling = new RAM(false, 4); + scc_populate_dangling->add_relation(rel__edge__2__1, false); + scc_populate_dangling->add_relation(rel__dangling_node, true); + scc_populate_dangling->add_relation(rel__node__1__1, false); + scc_populate_dangling->add_rule(new parallel_join_negate( + rel__dangling_node, rel__node__1__1, FULL, rel__edge__2__1, {0})); + + RAM *scc_degree = new RAM(false, 5); + scc_degree->add_relation(rel__node_outage_degree, true); + scc_degree->add_relation(rel__matrix__3__1, false); + scc_degree->add_rule(new parallel_copy(rel__node_outage_degree, + rel__matrix__3__1, FULL, {0, 2})); + + LIE *init_lie = new LIE(); + init_lie->add_relation(rel__edge__2__1); + init_lie->add_relation(rel__node__1__1); + init_lie->add_relation(rel___dollorunit__1__1); + init_lie->add_relation(rel__total_node_cnt__1__1); + init_lie->add_relation(rel__matrix__3__1); + init_lie->add_relation(rel__dangling_node); + init_lie->add_relation(rel__node_outage_degree); + init_lie->add_scc(scc_helper_fact); + init_lie->add_scc(scc_compute_node); + init_lie->add_scc(scc_count_nodes); + init_lie->add_scc(scc_compute_matrix); + init_lie->add_scc(scc_populate_dangling); + init_lie->add_scc(scc_degree); + init_lie->add_scc_dependance(scc_helper_fact, scc_count_nodes); + init_lie->add_scc_dependance(scc_compute_node, scc_count_nodes); + init_lie->add_scc_dependance(scc_compute_matrix, scc_degree); + + init_lie->enable_all_to_all_dump(); + init_lie->set_output_dir(slog_output_dir); // Write to this directory + init_lie->set_comm(mcomm); + init_lie->set_batch_size(1); + init_lie->execute(); + init_lie->print_all_relation_size(); // Continuously print relation sizes // only 1 data in this rel so its safe - rel__total_node_cnt__1__1->print(); - for (auto &t : rel__total_node_cnt__1__1->get_full()[mcomm.get_rank()]) { - total_node_size = t[0]; - dangling_value = (u64)(((1 - ALPHA) / total_node_size) * 100000); - std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << std::endl; - } + u64 local_node_size = rel__node__1__1->get_full_element_count(); + MPI_Barrier(mcomm.get_comm()); - // >>>>>>>>>>>>>>> compute page rank - std::cout << ">>>>>>>>>> Computing pagerank ... " << std::endl; + MPI_Allreduce(&local_node_size, &total_node_size, 1, MPI_UNSIGNED_LONG_LONG, + MPI_SUM, mcomm.get_comm()); + + dangling_node_cnt = rel__dangling_node->get_global_full_element_count(); + dangling_value = FLOAT_SCALE_CONST / total_node_size; + std::cout << ">>>>>>>>> Number of nodes: " << total_node_size + << " >>>>>>>>> Dangling node count: " << dangling_node_cnt + << " >>>>>>>>> Dangling value: " + << dangling_value * 1.0 / FLOAT_SCALE_CONST << std::endl; rel__edge__2__1->disable_initialization(); rel__node__1__1->disable_initialization(); + rel__matrix__3__1->disable_initialization(); + rel__dangling_node->disable_initialization(); + rel__node_outage_degree->disable_initialization(); - relation *rel__edge__2__2 = new relation( - 1, false, 2, get_tag_for_rel("edge", "2"), - std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", - std::to_string(get_tag_for_rel("edge", "2")) + ".edge.2.table", FULL); - - // matrix edge + successor count - relation *rel__matrix__3__1 = new relation( - 1, true, 3, get_tag_for_rel("matrix", "1"), - std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", - std::to_string(get_tag_for_rel("matrix", "1")) + ".matrix.3.table", FULL); - - relation *rel__rank__3__1 = new relation( - 1, true, 3, get_tag_for_rel("rank", "1"), - std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", - std::to_string(get_tag_for_rel("rank", "1")) + ".rank.3.table", FULL); + // rel__matrix__3__1->print(); + relation *rel__sub_rank__3__1 = new relation( + 1, true, 3, get_tag_for_rel("sub_rank", "1"), + std::to_string(get_tag_for_rel("sub_rank", "1")) + ".sub_rank.3.table", + std::to_string(get_tag_for_rel("sub_rank", "1")) + ".sub_rank.3.table", + FULL); - rel__rank__3__1->set_dependent_column_update( + // page rank (node N, + // , + // ) + rel__sub_rank__3__1->set_dependent_column_update( {1, 2, 3}, [](const std::vector &old_v, const std::vector &new_v, const vector &nt) -> std::optional { - // if (nt[0] == 59 && nt[1] == 58) { - // std::cout << "dependent column size " << new_v.size() << std::endl; - // std::cout << new_v[0] << " " << new_v[1] << " " << new_v[2] << " - // comparing with " << old_v[0] << " " << old_v[1] << " " << old_v[2] - // << std::endl; - // } if (new_v[0] != old_v[0]) { - // std::cout << " www >>>>>>>>>" << std::endl; return std::nullopt; - } else { - // monotonic - // assert(new_v[1] > old_v[1]); - // u32 new_sum_raw = new_v[1]; - // u32 old_sum_raw = old_v[1]; - // float new_sum = *reinterpret_cast(&new_sum_raw); - // float old_sum = *reinterpret_cast(&old_sum_raw); - // if (new_sum > old_sum) { - // std::cout << "new >> " << new_sum << " old >> " << old_sum << - // std::endl; - // } - // return new_sum > old_sum; - return new_v[1] > old_v[1]; - // return true; } + return true; }); - relation *rel__result__2__1__2 = new relation( - 2, true, 2, get_tag_for_rel("result", "1__2"), - std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", - std::to_string(get_tag_for_rel("result", "1__2")) + ".result.2.table", - FULL); - - // - - RAM *scc_copy_edge = new RAM(false, 0); - scc_copy_edge->add_relation(rel__edge__2__1, false, false); - scc_copy_edge->add_relation(rel__edge__2__2, true, false); - scc_copy_edge->add_rule( - new parallel_acopy(rel__edge__2__2, rel__edge__2__1, FULL, {1, 0, 2})); - - RAM *scc_compute_matrix = new RAM(false, 1); - scc_compute_matrix->add_relation(rel__edge__2__1, false, false); - scc_compute_matrix->add_relation(rel__edge__2__2, false, false); - scc_compute_matrix->add_relation(rel__matrix__3__1, true, false); - scc_compute_matrix->add_rule(new parallel_join_aggregate( - rel__matrix__3__1, rel__edge__2__2, rel__edge__2__1, FULL, - agg_count_local, SpecialAggregator::count, agg_count_reduce, nullptr, - {0, 1, 3})); - - RAM *scc_init = new RAM(false, 2); - scc_init->add_relation(rel__matrix__3__1, false, false); - scc_init->add_relation(rel__rank__3__1, true, false); - scc_init->add_rule(new parallel_copy_generate( - rel__rank__3__1, rel__matrix__3__1, FULL, - [](const u64 *const data, u64 *const output) -> int { - output[0] = data[0]; - output[1] = data[0]; - // float init_pg_v = (1 - ALPHA) / total_node_size; - u64 init_pg_v = dangling_value; - // std::cout << init_pg_v << std::endl; - // output[2] = *reinterpret_cast(&init_pg_v); - output[2] = init_pg_v; - return 1; - })); - - RAM *scc_page_rank = new RAM(true, 3); - scc_page_rank->add_relation(rel__matrix__3__1, false, false); - scc_page_rank->add_relation(rel__rank__3__1, true, false); - parallel_join *rank_join = - new parallel_join(rel__rank__3__1, rel__matrix__3__1, FULL, - rel__rank__3__1, DELTA, {3, 1, 2} // useless - ); - rank_join->set_generator_func([](const depend_val_t &target_vs, - const std::vector &input_v, - depend_val_t &res_set) -> bool { - // float pg_sum = 0.0; - u64 pg_sum = dangling_value; - - int count = 0; - for (auto &tv : target_vs) { - // std::cout << "tagret v >>>>> "; - // for (auto c: tv) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - u32 raw_succ_pg_v = tv[2]; // all columns are u64, cast to u32 first - // std::cout << ">>>>>>>>>>>>>>> " << - // *reinterpret_cast(&raw_succ_pg_v) << std::endl; - // auto succ_pg_v = *reinterpret_cast(&raw_succ_pg_v); - // if(succ_pg_v == 0) { - // // std::cout << ">>>>>>>>>> " << succ_pg_v << std::endl; - // std::cout << "tagret v >>>>> "; - // for (auto c: tv) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - // } - if (input_v[2] != 0) { - // pg_sum += ((ALPHA * succ_pg_v) / input_v[2]); - pg_sum += (u64)(((u64)(ALPHA * raw_succ_pg_v)) / input_v[2]); - // if (input_v[1] == 51) { - // std::cout << "Sum 51 " << input_v[0] << " with "; - // for (auto c: tv) { - // std::cout << c << " "; - // } - // std::cout << " result " << pg_sum << std::endl; - // } + std::vector pg_lie_list; + + for (int i = 0; i < MAX_PG_ITERATION; i++) { + std::cout << ">>>>>>>>>>>>>>>>>>>>> Compute pagerank iter " << current_iter + << std::endl; + LIE *pg_lie = new LIE(); + + RAM *scc_init = new RAM(false, 0); + scc_init->add_relation(rel__matrix__3__1, false, false); + scc_init->add_relation(rel__sub_rank__3__1, true, false); + scc_init->add_rule(new parallel_copy_generate( + rel__sub_rank__3__1, rel__matrix__3__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[1]; + output[1] = data[0]; + output[2] = (u64)((ALPHA * dangling_value) / data[2]); + return 1; + })); + RAM *scc_page_rank = new RAM(false, 1); + scc_page_rank->add_relation(rel__matrix__3__1, false, false); + scc_page_rank->add_relation(rel__sub_rank__3__1, true, false); + parallel_join *rank_join = + new parallel_join(rel__sub_rank__3__1, + rel__matrix__3__1, FULL, + rel__sub_rank__3__1, DELTA, + {3, 1, 2} // useless + ); + rank_join->set_generator_func([](const depend_val_t &target_vs, + const std::vector &input_v, + depend_val_t &res_set) -> bool { + u64 pg_sum = dangling_node_cnt * dangling_value; + int count = 0; + for (auto &tv : target_vs) { + u64 raw_succ_pg_v_sub = tv[2]; + pg_sum += raw_succ_pg_v_sub; + count++; } - count++; - } - if (pg_sum == 0) { - return false; + pg_sum += (u64)((1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size); + std::vector res_tuple(3, 0); + res_tuple[0] = input_v[1]; + res_tuple[1] = input_v[0]; + res_tuple[2] = (u64)(pg_sum * ALPHA / input_v[2]); + res_set.push_back(res_tuple); + return true; + }); + scc_page_rank->add_rule(rank_join); + pg_lie->add_relation(rel__matrix__3__1); + pg_lie->add_relation(rel__node__1__1); + pg_lie->add_relation(rel__sub_rank__3__1); + pg_lie->add_scc(scc_page_rank); + if (current_iter == 0) { + pg_lie->add_scc(scc_init); + pg_lie->add_scc_dependance(scc_init, scc_page_rank); } - if (count == 0) { - return false; + + pg_lie_list.push_back(pg_lie); + + if (i == MAX_PG_ITERATION - 1) { + pg_lie->enable_all_to_all_dump(); + pg_lie->enable_data_IO(); + pg_lie->enable_IO(); } - std::vector res_tuple(3, 0); - res_tuple[0] = input_v[1]; - res_tuple[1] = input_v[0]; - // res_tuple[2] = *reinterpret_cast(&pg_sum); - res_tuple[2] = pg_sum; - // std::cout << "New tuple >>>>>>> " << pg_sum << std::endl; - // for (auto c: res_tuple) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - res_set.push_back(res_tuple); - return true; - }); - scc_page_rank->add_rule(rank_join); + // lie->enable_share_io(); + pg_lie->set_output_dir(slog_output_dir); // Write to this directory + pg_lie->set_comm(mcomm); + pg_lie->set_batch_size(1); + pg_lie->execute(); + rel__sub_rank__3__1->disable_initialization(); + pg_lie->print_all_relation_size(); // Continuously print relation sizes + current_iter++; + // // need this? + // MPI_Barrier(mcomm.get_comm()); + } + + std::cout << "Aggregating Page Rank Result ..." << std::endl; + relation *rel__result__2__1__2 = new relation( + 2, true, 2, get_tag_for_rel("result", "1__2"), + std::to_string(get_tag_for_rel("result", "1__2")) + + ".result.2.table", std::to_string(get_tag_for_rel("result", "1__2")) + + ".result.2.table", FULL); RAM *scc_result = new RAM(false, 4); - scc_result->add_relation(rel__rank__3__1, false, false); + scc_result->add_relation(rel__sub_rank__3__1, false, false); scc_result->add_relation(rel__result__2__1__2, true, false); scc_result->add_relation(rel__node__1__1, false, false); - // scc_result->add_rule(new parallel_join_aggregate( - // rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, - // agg_sum_float_local, SpecialAggregator::sum, agg_sum_float_reduce, - // nullptr, {0, 2})); + // scc_result->add_relation(rel__sum_pg__1__1__1, true, false); scc_result->add_rule(new parallel_join_aggregate( - rel__result__2__1__2, rel__rank__3__1, rel__node__1__1, FULL, - agg_sum_local, SpecialAggregator::sum, agg_sum_reduce, nullptr, {0, 2})); - - LIE *pg_lie = new LIE(); - pg_lie->add_relation(rel__edge__2__1); - pg_lie->add_relation(rel__matrix__3__1); - pg_lie->add_relation(rel__node__1__1); - pg_lie->add_relation(rel__edge__2__2); - pg_lie->add_relation(rel__rank__3__1); - pg_lie->add_relation(rel__result__2__1__2); - pg_lie->add_scc(scc_copy_edge); - pg_lie->add_scc(scc_compute_matrix); - pg_lie->add_scc(scc_init); - pg_lie->add_scc(scc_page_rank); - pg_lie->add_scc(scc_result); - pg_lie->add_scc_dependance(scc_copy_edge, scc_compute_matrix); - pg_lie->add_scc_dependance(scc_compute_matrix, scc_init); - pg_lie->add_scc_dependance(scc_init, scc_page_rank); - pg_lie->add_scc_dependance(scc_page_rank, scc_result); - - // Enable IO - pg_lie->enable_all_to_all_dump(); - pg_lie->enable_data_IO(); - pg_lie->enable_IO(); - // lie->enable_share_io(); - pg_lie->set_output_dir(slog_output_dir); // Write to this directory - pg_lie->set_comm(mcomm); - pg_lie->set_batch_size(1); - pg_lie->execute(); - pg_lie->print_all_relation_size(); // Continuously print relation sizes - // lie->stat_intermediate(); - // rel__matrix__3__1->print(); - // rel__rank__3__1->print( - // [](const std::vector& tp){ - // u32 pg_v = tp[2]; - // // std::cout << tp[0] << " " << tp[1] << " " << - // *reinterpret_cast(&pg_v) << std::cout << tp[0] << " " << tp[1] - // << " " << pg_v << std::endl; - // } - // ); + rel__result__2__1__2, rel__sub_rank__3__1, rel__node__1__1, FULL, + [](std::pair + joined_range) { + local_agg_res_t sum_res = 0; + for (shmap_relation::iterator it = joined_range.first; + it != joined_range.second; ++it) { + auto tuple = (*it); + if (tuple[0] != tuple[1]) { + sum_res += tuple[tuple.size() - 2]; + } + } + sum_res += (u64)((1 - ALPHA) * FLOAT_SCALE_CONST / total_node_size); + return sum_res; + }, + SpecialAggregator::sum, + agg_sum_reduce, + nullptr, {0, 2})); + + LIE* final_lie = new LIE(); + final_lie->add_relation(rel__result__2__1__2); + final_lie->add_relation(rel__node__1__1); + final_lie->add_relation(rel__sub_rank__3__1); + // final_lie->add_relation(rel__sum_pg__1__1__1); + final_lie->add_scc(scc_result); + final_lie->enable_all_to_all_dump(); + final_lie->enable_data_IO(); + final_lie->enable_IO(); + + final_lie->set_output_dir(slog_output_dir); // Write to this directory + final_lie->set_comm(mcomm); + final_lie->set_batch_size(1); + final_lie->execute(); + final_lie->print_all_relation_size(); // Continuously print relation sizes + rel__result__2__1__2->print([](const std::vector &tp) { u32 pg_v = tp[1]; - // std::cout << tp[0] << " " << *reinterpret_cast(&pg_v) << - std::cout << tp[0] << " " << pg_v * 1.0 / 100000 << std::endl; + std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl; }); - // print all variants(non-canonical index of each relation) - // if (mcomm.get_rank() == 0) { - // std::cout << "rel_name" - // << ",\t" - // << "indices\n"; - // for (auto const &rel_p : rel_index_map) { - // std::cout << rel_p.first << ",\t" << rel_p.second.size() << "\n"; - // } - // std::cout << std::endl; - // } - - delete pg_lie; - // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> mcomm.destroy(); From b88f7794ec44efcd9dd74b083a925a5d4dbcad21 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Sun, 4 Dec 2022 19:02:21 -0500 Subject: [PATCH 19/36] add CC --- backend/src/RA/parallel_copy.cpp | 1 + backend/tests/cc/README.md | 0 backend/tests/cc/compiled_pre/CMakeLists.txt | 28 + backend/tests/cc/compiled_pre/cc.cpp | 575 +++++++++++++++++++ backend/tests/cc/ground_truth | 8 + backend/tests/cc/input-data/edge.facts | 15 + examples/datalog-example | 2 +- 7 files changed, 628 insertions(+), 1 deletion(-) create mode 100644 backend/tests/cc/README.md create mode 100644 backend/tests/cc/compiled_pre/CMakeLists.txt create mode 100644 backend/tests/cc/compiled_pre/cc.cpp create mode 100644 backend/tests/cc/ground_truth create mode 100644 backend/tests/cc/input-data/edge.facts diff --git a/backend/src/RA/parallel_copy.cpp b/backend/src/RA/parallel_copy.cpp index e5f3ade1..b6b4ca4c 100644 --- a/backend/src/RA/parallel_copy.cpp +++ b/backend/src/RA/parallel_copy.cpp @@ -6,6 +6,7 @@ #include "../parallel_RA_inc.h" +#include #ifdef GOOGLE_MAP void parallel_copy::local_copy(u32 buckets, google_relation* input, u32* input_bucket_map, relation* output, std::vector reorder_map, u32 arity, u32 join_column_count, all_to_allv_buffer& copy_buffer, int ra_counter) diff --git a/backend/tests/cc/README.md b/backend/tests/cc/README.md new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/cc/compiled_pre/CMakeLists.txt b/backend/tests/cc/compiled_pre/CMakeLists.txt new file mode 100644 index 00000000..36be513b --- /dev/null +++ b/backend/tests/cc/compiled_pre/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required (VERSION 3.9) + +project (cc) + +add_compile_options(--std=c++17 -lstdc++fs -Wno-strict-aliasing -Werror=class-memaccess -fpermissive) + +link_libraries(stdc++fs) + +find_package(MPI REQUIRED) +# find_package(OpenMP) +# if (OPENMP_FOUND) +# set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") +# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +# set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +# endif() + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=class-memaccess -fpermissive") +# set (base_dir "${PROJECT_SOURCE_DIR}/../backend") +set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") + +file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") +file (GLOB source_files_cc "${PROJECT_SOURCE_DIR}/cc.cpp") + +ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}") + +add_executable(cc ${source_files_cc}) +INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH}) +TARGET_LINK_LIBRARIES(cc parallel_RA ${MPI_LIBRARIES}) diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp new file mode 100644 index 00000000..521597a3 --- /dev/null +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -0,0 +1,575 @@ +// location of `parallel_RA_inc.h` here +#include "/home/stargazermiao/workspace/PL/slog/backend/src/parallel_RA_inc.h" +#include "mpi.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// builtins.cpp goes here! +// builtins.cpp +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +#define u64 uint64_t +#define u32 uint32_t +using i64 = int64_t; + +const u64 tag_mask = 0xffffc00000000000; +const u64 tag_position = 46; +const u64 int_tag = 0; +const u64 str_tag = 2; +const u64 sign_flip_const = 0x0000200000000000; +const u64 signed_num_mask = 0xFFFFE00000000000; + +inline bool is_number(u64 datum) { + // cout << "is_number(" << datum << "): " << (datum >> tag_position == + // int_tag) << "\n"; + return datum >> tag_position == int_tag; +} + +inline i64 datum_to_number(u64 datum) { + i64 signed_val = + (datum & ~tag_mask) << (64 - tag_position) >> (64 - tag_position); + if (signed_val >= sign_flip_const) { + signed_val = sign_flip_const - signed_val; + } + return signed_val; + // return (i64) (datum & ~tag_mask) << (64 - tag_position) >> (64 - + // tag_position); +} +const auto d2n = datum_to_number; + +inline u64 number_to_datum(i64 number) { + i64 unsigned_value = number; + if (number < 0) { + unsigned_value = (-number) + sign_flip_const; + } + return (unsigned_value & ~tag_mask) | (int_tag << tag_position); + // return (number & ~tag_mask) | (int_tag << tag_position); +} + +const auto n2d = number_to_datum; + +inline u64 string_to_datum(std::string str) { + u32 str_hash = string_hash(str); + return (str_hash & ~tag_mask) | (str_tag << tag_position); +} +const auto s2d = string_to_datum; + +vector> builtin_div_rem(const u64 *const data) { + if (is_number(data[0]) && is_number(data[1])) { + auto div = number_to_datum(d2n(data[0]) / d2n(data[1])); + auto rem = number_to_datum(d2n(data[0]) % d2n(data[1])); + return {{div, rem}}; + } else { + return {}; + } +} + +#define BUILTIN_BINARY_NUMBER_PRED(name, op) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (is_number(data[0]) && is_number(data[1]) && \ + datum_to_number(data[0]) op datum_to_number(data[1])) { \ + return callback(init_state); \ + } else \ + return init_state; \ + } + +BUILTIN_BINARY_NUMBER_PRED(builtin_less, <) +BUILTIN_BINARY_NUMBER_PRED(builtin_greater, >) +BUILTIN_BINARY_NUMBER_PRED(builtin_le, <=) +BUILTIN_BINARY_NUMBER_PRED(builtin_ge, >=) + +#define BUILTIN_BINARY_NUMBER_FUNC(name, op) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0]) && is_number(data[1])) { \ + auto res = number_to_datum(datum_to_number(data[0]) \ + op datum_to_number(data[1])); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +BUILTIN_BINARY_NUMBER_FUNC(builtin_add, +) +BUILTIN_BINARY_NUMBER_FUNC(builtin_subtract, -) +BUILTIN_BINARY_NUMBER_FUNC(builtin_multiply, *) +BUILTIN_BINARY_NUMBER_FUNC(builtin_divide, /) + +#define BUILTIN_BINARY_NUMBER_FUNC2(name, impl) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0]) && is_number(data[1])) { \ + auto res = number_to_datum( \ + impl(datum_to_number(data[0]), datum_to_number(data[1]))); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +inline u64 impl_arg2_minus_arg1(u64 arg1, u64 arg2) { return arg2 - arg1; } +BUILTIN_BINARY_NUMBER_FUNC2(builtin_arg2_minus_arg1, impl_arg2_minus_arg1) + +#define BUILTIN_UNARY_NUMBER_FUNC(name, impl) \ + template \ + inline TState name(const u64 *data, TState init_state, \ + TState (*callback)(u64 res, TState state)) { \ + if (is_number(data[0])) { \ + auto res = number_to_datum(impl(datum_to_number(data[0]))); \ + return callback(res, init_state); \ + } else \ + return init_state; \ + } + +inline u64 add1(u64 x) { return x + 1; } +inline u64 sub1(u64 x) { return x - 1; } + +BUILTIN_UNARY_NUMBER_FUNC(builtin_add1, add1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_add1_2, sub1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1, sub1) +BUILTIN_UNARY_NUMBER_FUNC(builtin_sub1_2, add1) + +vector> builtin_range(const u64 *const data) { + vector> res; + if (is_number(data[0]) && is_number(data[1])) { + auto lb = datum_to_number(data[0]); + auto ub = datum_to_number(data[1]); + res.reserve(ub - lb); + for (u64 x = lb; x < ub; x++) + res.push_back({number_to_datum(x)}); + } + return res; +} + +template +TState callback_builtin_range(const u64 *data, TState init_state, + TState (*callback)(u64 res, TState state)) { + auto state = init_state; + if (is_number(data[0]) && is_number(data[1])) { + auto lb = datum_to_number(data[0]); + auto ub = datum_to_number(data[1]); + for (u64 x = lb; x < ub; x++) + state = callback(number_to_datum(x), state); + } + return state; +} + +#define BUILTIN_BINARY_PRED(name, op) \ + template \ + TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (data[0] op data[1]) \ + return callback(init_state); \ + else \ + return init_state; \ + } +BUILTIN_BINARY_PRED(builtin_eq, ==) +BUILTIN_BINARY_PRED(builtin_neq, !=) + +template +TState builtin_eq_1(const u64 *data, TState init_state, + TState (*callback)(u64 res, TState state)) { + return callback(data[0], init_state); +} + +#define BUILTIN_UNARY_PRED(name, pred) \ + template \ + TState name(const u64 *data, TState init_state, \ + TState (*callback)(TState state)) { \ + if (pred(data[0])) \ + return callback(init_state); \ + else \ + return init_state; \ + } + +bool is_not_number(u64 datum) { return !is_number(datum); } +BUILTIN_UNARY_PRED(builtin_number_huh, is_number) +BUILTIN_UNARY_PRED(builtin_not_number_huh, is_not_number) + +// for generate-cpp-lambda-for-computational-join +struct CL2CB_State { + void *original_callback; // There be dragons? + void *original_state; + const u64 *original_data; + u64 *cl1_output_args; +}; + +// for generate-cpp-lambda-for-computational-copy +struct BCLCB_State { + void *original_callback; + void *original_state; + const u64 *original_data; +}; + +// an experiment: +template bool builtin_binary_number_pred(const u64 *data) { + if (is_number(data[0]) && is_number(data[1])) { + return f(datum_to_number(data[0]), datum_to_number(data[1])); + } else { + return false; + } +} +bool _less(u64 x, u64 y) { return x < y; } +auto builtin_less2 = builtin_binary_number_pred<_less>; + +template +inline TState builtin_nop(const u64 *data, TState init_state, + TState (*callback)(TState state)) { + return callback(init_state); +} + +// //////////////////// AGGREGATORS Alternative design //////////////////// + +// TODO: add number type check +////////////////////////////// count ///////////////////////////////////// + +local_agg_res_t +agg_count_local(std::pair + joined_range) { + local_agg_res_t cnt = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + cnt++; + } + return cnt; +} + +local_agg_res_t agg_count_reduce(local_agg_res_t x, local_agg_res_t y) { + return x + y; +} + +////////////////////////////// sum ///////////////////////////////////// + +local_agg_res_t +agg_sum_local(std::pair + joined_range) { + local_agg_res_t sum_res = 0; + for (shmap_relation::iterator it = joined_range.first; + it != joined_range.second; ++it) { + auto tuple = (*it); + // std::cout << tuple[0] << " " << tuple[1] << " " << tuple[2] << std::endl; + // if (tuple[1] == MAX_PG_ITERATION) { + sum_res += tuple[tuple.size() - 2]; + // } + } + return sum_res; +} + +local_agg_res_t agg_sum_reduce(local_agg_res_t x, local_agg_res_t y) { + return x + y; +} + +local_agg_res_t agg_sum_float_local( + std::pair + joined_range) { + float sum_res = 0.0; + for (shmap_relation::iterator it = joined_range.first; + it != joined_range.second; ++it) { + auto tuple = (*it); + u32 agg_column_raw = tuple[tuple.size() - 2]; + + sum_res += *reinterpret_cast(&agg_column_raw); + } + // std::cout << ">>>>>>> " << sum_res << " " << + // *reinterpret_cast(&sum_res) << std::endl; + u32 sum_res_encoded = *reinterpret_cast(&sum_res); + return sum_res_encoded; +} + +local_agg_res_t agg_sum_float_reduce(local_agg_res_t x_raw, + local_agg_res_t y_raw) { + float x = *reinterpret_cast(&x_raw); + float y = *reinterpret_cast(&y_raw); + float res = x + y; + // std::cout << res << std::endl; + u32 res_encoded = *reinterpret_cast(&res); + return res_encoded; +} + +////////////////////////////// maximum ///////////////////////////////////// + +local_agg_res_t +agg_maximum_local(std::pair + joined_range) { + local_agg_res_t max_res = 0; + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + auto current_v = tuple[tuple.size() - 1]; + if (current_v > max_res) { + max_res = current_v; + } + } + return max_res; +} + +local_agg_res_t agg_maximum_reduce(local_agg_res_t x, local_agg_res_t y) { + if (x > y) { + return x; + } else { + return y; + } +} + +////////////////////////////// minimum ///////////////////////////////////// + +local_agg_res_t +agg_minimum_local(std::pair + joined_range) { + local_agg_res_t min_res = std::numeric_limits::max(); + for (auto it = joined_range.first; it != joined_range.second; ++it) { + auto tuple = (*it); + auto current_v = tuple[tuple.size() - 1]; + if (current_v < min_res) { + min_res = current_v; + } + } + return min_res; +} + +local_agg_res_t agg_minimum_reduce(local_agg_res_t x, local_agg_res_t y) { + if (x < y) { + return x; + } else { + return y; + } +} + +// // end of builtins.cpp + +// global definitions: + +int max_rel = 255; +std::map rel_tag_map; +std::map> rel_index_map; + +// load all relation inside input database +void load_input_relation(std::string db_dir) { + for (const auto &entry : std::filesystem::directory_iterator(db_dir)) { + // check if ends with table + std::string filename_ss = entry.path().filename().string(); + std::cout << "input database has file " << filename_ss << std::endl; + std::string suffix = ".table"; + int ft = filename_ss.size() - suffix.size(); + if (ft < 0) + ft = 0; + if (filename_ss.rfind(suffix) != ft) { + continue; + } + std::string filename_s = entry.path().stem().string(); + int tag = std::stoi(filename_s.substr(0, filename_s.find("."))); + std::string name_arity = filename_s.substr( + filename_s.find(".") + 1, filename_s.size() - filename_s.find(".") - 1); + std::string name = name_arity.substr(0, name_arity.rfind(".")); + std::string arity_s = + name_arity.substr(name_arity.rfind(".") + 1, name_arity.size()); + int arity = std::stoi(arity_s); + std::stringstream index_stream; + index_stream << name; + for (int i = 1; i <= arity; i++) { + index_stream << "__" << i; + } + if (tag > max_rel) + max_rel = tag; + std::cout << "load " << tag << "." << index_stream.str() << "has arity " + << arity << std::endl; + rel_tag_map[index_stream.str()] = tag; + } +} + +int get_tag_for_rel(std::string relation_name, std::string index_str) { + std::string name_arity = relation_name + "__" + index_str; + if (rel_index_map.find(relation_name) != rel_index_map.end()) { + rel_index_map[relation_name].insert(index_str); + } else { + rel_index_map[relation_name] = {index_str}; + } + + if (rel_tag_map.find(name_arity) != rel_tag_map.end()) { + // std::cout << "rel: " << name_arity << " " << rel_tag_map[name_arity] << + // std::endl; + return rel_tag_map[name_arity]; + } + max_rel++; + rel_tag_map[name_arity] = max_rel; + std::cout << "generate rel tag: " << name_arity << " " << max_rel + << std::endl; + return max_rel; +} + +int main(int argc, char **argv) { + // input dir from compiler + std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data"; + // output dir from compiler + std::string slog_output_dir = "/home/ubuntu/workspace/slog/out/checkpoints"; + // if (argc 3) { + slog_input_dir = argv[1]; + slog_output_dir = argv[2]; + // } + load_input_relation(slog_input_dir); + mpi_comm mcomm; + mcomm.create(argc, argv); + + // (edge from to) + relation *rel__edge__2__1 = new relation( + 1, true, 2, get_tag_for_rel("edge", "1__2"), + std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + + ".edge.2.table", + FULL); + + relation *rel__cc__2__1 = new relation( + 1, true, 2, get_tag_for_rel("cc", "1"), + std::to_string(get_tag_for_rel("cc", "1")) + ".cc.2.table", + FULL); + rel__cc__2__1->set_dependent_column_update( + {1,2}, + [](const std::vector &old_v, const std::vector &new_v, + const vector &nt) -> std::optional { + if (new_v[0] < old_v[0]) { + return true; + } else { + return false; + } + }); + + relation *rel__node__1__1 = new relation( + 1, true, 1, get_tag_for_rel("node", "1"), + std::to_string(get_tag_for_rel("node", "1")) + ".node.1.table", + FULL); + + relation *rel__cc_final__2__1 = new relation( + 1, true, 2, get_tag_for_rel("cc_final", "2"), + std::to_string(get_tag_for_rel("cc_final", "1")) + ".cc_final.2.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("cc_final", "1")) + + ".cc_final.2.table", + FULL); + + relation *rel__cc_represent__1__1 = new relation( + 1, true, 1, get_tag_for_rel("cc_represent", "1"), + std::to_string(get_tag_for_rel("cc_represent", "1")) + ".cc_represent.2.table", + FULL); + + RAM *to_undirected_scc = new RAM(false, 0); + to_undirected_scc->add_relation(rel__edge__2__1, false); + to_undirected_scc->add_rule(new parallel_copy_generate( + rel__edge__2__1, rel__edge__2__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[1]; + output[1] = data[0]; + return 1; + } + )); + + RAM *cc_init_scc = new RAM(false, 1); + cc_init_scc->add_relation(rel__edge__2__1, false); + cc_init_scc->add_relation(rel__cc__2__1, true); + cc_init_scc->add_relation(rel__node__1__1, true); + cc_init_scc->add_rule(new parallel_copy_generate( + rel__cc__2__1, rel__edge__2__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[0]; + output[1] = data[0]; + return 1; + } + )); + cc_init_scc->add_rule(new parallel_copy_generate( + rel__node__1__1, rel__edge__2__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[0]; + return 1; + } + )); + + RAM* cc_compute_scc = new RAM(true, 2); + cc_compute_scc->add_relation(rel__edge__2__1, false); + cc_compute_scc->add_relation(rel__cc__2__1, true); + parallel_join *cc_pg = new parallel_join( + rel__cc__2__1, rel__edge__2__1, + FULL, rel__cc__2__1, DELTA, + {1, 0} // useless + ); + cc_pg->set_generator_func( + [](const depend_val_t& target_vs, const std::vector& input_v, depend_val_t& res_set) -> bool { + // std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl; + auto target_v = target_vs[0]; + std::vector res(2, 0); + res[0] = input_v[1]; + res[1] = target_v[1]; + res_set.push_back(res); + return true; + } + ); + cc_compute_scc->add_rule(cc_pg); + + RAM* cc_agg_scc = new RAM(false, 3); + cc_agg_scc->add_relation(rel__cc__2__1, false); + cc_agg_scc->add_relation(rel__node__1__1, false); + cc_agg_scc->add_relation(rel__cc_final__2__1, true); + cc_agg_scc->add_rule(new parallel_join_aggregate( + rel__cc_final__2__1, rel__cc__2__1, + rel__node__1__1, FULL, + agg_minimum_local, SpecialAggregator::minimum, agg_minimum_reduce, + nullptr, {0,2})); + + RAM* cc_rep_scc = new RAM(false, 3); + cc_rep_scc->add_relation(rel__cc_final__2__1, false); + cc_rep_scc->add_relation(rel__cc_represent__1__1, true); + cc_rep_scc->add_rule(new parallel_copy( + rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1} + )); + + + LIE *cc_lie = new LIE(); + cc_lie->add_relation(rel__edge__2__1); + cc_lie->add_relation(rel__node__1__1); + cc_lie->add_relation(rel__cc__2__1); + cc_lie->add_relation(rel__cc_final__2__1); + cc_lie->add_relation(rel__cc_represent__1__1); + + cc_lie->add_scc(to_undirected_scc); + cc_lie->add_scc(cc_init_scc); + cc_lie->add_scc(cc_compute_scc); + cc_lie->add_scc(cc_agg_scc); + cc_lie->add_scc(cc_rep_scc); + + cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc); + cc_lie->add_scc_dependance(cc_init_scc, cc_compute_scc); + cc_lie->add_scc_dependance(cc_compute_scc, cc_agg_scc); + cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc); + + cc_lie->enable_all_to_all_dump(); + cc_lie->set_output_dir(slog_output_dir); // Write to this directory + cc_lie->set_comm(mcomm); + cc_lie->set_batch_size(1); + cc_lie->execute(); + cc_lie->print_all_relation_size(); // Continuously print relation sizes + + // rel__node__1__1->print(); + // rel__edge__2__1->print(); + rel__cc__2__1->print(); + // rel__cc_final__2__1->print(); + // rel__cc_represent__1__1->print(); + // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + mcomm.destroy(); + + return 0; +} diff --git a/backend/tests/cc/ground_truth b/backend/tests/cc/ground_truth new file mode 100644 index 00000000..cc2f793f --- /dev/null +++ b/backend/tests/cc/ground_truth @@ -0,0 +1,8 @@ +[{'0', '19', '25', '3'}, + {'13', '2'}, + {'14', '4'}, + {'11', '18', '27', '28', '5'}, + {'26', '6'}, + {'21', '22', '23', '8'}, + {'12', '24'}, + {'17', '20'}] diff --git a/backend/tests/cc/input-data/edge.facts b/backend/tests/cc/input-data/edge.facts new file mode 100644 index 00000000..1530cad3 --- /dev/null +++ b/backend/tests/cc/input-data/edge.facts @@ -0,0 +1,15 @@ +0 19 +0 25 +2 13 +3 19 +4 14 +5 27 +5 28 +6 26 +8 23 +11 27 +12 24 +17 20 +18 28 +21 23 +21 22 \ No newline at end of file diff --git a/examples/datalog-example b/examples/datalog-example index 87266643..be103a21 160000 --- a/examples/datalog-example +++ b/examples/datalog-example @@ -1 +1 @@ -Subproject commit 872666433df43282a408b4e37dad3fcbeafa1891 +Subproject commit be103a21713d3e965fc0d51dd54edf29721187aa From 5ea49614413c7ec87ccfed26fa16d3f48efbeee2 Mon Sep 17 00:00:00 2001 From: ysun67 Date: Thu, 8 Dec 2022 19:43:03 -0500 Subject: [PATCH 20/36] add debug info --- backend/src/RAM/RA_tasks.cpp | 44 +++++++++++---- backend/src/RAM/RA_tasks.h | 2 +- backend/src/btree/btree_container.h | 8 +-- backend/src/compat.h | 2 +- backend/src/lie/lie.cpp | 13 ++++- .../src/relation/balanced_hash_relation.cpp | 21 +++---- backend/src/relation/balanced_hash_relation.h | 2 +- backend/src/relation/shmap_relation.h | 2 +- backend/src/relation/shmap_relation_exp.cpp | 42 +------------- backend/tests/cc/compiled_pre/cc.cpp | 55 +++++++++++-------- .../tests/pagerank/compiled_pre/pagerank.cpp | 2 +- .../pagerank/compiled_pre/pagerank_full.cpp | 2 +- backend/tests/sssp/compiled_pre/sssp_opt.cpp | 10 +++- 13 files changed, 105 insertions(+), 100 deletions(-) diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 2712e343..3c8963f8 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -785,6 +785,8 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) int nprocs = mcomm.get_local_nprocs(); int RA_count = RA_list.size(); u64 relation_id=0, bucket_id=0, intern_key=0, intern_value=0; + double check_time = 0; + double insert_time = 0; for (int k = 0; k < RA_count * nprocs; k++) { @@ -847,19 +849,22 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) } #endif u32 elements_to_read = tuples_to_read * width; + for (int tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++) { u32 x = starting + tuple_ind * width; bool insert_flag = true; if (output->get_dependent_column().size() > 1) { - std::vector tt; - for (int i = 0; i < width; i++) { - tt.push_back(cumulative_all_to_allv_buffer[x+i]); - } + std::vector tt(cumulative_all_to_allv_buffer+x, cumulative_all_to_allv_buffer+x+width); + // for (int i = 0; i < width; i++) { + // tt.push_back(cumulative_all_to_allv_buffer[x+i]); + // } // temporary index column just to match size of column tt.push_back(0); + auto _before_i = MPI_Wtime(); insert_flag = output->check_dependent_value_insert_avalible(tt); - + auto _after_i = MPI_Wtime(); + check_time += _after_i - _before_i; } else { insert_flag = output->find_in_full(cumulative_all_to_allv_buffer + x, width) == false && output->find_in_delta(cumulative_all_to_allv_buffer + x, width) == false && @@ -885,9 +890,11 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) intern_map[intern_key] = intern_value; tuple[width] = intern_key | intern_value; /// Intern here - + auto _before_ins = MPI_Wtime(); if (output->insert_in_newt(tuple) == true) successful_insert++; + auto _after_ins = MPI_Wtime(); + insert_time += _after_ins - _before_ins; } } starting = starting + elements_to_read; @@ -923,7 +930,8 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) // std::cout << output->get_debug_id() << " successful insert: " << successful_insert << " ; failed insert : " << failed_insert << std::endl; } - + if (mcomm.get_rank() == 0) + std::cout << "CHECK TIME: " << check_time << " INSERT_TIME: " << insert_time << " NEW TUPLES: " << successful_insert << std::endl; delete[] cumulative_all_to_allv_recv_process_count_array; delete[] cumulative_all_to_allv_buffer; } @@ -1246,7 +1254,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& -void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector& history, std::map& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num) +void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector& history, std::map& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector& runtime_vector) { int inner_loop = 0; u32 RA_count = RA_list.size(); @@ -1254,6 +1262,11 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s int *offset = new int[RA_count]; for (u32 i =0; i < RA_count; i++) offset[i] = 0; + + double all_local_compute = 0; + double all_insert_newt = 0; + double all_comm = 0; + double all_time = 0; while (batch_size != 0) { @@ -1265,7 +1278,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s std::cout << std::setiosflags(std::ios::fixed); auto intra_start = MPI_Wtime(); intra_bucket_comm_execute(); - auto intra_end = MPI_Wtime(); + auto intra_end = MPI_Wtime(); bool local_join_status = false; while (local_join_status == false) @@ -1277,10 +1290,12 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s auto compute_start = MPI_Wtime(); local_join_status = local_compute(offset); auto compute_end = MPI_Wtime(); + all_local_compute += compute_end - compute_start; auto all_to_all_start = MPI_Wtime(); comm_compaction_all_to_all(compute_buffer, &cumulative_all_to_allv_recv_process_count_array, &cumulative_all_to_allv_buffer, mcomm.get_local_comm(), *loop_counter, task_id, output_dir, all_to_all_record, sloav_mode, rotate_index_array, send_indexes, sendb_num); auto all_to_all_end = MPI_Wtime(); + all_comm += all_to_all_end - all_to_all_start; auto free_buffers_start = MPI_Wtime(); free_compute_buffers(); @@ -1289,6 +1304,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s auto insert_in_newt_start = MPI_Wtime(); local_insert_in_newt_comm_compaction(intern_map); auto insert_in_newt_end = MPI_Wtime(); + all_insert_newt += insert_in_newt_end - insert_in_newt_start; #if 1 @@ -1327,7 +1343,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s auto insert_in_full_start = MPI_Wtime(); local_insert_in_full(); - auto insert_in_full_end = MPI_Wtime(); + auto insert_in_full_end = MPI_Wtime(); #if 1 if (mcomm.get_rank() == 0) @@ -1349,6 +1365,8 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s std::cout << (intra_end - intra_start) << std::setw(12) << (insert_in_full_end - insert_in_full_start) << std::setw(12) << (insert_in_full_end - intra_start) << std::endl; + + all_time += insert_in_full_end - intra_start; } #endif @@ -1360,6 +1378,12 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s if (iteration_count == 1) break; } + if (mcomm.get_rank() == 0) { + runtime_vector[0] = runtime_vector[0] + all_comm; + runtime_vector[1] = runtime_vector[1] + all_local_compute; + runtime_vector[2] = runtime_vector[2] + all_insert_newt; + runtime_vector[3] = runtime_vector[3] + all_time; + } delete[] offset; diff --git a/backend/src/RAM/RA_tasks.h b/backend/src/RAM/RA_tasks.h index 8b5d8e0d..ab9ac4a3 100644 --- a/backend/src/RAM/RA_tasks.h +++ b/backend/src/RAM/RA_tasks.h @@ -165,7 +165,7 @@ class RAM /// Start running this SCC (task) for "batck_size" iterations void execute_in_batches(std::string name, int batch_size, std::vector& history, std::map& intern_map, int *loop_counter,int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num); - void execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector& history, std::map& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num); + void execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector& history, std::map& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector& runtime_vector); }; #endif diff --git a/backend/src/btree/btree_container.h b/backend/src/btree/btree_container.h index fb617abe..9b918ba7 100644 --- a/backend/src/btree/btree_container.h +++ b/backend/src/btree/btree_container.h @@ -58,9 +58,9 @@ class btree_container { // Iterator routines. iterator begin() { return tree_.begin(); } - const_iterator begin() const { return tree_.begin(); } + const_iterator cbegin() const { return tree_.begin(); } iterator end() { return tree_.end(); } - const_iterator end() const { return tree_.end(); } + const_iterator cend() const { return tree_.end(); } reverse_iterator rbegin() { return tree_.rbegin(); } const_reverse_iterator rbegin() const { return tree_.rbegin(); } reverse_iterator rend() { return tree_.rend(); } @@ -70,13 +70,13 @@ class btree_container { iterator lower_bound(const key_type &key) { return tree_.lower_bound(key); } - const_iterator lower_bound(const key_type &key) const { + const_iterator clower_bound(const key_type &key) const { return tree_.lower_bound(key); } iterator upper_bound(const key_type &key) { return tree_.upper_bound(key); } - const_iterator upper_bound(const key_type &key) const { + const_iterator cupper_bound(const key_type &key) const { return tree_.upper_bound(key); } std::pair equal_range(const key_type &key) { diff --git a/backend/src/compat.h b/backend/src/compat.h index dbc42cf4..e40be509 100644 --- a/backend/src/compat.h +++ b/backend/src/compat.h @@ -25,7 +25,7 @@ #include "btree/btree_set.h" #include #include -#include +// #include #ifdef __GNUC__ diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index e517288f..ad478a20 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -385,6 +385,7 @@ bool LIE::execute () } } } + std::vector run_time_vector(4,0); //int c = 0; /// Running one task at a time @@ -475,7 +476,7 @@ bool LIE::execute () if (comm_compaction == 0) executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num); else - executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num); + executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector); // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< AFTER ITERATION " << loop_counter <<" <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; // for (u32 i = 0 ; i < scc_relation_count; i++) @@ -523,11 +524,14 @@ bool LIE::execute () if (comm_compaction == 0) executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num); else - executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num); + executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector); //executable_task->print_all_relation(); delta_in_scc = history[history.size()-2]; + if(mcomm.get_rank() == 0) { + std::cout << "DELTA " << delta_in_scc << std::endl; + } //if (delta_in_scc == 0) // executed_scc_id.push_back(executable_task->get_id()); #if 0 @@ -569,6 +573,11 @@ bool LIE::execute () if (mcomm.get_rank() == 0) { std::cout << "<<<<<<<<<<< SCC " << executable_task->get_id() << " finish, " << loop_counter << " iteration in total." << std::endl; + std::cout << "TOTAL STAT >>>>>>>> " << executable_task->get_id() << " >>>>>>>> " + << "COMM TIME: " << run_time_vector[0] << " LCOMPUTE TIME: " << run_time_vector[1] << " INSERT TIME: " << run_time_vector[2] + << " OTHER TIME: " << run_time_vector[3] - run_time_vector[0] - run_time_vector[1] - run_time_vector[2] + << " ALL TIME: " << run_time_vector[3] + << std::endl; // print_all_relation_size(); } full_iteration_count += loop_counter; diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index b3b7a1e7..28b0adf8 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -11,6 +11,7 @@ #include #include #include +#include u32 relation::get_global_delta_element_count() { @@ -1273,6 +1274,8 @@ bool relation::insert_in_full(u64* t) // TODO: use normal insert here! if (full[bucket_id].insert_tuple_from_array(t, arity+1) == true) + // std::vector tp(t, t+arity+1); + // if (full[bucket_id].insert(tp)) { // TODO: change how to deal with element counts full_element_count++; @@ -1410,20 +1413,10 @@ void relation::local_insert_in_delta() } bool relation::check_dependent_value_insert_avalible(const std::vector& tuple) { - uint64_t bucket_id = tuple_hash(tuple.data(), join_column_count) % get_bucket_count(); - // return newt[bucket_id].check_dependent_insertion(tuple); - // if (!(full[bucket_id].check_dependent_insertion(tuple) && delta[bucket_id].check_dependent_insertion(tuple))) { - // for (auto c: tuple) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - // std::cout << "current tree >>" << std::endl; - // for (auto t: delta[bucket_id]) { - // for (auto c: t) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - // } + // uint64_t bucket_id = tuple_hash(tuple.data(), join_column_count) % get_bucket_count(); + // if (bucket_id != mcomm.get_rank()) { + // std::cout << "wwwwwwwwwwwwwwwwwwwwwwwwwwwwww " << std::endl; // } + int bucket_id = mcomm.get_rank(); return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ; } diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index e34d6e76..d80d3b08 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -215,7 +215,7 @@ class relation #endif void set_delta_element_count(int val) {delta_element_count = val;} - int get_delta_element_count() {return delta_element_count;} + int get_delta_element_count() {return delta[mcomm.get_rank()].count();} u32** get_delta_sub_bucket_element_count() {return delta_sub_bucket_element_count;} u32 get_global_delta_element_count(); diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h index 3d1fe9a5..4ed96934 100644 --- a/backend/src/relation/shmap_relation.h +++ b/backend/src/relation/shmap_relation.h @@ -45,7 +45,7 @@ struct shmap_relation { // souffle use multi set for some relation using t_ind = btree::btree_set; t_ind ind; - using iterator = t_ind::const_iterator; + using iterator = t_ind::iterator; bool insert(const t_tuple &t) { return ind.insert(t).second; diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index d0fbc176..91934d9e 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -73,13 +73,6 @@ int shmap_relation::insert_tuple_from_array(u64 *t, int width) bool joined = false; for (auto it = exist_tuples_range.first; it != exist_tuples_range.second; it++) { auto cur_tuple = *it; - // if (tp[0] == 59 && tp[1] == 58) { - // std::cout << "tppppp <<<<<< "; - // for (auto c: cur_tuple) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - // } std::vector old_t; for (auto i: dependent_column_indices) { @@ -91,16 +84,6 @@ int shmap_relation::insert_tuple_from_array(u64 *t, int width) } if (compare_res.value()) { need_deletes.push_back(it); - // if (tp[0] == 59 && tp[1] == 58) { - // for (auto c: cur_tuple) { - // std::cout << c << " "; - // } - // std::cout << "update with " << compare_res.value() <<" <<<<<< "; - // for (auto c: tp) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - // } } joined = true; } @@ -113,13 +96,9 @@ int shmap_relation::insert_tuple_from_array(u64 *t, int width) } if (!need_deletes.empty()) { for (auto d: need_deletes) { - // std::cout << "delete >>>> "; - // for (auto c: *d) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - ind.erase(*d); + ind.erase(d); } + if (insert(tp)) { return INSERT_SUCCESS; } else { @@ -179,23 +158,6 @@ shmap_relation::check_dependent_insertion(const std::vector &tp) { joined = true; } } - // std::cout << " not adding to lattice with <<<<<< "; - // for (auto c: tp) { - // std::cout << c << " "; - // } - // std::cout << " while lower bound ... "; - // for (auto c: lower_bound) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - // std::cout << "The current btree: " << std::endl; - // for (auto& t: ind) { - // std::cout << "Tuple : "; - // for (auto c: t) { - // std::cout << c << " "; - // } - // std::cout << std::endl; - // } if (!joined) { return true; } else { diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 521597a3..232244f7 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -1,8 +1,8 @@ // location of `parallel_RA_inc.h` here -#include "/home/stargazermiao/workspace/PL/slog/backend/src/parallel_RA_inc.h" +#include "/home/ysun67/workspace/slog/backend/src/parallel_RA_inc.h" #include "mpi.h" -#include +// #include #include #include #include @@ -415,6 +415,7 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { } int main(int argc, char **argv) { + double start_time = MPI_Wtime(); // input dir from compiler std::string slog_input_dir = "/home/ubuntu/workspace/slog/out/input-data"; // output dir from compiler @@ -467,16 +468,16 @@ int main(int argc, char **argv) { std::to_string(get_tag_for_rel("cc_represent", "1")) + ".cc_represent.2.table", FULL); - RAM *to_undirected_scc = new RAM(false, 0); - to_undirected_scc->add_relation(rel__edge__2__1, false); - to_undirected_scc->add_rule(new parallel_copy_generate( - rel__edge__2__1, rel__edge__2__1, FULL, - [](const u64 *const data, u64 *const output) -> int { - output[0] = data[1]; - output[1] = data[0]; - return 1; - } - )); + // RAM *to_undirected_scc = new RAM(false, 0); + // to_undirected_scc->add_relation(rel__edge__2__1, false); + // to_undirected_scc->add_rule(new parallel_copy_generate( + // rel__edge__2__1, rel__edge__2__1, FULL, + // [](const u64 *const data, u64 *const output) -> int { + // output[0] = data[1]; + // output[1] = data[0]; + // return 1; + // } + // )); RAM *cc_init_scc = new RAM(false, 1); cc_init_scc->add_relation(rel__edge__2__1, false); @@ -529,12 +530,12 @@ int main(int argc, char **argv) { agg_minimum_local, SpecialAggregator::minimum, agg_minimum_reduce, nullptr, {0,2})); - RAM* cc_rep_scc = new RAM(false, 3); - cc_rep_scc->add_relation(rel__cc_final__2__1, false); - cc_rep_scc->add_relation(rel__cc_represent__1__1, true); - cc_rep_scc->add_rule(new parallel_copy( - rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1} - )); + // RAM* cc_rep_scc = new RAM(false, 3); + // cc_rep_scc->add_relation(rel__cc_final__2__1, false); + // cc_rep_scc->add_relation(rel__cc_represent__1__1, true); + // cc_rep_scc->add_rule(new parallel_copy( + // rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1} + // )); LIE *cc_lie = new LIE(); @@ -544,27 +545,35 @@ int main(int argc, char **argv) { cc_lie->add_relation(rel__cc_final__2__1); cc_lie->add_relation(rel__cc_represent__1__1); - cc_lie->add_scc(to_undirected_scc); + // cc_lie->add_scc(to_undirected_scc); cc_lie->add_scc(cc_init_scc); cc_lie->add_scc(cc_compute_scc); cc_lie->add_scc(cc_agg_scc); - cc_lie->add_scc(cc_rep_scc); + // cc_lie->add_scc(cc_rep_scc); - cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc); + // cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc); cc_lie->add_scc_dependance(cc_init_scc, cc_compute_scc); cc_lie->add_scc_dependance(cc_compute_scc, cc_agg_scc); - cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc); + // cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc); cc_lie->enable_all_to_all_dump(); cc_lie->set_output_dir(slog_output_dir); // Write to this directory cc_lie->set_comm(mcomm); cc_lie->set_batch_size(1); cc_lie->execute(); + + double end_time = MPI_Wtime(); + double rank_running_time = end_time - start_time; + double final_time; + MPI_Reduce(&rank_running_time, &final_time, 1, MPI_DOUBLE_PRECISION, MPI_MAX, 0, mcomm.get_comm()); + if (mcomm.get_rank() == 0) { + std::cout << "RUNNING TIME: >>>>>>>>>>>>>>>>>>>>>> " << final_time << std::endl; + } cc_lie->print_all_relation_size(); // Continuously print relation sizes // rel__node__1__1->print(); // rel__edge__2__1->print(); - rel__cc__2__1->print(); + // rel__cc__2__1->print(); // rel__cc_final__2__1->print(); // rel__cc_represent__1__1->print(); // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git a/backend/tests/pagerank/compiled_pre/pagerank.cpp b/backend/tests/pagerank/compiled_pre/pagerank.cpp index a24bcf25..267a4abc 100644 --- a/backend/tests/pagerank/compiled_pre/pagerank.cpp +++ b/backend/tests/pagerank/compiled_pre/pagerank.cpp @@ -2,7 +2,7 @@ #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" #include "mpi.h" -#include +// #include #include #include #include diff --git a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp index 11055725..7513aa3c 100644 --- a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp +++ b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp @@ -2,7 +2,7 @@ #include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" #include "mpi.h" -#include +// #include #include #include #include diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp index 6d41428b..b2a788c3 100644 --- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp +++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp @@ -1,5 +1,5 @@ // location of `parallel_RA_inc.h` here -#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" +#include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" #include #include @@ -385,6 +385,7 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, std::string output_dir, int argc, char **argv) { + double start_time = 0; start_node = sp; load_input_relation(input_dir); @@ -483,6 +484,13 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, lie->set_comm(mcomm); lie->set_batch_size(1); lie->execute(); + double end_time = MPI_Wtime(); + double rank_running_time = end_time - start_time; + double final_time; + MPI_Reduce(&rank_running_time, &final_time, 1, MPI_DOUBLE_PRECISION, MPI_MAX, 0, mcomm.get_comm()); + if (mcomm.get_rank() == 0) { + std::cout << "RUNNING TIME: >>>>>>>>>>>>>>>>>>>>>> " << final_time << std::endl; + } lie->print_all_relation_size(); // Continuously print relation sizes // lie->stat_intermediate(); From fa9941b64633f88523d5093b871a042875a2e7cb Mon Sep 17 00:00:00 2001 From: ysun67 Date: Mon, 12 Dec 2022 01:24:02 -0500 Subject: [PATCH 21/36] freez --- backend/tests/cc/compiled_pre/cc.cpp | 4 +- backend/tests/sssp/compiled_pre/sssp_opt.cpp | 12 +---- backend/tests/sssp/sssp.py | 15 +++--- cluster.yaml | 50 ++++++++++++++++++++ sbatch.sh | 6 +++ 5 files changed, 67 insertions(+), 20 deletions(-) create mode 100644 cluster.yaml create mode 100644 sbatch.sh diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 232244f7..a038fc59 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -573,8 +573,8 @@ int main(int argc, char **argv) { // rel__node__1__1->print(); // rel__edge__2__1->print(); - // rel__cc__2__1->print(); - // rel__cc_final__2__1->print(); + // rel__cc__2__1->print(); + // rel__cc_final__2__1->print(); // rel__cc_represent__1__1->print(); // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp index b2a788c3..3b9d630a 100644 --- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp +++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp @@ -420,7 +420,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, auto [data, output] = state; auto head_tuple = output; - bool compatible = true && res_0 < n2d(start_node); + bool compatible = true && res_0 == n2d(start_node); if (!compatible) return state; @@ -444,21 +444,11 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, ); update_spath_j->set_generator_func( [](const depend_val_t& target_vs, const std::vector& input_v, depend_val_t& res_set) -> bool { - // std::cout << "Joining >>> "; - // for (auto c : input_v) { - // std::cout << c << " "; - // } - // std::cout << " and >>>>>>>"; - // for (auto c : target_v) { - // std::cout << c << " "; - // } - // std::cout << std::endl; auto target_v = target_vs[0]; std::vector res(3, 0); res[0] = input_v[1]; res[1] = target_v[1]; if (res[0] == res[1]) { - // std::cout << "Warning detect a loop for node " << res[0] << std::endl; res[2] = 0; } else { res[2] = target_v[2] + 1; diff --git a/backend/tests/sssp/sssp.py b/backend/tests/sssp/sssp.py index bb7862a1..0259a7d7 100644 --- a/backend/tests/sssp/sssp.py +++ b/backend/tests/sssp/sssp.py @@ -1,7 +1,7 @@ import networkx as nx -data_f = open("/home/ubuntu/workspace/dataset/soc-LiveJournal1.txt") +data_f = open("/home/ysun67/workspace/dataset/soc-LiveJournal1.txt") # data_f = open("/home/ubuntu/workspace/slog/backend/tests/sssp/test-input-graph/edge.csv") g = nx.DiGraph() @@ -9,10 +9,11 @@ g.add_edge(*map(int, l.strip().split("\t"))) sssp_nodes = 0 -for i in range(1,10): - reached_map = nx.shortest_path(g, i) - sssp_nodes = sssp_nodes + len(reached_map.keys()) - for k, v in reached_map.items(): - print(f"{k} {i} {len(v)-1}") +#for i in range(1,10): +# reached_map = nx.shortest_path(g, i) +# sssp_nodes = sssp_nodes + len(reached_map.keys()) +# for k, v in reached_map.items(): +# print(f"{k} {i} {len(v)-1}") -print(sssp_nodes) +reached_map = nx.shortest_path(g, 1) +print(len(reached_map.items())) diff --git a/cluster.yaml b/cluster.yaml new file mode 100644 index 00000000..97fd09f0 --- /dev/null +++ b/cluster.yaml @@ -0,0 +1,50 @@ +Region: us-east-2 +Image: + Os: ubuntu2004 +HeadNode: + InstanceType: c6a.xlarge + Networking: + SubnetId: subnet-0b2659c4d572b0d41 + Ssh: + KeyName: us-east-2 + LocalStorage: + RootVolume: + Size: 256 +Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: queue1 + ComputeResources: + - Name: m5nmetal + Instances: + - InstanceType: m5n.metal + MinCount: 0 + MaxCount: 4 + Efa: + Enabled: true + - Name: c6a32x + Instances: + - InstanceType: c6a.32xlarge + MinCount: 0 + MaxCount: 4 + Efa: + Enabled: true + - Name: c6imetal + Instances: + - InstanceType: c6i.metal + MinCount: 0 + MaxCount: 4 + Efa: + Enabled: true + - Name: m6imetal + Instances: + - InstanceType: m6i.metal + MinCount: 0 + MaxCount: 4 + Efa: + Enabled: truev + Networking: + PlacementGroup: + Enabled: true + SubnetIds: + - subnet-03f9e3c05f7ec22c3 diff --git a/sbatch.sh b/sbatch.sh new file mode 100644 index 00000000..06bafd3e --- /dev/null +++ b/sbatch.sh @@ -0,0 +1,6 @@ +#!/bin/bash +#SBATCH --nodes=2 +#SBATCH --ntasks=256 +#SBATCH --ntasks-per-node=128 +#SBATCH --cpus-per-task=1 +srun /home/ubuntu/slog/backend/tests/cc/compiled_pre/build/cc /home/ubuntu/dataset/twitter /home/ubuntu/srun-out From ba89394c94c81e8662a2cabec497b5d15082ea15 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Thu, 5 Jan 2023 12:49:48 -0500 Subject: [PATCH 22/36] local --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 37d3dfa0..16082615 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ temp-out/ test-input souffle-out local/ +evaluation From f043516321ac6113c86fcb1cb42860596e574c6d Mon Sep 17 00:00:00 2001 From: ysun67 Date: Wed, 28 Dec 2022 14:17:19 -0500 Subject: [PATCH 23/36] theta gcc + mpich --- backend/src/RA/parallel_join.cpp | 72 ++++++++++- backend/src/RA/parallel_join.h | 1 + backend/src/RAM/RA_tasks.cpp | 126 +++++++++++++------ backend/src/RAM/RA_tasks.h | 2 +- backend/src/lie/lie.cpp | 4 +- backend/src/relation/shmap_relation.h | 4 +- backend/src/relation/shmap_relation_exp.cpp | 62 ++++----- backend/tests/cc/compiled_pre/cc.cpp | 16 ++- backend/tests/sssp/compiled_pre/sssp_opt.cpp | 17 +-- 9 files changed, 209 insertions(+), 95 deletions(-) diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index 59b2fd48..23b2e7b6 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -8,11 +8,13 @@ #include "../parallel_RA_inc.h" #include #include +#include bool parallel_join::local_join(int threshold, int* offset, int join_order, u32 buckets, + shmap_relation *input0, int input0_buffer_size, int input0_buffer_width, u64 *input0_buffer, shmap_relation *input1, u32 i1_size, int input1_buffer_width, std::vector reorder_map_array, @@ -78,6 +80,68 @@ bool parallel_join::local_join(int threshold, int* offset, else if (join_order == RIGHT) { + if (input0->dependent_column_indices.size() > 0 && generator_mode) { + // right lattice join + std::vector> input_ts; + std::vector prev_non_dependent_columns; + for (int k1 = *offset; k1 < input0_buffer_size; k1 = k1 + input0_buffer_width) { + std::vector cur_non_dependent_columns( + input0_buffer+k1, + input0_buffer+k1+input0_buffer_width-input0->dependent_column_indices.size() + ); + // std::vector prefix; + // for (int jc=0; jc < join_column_count; jc++) + // prefix.push_back(input0_buffer[k1 + jc]); + + std::vector input_t(input0_buffer+k1, input0_buffer+k1+input0_buffer_width); + // std::cout << "LT >>> "; + // for (auto c: input_t) { + // std::cout << c << " "; + // } + // std::cout << std::endl; + if (cur_non_dependent_columns == prev_non_dependent_columns) { + input_ts.push_back(input_t); + } else { + if (input_ts.size() != 0) { + u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; + input1[bucket_id].as_all_to_allv_right_join_buffer( + std::vector(prev_non_dependent_columns.begin(), + prev_non_dependent_columns.begin()+join_column_count), + join_buffer, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); + input_ts.clear(); + } + prev_non_dependent_columns = cur_non_dependent_columns; + input_ts.push_back(input_t); + } + } + if (input_ts.size() != 0) { + u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; + input1[bucket_id].as_all_to_allv_right_join_buffer( + std::vector(prev_non_dependent_columns.begin(), + prev_non_dependent_columns.begin()+join_column_count), + join_buffer, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); + input_ts.clear(); + } + } else { + // original code for (int k1 = *offset; k1 < input0_buffer_size; k1 = k1 + input0_buffer_width) { std::vector prefix; @@ -85,10 +149,12 @@ bool parallel_join::local_join(int threshold, int* offset, prefix.push_back(input0_buffer[k1 + jc]); u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; - + std::vector> input_ts; + input_ts.push_back(std::vector(input0_buffer+k1, input0_buffer+k1+input0_buffer_width)); input1[bucket_id].as_all_to_allv_right_join_buffer( prefix, join_buffer, - input0_buffer + k1, input0_buffer_width, + // input0_buffer + k1, input0_buffer_width, + input_ts, input1_buffer_width, counter, buckets, output_sub_bucket_count, output_sub_bucket_rank, reorder_map_array, @@ -107,6 +173,8 @@ bool parallel_join::local_join(int threshold, int* offset, return false; } } + + } } deduplicate.remove_tuple(); diff --git a/backend/src/RA/parallel_join.h b/backend/src/RA/parallel_join.h index 30e15000..900b3b4d 100644 --- a/backend/src/RA/parallel_join.h +++ b/backend/src/RA/parallel_join.h @@ -90,6 +90,7 @@ class parallel_join: public parallel_RA { bool local_join(int threshold, int* offset, int join_order, u32 buckets, + shmap_relation *input0, int input0_buffer_size, int input0_buffer_width, u64 *input0_buffer, shmap_relation *input1, u32 i1_size, int input1_buffer_width, std::vector reorder_map_array, diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 3c8963f8..88b7fa2e 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -259,21 +259,21 @@ u64 RAM::intra_bucket_comm_execute() else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) { // std::cout << "here>>>>>>>>>>>>>" << std::endl; - if (input1->get_dependent_column().size() > 0) { - intra_bucket_comm(get_bucket_count(), - input0->get_full(), - input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), - input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), - &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], - mcomm.get_local_comm()); - } else { + // if (input1->get_dependent_column().size() > 0) { + // intra_bucket_comm(get_bucket_count(), + // input0->get_full(), + // input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), + // input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), + // &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], + // mcomm.get_local_comm()); + // } else { intra_bucket_comm(get_bucket_count(), input1->get_delta(), input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], mcomm.get_local_comm()); - } + // } total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; } @@ -384,7 +384,9 @@ bool RAM::local_compute(int* offset) u32 total_join_tuples = 0; u32 counter = 0; int threshold = 20000000; - + auto before_compute_time = MPI_Wtime(); + auto ibf_size = 0; + u64 jtarget_size = 0; for (std::vector::iterator it = RA_list.begin() ; it != RA_list.end(); ++it) { // std::cout << "RA type : " << (*it)->get_RA_type() << std::endl; @@ -592,7 +594,7 @@ bool RAM::local_compute(int* offset) else if ((*it)->get_RA_type() == JOIN) { - // auto before_time = MPI_Wtime(); + auto before_join_time = MPI_Wtime(); parallel_join* current_ra = (parallel_join*) *it; relation* output_relation = current_ra->get_join_output(); @@ -607,6 +609,7 @@ bool RAM::local_compute(int* offset) join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), LEFT, get_bucket_count(), + input0->get_delta(), intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1, reorder_map_array, @@ -617,7 +620,7 @@ bool RAM::local_compute(int* offset) &join_tuples_duplicates, &join_tuples); total_join_tuples = total_join_tuples + join_tuples; - + jtarget_size += input1->get_delta_element_count(); } else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL) { @@ -625,6 +628,7 @@ bool RAM::local_compute(int* offset) join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), LEFT, get_bucket_count(), + input0->get_delta(), intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], input1->get_full(), input1->get_full_element_count(), input1->get_arity()+1, reorder_map_array, @@ -635,26 +639,30 @@ bool RAM::local_compute(int* offset) &join_tuples_duplicates, &join_tuples); total_join_tuples = total_join_tuples + join_tuples; + jtarget_size += input1->get_full_element_count(); } else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) { - if (input1->get_dependent_column().size() > 0) { - join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), - LEFT, - get_bucket_count(), - intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], - input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1, - reorder_map_array, - output_relation, - compute_buffer, - counter, - join_column_count, - &join_tuples_duplicates, - &join_tuples); - } else { + // if (input1->get_dependent_column().size() > 0) { + // join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), + // LEFT, + // get_bucket_count(), + // input0->get_delta(), + // intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], + // input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1, + // reorder_map_array, + // output_relation, + // compute_buffer, + // counter, + // join_column_count, + // &join_tuples_duplicates, + // &join_tuples); + // jtarget_size += input1->get_delta_element_count(); + // } else { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), RIGHT, get_bucket_count(), + input1->get_delta(), intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1, reorder_map_array, @@ -664,14 +672,17 @@ bool RAM::local_compute(int* offset) join_column_count, &join_tuples_duplicates, &join_tuples); - } + jtarget_size += input0->get_full_element_count(); + // } total_join_tuples = total_join_tuples + join_tuples; + } else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL) { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), RIGHT, get_bucket_count(), + input1->get_full(), intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1, reorder_map_array, @@ -682,14 +693,14 @@ bool RAM::local_compute(int* offset) &join_tuples_duplicates, &join_tuples); total_join_tuples = total_join_tuples + join_tuples; + jtarget_size += input0->get_full_element_count(); } - // auto after_time = MPI_Wtime(); - // if (mcomm.get_local_rank() == 0) { - // std::cout << "local join on rank " << mcomm.get_local_rank() << " takes " << after_time - before_time << std::endl; - // } + + ibf_size += intra_bucket_buf_output_size[counter]; } counter++; } + auto after_compute_time = MPI_Wtime(); #if 0 int global_total_join_tuples = 0; @@ -700,12 +711,27 @@ bool RAM::local_compute(int* offset) std::cout << "Joins: " << global_total_join_tuples << " Duplicates " << global_join_tuples_duplicates << " " << std::endl; #endif + auto before_sync_time = MPI_Wtime(); int global_synchronizer = 0; int synchronizer = 0; if (join_completed == true) synchronizer = 1; MPI_Allreduce(&synchronizer, &global_synchronizer, 1, MPI_INT, MPI_BAND, mcomm.get_comm()); + auto after_sync_time = MPI_Wtime(); + auto lc_all_time = after_compute_time - before_compute_time; + double slowest_rank_time = lc_all_time; + MPI_Allreduce(&lc_all_time, &slowest_rank_time, 1, MPI_DOUBLE, MPI_MAX, mcomm.get_comm()); + if (lc_all_time == slowest_rank_time) { + std::cout << "Slowest Rank >>> " << mcomm.get_rank() + << " Comp Time >>> " << after_compute_time - before_compute_time + << " Sync Time >>> " << after_sync_time - before_sync_time + << " Input Size >>> " << ibf_size + << " Target Count >>> " << jtarget_size + << std::endl; + } + + bool res = false; if (global_synchronizer == 1) { counter = 0; @@ -734,10 +760,19 @@ bool RAM::local_compute(int* offset) delete[] intra_bucket_buf_output_size; delete[] intra_bucket_buf_output; - return true; + res = true; } - else - return false; + + + if (mcomm.get_rank() == 0) { + std::cout << "Rank 0 compute time >>> " << after_compute_time - before_compute_time + << " Sync time >>> " << after_sync_time - before_sync_time + << " Input Size >>> " << ibf_size + << " Target Count >>> " << jtarget_size + << std::endl; + } + + return res; } @@ -850,7 +885,7 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) #endif u32 elements_to_read = tuples_to_read * width; - for (int tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++) + for (u32 tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++) { u32 x = starting + tuple_ind * width; bool insert_flag = true; @@ -906,7 +941,7 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) u64 tuple[width]; successful_insert = 0; u32 elements_to_read = tuples_to_read * width; - for (int tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++) + for (u32 tuple_ind = 0; tuple_ind < tuples_to_read; tuple_ind ++) { u32 x = starting + tuple_ind * width; if (output->find_in_full(cumulative_all_to_allv_buffer + x, width) == false && output->find_in_delta(cumulative_all_to_allv_buffer + x, width) == false) @@ -1117,7 +1152,7 @@ void RAM::io_all_relation(int status) } -void RAM::execute_in_batches(std::string name, int batch_size, std::vector& history, std::map& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num) +void RAM::execute_in_batches(std::string name, int batch_size, std::vector& history, std::map& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector& runtime_vector) { int inner_loop = 0; u32 RA_count = RA_list.size(); @@ -1126,6 +1161,11 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& for (u32 i =0; i < RA_count; i++) offset[i] = 0; + double all_local_compute = 0; + double all_insert_newt = 0; + double all_comm = 0; + double all_time = 0; + while (batch_size != 0) { #if DEBUG_OUTPUT @@ -1148,10 +1188,12 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& auto compute_start = MPI_Wtime(); local_join_status = local_compute(offset); auto compute_end = MPI_Wtime(); + all_local_compute += compute_end - compute_start; auto all_to_all_start = MPI_Wtime(); local_comm(); auto all_to_all_end = MPI_Wtime(); + all_comm += all_to_all_end - all_to_all_start; auto free_buffers_start = MPI_Wtime(); free_compute_buffers(); @@ -1160,6 +1202,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& auto insert_in_newt_start = MPI_Wtime(); local_insert_in_newt(intern_map); auto insert_in_newt_end = MPI_Wtime(); + all_insert_newt += insert_in_newt_end - insert_in_newt_start; #if 1 if (mcomm.get_rank() == 0) @@ -1230,6 +1273,13 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& break; } + if (mcomm.get_rank() == 0) { + runtime_vector[0] = runtime_vector[0] + all_comm; + runtime_vector[1] = runtime_vector[1] + all_local_compute; + runtime_vector[2] = runtime_vector[2] + all_insert_newt; + runtime_vector[3] = runtime_vector[3] + all_time; + } + delete[] offset; @@ -1408,7 +1458,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s bool RAM::contains_relation(int tag) { for (auto rel : ram_relations) { - if (rel->get_intern_tag() == tag) { + if (rel->get_intern_tag() == (u32)tag) { return true; } } diff --git a/backend/src/RAM/RA_tasks.h b/backend/src/RAM/RA_tasks.h index ab9ac4a3..be90384a 100644 --- a/backend/src/RAM/RA_tasks.h +++ b/backend/src/RAM/RA_tasks.h @@ -163,7 +163,7 @@ class RAM bool contains_relation(int tag); /// Start running this SCC (task) for "batck_size" iterations - void execute_in_batches(std::string name, int batch_size, std::vector& history, std::map& intern_map, int *loop_counter,int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num); + void execute_in_batches(std::string name, int batch_size, std::vector& history, std::map& intern_map, int *loop_counter,int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector& runtime_vector); void execute_in_batches_comm_compaction(std::string name, int batch_size, std::vector& history, std::map& intern_map, int* loop_counter, int task_id, std::string output_dir, bool all_to_all_record, int sloav_mode, int* rotate_index_array, int** send_indexes, int *sendb_num, std::vector& runtime_vector); }; diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index ad478a20..320e9592 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -474,7 +474,7 @@ bool LIE::execute () create_checkpoint_dump(loop_counter, executable_task->get_id()); if (comm_compaction == 0) - executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num); + executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector); else executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector); @@ -522,7 +522,7 @@ bool LIE::execute () create_checkpoint_dump(loop_counter, executable_task->get_id()); if (comm_compaction == 0) - executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num); + executable_task->execute_in_batches(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector); else executable_task->execute_in_batches_comm_compaction(app_name, batch_size, history, intern_map, &loop_counter, executable_task->get_id(), output_dir, all_to_all_meta_data_dump, sloav_mode, rotate_index_array, send_indexes, sendb_num, run_time_vector); diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h index 4ed96934..09ba11ac 100644 --- a/backend/src/relation/shmap_relation.h +++ b/backend/src/relation/shmap_relation.h @@ -11,6 +11,7 @@ #include "../btree/btree_set.h" #include #include +#include struct shmap_relation { @@ -136,7 +137,8 @@ struct shmap_relation { void as_all_to_allv_acopy_buffer(all_to_allv_buffer& buffer, std::vector prefix, std::vector reorder_map, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, u32 arity, u32 join_column_count, int head_rel_hash_col_count, bool canonical); void as_all_to_allv_right_join_buffer( std::vector prefix, all_to_allv_buffer& join_buffer, - u64 *input0_buffer, int input0_buffer_width, + // u64 *input0_buffer, int input0_buffer_width, + std::vector> &input_ts, int input1_buffer_width, int ra_id, u32 buckets, u32* output_sub_bucket_count, u32** output_sub_bucket_rank, std::vector reorder_map, diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index 91934d9e..19decd31 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -393,8 +393,9 @@ void shmap_relation::as_all_to_allv_copy_generate_buffer( void shmap_relation::as_all_to_allv_right_join_buffer( std::vector prefix, all_to_allv_buffer &join_buffer, - u64 *input0_buffer, - int input0_buffer_width, + // u64 *input0_buffer, + // int input0_buffer_width, + std::vector> &input_ts, int input1_buffer_width, int ra_id, u32 buckets, u32 *output_sub_bucket_count, @@ -435,48 +436,33 @@ void shmap_relation::as_all_to_allv_right_join_buffer( auto joined_range = lowerUpperRange(lower_bound, upper_bound); if (generator_mode) { - std::vector input_t(input0_buffer, input0_buffer+input0_buffer_width); - std::vector> eq_tuple_set; - std::vector> generated_tuple_set; - std::vector prev_non_dependent_columns; - for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it){ + for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) { auto cur_path = *it; - std::vector cur_non_dependent_columns(cur_path.begin(), cur_path.begin()+arity+1-dependent_column_indices.size()); - if (cur_non_dependent_columns == prev_non_dependent_columns) { - eq_tuple_set.push_back(cur_path); - continue; - } else { - if (eq_tuple_set.size() != 0) { - gen_func(eq_tuple_set, input_t, generated_tuple_set); - eq_tuple_set.clear(); - } - prev_non_dependent_columns = cur_non_dependent_columns; - eq_tuple_set.push_back(cur_path); - } - } - if (eq_tuple_set.size() != 0) { - gen_func(eq_tuple_set, input_t, generated_tuple_set); - } - for (auto& tp: generated_tuple_set) { - uint64_t bucket_id = tuple_hash(tp.data(), head_rel_hash_col_count) % buckets; - uint64_t sub_bucket_id=0; - if (canonical == false) - sub_bucket_id = tuple_hash(tp.data() + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id]; + std::vector> generated_tuple_set; + gen_func(input_ts, cur_path, generated_tuple_set); + for (auto& tp: generated_tuple_set) { + uint64_t bucket_id = tuple_hash(tp.data(), head_rel_hash_col_count) % buckets; + uint64_t sub_bucket_id=0; + if (canonical == false) + sub_bucket_id = tuple_hash(tp.data() + head_rel_hash_col_count, join_buffer.width[ra_id]-head_rel_hash_col_count) % output_sub_bucket_count[bucket_id]; - int index = output_sub_bucket_rank[bucket_id][sub_bucket_id]; + int index = output_sub_bucket_rank[bucket_id][sub_bucket_id]; - join_buffer.local_compute_output_size_rel[ra_id] = join_buffer.local_compute_output_size_rel[ra_id] + join_buffer.width[ra_id]; - join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id]; - join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id]; - join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++; + join_buffer.local_compute_output_size_rel[ra_id] = join_buffer.local_compute_output_size_rel[ra_id] + join_buffer.width[ra_id]; + join_buffer.local_compute_output_size_total = join_buffer.local_compute_output_size_total + join_buffer.width[ra_id]; + join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] = join_buffer.local_compute_output_size_flat[index*join_buffer.ra_count + ra_id] + join_buffer.width[ra_id]; + join_buffer.local_compute_output_count_flat[index * join_buffer.ra_count + ra_id] ++; - join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id]; - join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id]; - join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)tp.data(), sizeof(u64)*join_buffer.width[ra_id]); - (*local_join_inserts)++; - (*local_join_count)++; + join_buffer.local_compute_output_size[ra_id][index] = join_buffer.local_compute_output_size[ra_id][index] + join_buffer.width[ra_id]; + join_buffer.cumulative_tuple_process_map[index] = join_buffer.cumulative_tuple_process_map[index] + join_buffer.width[ra_id]; + join_buffer.local_compute_output[ra_id][index].vector_buffer_append((const unsigned char*)tp.data(), sizeof(u64)*join_buffer.width[ra_id]); + (*local_join_inserts)++; + (*local_join_count)++; + } } } else { + u64* input0_buffer = input_ts[0].data(); + int input0_buffer_width = input_ts[0].size(); for(auto it = joined_range.first; it != joined_range.second && it != ind.end(); ++it) { auto cur_path = *it; diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index a038fc59..8964510d 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -365,7 +365,7 @@ void load_input_relation(std::string db_dir) { for (const auto &entry : std::filesystem::directory_iterator(db_dir)) { // check if ends with table std::string filename_ss = entry.path().filename().string(); - std::cout << "input database has file " << filename_ss << std::endl; + // std::cout << "input database has file " << filename_ss << std::endl; std::string suffix = ".table"; int ft = filename_ss.size() - suffix.size(); if (ft < 0) @@ -388,8 +388,8 @@ void load_input_relation(std::string db_dir) { } if (tag > max_rel) max_rel = tag; - std::cout << "load " << tag << "." << index_stream.str() << "has arity " - << arity << std::endl; + // std::cout << "load " << tag << "." << index_stream.str() << "has arity " + // << arity << std::endl; rel_tag_map[index_stream.str()] = tag; } } @@ -409,8 +409,8 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { } max_rel++; rel_tag_map[name_arity] = max_rel; - std::cout << "generate rel tag: " << name_arity << " " << max_rel - << std::endl; + // std::cout << "generate rel tag: " << name_arity << " " << max_rel + // << std::endl; return max_rel; } @@ -509,11 +509,15 @@ int main(int argc, char **argv) { ); cc_pg->set_generator_func( [](const depend_val_t& target_vs, const std::vector& input_v, depend_val_t& res_set) -> bool { - // std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl; auto target_v = target_vs[0]; std::vector res(2, 0); res[0] = input_v[1]; res[1] = target_v[1]; + // if (target_v[0] == 21) { + // std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl; + // std::cout << "cc " << target_v[0] << " " << target_v[1] << std::endl; + // std::cout << "res " << res[0] << " " << res[1] << std::endl; + // } res_set.push_back(res); return true; } diff --git a/backend/tests/sssp/compiled_pre/sssp_opt.cpp b/backend/tests/sssp/compiled_pre/sssp_opt.cpp index 3b9d630a..b39254bb 100644 --- a/backend/tests/sssp/compiled_pre/sssp_opt.cpp +++ b/backend/tests/sssp/compiled_pre/sssp_opt.cpp @@ -1,5 +1,6 @@ // location of `parallel_RA_inc.h` here #include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" +#include "mpi.h" #include #include @@ -35,6 +36,7 @@ const u64 str_tag = 2; const u64 sign_flip_const = 0x0000200000000000; const u64 signed_num_mask = 0xFFFFE00000000000; int start_node = 1; +int end_node = 2; inline bool is_number(u64 datum) { // cout << "is_number(" << datum << "): " << (datum >> tag_position == @@ -383,10 +385,10 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { return max_rel; } -void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, +void compute_sssp_from(mpi_comm &mcomm, int sp, int ep, std::string input_dir, std::string output_dir, int argc, char **argv) { - double start_time = 0; start_node = sp; + end_node = ep; load_input_relation(input_dir); relation *rel__edge__2__1__2 = new relation( @@ -420,7 +422,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, auto [data, output] = state; auto head_tuple = output; - bool compatible = true && res_0 == n2d(start_node); + bool compatible = true && (res_0 < n2d(end_node)) && (res_0 >= n2d(start_node)); if (!compatible) return state; @@ -458,6 +460,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, }); scc1->add_rule(update_spath_j); + double start_time = MPI_Wtime(); LIE *lie = new LIE(); lie->add_relation(rel__edge__2__1__2); lie->add_relation(rel__spath__3__2); @@ -467,9 +470,9 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, // Enable IO lie->enable_all_to_all_dump(); - lie->enable_data_IO(); + //lie->enable_data_IO(); // lie->enable_share_io(); - lie->enable_IO(); + //lie->enable_IO(); lie->set_output_dir(output_dir); // Write to this directory lie->set_comm(mcomm); lie->set_batch_size(1); @@ -488,7 +491,7 @@ void compute_sssp_from(mpi_comm &mcomm, int sp, std::string input_dir, // rel__spath__2__1__2->print(); // rel__edge__2__1__2->print(); - // rel__spath__3__2->print(); + // rel__spath__3__2->print(); // rel__edge__3__1->print(); // rel__edge__3__1__2__3->print(); @@ -522,7 +525,7 @@ int main(int argc, char **argv) { mpi_comm mcomm; mcomm.create(argc, argv); - compute_sssp_from(mcomm, atoi(argv[3]), slog_input_dir, slog_output_dir, argc, + compute_sssp_from(mcomm, atoi(argv[3]), atoi(argv[4]), slog_input_dir, slog_output_dir, argc, argv); mcomm.destroy(); From 13a0a419a35765046995379915f8defaa88f55bd Mon Sep 17 00:00:00 2001 From: ysun67 Date: Wed, 28 Dec 2022 18:05:23 -0500 Subject: [PATCH 24/36] add more log --- backend/src/RAM/RA_tasks.cpp | 239 ++++++++++++++++------------------- 1 file changed, 107 insertions(+), 132 deletions(-) diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 88b7fa2e..54149941 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -227,68 +227,42 @@ u64 RAM::intra_bucket_comm_execute() parallel_join* current_ra = (parallel_join*) *it; relation* input0 = current_ra->get_join_input0(); relation* input1 = current_ra->get_join_input1(); - - /// Join between delta and delta - if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == DELTA) - { - - intra_bucket_comm(get_bucket_count(), - input0->get_delta(), - input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), - input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), - &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], - mcomm.get_local_comm()); - - total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; + shmap_relation* input0_trees = input0->get_full(); + u64 input0_size = input0->get_full_element_count(); + shmap_relation* input1_trees = input1->get_full(); + u64 input1_size = input1->get_full_element_count(); + if (current_ra->get_join_input0_graph_type() == DELTA) { + input0_trees = input0->get_delta(); + input0_size = input0->get_delta_element_count(); + } + if (current_ra->get_join_input1_graph_type() == DELTA) { + input1_trees = input1->get_delta(); + input1_size = input1->get_delta_element_count(); + } + int join_direction = LEFT; + int local_join_direction_count = input0_size < input1_size ? 0 : 1; // true if size of input0 > input1 + int global_join_direction_count = local_join_direction_count; + MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm()); + if (global_join_direction_count > mcomm.get_nprocs() / 2) { + join_direction = RIGHT; } - /// Join between delta and full - else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL) - { - + if (join_direction == LEFT) { intra_bucket_comm(get_bucket_count(), - input0->get_delta(), + input0_trees, input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], mcomm.get_local_comm()); - total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; - } - - /// Join between full and delta - else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) - { - // std::cout << "here>>>>>>>>>>>>>" << std::endl; - // if (input1->get_dependent_column().size() > 0) { - // intra_bucket_comm(get_bucket_count(), - // input0->get_full(), - // input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), - // input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), - // &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], - // mcomm.get_local_comm()); - // } else { - intra_bucket_comm(get_bucket_count(), - input1->get_delta(), - input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), - input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), - &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], - mcomm.get_local_comm()); - // } - total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; - } - - /// Join between full and full - else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL) - { - + } else { intra_bucket_comm(get_bucket_count(), - input1->get_full(), + input1_trees, input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], mcomm.get_local_comm()); - total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; } + total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; } counter++; } @@ -604,33 +578,33 @@ bool RAM::local_compute(int* offset) relation* input1 = current_ra->get_join_input1(); int join_column_count = input0->get_join_column_count(); - if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == DELTA) - { - join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), - LEFT, - get_bucket_count(), - input0->get_delta(), - intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], - input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1, - reorder_map_array, - output_relation, - compute_buffer, - counter, - join_column_count, - &join_tuples_duplicates, - &join_tuples); - total_join_tuples = total_join_tuples + join_tuples; - jtarget_size += input1->get_delta_element_count(); + shmap_relation* input0_trees = input0->get_full(); + u64 input0_size = input0->get_full_element_count(); + shmap_relation* input1_trees = input1->get_full(); + u64 input1_size = input1->get_full_element_count(); + if (current_ra->get_join_input0_graph_type() == DELTA) { + input0_trees = input0->get_delta(); + input0_size = input0->get_delta_element_count(); + } + if (current_ra->get_join_input1_graph_type() == DELTA) { + input1_trees = input1->get_delta(); + input1_size = input1->get_delta_element_count(); + } + int join_direction = LEFT; + int local_join_direction_count = input0_size < input1_size ? 0 : 1; // true if size of input0 > input1 + int global_join_direction_count = local_join_direction_count; + MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm()); + if (global_join_direction_count > mcomm.get_nprocs() / 2) { + join_direction = RIGHT; } - else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL) - { + if (join_direction == LEFT) { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), LEFT, get_bucket_count(), - input0->get_delta(), + input0_trees, intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], - input1->get_full(), input1->get_full_element_count(), input1->get_arity()+1, + input1_trees, input1_size, input1->get_arity()+1, reorder_map_array, output_relation, compute_buffer, @@ -638,53 +612,14 @@ bool RAM::local_compute(int* offset) join_column_count, &join_tuples_duplicates, &join_tuples); - total_join_tuples = total_join_tuples + join_tuples; - jtarget_size += input1->get_full_element_count(); - } - else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) - { - // if (input1->get_dependent_column().size() > 0) { - // join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), - // LEFT, - // get_bucket_count(), - // input0->get_delta(), - // intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], - // input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1, - // reorder_map_array, - // output_relation, - // compute_buffer, - // counter, - // join_column_count, - // &join_tuples_duplicates, - // &join_tuples); - // jtarget_size += input1->get_delta_element_count(); - // } else { - join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), - RIGHT, - get_bucket_count(), - input1->get_delta(), - intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], - input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1, - reorder_map_array, - output_relation, - compute_buffer, - counter, - join_column_count, - &join_tuples_duplicates, - &join_tuples); - jtarget_size += input0->get_full_element_count(); - // } - total_join_tuples = total_join_tuples + join_tuples; - - } - else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL) - { + + } else { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), RIGHT, get_bucket_count(), - input1->get_full(), + input1_trees, intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], - input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1, + input0_trees, input0_size, input0->get_arity()+1, reorder_map_array, output_relation, compute_buffer, @@ -692,9 +627,9 @@ bool RAM::local_compute(int* offset) join_column_count, &join_tuples_duplicates, &join_tuples); - total_join_tuples = total_join_tuples + join_tuples; - jtarget_size += input0->get_full_element_count(); } + total_join_tuples = total_join_tuples + join_tuples; + jtarget_size += input1->get_delta_element_count(); ibf_size += intra_bucket_buf_output_size[counter]; } @@ -1165,7 +1100,12 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& double all_insert_newt = 0; double all_comm = 0; double all_time = 0; + double all_insert_in_full = 0; + double all_allocate_buf = 0; + double all_intra = 0; + double all_free_buf =0; + // auto before_batch = MPI_Wtime(); while (batch_size != 0) { #if DEBUG_OUTPUT @@ -1175,7 +1115,8 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& auto intra_start = MPI_Wtime(); intra_bucket_comm_execute(); - auto intra_end = MPI_Wtime(); + auto intra_end = MPI_Wtime(); + all_intra += intra_end - intra_start; std::cout << std::setiosflags(std::ios::fixed); bool local_join_status = false; @@ -1184,6 +1125,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& auto allocate_buffers_start = MPI_Wtime(); allocate_compute_buffers(); auto allocate_buffers_end = MPI_Wtime(); + all_allocate_buf += allocate_buffers_end - allocate_buffers_start; auto compute_start = MPI_Wtime(); local_join_status = local_compute(offset); @@ -1198,6 +1140,7 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& auto free_buffers_start = MPI_Wtime(); free_compute_buffers(); auto free_buffers_end = MPI_Wtime(); + all_free_buf += free_buffers_end - free_buffers_start; auto insert_in_newt_start = MPI_Wtime(); local_insert_in_newt(intern_map); @@ -1240,9 +1183,13 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& auto insert_in_full_start = MPI_Wtime(); local_insert_in_full(); auto insert_in_full_end = MPI_Wtime(); - + all_insert_in_full += insert_in_full_end - insert_in_full_start; + #if 1 - if (mcomm.get_rank() == 0) + double all_l_time = insert_in_full_end - intra_start; + double slowest_all_time = all_l_time; + MPI_Allreduce(&all_l_time, &slowest_all_time, 1, MPI_DOUBLE, MPI_MAX, mcomm.get_comm()); + if (mcomm.get_rank() == 0 || slowest_all_time == all_l_time) { #if 0 std::cout << name << " " << mcomm.get_local_nprocs()<< " Current time OUTER LOOP [" << loop_count_tracker << " ] " @@ -1258,13 +1205,21 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& << " full " << *running_insert_in_full << " Total " << *running_intra_bucket_comm + *running_buffer_allocate + *running_local_compute + *running_all_to_all + *running_buffer_free + *running_insert_newt + *running_insert_in_full << std::endl; #endif - std::cout << (intra_end - intra_start) << std::setw(12) - << (insert_in_full_end - insert_in_full_start) << std::setw(12) - << (insert_in_full_end - intra_start) << std::endl; - + std::cout << "rank" << std::setw(12) << "loop" << std::setw(12) << "alloc_buf" << std::setw(12) << "compute" << std::setw(12) + << "all2all" << std::setw(12) << "free_buf" << std::setw(12) << "insert_newt" << std::setw(12) + << "intra" << std::setw(12) << "insert_full" << std::setw(12) << "total" << "\n" ; + std::cout << mcomm.get_rank() << std::setw(12) << loop_count_tracker << std::setprecision(4) << std::setw(12) + << all_allocate_buf << std::setprecision(4) << std::setw(12) + << all_local_compute << std::setprecision(4) << std::setw(12) + << all_comm << std::setprecision(4) << std::setw(12) + << all_free_buf << std::setprecision(4) << std::setw(12) + << all_insert_newt << std::setprecision(4) << std::setw(12); + std::cout << all_intra << std::setw(12) + << all_insert_in_full << std::setw(12) + << all_l_time << std::endl; } #endif - + all_time += all_l_time; batch_size--; loop_count_tracker++; @@ -1317,6 +1272,12 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s double all_insert_newt = 0; double all_comm = 0; double all_time = 0; + double all_insert_in_full = 0; + double all_allocate_buf = 0; + double all_intra = 0; + double all_free_buf =0; + + // auto before_batch = MPI_Wtime(); while (batch_size != 0) { @@ -1329,6 +1290,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s auto intra_start = MPI_Wtime(); intra_bucket_comm_execute(); auto intra_end = MPI_Wtime(); + all_intra += intra_end - intra_start; bool local_join_status = false; while (local_join_status == false) @@ -1336,6 +1298,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s auto allocate_buffers_start = MPI_Wtime(); allocate_compute_buffers(); auto allocate_buffers_end = MPI_Wtime(); + all_allocate_buf += allocate_buffers_end - allocate_buffers_start; auto compute_start = MPI_Wtime(); local_join_status = local_compute(offset); @@ -1350,6 +1313,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s auto free_buffers_start = MPI_Wtime(); free_compute_buffers(); auto free_buffers_end = MPI_Wtime(); + all_free_buf += free_buffers_end - free_buffers_start; auto insert_in_newt_start = MPI_Wtime(); local_insert_in_newt_comm_compaction(intern_map); @@ -1357,7 +1321,7 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s all_insert_newt += insert_in_newt_end - insert_in_newt_start; -#if 1 +#if 0 if (mcomm.get_rank() == 0) { #if 0 @@ -1394,9 +1358,13 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s auto insert_in_full_start = MPI_Wtime(); local_insert_in_full(); auto insert_in_full_end = MPI_Wtime(); + all_insert_in_full += insert_in_full_end - insert_in_full_start; #if 1 - if (mcomm.get_rank() == 0) + double all_l_time = insert_in_full_end - intra_start; + double slowest_all_time = all_l_time; + MPI_Allreduce(&all_l_time, &slowest_all_time, 1, MPI_DOUBLE, MPI_MAX, mcomm.get_comm()); + if (mcomm.get_rank() == 0 || slowest_all_time == all_l_time) { #if 0 std::cout << name << " " << mcomm.get_local_nprocs()<< " Current time OUTER LOOP [" << loop_count_tracker << " ] " @@ -1412,15 +1380,22 @@ void RAM::execute_in_batches_comm_compaction(std::string name, int batch_size, s << " full " << *running_insert_in_full << " Total " << *running_intra_bucket_comm + *running_buffer_allocate + *running_local_compute + *running_all_to_all + *running_buffer_free + *running_insert_newt + *running_insert_in_full << std::endl; #endif - std::cout << (intra_end - intra_start) << std::setw(12) - << (insert_in_full_end - insert_in_full_start) << std::setw(12) - << (insert_in_full_end - intra_start) << std::endl; - - all_time += insert_in_full_end - intra_start; - + std::cout << "rank" << std::setw(12) << "loop" << std::setw(12) << "alloc_buf" << std::setw(12) << "compute" << std::setw(12) + << "all2all" << std::setw(12) << "free_buf" << std::setw(12) << "insert_newt" << std::setw(12) + << "intra" << std::setw(12) << "insert_full" << std::setw(12) << "total" << "\n" ; + std::cout << mcomm.get_rank()<< std::setw(12) << loop_count_tracker << std::setprecision(4) << std::setw(12) + << all_allocate_buf << std::setprecision(4) << std::setw(12) + << all_local_compute << std::setprecision(4) << std::setw(12) + << all_comm << std::setprecision(4) << std::setw(12) + << all_free_buf << std::setprecision(4) << std::setw(12) + << all_insert_newt << std::setprecision(4) << std::setw(12); + std::cout << all_intra << std::setw(12) + << all_insert_in_full << std::setw(12) + << all_l_time << std::endl; } + #endif - + all_time += all_l_time; batch_size--; loop_count_tracker++; From 3c6221e845affc470264928c9eb4c633397306b2 Mon Sep 17 00:00:00 2001 From: ysun67 Date: Thu, 29 Dec 2022 23:45:09 -0500 Subject: [PATCH 25/36] more stat --- backend/src/RA/parallel_join.cpp | 22 ++++++++++++++++++++-- backend/src/RA/parallel_join.h | 3 ++- backend/src/RAM/RA_tasks.cpp | 25 ++++++++++++++++++------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index 23b2e7b6..95ba9660 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -6,6 +6,7 @@ #include "../parallel_RA_inc.h" +#include "mpi.h" #include #include #include @@ -23,7 +24,8 @@ bool parallel_join::local_join(int threshold, int* offset, int counter, int join_column_count, u32* global_join_duplicates, - u32* global_join_inserts) + u32* global_join_inserts, + std::vector& time_stat) { join_buffer.width[counter] = reorder_map_array.size(); @@ -39,9 +41,12 @@ bool parallel_join::local_join(int threshold, int* offset, u32** output_sub_bucket_rank = output->get_sub_bucket_rank(); // std::cout << "wwwwwwwww " << input0_buffer_size << " " << input0_buffer_size << " " << i1_size << std::endl; - if (*offset > input0_buffer_size || input0_buffer_size == 0 || i1_size == 0) + if (*offset > input0_buffer_size || input0_buffer_size == 0 || i1_size == 0) { + time_stat.push_back(0); return true; + } + double join_time_total = 0; int local_join_count=0; if (join_order == LEFT) { @@ -56,6 +61,7 @@ bool parallel_join::local_join(int threshold, int* offset, u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; + auto before_actual_join = MPI_Wtime(); input1[bucket_id].as_all_to_allv_left_join_buffer( prefix, join_buffer, input0_buffer + k1,input0_buffer_width, @@ -67,6 +73,8 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(), output->get_is_canonical(), generator_mode, generator_func); + auto after_actual_join = MPI_Wtime(); + join_time_total += after_actual_join - before_actual_join; // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl; if (local_join_count > threshold) @@ -103,6 +111,7 @@ bool parallel_join::local_join(int threshold, int* offset, input_ts.push_back(input_t); } else { if (input_ts.size() != 0) { + auto before_actual_join = MPI_Wtime(); u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; input1[bucket_id].as_all_to_allv_right_join_buffer( std::vector(prev_non_dependent_columns.begin(), @@ -117,6 +126,8 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); + auto after_actual_join = MPI_Wtime(); + join_time_total += after_actual_join - before_actual_join; input_ts.clear(); } prev_non_dependent_columns = cur_non_dependent_columns; @@ -125,6 +136,7 @@ bool parallel_join::local_join(int threshold, int* offset, } if (input_ts.size() != 0) { u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; + auto before_actual_join = MPI_Wtime(); input1[bucket_id].as_all_to_allv_right_join_buffer( std::vector(prev_non_dependent_columns.begin(), prev_non_dependent_columns.begin()+join_column_count), @@ -138,6 +150,8 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); + auto after_actual_join = MPI_Wtime(); + join_time_total += after_actual_join - before_actual_join; input_ts.clear(); } } else { @@ -151,6 +165,7 @@ bool parallel_join::local_join(int threshold, int* offset, u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; std::vector> input_ts; input_ts.push_back(std::vector(input0_buffer+k1, input0_buffer+k1+input0_buffer_width)); + auto before_actual_join = MPI_Wtime(); input1[bucket_id].as_all_to_allv_right_join_buffer( prefix, join_buffer, // input0_buffer + k1, input0_buffer_width, @@ -163,6 +178,8 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); + auto after_actual_join = MPI_Wtime(); + join_time_total += after_actual_join - before_actual_join; // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl; if (local_join_count > threshold) @@ -177,6 +194,7 @@ bool parallel_join::local_join(int threshold, int* offset, } } + time_stat.push_back(join_time_total); deduplicate.remove_tuple(); return true; } diff --git a/backend/src/RA/parallel_join.h b/backend/src/RA/parallel_join.h index 900b3b4d..c30120c5 100644 --- a/backend/src/RA/parallel_join.h +++ b/backend/src/RA/parallel_join.h @@ -99,7 +99,8 @@ class parallel_join: public parallel_RA { int counter, int join_column_count, u32* local_join_duplicates, - u32* local_join_inserts); + u32* local_join_inserts, + std::vector& time_stat); #endif }; diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 54149941..0684d2c6 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -361,6 +361,8 @@ bool RAM::local_compute(int* offset) auto before_compute_time = MPI_Wtime(); auto ibf_size = 0; u64 jtarget_size = 0; + double size_sync_time = 0; + double real_join_time = 0; for (std::vector::iterator it = RA_list.begin() ; it != RA_list.end(); ++it) { // std::cout << "RA type : " << (*it)->get_RA_type() << std::endl; @@ -593,11 +595,15 @@ bool RAM::local_compute(int* offset) int join_direction = LEFT; int local_join_direction_count = input0_size < input1_size ? 0 : 1; // true if size of input0 > input1 int global_join_direction_count = local_join_direction_count; + + auto before_size_sync = MPI_Wtime(); MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm()); if (global_join_direction_count > mcomm.get_nprocs() / 2) { join_direction = RIGHT; } - + auto after_size_sync = MPI_Wtime(); + size_sync_time += after_size_sync - before_size_sync; + std::vector real_j_time_stat; if (join_direction == LEFT) { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), LEFT, @@ -611,8 +617,10 @@ bool RAM::local_compute(int* offset) counter, join_column_count, &join_tuples_duplicates, - &join_tuples); - + &join_tuples, + real_j_time_stat); + jtarget_size += input1_size; + ibf_size += input0_size; } else { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), RIGHT, @@ -626,12 +634,13 @@ bool RAM::local_compute(int* offset) counter, join_column_count, &join_tuples_duplicates, - &join_tuples); + &join_tuples, + real_j_time_stat); + jtarget_size += input0_size; + ibf_size += input1_size; } total_join_tuples = total_join_tuples + join_tuples; - jtarget_size += input1->get_delta_element_count(); - - ibf_size += intra_bucket_buf_output_size[counter]; + real_join_time += real_j_time_stat[0]; } counter++; } @@ -660,7 +669,9 @@ bool RAM::local_compute(int* offset) if (lc_all_time == slowest_rank_time) { std::cout << "Slowest Rank >>> " << mcomm.get_rank() << " Comp Time >>> " << after_compute_time - before_compute_time + << " Real Join >>> " << real_join_time << " Sync Time >>> " << after_sync_time - before_sync_time + << " Size Sync Time >>> " << size_sync_time << " Input Size >>> " << ibf_size << " Target Count >>> " << jtarget_size << std::endl; From 6a68658c20af50e04612e37363d5196ad833920f Mon Sep 17 00:00:00 2001 From: ysun67 Date: Wed, 4 Jan 2023 14:53:44 -0500 Subject: [PATCH 26/36] more hash function stage change --- backend/CMakeLists.txt | 2 +- backend/src/RAM/RA_tasks.cpp | 4 + backend/src/hash/fasthash.cpp | 50 + backend/src/hash/fasthash.h | 52 + backend/src/hash/hash.cpp | 29 +- backend/src/hash/hash.h | 128 +- backend/src/hash/spooky-c.cpp | 598 ++ backend/src/hash/spooky-c.h | 94 + backend/src/hash/xxhash.cpp | 5 + backend/src/hash/xxhash.h | 6290 +++++++++++++++++ backend/src/lie/lie.cpp | 12 + .../src/relation/balanced_hash_relation.cpp | 50 +- backend/src/relation/balanced_hash_relation.h | 19 +- .../src/relation/relation_load_balancer.cpp | 10 + backend/tests/cc/compiled_pre/CMakeLists.txt | 2 +- backend/tests/cc/compiled_pre/cc.cpp | 15 + backend/utility/tsv_to_bin.cpp | 127 +- cluster.yaml | 2 +- 18 files changed, 7452 insertions(+), 37 deletions(-) create mode 100644 backend/src/hash/fasthash.cpp create mode 100644 backend/src/hash/fasthash.h create mode 100644 backend/src/hash/spooky-c.cpp create mode 100644 backend/src/hash/spooky-c.h create mode 100644 backend/src/hash/xxhash.cpp create mode 100644 backend/src/hash/xxhash.h diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt index a348cc28..1d331260 100644 --- a/backend/CMakeLists.txt +++ b/backend/CMakeLists.txt @@ -18,7 +18,7 @@ set (tests_dir "${PROJECT_SOURCE_DIR}/tests") set (data_dir "${PROJECT_SOURCE_DIR}/data") set (utility_dir "${PROJECT_SOURCE_DIR}/utility") -file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") +file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/hash/hash.cpp" "${source_dir}/hash/xxhash.cpp" "${source_dir}/hash/spooky-c.cpp" "${source_dir}/hash/fashhash.cpp" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") file (GLOB source_files_ata "${tests_dir}/all_to_all_benchmark.cpp") file (GLOB source_files_tc "${tests_dir}/transitive_closure.cpp") #file (GLOB source_files_builtin "${tests_dir}/builtin.cpp") diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 0684d2c6..3b62572b 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -77,6 +77,9 @@ void RAM::load_balance() for (u32 i=0; i < ram_relation_count; i++) { relation* current_relation = ram_relations[i]; + if (!current_relation->balance_flag) { + continue; + } if (current_relation->load_balance_merge_full_and_delta(refinement_factor) == false) current_relation->load_balance_split_full_and_delta(refinement_factor); @@ -674,6 +677,7 @@ bool RAM::local_compute(int* offset) << " Size Sync Time >>> " << size_sync_time << " Input Size >>> " << ibf_size << " Target Count >>> " << jtarget_size + << " Join Count >>> " << total_join_tuples << std::endl; } diff --git a/backend/src/hash/fasthash.cpp b/backend/src/hash/fasthash.cpp new file mode 100644 index 00000000..c60c9501 --- /dev/null +++ b/backend/src/hash/fasthash.cpp @@ -0,0 +1,50 @@ +#include "fasthash.h" + +// Compression function for Merkle-Damgard construction. +// This function is generated using the framework provided. +#define mix(h) ({ \ + (h) ^= (h) >> 23; \ + (h) *= 0x2127599bf4325c37ULL; \ + (h) ^= (h) >> 47; }) + +uint64_t fasthash64(const void *buf, size_t len, uint64_t seed) +{ + const uint64_t m = 0x880355f21e6d1965ULL; + const uint64_t *pos = (const uint64_t *)buf; + const uint64_t *end = pos + (len / 8); + const unsigned char *pos2; + uint64_t h = seed ^ (len * m); + uint64_t v; + + while (pos != end) { + v = *pos++; + h ^= mix(v); + h *= m; + } + + pos2 = (const unsigned char*)pos; + v = 0; + + switch (len & 7) { + case 7: v ^= (uint64_t)pos2[6] << 48; + case 6: v ^= (uint64_t)pos2[5] << 40; + case 5: v ^= (uint64_t)pos2[4] << 32; + case 4: v ^= (uint64_t)pos2[3] << 24; + case 3: v ^= (uint64_t)pos2[2] << 16; + case 2: v ^= (uint64_t)pos2[1] << 8; + case 1: v ^= (uint64_t)pos2[0]; + h ^= mix(v); + h *= m; + } + + return mix(h); +} + +uint32_t fasthash32(const void *buf, size_t len, uint32_t seed) +{ + // the following trick converts the 64-bit hashcode to Fermat + // residue, which shall retain information from both the higher + // and lower parts of hashcode. + uint64_t h = fasthash64(buf, len, seed); + return h - (h >> 32); +} \ No newline at end of file diff --git a/backend/src/hash/fasthash.h b/backend/src/hash/fasthash.h new file mode 100644 index 00000000..042387a9 --- /dev/null +++ b/backend/src/hash/fasthash.h @@ -0,0 +1,52 @@ +/* The MIT License + Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com) + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef _FASTHASH_H +#define _FASTHASH_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * fasthash32 - 32-bit implementation of fasthash + * @buf: data buffer + * @len: data size + * @seed: the seed + */ + uint32_t fasthash32(const void *buf, size_t len, uint32_t seed); + +/** + * fasthash64 - 64-bit implementation of fasthash + * @buf: data buffer + * @len: data size + * @seed: the seed + */ + uint64_t fasthash64(const void *buf, size_t len, uint64_t seed); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/backend/src/hash/hash.cpp b/backend/src/hash/hash.cpp index 9b9dcf1d..364d0efd 100644 --- a/backend/src/hash/hash.cpp +++ b/backend/src/hash/hash.cpp @@ -1 +1,28 @@ -#include "parallel_RA_inc.h" +#include "hash.h" + +#include "fasthash.h" +#include "spooky-c.h" +#include "xxhash.h" +#include + +uint64_t tuple_hash(const uint64_t* start_ptr, uint64_t prefix_len) +{ + return fnv1a(start_ptr, prefix_len); + // return MurmurHash64A(start_ptr, prefix_len*8, MURMUR_SEED); + // return spooky_hash64(start_ptr, prefix_len*8, MURMUR_SEED); + // return fasthash64(start_ptr, prefix_len*8, 10); + // return XXH64(start_ptr, prefix_len*8, 10); +} + +std::vector tuple_hash_test_all(const uint64_t* start_ptr, uint64_t prefix_len) { + std::vector all_hash_v; + all_hash_v.push_back(start_ptr[0]); + all_hash_v.push_back(fnv1a(start_ptr, prefix_len)); + all_hash_v.push_back(hash64shift(start_ptr)); + all_hash_v.push_back(spooky_hash64(start_ptr, prefix_len*8, 1)); + all_hash_v.push_back(fasthash64(start_ptr, prefix_len*8, 1)); + // all_hash_v.push_back(XXH64(start_ptr, prefix_len*8, 1)); + all_hash_v.push_back(XXH32(start_ptr, prefix_len*4, 1)); + return all_hash_v; +} + diff --git a/backend/src/hash/hash.h b/backend/src/hash/hash.h index ab7f1951..aa9a6b8a 100644 --- a/backend/src/hash/hash.h +++ b/backend/src/hash/hash.h @@ -8,33 +8,140 @@ /// Based on the FNV-1a hash function -inline u64 tuple_hash(const u64* start_ptr, u64 prefix_len) + +#include +#include +#include +// #include +#define MURMUR_SEED 7917 + +///FNV-1a +inline uint64_t fnv1a(const uint64_t* start_ptr, uint64_t prefix_len) { - const u64 base = 14695981039346656037ULL; - const u64 prime = 1099511628211ULL; + const uint64_t base = 14695981039346656037ULL; + const uint64_t prime = 1099511628211ULL; - u64 hash = base; - for (u64 i = 0; i < prefix_len; ++i) + uint64_t hash = base; + for (uint64_t i = 0; i < prefix_len; ++i) { - u64 chunk = start_ptr[i]; + uint64_t chunk = start_ptr[i]; hash ^= chunk & 255ULL; hash *= prime; for (char j = 0; j < 7; ++j) { chunk = chunk >> 8; hash ^= chunk & 255ULL; + if ((chunk & 255ULL) == 0) + continue; hash *= prime; } } return hash; } +inline uint64_t nonhash1(const uint64_t* start_ptr, uint64_t prefix_len) +{ + // range base split on first column, + return start_ptr[0]; +} + + +// murmurhash +#if defined(_MSC_VER) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +static inline uint64_t getblock ( const uint64_t * p ) +{ +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + return *p; +#else + const uint8_t *c = (const uint8_t *)p; + return (uint64_t)c[0] | + (uint64_t)c[1] << 8 | + (uint64_t)c[2] << 16 | + (uint64_t)c[3] << 24 | + (uint64_t)c[4] << 32 | + (uint64_t)c[5] << 40 | + (uint64_t)c[6] << 48 | + (uint64_t)c[7] << 56; +#endif +} + +inline uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed ) +{ + const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = getblock(data++); + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +inline uint64_t hash64shift(const uint64_t* keys) +{ + uint64_t key = keys[0]; + key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = key ^ (key >> 24); + key = (key + (key << 3)) + (key << 8); // key * 265 + key = key ^ (key >> 14); + key = (key + (key << 2)) + (key << 4); // key * 21 + key = key ^ (key >> 28); + key = key + (key << 31); + return key; +} + + +uint64_t tuple_hash(const uint64_t* start_ptr, uint64_t prefix_len); +std::vector tuple_hash_test_all(const uint64_t* start_ptr, uint64_t prefix_len); + // change this to compile time? -inline u32 string_hash(const std::string& str) { - const u32 base = 2166136261u; - const u32 prime = 16777619u; +inline uint32_t string_hash(const std::string& str) { + const uint32_t base = 2166136261u; + const uint32_t prime = 16777619u; - u32 hash = base; + uint32_t hash = base; for (char c: str) { if ((int)c == 0) continue; @@ -43,3 +150,4 @@ inline u32 string_hash(const std::string& str) { } return hash; } + diff --git a/backend/src/hash/spooky-c.cpp b/backend/src/hash/spooky-c.cpp new file mode 100644 index 00000000..6e8f8c2c --- /dev/null +++ b/backend/src/hash/spooky-c.cpp @@ -0,0 +1,598 @@ + +// A C version of Bob Jenkins' spooky hash +// Spooky Hash +// A 128-bit noncryptographic hash, for checksums and table lookup +// By Bob Jenkins. Bob's version was under Public Domain +// The C version is under the BSD license +// * Copyright (c) 2014, Spooky Contributors +// * All rights reserved. +// * +// * Redistribution and use in source and binary forms, with or without +// * modification, are permitted provided that the following conditions are met: +// * +// * 1. Redistributions of source code must retain the above copyright notice, +// * this list of conditions and the following disclaimer. +// * +// * 2. Redistributions in binary form must reproduce the above copyright +// * notice, this list of conditions and the following disclaimer in the +// * documentation and/or other materials provided with the distribution. +// * +// * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +// * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +// * OF THE POSSIBILITY OF SUCH DAMAGE. +// Oct 31 2010: published framework, disclaimer ShortHash isn't right +// Nov 7 2010: disabled ShortHash +// Oct 11 2011: C version ported by Andi Kleen (andikleen@github) +// Oct 31 2011: replace End, ShortMix, ShortEnd, enable ShortHash again +// Apr 10 2012: buffer overflow on platforms without unaligned reads +// Apr 27 2012: C version updated by Ziga Zupanec ziga.zupanec@gmail.com (agiz@github) +// Update to spooky V2: d = should be d += in short hash, and remove extra mix from long hash +// (note results have changed from this change) + +// Assumes little endian ness. Caller has to check this case. +// According to Bob it should work on LE too, but just give different results. + + + +/* + * If this is an autoconf build, then use the unaligned access autoconf test to + * determine this. Otherwise, fall back on using the arch macros provided by + * the compiler. + */ +#ifdef HAVE_CONFIG_H +# include +# ifndef HAVE_ALIGNED_ACCESS_REQUIRED +# define ALLOW_UNALIGNED_READS 1 +# else +# define ALLOW_UNALIGNED_READS 0 +# endif +#else +# if defined(__i386__) || defined(__x86_64__) // add more architectures here +# define ALLOW_UNALIGNED_READS 1 +# else +# define ALLOW_UNALIGNED_READS 0 +# endif +#endif /* HAVE_CONFIG_H */ + +#include + +#include "spooky-c.h" + +// SC_CONST: a constant which: +// * is not zero +// * is odd +// * is a not-very-regular mix of 1's and 0's +// * does not need any other special mathematical properties +#define SC_CONST 0xdeadbeefdeadbeefLL + +static inline uint64_t rot64(uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +// +// This is used if the input is 96 bytes long or longer. +// +// The internal state is fully overwritten every 96 bytes. +// Every input bit appears to cause at least 128 bits of entropy +// before 96 other bytes are combined, when run forward or backward +// For every input bit, +// Two inputs differing in just that input bit +// Where "differ" means xor or subtraction +// And the base value is random +// When run forward or backwards one Mix +// I tried 3 pairs of each; they all differed by at least 212 bits. +// +static inline void mix +( + const uint64_t *data, + uint64_t *s0, uint64_t *s1, uint64_t *s2, uint64_t *s3, + uint64_t *s4, uint64_t *s5, uint64_t *s6, uint64_t *s7, + uint64_t *s8, uint64_t *s9, uint64_t *s10, uint64_t *s11 +) +{ + *s0 += data[0]; *s2 ^= *s10; *s11 ^= *s0; *s0 = rot64(*s0, 11); *s11 += *s1; + *s1 += data[1]; *s3 ^= *s11; *s0 ^= *s1; *s1 = rot64(*s1, 32); *s0 += *s2; + *s2 += data[2]; *s4 ^= *s0; *s1 ^= *s2; *s2 = rot64(*s2, 43); *s1 += *s3; + *s3 += data[3]; *s5 ^= *s1; *s2 ^= *s3; *s3 = rot64(*s3, 31); *s2 += *s4; + *s4 += data[4]; *s6 ^= *s2; *s3 ^= *s4; *s4 = rot64(*s4, 17); *s3 += *s5; + *s5 += data[5]; *s7 ^= *s3; *s4 ^= *s5; *s5 = rot64(*s5, 28); *s4 += *s6; + *s6 += data[6]; *s8 ^= *s4; *s5 ^= *s6; *s6 = rot64(*s6, 39); *s5 += *s7; + *s7 += data[7]; *s9 ^= *s5; *s6 ^= *s7; *s7 = rot64(*s7, 57); *s6 += *s8; + *s8 += data[8]; *s10 ^= *s6; *s7 ^= *s8; *s8 = rot64(*s8, 55); *s7 += *s9; + *s9 += data[9]; *s11 ^= *s7; *s8 ^= *s9; *s9 = rot64(*s9, 54); *s8 += *s10; + *s10 += data[10]; *s0 ^= *s8; *s9 ^= *s10; *s10 = rot64(*s10, 22); *s9 += *s11; + *s11 += data[11]; *s1 ^= *s9; *s10 ^= *s11; *s11 = rot64(*s11, 46); *s10 += *s0; +} + +// +// Mix all 12 inputs together so that h0, h1 are a hash of them all. +// +// For two inputs differing in just the input bits +// Where "differ" means xor or subtraction +// And the base value is random, or a counting value starting at that bit +// The final result will have each bit of h0, h1 flip +// For every input bit, +// with probability 50 +- .3% +// For every pair of input bits, +// with probability 50 +- 3% +// +// This does not rely on the last Mix() call having already mixed some. +// Two iterations was almost good enough for a 64-bit result, but a +// 128-bit result is reported, so End() does three iterations. +// +static inline void endPartial +( + uint64_t *h0, uint64_t *h1, uint64_t *h2, uint64_t *h3, + uint64_t *h4, uint64_t *h5, uint64_t *h6, uint64_t *h7, + uint64_t *h8, uint64_t *h9, uint64_t *h10, uint64_t *h11 +) +{ + *h11+= *h1; *h2 ^= *h11; *h1 = rot64(*h1, 44); + *h0 += *h2; *h3 ^= *h0; *h2 = rot64(*h2, 15); + *h1 += *h3; *h4 ^= *h1; *h3 = rot64(*h3, 34); + *h2 += *h4; *h5 ^= *h2; *h4 = rot64(*h4, 21); + *h3 += *h5; *h6 ^= *h3; *h5 = rot64(*h5, 38); + *h4 += *h6; *h7 ^= *h4; *h6 = rot64(*h6, 33); + *h5 += *h7; *h8 ^= *h5; *h7 = rot64(*h7, 10); + *h6 += *h8; *h9 ^= *h6; *h8 = rot64(*h8, 13); + *h7 += *h9; *h10^= *h7; *h9 = rot64(*h9, 38); + *h8 += *h10; *h11^= *h8; *h10= rot64(*h10, 53); + *h9 += *h11; *h0 ^= *h9; *h11= rot64(*h11, 42); + *h10+= *h0; *h1 ^= *h10; *h0 = rot64(*h0, 54); +} + +static inline void end +( + uint64_t *h0, uint64_t *h1, uint64_t *h2, uint64_t *h3, + uint64_t *h4, uint64_t *h5, uint64_t *h6, uint64_t *h7, + uint64_t *h8, uint64_t *h9, uint64_t *h10, uint64_t *h11 +) +{ + endPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11); + endPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11); + endPartial(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11); +} + +// +// The goal is for each bit of the input to expand into 128 bits of +// apparent entropy before it is fully overwritten. +// n trials both set and cleared at least m bits of h0 h1 h2 h3 +// n: 2 m: 29 +// n: 3 m: 46 +// n: 4 m: 57 +// n: 5 m: 107 +// n: 6 m: 146 +// n: 7 m: 152 +// when run forwards or backwards +// for all 1-bit and 2-bit diffs +// with diffs defined by either xor or subtraction +// with a base of all zeros plus a counter, or plus another bit, or random +// +static inline void short_mix +( + uint64_t *h0, + uint64_t *h1, + uint64_t *h2, + uint64_t *h3 +) +{ + *h2 = rot64(*h2, 50); *h2 += *h3; *h0 ^= *h2; + *h3 = rot64(*h3, 52); *h3 += *h0; *h1 ^= *h3; + *h0 = rot64(*h0, 30); *h0 += *h1; *h2 ^= *h0; + *h1 = rot64(*h1, 41); *h1 += *h2; *h3 ^= *h1; + *h2 = rot64(*h2, 54); *h2 += *h3; *h0 ^= *h2; + *h3 = rot64(*h3, 48); *h3 += *h0; *h1 ^= *h3; + *h0 = rot64(*h0, 38); *h0 += *h1; *h2 ^= *h0; + *h1 = rot64(*h1, 37); *h1 += *h2; *h3 ^= *h1; + *h2 = rot64(*h2, 62); *h2 += *h3; *h0 ^= *h2; + *h3 = rot64(*h3, 34); *h3 += *h0; *h1 ^= *h3; + *h0 = rot64(*h0, 5); *h0 += *h1; *h2 ^= *h0; + *h1 = rot64(*h1, 36); *h1 += *h2; *h3 ^= *h1; +} + +// +// Mix all 4 inputs together so that h0, h1 are a hash of them all. +// +// For two inputs differing in just the input bits +// Where "differ" means xor or subtraction +// And the base value is random, or a counting value starting at that bit +// The final result will have each bit of h0, h1 flip +// For every input bit, +// with probability 50 +- .3% (it is probably better than that) +// For every pair of input bits, +// with probability 50 +- .75% (the worst case is approximately that) +// +static inline void short_end +( + uint64_t *h0, + uint64_t *h1, + uint64_t *h2, + uint64_t *h3 +) +{ + *h3 ^= *h2; *h2 = rot64(*h2, 15); *h3 += *h2; + *h0 ^= *h3; *h3 = rot64(*h3, 52); *h0 += *h3; + *h1 ^= *h0; *h0 = rot64(*h0, 26); *h1 += *h0; + *h2 ^= *h1; *h1 = rot64(*h1, 51); *h2 += *h1; + *h3 ^= *h2; *h2 = rot64(*h2, 28); *h3 += *h2; + *h0 ^= *h3; *h3 = rot64(*h3, 9); *h0 += *h3; + *h1 ^= *h0; *h0 = rot64(*h0, 47); *h1 += *h0; + *h2 ^= *h1; *h1 = rot64(*h1, 54); *h2 += *h1; + *h3 ^= *h2; *h2 = rot64(*h2, 32); *h3 += *h2; + *h0 ^= *h3; *h3 = rot64(*h3, 25); *h0 += *h3; + *h1 ^= *h0; *h0 = rot64(*h0, 63); *h1 += *h0; +} + +void spooky_shorthash +( + const void *message, + size_t length, + uint64_t *hash1, + uint64_t *hash2 +) +{ + uint64_t buf[2 * SC_NUMVARS]; + union + { + const uint8_t *p8; + uint32_t *p32; + uint64_t *p64; + size_t i; + } u; + size_t remainder; + uint64_t a, b, c, d; + u.p8 = (const uint8_t *)message; + + if (!ALLOW_UNALIGNED_READS && (u.i & 0x7)) + { + memcpy(buf, message, length); + u.p64 = buf; + } + + remainder = length % 32; + a = *hash1; + b = *hash2; + c = SC_CONST; + d = SC_CONST; + + if (length > 15) + { + const uint64_t *endp = u.p64 + (length/32)*4; + + // handle all complete sets of 32 bytes + for (; u.p64 < endp; u.p64 += 4) + { + c += u.p64[0]; + d += u.p64[1]; + short_mix(&a, &b, &c, &d); + a += u.p64[2]; + b += u.p64[3]; + } + + // Handle the case of 16+ remaining bytes. + if (remainder >= 16) + { + c += u.p64[0]; + d += u.p64[1]; + short_mix(&a, &b, &c, &d); + u.p64 += 2; + remainder -= 16; + } + } + + // Handle the last 0..15 bytes, and its length + d += ((uint64_t)length) << 56; + switch (remainder) + { + case 15: + d += ((uint64_t)u.p8[14]) << 48; + case 14: + d += ((uint64_t)u.p8[13]) << 40; + case 13: + d += ((uint64_t)u.p8[12]) << 32; + case 12: + d += u.p32[2]; + c += u.p64[0]; + break; + case 11: + d += ((uint64_t)u.p8[10]) << 16; + case 10: + d += ((uint64_t)u.p8[9]) << 8; + case 9: + d += (uint64_t)u.p8[8]; + case 8: + c += u.p64[0]; + break; + case 7: + c += ((uint64_t)u.p8[6]) << 48; + case 6: + c += ((uint64_t)u.p8[5]) << 40; + case 5: + c += ((uint64_t)u.p8[4]) << 32; + case 4: + c += u.p32[0]; + break; + case 3: + c += ((uint64_t)u.p8[2]) << 16; + case 2: + c += ((uint64_t)u.p8[1]) << 8; + case 1: + c += (uint64_t)u.p8[0]; + break; + case 0: + c += SC_CONST; + d += SC_CONST; + } + short_end(&a, &b, &c, &d); + *hash1 = a; + *hash2 = b; +} + +void spooky_init +( + struct spooky_state *state, + uint64_t seed1, + uint64_t seed2 +) +{ + state->m_length = 0; + state->m_remainder = 0; + state->m_state[0] = seed1; + state->m_state[1] = seed2; +} + +void spooky_update +( + struct spooky_state *state, + const void *message, + size_t length +) +{ + uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; + size_t newLength = length + state->m_remainder; + uint8_t remainder; + union + { + const uint8_t *p8; + uint64_t *p64; + size_t i; + } u; + const uint64_t *endp; + + // Is this message fragment too short? If it is, stuff it away. + if (newLength < SC_BUFSIZE) + { + memcpy(&((uint8_t *)state->m_data)[state->m_remainder], message, length); + state->m_length = length + state->m_length; + state->m_remainder = (uint8_t)newLength; + return; + } + + // init the variables + if (state->m_length < SC_BUFSIZE) + { + h0 = h3 = h6 = h9 = state->m_state[0]; + h1 = h4 = h7 = h10 = state->m_state[1]; + h2 = h5 = h8 = h11 = SC_CONST; + } + else + { + h0 = state->m_state[0]; + h1 = state->m_state[1]; + h2 = state->m_state[2]; + h3 = state->m_state[3]; + h4 = state->m_state[4]; + h5 = state->m_state[5]; + h6 = state->m_state[6]; + h7 = state->m_state[7]; + h8 = state->m_state[8]; + h9 = state->m_state[9]; + h10 = state->m_state[10]; + h11 = state->m_state[11]; + } + state->m_length = length + state->m_length; + + // if we've got anything stuffed away, use it now + if (state->m_remainder) + { + uint8_t prefix = SC_BUFSIZE-state->m_remainder; + memcpy(&(((uint8_t *)state->m_data)[state->m_remainder]), message, prefix); + u.p64 = state->m_data; + mix(u.p64, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + mix(&u.p64[SC_NUMVARS], &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + u.p8 = ((const uint8_t *)message) + prefix; + length -= prefix; + } + else + { + u.p8 = (const uint8_t *)message; + } + + // handle all whole blocks of SC_BLOCKSIZE bytes + endp = u.p64 + (length/SC_BLOCKSIZE)*SC_NUMVARS; + remainder = (uint8_t)(length-((const uint8_t *)endp - u.p8)); + if (ALLOW_UNALIGNED_READS || (u.i & 0x7) == 0) + { + while (u.p64 < endp) + { + mix(u.p64, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + u.p64 += SC_NUMVARS; + } + } + else + { + while (u.p64 < endp) + { + memcpy(state->m_data, u.p8, SC_BLOCKSIZE); + mix(state->m_data, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + u.p64 += SC_NUMVARS; + } + } + + // stuff away the last few bytes + state->m_remainder = remainder; + memcpy(state->m_data, endp, remainder); + + // stuff away the variables + state->m_state[0] = h0; + state->m_state[1] = h1; + state->m_state[2] = h2; + state->m_state[3] = h3; + state->m_state[4] = h4; + state->m_state[5] = h5; + state->m_state[6] = h6; + state->m_state[7] = h7; + state->m_state[8] = h8; + state->m_state[9] = h9; + state->m_state[10] = h10; + state->m_state[11] = h11; +} + +void spooky_final +( + struct spooky_state *state, + uint64_t *hash1, + uint64_t *hash2 +) +{ + uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; + const uint64_t *data = (const uint64_t *)state->m_data; + uint8_t remainder = state->m_remainder; + + // init the variables + if (state->m_length < SC_BUFSIZE) + { + spooky_shorthash(state->m_data, state->m_length, hash1, hash2); + return; + } + + h0 = state->m_state[0]; + h1 = state->m_state[1]; + h2 = state->m_state[2]; + h3 = state->m_state[3]; + h4 = state->m_state[4]; + h5 = state->m_state[5]; + h6 = state->m_state[6]; + h7 = state->m_state[7]; + h8 = state->m_state[8]; + h9 = state->m_state[9]; + h10 = state->m_state[10]; + h11 = state->m_state[11]; + + if (remainder >= SC_BLOCKSIZE) + { + // m_data can contain two blocks; handle any whole first block + mix(data, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + data += SC_NUMVARS; + remainder -= SC_BLOCKSIZE; + } + + // mix in the last partial block, and the length mod SC_BLOCKSIZE + memset(&((uint8_t *)data)[remainder], 0, (SC_BLOCKSIZE-remainder)); + + ((uint8_t *)data)[SC_BLOCKSIZE-1] = remainder; + mix(data, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + + // do some final mixing + end(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + + *hash1 = h0; + *hash2 = h1; +} + +void spooky_hash128 +( + const void *message, + size_t length, + uint64_t *hash1, + uint64_t *hash2 +) +{ + uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; + uint64_t buf[SC_NUMVARS]; + uint64_t *endp; + union + { + const uint8_t *p8; + uint64_t *p64; + uintptr_t i; + } u; + size_t remainder; + + if (length < SC_BUFSIZE) + { + spooky_shorthash(message, length, hash1, hash2); + return; + } + + h0 = h3 = h6 = h9 = *hash1; + h1 = h4 = h7 = h10 = *hash2; + h2 = h5 = h8 = h11 = SC_CONST; + + u.p8 = (const uint8_t *)message; + endp = u.p64 + (length/SC_BLOCKSIZE)*SC_NUMVARS; + + // handle all whole blocks of SC_BLOCKSIZE bytes + if (ALLOW_UNALIGNED_READS || (u.i & 0x7) == 0) + { + while (u.p64 < endp) + { + mix(u.p64, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + u.p64 += SC_NUMVARS; + } + } + else + { + while (u.p64 < endp) + { + memcpy(buf, u.p64, SC_BLOCKSIZE); + mix(buf, &h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + u.p64 += SC_NUMVARS; + } + } + + // handle the last partial block of SC_BLOCKSIZE bytes + remainder = (length - ((const uint8_t *)endp-(const uint8_t *)message)); + memcpy(buf, endp, remainder); + memset(((uint8_t *)buf)+remainder, 0, SC_BLOCKSIZE-remainder); + ((uint8_t *)buf)[SC_BLOCKSIZE-1] = remainder; + + // do some final mixing + end(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &h8, &h9, &h10, &h11); + *hash1 = h0; + *hash2 = h1; +} + +uint64_t spooky_hash64 +( + const void *message, + size_t length, + uint64_t seed +) +{ + uint64_t hash1 = seed; + spooky_hash128(message, length, &hash1, &seed); + return hash1; +} + +uint32_t spooky_hash32 +( + const void *message, + size_t length, + uint32_t seed +) +{ + uint64_t hash1 = seed, hash2 = seed; + spooky_hash128(message, length, &hash1, &hash2); + return (uint32_t)hash1; +} \ No newline at end of file diff --git a/backend/src/hash/spooky-c.h b/backend/src/hash/spooky-c.h new file mode 100644 index 00000000..9cd60e05 --- /dev/null +++ b/backend/src/hash/spooky-c.h @@ -0,0 +1,94 @@ +// SpookyHash: a 128-bit noncryptographic hash function +// By Bob Jenkins, public domain +// Oct 31 2010: alpha, framework + SpookyHash::Mix appears right +// Oct 11 2011: C version ported by Andi Kleen (andikleen@github) +// Oct 31 2011: alpha again, Mix only good to 2^^69 but rest appears right +// Dec 31 2011: beta, improved Mix, tested it for 2-bit deltas +// Feb 2 2012: production, same bits as beta +// Feb 5 2012: adjusted definitions of uint* to be more portable +// Mar 30 2012: 3 bytes/cycle, not 4. Alpha was 4 but wasn't thorough enough. +// Apr 27 2012: C version updated by Ziga Zupanec ziga.zupanec@gmail.com (agiz@github) +// +// Up to 3 bytes/cycle for long messages. Reasonably fast for short messages. +// All 1 or 2 bit deltas achieve avalanche within 1% bias per output bit. +// +// This was developed for and tested on 64-bit x86-compatible processors. +// It assumes the processor is little-endian. There is a macro +// controlling whether unaligned reads are allowed (by default they are). +// This should be an equally good hash on big-endian machines, but it will +// compute different results on them than on little-endian machines. +// +// Google's CityHash has similar specs to SpookyHash, and CityHash is faster +// on some platforms. MD4 and MD5 also have similar specs, but they are orders +// of magnitude slower. CRCs are two or more times slower, but unlike +// SpookyHash, they have nice math for combining the CRCs of pieces to form +// the CRCs of wholes. There are also cryptographic hashes, but those are even +// slower than MD5. +// + +#include +#include + +#define SC_NUMVARS 12 +#define SC_BLOCKSIZE (8 * SC_NUMVARS) +#define SC_BUFSIZE (2 * SC_BLOCKSIZE) + +struct spooky_state +{ + uint64_t m_data[2 * SC_NUMVARS]; + uint64_t m_state[SC_NUMVARS]; + size_t m_length; + unsigned char m_remainder; +}; + +void spooky_shorthash +( + const void *message, + size_t length, + uint64_t *hash1, + uint64_t *hash2 +); + +void spooky_init +( + struct spooky_state *state, + uint64_t hash1, + uint64_t hash2 +); + +void spooky_update +( + struct spooky_state *state, + const void *msg, + size_t len +); + +void spooky_final +( + struct spooky_state *state, + uint64_t *hash1, + uint64_t *hash2 +); + +//hash1/2 doubles as input parameter for seed1/2 and output for hash1/2 +void spooky_hash128 +( + const void *message, + size_t length, + uint64_t *hash1, + uint64_t *hash2 +); + +uint64_t spooky_hash64 +( + const void *message, + size_t len, + uint64_t seed +); + +uint32_t spooky_hash32 +( + const void *message, + size_t len, + uint32_t seed +); \ No newline at end of file diff --git a/backend/src/hash/xxhash.cpp b/backend/src/hash/xxhash.cpp new file mode 100644 index 00000000..267cbe79 --- /dev/null +++ b/backend/src/hash/xxhash.cpp @@ -0,0 +1,5 @@ + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/backend/src/hash/xxhash.h b/backend/src/hash/xxhash.h new file mode 100644 index 00000000..2a70a8bc --- /dev/null +++ b/backend/src/hash/xxhash.h @@ -0,0 +1,6290 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include + * #include + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * @file xxhash.h + * xxHash prototypes and implementation + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 1 +/*! @brief Version number, encoded as two digits each */ +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + + +/* **************************** +* Common basic types +******************************/ +#include /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; + +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * See @ref single_shot_example "Single Shot Example" for an example. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +#ifndef XXH_NO_STREAM +/*! + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * @see streaming_example at the top of @ref xxhash.h for an example. + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif + +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif + +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif + +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else +# include +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled via the @ref XXH_VECTOR + * macro. For the x86 family, an automatic dispatcher is included separately + * in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generage exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief 64-bit unseeded variant of XXH3. + * + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see + * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t length); + +/*! + * @brief 64-bit seeded variant of XXH3 + * + * This variant generates a custom secret on the fly based on default secret + * altered using the `seed` value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * @param input The data to hash + * @param length The length + * @param seed The 64-bit seed to alter the state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief 64-bit variant of XXH3 with a custom "secret". + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing "XXH3_generateSecret()" instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); +XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); + +/* + * XXH3_64bits_reset(): + * Initialize with default parameters. + * digest will be equivalent to `XXH3_64bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); +/* + * XXH3_64bits_reset_withSeed(): + * Generate a custom secret from `seed`, and store it into `statePtr`. + * digest will be equivalent to `XXH3_64bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Unseeded 128-bit variant of XXH3 + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see + * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(const void* data, size_t len); +/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @return: >0 if *h128_1 > *h128_2 + * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(const void* h128_1, const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; } + + +/*! + * simple alias to pre-selected XXH3_128bits variant + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * XXH3_generateSecret(): + * + * Derive a high-entropy secret from any user-defined content, named customSeed. + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include + * #include + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The seed to seed the state. + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed); + +/*! + * These variants generate hash values using either + * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(const void* data, size_t len, + const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(const void* input, size_t length, + const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, + const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, + const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * . + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# define XXH_ASSERT(c) ((void)0) +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD_W(var) __asm__ __volatile__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_W(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang autovectorizes it incorrectly + * and it is pointless writing a NEON implementation that is basically the + * same speed as scalar for XXH32. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + + do { + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); + } else { + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @ingroup XXH32_family + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH64_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + + do { + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); + } else { + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#else +/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || defined(__aarch64__) || defined(_M_ARM) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) +# define inline __inline__ /* circumvent a clang bug */ +# include +# undef inline +# elif defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# endif +#endif + +#if defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * @ref XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment required for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +# define XXH_SVE 6 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + + +#if XXH_VECTOR == XXH_NEON +/* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split into + * two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] + * | '---------. .--------' | + * | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 is + * completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so modifying + * D11 will modify the high half of Q5. This is similar to how modifying AH + * will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd lanes + * in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead of + * taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the + * lower and upper halves of the Q register to end up with the high and low + * halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions modifying two + * registers at once is not possible in GCC or Clang's IR, and they have to + * create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, many + * quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes to a + * D-form register zero the high bits, similar to how writes to W-form scalar + * registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent + * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one + * operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be this + * mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } + * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } + * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + */ + +/*! + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) + * { + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ +# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && (defined(__GNUC__) || defined(__clang__)) \ + && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM)) +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ + __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) +# else +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64 (in); \ + (outHi) = vshrn_n_u64 ((in), 32); \ + } while (0) +# endif + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(uint64x2_t const*)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and + * 2 lanes on scalar by default (except on Apple platforms, as Apple CPUs benefit + * from only using NEON). + * + * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the + * emulated 64-bit arithmetic is too slow. + * + * Modern ARM CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't + * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions, + * you are only using 2/3 of the CPU bandwidth. + * + * This is even more noticeable on the more advanced cores like the A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the + * remaining lanes will use scalar instructions. This improves the bandwidth + * and also gives the integer pipelines something to do besides twiddling loop + * counters and pointers. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include +# else +# include +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= 0x165667919E3779F9ULL; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= 0x9FB21C651E98DF25ULL; + h64 ^= (h64 >> 35) + len ; + h64 *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1, acc_end; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); + acc_end = 0; +#else + acc += XXH3_mix16B(input+0, secret+0, seed); + acc_end = XXH3_mix16B(input+len-16, secret+16, seed); + if (len > 32) { + acc += XXH3_mix16B(input+16, secret+32, seed); + acc_end += XXH3_mix16B(input+len-32, secret+48, seed); + if (len > 64) { + acc += XXH3_mix16B(input+32, secret+64, seed); + acc_end += XXH3_mix16B(input+len-48, secret+80, seed); + + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc_end += XXH3_mix16B(input+len-64, secret+112, seed); + } + } + } +#endif + return XXH3_avalanche(acc + acc_end); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXH3_avalanche(acc); + XXH_ASSERT(nbRounds >= 8); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + return XXH3_avalanche(acc); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_() and it calls + * XXH3_accumulate_512_(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { + uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + uint64x2_t acc_vec1 = xacc[i]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec1 = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec1 = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_21 = vextq_u64(data_vec1, data_vec1, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key1 = veorq_u64(data_vec1, key_vec1); + + uint64x2_t acc_vec2 = xacc[i+1]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_22 = vextq_u64(data_vec2, data_vec2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key2 = veorq_u64(data_vec2, key_vec2); + + /* data_key_lo = {(data_key1 & 0xFFFFFFFF), (data_key2 & 0xFFFFFFFF)}; + * data_key_hi = {(data_key1 >> 32), (data_key2 >> 32)}; + */ + uint32x4x2_t zipped = vuzpq_u32(vreinterpretq_u32_u64(data_key1), vreinterpretq_u32_u64(data_key2)); + uint32x4_t data_key_lo = zipped.val[0]; + uint32x4_t data_key_hi = zipped.val[1]; + + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_21 = vmlal_u32 (acc_vec_21, vget_low_u32(data_key_lo), vget_low_u32(data_key_hi)); + XXH_COMPILER_GUARD_W(acc_vec_21); + /* xacc[i] += acc_vec_2; */ + acc_vec1 = vaddq_u64 (acc_vec1, acc_vec_21); + xacc[i] = acc_vec1; + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_22 = vmlal_u32 (acc_vec_22, vget_high_u32(data_key_lo), vget_high_u32(data_key_hi)); + XXH_COMPILER_GUARD_W(acc_vec_22); + /* xacc[i] += acc_vec_2; */ + acc_vec2 = vaddq_u64 (acc_vec2, acc_vec_22); + xacc[i+1] = acc_vec2; + } + for (; i < XXH3_NEON_LANES / 2; i++) { + uint64x2_t acc_vec = xacc[i]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + data_key = veorq_u64(data_vec, key_vec); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi); + XXH_COMPILER_GUARD_W(acc_vec_2); + /* xacc[i] += acc_vec_2; */ + acc_vec = vaddq_u64 (acc_vec, acc_vec_2); + xacc[i] = acc_vec; + } + + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* xacc = (uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); + + size_t i; + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1 */ + uint32x2_t data_key_lo, data_key_hi; + /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (xacc[i] >> 32); + * xacc[i] = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + { /* + * prod_hi = (data_key >> 32) * XXH_PRIME32_1; + * + * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will + * incorrectly "optimize" this: + * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); + * shifted = vshll_n_u32(tmp, 32); + * to this: + * tmp = "vmulq_u64"(a, b); // no such thing! + * shifted = vshlq_n_u64(tmp, 32); + * + * However, unlike SSE, Clang lacks a 64-bit multiply routine + * for NEON, and it scalarizes two 64-bit multiplies instead. + * + * vmull_u32 has the same timing as vmul_u32, and it avoids + * this bug completely. + * See https://bugs.llvm.org/show_bug.cgi?id=39967 + */ + uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + /* xacc[i] = prod_hi << 32; */ + prod_hi = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime); + } + } + } +} + +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + unsigned int* const xacc = (unsigned int*) acc; + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i); + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + /* xacc[i] = acc_vec; */ + vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_u64x2* const xacc = (xxh_u64x2*) acc; + const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + +/* scalar variants - universal */ + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == XXH3_kSecret); + + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(const void* input, size_t length, const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(const void* input, size_t length, const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/* Note : when XXH3_consumeStripes() is invoked, + * there must be a guarantee that at least one more byte must be consumed from input + * so that the function can blindly consume all stripes using the "normal" secret segment */ +XXH_FORCE_INLINE void +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { + /* need a scrambling operation */ + size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; + size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; + f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock); + f_scramble(acc, secret + secretLimit); + f_acc(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock); + *nbStripesSoFarPtr = nbStripesAfterBlock; + } else { + f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes); + *nbStripesSoFarPtr += nbStripes; + } +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + + /* large input to consume : ingest per full block */ + if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar); + /* join to current block's end */ + { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar; + XXH_ASSERT(nbStripesToEnd <= nbStripes); + f_acc(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd); + f_scramble(acc, secret + state->secretLimit); + state->nbStripesSoFar = 0; + input += nbStripesToEnd * XXH_STRIPE_LEN; + nbStripes -= nbStripesToEnd; + } + /* consume per entire blocks */ + while(nbStripes >= state->nbStripesPerBlock) { + f_acc(acc, input, secret, state->nbStripesPerBlock); + f_scramble(acc, secret + state->secretLimit); + input += state->nbStripesPerBlock * XXH_STRIPE_LEN; + nbStripes -= state->nbStripesPerBlock; + } + /* consume last partial block */ + f_acc(acc, input, secret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + XXH_ASSERT(input < bEnd); /* at least some bytes left */ + state->nbStripesSoFar = nbStripes; + /* buffer predecessor of last partial stripe */ + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN); + } else { + /* content to consume <= block size */ + /* Consume input by a multiple of internal buffer size */ + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + input += XXH3_INTERNALBUFFER_SIZE; + } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + } + } + + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate, XXH3_scrambleAcc); + /* last stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - XXH_STRIPE_LEN, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } else { /* bufferedSize < XXH_STRIPE_LEN */ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + unsigned int const nbRounds = (unsigned int)len / 32; + unsigned int i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + for (i=0; i<4; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + (32 * i), + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + XXH_ASSERT(nbRounds >= 4); + for (i=4 ; i < nbRounds; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + 0ULL - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; nset_separate_io(separate_io); lie_relations[i]->set_offset_io(offset_io); lie_relations[i]->initialize_relation(mcomm, intern_map); + // if (lie_relations[i]->get_intern_tag() == 258) { + // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << lie_relations[i]->get_full_element_count() << std::endl; + // } } #if DEBUG_OUTPUT //lie_relations[i]->print(); @@ -320,6 +323,11 @@ bool LIE::execute () print_all_relation_size(); + // balance all relation before program run + // for (u32 i = 0 ; i < lie_relations.size(); i++) { + + // } + //if (mcomm.get_local_rank() == 0) // std::cout << "Done initializing " << lie_relation_count << std::endl; @@ -423,6 +431,10 @@ bool LIE::execute () print_relation_size(scc_relation[i]); std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<< BEFORE COMPUTATION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl; #endif + + // load balance before a SCC executed + executable_task->load_balance(); + if (restart_flag == false) { for (u32 i=0; i < scc_relation_count; i++) diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index 28b0adf8..da398ffa 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -7,8 +7,10 @@ #include "../parallel_RA_inc.h" #include "balanced_hash_relation.h" +#include "mpi.h" #include #include +#include #include #include #include @@ -1417,6 +1419,50 @@ bool relation::check_dependent_value_insert_avalible(const std::vector& tup // if (bucket_id != mcomm.get_rank()) { // std::cout << "wwwwwwwwwwwwwwwwwwwwwwwwwwwwww " << std::endl; // } - int bucket_id = mcomm.get_rank(); - return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ; + // int bucket_id = mcomm.get_rank(); + bool res = true; + for (int i = 0 ; i < mcomm.get_nprocs(); i ++) { + res = (res && delta[i].check_dependent_insertion(tuple)) && full[i].check_dependent_insertion(tuple); + } + // return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ; + return res; +} + +void relation::test_calc_hash_rank(u64 rank_n) { + int hash_types = 6; + std::vector> tuple_cnts(hash_types, std::vector(rank_n, 0)); + std::vector hash_names{"nohash", "fnv1a", "murmur", "spooky", "fasthash", "xxhash"}; + + for (auto t: full[mcomm.get_rank()]) { + // std::vector compressed; + // for (auto c: t) { + // compressed.push_back(c % rank_n); + // } + auto hashes = tuple_hash_test_all(t.data(), get_join_column_count()); + for (int i = 0; i < hash_types; i++) { + // u64 hashv_main = hashes[i]; + // u64 rk_main = hashes[i] % rank_n; + // u64 rk_sub = tuple_hash_test_all(&hashv_main, 1)[1] % rank_n; + // u64 rk_final = rk_main * 64 + rk_sub; + // tuple_cnts[i][rk_final]++; + tuple_cnts[i][hashes[i] % (rank_n-1)]++; + // u64 rkv = rank_n; + // u64 p = UINT64_MAX / rank_n; + // tuple_cnts[i][hashes[i] / p]++; + } + } + // std::cout << mcomm.get_rank() << std::endl; + for (int i = 0; i < hash_types; i++) { + // for (auto cnt: tuple_cnts[i]) { + // std::cout << hash_names[i] << ", " << mcomm.get_rank() << ", " << cnt << std::endl; + // } + for (u64 rk = 0; rk < rank_n; rk++) { + u64 local_cnt = tuple_cnts[i][rk]; + u64 global_cnt = local_cnt; + MPI_Reduce(&local_cnt, &global_cnt, 1, MPI_UINT64_T, MPI_SUM, 0, mcomm.get_comm()); + if (mcomm.get_rank() == 0) { + std::cout << hash_names[i] << ", " << rk << ", " << global_cnt << std::endl; + } + } + } } diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index d80d3b08..1be454a2 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -97,6 +97,8 @@ class relation public: + bool balance_flag = false; + /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL); /// 2: arity (Internally one extra id (intern id) column is added to every relation) /// true: arity == join column count @@ -189,7 +191,13 @@ class relation void set_full_element_count(int val) {full_element_count = val;} - int get_full_element_count() {return full[mcomm.get_rank()].count();} + int get_full_element_count() { + u64 res = 0; + for (int i = 0; i < get_bucket_count(); i++) { + res += full[i].size(); + } + return res; + } u32** get_full_sub_bucket_element_count() {return full_sub_bucket_element_count;} u32 get_global_full_element_count(); @@ -215,7 +223,13 @@ class relation #endif void set_delta_element_count(int val) {delta_element_count = val;} - int get_delta_element_count() {return delta[mcomm.get_rank()].count();} + int get_delta_element_count() { + u64 res = 0; + for (int i = 0; i < get_bucket_count(); i++) { + res += delta[i].size(); + } + return res; + } u32** get_delta_sub_bucket_element_count() {return delta_sub_bucket_element_count;} u32 get_global_delta_element_count(); @@ -296,4 +310,5 @@ class relation void enable_initialization() { init_flag = true; } bool need_init_huh() { return init_flag; } + void test_calc_hash_rank(u64 rank_n); }; diff --git a/backend/src/relation/relation_load_balancer.cpp b/backend/src/relation/relation_load_balancer.cpp index e278ec36..5918b658 100644 --- a/backend/src/relation/relation_load_balancer.cpp +++ b/backend/src/relation/relation_load_balancer.cpp @@ -6,6 +6,7 @@ #include "../parallel_RA_inc.h" +#include @@ -270,8 +271,17 @@ bool relation::load_balance_split_full_and_delta(float rf) MPI_Allreduce(&min_sub_bucket_size, &global_min, 1, MPI_INT, MPI_MIN, mcomm.get_local_comm()); MPI_Allreduce(&total_sub_bucket_size, &global_total_sub_bucket_size, 1, MPI_INT, MPI_SUM, mcomm.get_local_comm()); delete[] max_sub_bucket_size; + if (mcomm.get_rank() == 0) { + std::cout << "Max sub buckets "; + for (u32 i = 0; i < buckets; i++) { + std::cout << max_sub_bucket_size[i] << " "; + } + std::cout << std::endl; + std::cout << "Total Sub buckect size : " << total_sub_bucket_size << std::endl; + } average_sub_bucket_size = global_total_sub_bucket_size / total_sub_bucket_count; + // std::cout << "Total Sub buckect size : " << global_total_sub_bucket_size << std::endl; u32 global_new_sub_bucket[buckets]; memcpy(global_new_sub_bucket, sub_bucket_per_bucket_count, buckets * sizeof(u32)); diff --git a/backend/tests/cc/compiled_pre/CMakeLists.txt b/backend/tests/cc/compiled_pre/CMakeLists.txt index 36be513b..79276742 100644 --- a/backend/tests/cc/compiled_pre/CMakeLists.txt +++ b/backend/tests/cc/compiled_pre/CMakeLists.txt @@ -18,7 +18,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla # set (base_dir "${PROJECT_SOURCE_DIR}/../backend") set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") -file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") +file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/hash/xxhash.cpp" "${source_dir}/hash/hash.cpp" "${source_dir}/hash/spooky-c.cpp" "${source_dir}/hash/fasthash.cpp" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") file (GLOB source_files_cc "${PROJECT_SOURCE_DIR}/cc.cpp") ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}") diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 8964510d..43c04d90 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -436,6 +436,15 @@ int main(int argc, char **argv) { ".edge.2.table", FULL); + rel__edge__2__1->balance_flag = true; + + // relation *rel__edge__2__1__2 = new relation( + // 2, true, 2, get_tag_for_rel("edge", "1__2"), + // std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", + // slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + + // ".edge.2.table", + // FULL); + relation *rel__cc__2__1 = new relation( 1, true, 2, get_tag_for_rel("cc", "1"), std::to_string(get_tag_for_rel("cc", "1")) + ".cc.2.table", @@ -480,6 +489,7 @@ int main(int argc, char **argv) { // )); RAM *cc_init_scc = new RAM(false, 1); + // cc_init_scc->add_relation(rel__edge__2__1__2, false); cc_init_scc->add_relation(rel__edge__2__1, false); cc_init_scc->add_relation(rel__cc__2__1, true); cc_init_scc->add_relation(rel__node__1__1, true); @@ -548,6 +558,7 @@ int main(int argc, char **argv) { cc_lie->add_relation(rel__cc__2__1); cc_lie->add_relation(rel__cc_final__2__1); cc_lie->add_relation(rel__cc_represent__1__1); + // cc_lie->add_relation(rel__edge__2__1__2); // cc_lie->add_scc(to_undirected_scc); cc_lie->add_scc(cc_init_scc); @@ -575,6 +586,10 @@ int main(int argc, char **argv) { } cc_lie->print_all_relation_size(); // Continuously print relation sizes + + // rel__edge__2__1__2->test_calc_hash_rank(4096); + // rel__edge__2__1->test_calc_hash_rank(4096); + // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; // rel__node__1__1->print(); // rel__edge__2__1->print(); // rel__cc__2__1->print(); diff --git a/backend/utility/tsv_to_bin.cpp b/backend/utility/tsv_to_bin.cpp index 25de07b6..9f88afb5 100644 --- a/backend/utility/tsv_to_bin.cpp +++ b/backend/utility/tsv_to_bin.cpp @@ -56,25 +56,114 @@ unsigned buckets; string string_intern_file_path; string mode = "slog"; -// hash a tuple n values long using our hashing algorithm -u64 hash_tuple(u64 *fact, unsigned num) +/// Based on the FNV-1a hash function +#include +// #include +#define MURMUR_SEED 7917 + +///FNV-1a +uint64_t fnv1a(const uint64_t* start_ptr, uint64_t prefix_len) { - u64 prime = 1099511628211ull; - u64 hash = 14695981039346656037ull; - u64 chunk, h0; - for (unsigned i = 0; i < num; i++) - { - chunk = fact[i]; - h0 = hash ^ (chunk & 255); - hash = h0 * prime; - for (unsigned j = 0; j < 7; j++) - { - chunk = chunk >> 8; - h0 = hash ^ (chunk & 255); - hash = h0 * prime; - } - } - return hash; + const uint64_t base = 14695981039346656037ULL; + const uint64_t prime = 1099511628211ULL; + + uint64_t hash = base; + for (uint64_t i = 0; i < prefix_len; ++i) + { + uint64_t chunk = start_ptr[i]; + hash ^= chunk & 255ULL; + hash *= prime; + for (char j = 0; j < 7; ++j) + { + chunk = chunk >> 8; + hash ^= chunk & 255ULL; + hash *= prime; + } + } + return hash; +} + + + +// murmurhash +#if defined(_MSC_VER) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +static inline uint64_t getblock ( const uint64_t * p ) +{ +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + return *p; +#else + const uint8_t *c = (const uint8_t *)p; + return (uint64_t)c[0] | + (uint64_t)c[1] << 8 | + (uint64_t)c[2] << 16 | + (uint64_t)c[3] << 24 | + (uint64_t)c[4] << 32 | + (uint64_t)c[5] << 40 | + (uint64_t)c[6] << 48 | + (uint64_t)c[7] << 56; +#endif +} + +uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed ) +{ + const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = getblock(data++); + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + + + +uint64_t tuple_hash(const uint64_t* start_ptr, uint64_t prefix_len) +{ + // return fnv1a(start_ptr, prefix_len); + return MurmurHash64A(start_ptr, (int)prefix_len, MURMUR_SEED); } u32 string_hash(const std::string& str) { @@ -284,7 +373,7 @@ void file_to_slog(char *input_file, char *output_file, col_count++; } - u64 t_hash = hash_tuple(tuple_buffer, arity); + u64 t_hash = tuple_hash(tuple_buffer, arity); if (tuple_hash_set.find(t_hash) == tuple_hash_set.end()){ tuple_hash_set.insert(t_hash); u64 tid = rel_tag; diff --git a/cluster.yaml b/cluster.yaml index 97fd09f0..7fbe091b 100644 --- a/cluster.yaml +++ b/cluster.yaml @@ -42,7 +42,7 @@ Scheduling: MinCount: 0 MaxCount: 4 Efa: - Enabled: truev + Enabled: true Networking: PlacementGroup: Enabled: true From 61a4d7f6c8ad7e674b150af35388bb7133ea30f6 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Thu, 5 Jan 2023 21:08:03 -0500 Subject: [PATCH 27/36] add manual sub rank split fix insert check/sub rank split dyn try opt --- backend/src/RA/parallel_join.cpp | 112 ++++++++++-------- backend/src/RA/parallel_join.h | 2 + backend/src/RAM/RA_tasks.cpp | 4 + backend/src/RAM/RA_tasks.h | 2 +- backend/src/lie/lie.cpp | 2 + .../src/relation/balanced_hash_relation.cpp | 22 ++-- backend/src/relation/balanced_hash_relation.h | 3 +- backend/src/relation/shmap_relation.h | 4 +- backend/tests/cc/compiled_pre/cc.cpp | 4 +- 9 files changed, 88 insertions(+), 67 deletions(-) diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index 95ba9660..61630f4d 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -20,6 +20,8 @@ bool parallel_join::local_join(int threshold, int* offset, shmap_relation *input1, u32 i1_size, int input1_buffer_width, std::vector reorder_map_array, relation* output, + relation* input0_rel, + relation* input1_rel, all_to_allv_buffer& join_buffer, int counter, int join_column_count, @@ -60,19 +62,21 @@ bool parallel_join::local_join(int threshold, int* offset, } u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; - + auto before_actual_join = MPI_Wtime(); - input1[bucket_id].as_all_to_allv_left_join_buffer( - prefix, join_buffer, - input0_buffer + k1,input0_buffer_width, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, output->get_join_column_count(), - output->get_is_canonical(), - generator_mode, generator_func); + for (u32 sb = 0; sb < input1_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { + input1[input1_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_left_join_buffer( + prefix, join_buffer, + input0_buffer + k1,input0_buffer_width, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, output->get_join_column_count(), + output->get_is_canonical(), + generator_mode, generator_func); + } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; @@ -112,20 +116,22 @@ bool parallel_join::local_join(int threshold, int* offset, } else { if (input_ts.size() != 0) { auto before_actual_join = MPI_Wtime(); - u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; - input1[bucket_id].as_all_to_allv_right_join_buffer( - std::vector(prev_non_dependent_columns.begin(), - prev_non_dependent_columns.begin()+join_column_count), - join_buffer, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); + // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; + for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) { + input1[bucket_id].as_all_to_allv_right_join_buffer( + std::vector(prev_non_dependent_columns.begin(), + prev_non_dependent_columns.begin()+join_column_count), + join_buffer, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); + } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; input_ts.clear(); @@ -137,19 +143,21 @@ bool parallel_join::local_join(int threshold, int* offset, if (input_ts.size() != 0) { u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; auto before_actual_join = MPI_Wtime(); - input1[bucket_id].as_all_to_allv_right_join_buffer( - std::vector(prev_non_dependent_columns.begin(), - prev_non_dependent_columns.begin()+join_column_count), - join_buffer, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); + for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { + input1[input0_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_right_join_buffer( + std::vector(prev_non_dependent_columns.begin(), + prev_non_dependent_columns.begin()+join_column_count), + join_buffer, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); + } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; input_ts.clear(); @@ -166,18 +174,20 @@ bool parallel_join::local_join(int threshold, int* offset, std::vector> input_ts; input_ts.push_back(std::vector(input0_buffer+k1, input0_buffer+k1+input0_buffer_width)); auto before_actual_join = MPI_Wtime(); - input1[bucket_id].as_all_to_allv_right_join_buffer( - prefix, join_buffer, - // input0_buffer + k1, input0_buffer_width, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); + for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { + input1[input0_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_right_join_buffer( + prefix, join_buffer, + // input0_buffer + k1, input0_buffer_width, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); + } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; diff --git a/backend/src/RA/parallel_join.h b/backend/src/RA/parallel_join.h index c30120c5..25aafeae 100644 --- a/backend/src/RA/parallel_join.h +++ b/backend/src/RA/parallel_join.h @@ -95,6 +95,8 @@ class parallel_join: public parallel_RA { shmap_relation *input1, u32 i1_size, int input1_buffer_width, std::vector reorder_map_array, relation* output, + relation* input0_rel, + relation* input1_rel, all_to_allv_buffer& join_buffer, int counter, int join_column_count, diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 3b62572b..546efd21 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -616,6 +616,8 @@ bool RAM::local_compute(int* offset) input1_trees, input1_size, input1->get_arity()+1, reorder_map_array, output_relation, + input0, + input1, compute_buffer, counter, join_column_count, @@ -633,6 +635,8 @@ bool RAM::local_compute(int* offset) input0_trees, input0_size, input0->get_arity()+1, reorder_map_array, output_relation, + input0, + input1, compute_buffer, counter, join_column_count, diff --git a/backend/src/RAM/RA_tasks.h b/backend/src/RAM/RA_tasks.h index be90384a..3f9718c9 100644 --- a/backend/src/RAM/RA_tasks.h +++ b/backend/src/RAM/RA_tasks.h @@ -55,7 +55,7 @@ class RAM u32 loop_count_tracker; public: - + bool balance_flag = false; double all_to_all_time = 0; ~RAM(); diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index 88d85b4e..6e4de91a 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -433,7 +433,9 @@ bool LIE::execute () #endif // load balance before a SCC executed + if (executable_task->balance_flag) { executable_task->load_balance(); + } if (restart_flag == false) { diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index da398ffa..88af9322 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -416,11 +416,13 @@ void relation::print() full[i].as_vector_buffer_recursive(&(vb_full[i]), prefix); if (vb_full[i].size != 0) - std::cout << get_debug_id() << " " << mcomm.get_rank() << " FULL Rows " << vb_full[i].size/(sizeof(u64) * (arity + 1)) << " columns " << arity + 1 << std::endl; + std::cout << get_debug_id() << " " << mcomm.get_rank() << " " << i << " FULL Rows " + << vb_full[i].size/(sizeof(u64) * (arity + 1)) << " columns " << arity + 1 + << std::endl; for (u32 j=0; j < vb_full[i].size/sizeof(u64); j = j + arity+1) { if (j % (arity+1) == 0) - std::cout << "F [" << j/(arity+1) << "] "; + std::cout << "F [" << mcomm.get_rank() << " " << i << " " << j/(arity+1) << "] "; for (u32 k = 0; k < arity+1; k++) { u64 temp; @@ -709,7 +711,7 @@ void relation::initialize_relation(mpi_comm& mcomm, std::map& intern_m u32 buckets = mcomm.get_local_nprocs(); - default_sub_bucket_per_bucket_count = 1; + // default_sub_bucket_per_bucket_count = 1; int rank = mcomm.get_local_rank(); int nprocs = mcomm.get_local_nprocs(); @@ -1415,17 +1417,17 @@ void relation::local_insert_in_delta() } bool relation::check_dependent_value_insert_avalible(const std::vector& tuple) { + // bool res = true; + // for (int i = 0 ; i < mcomm.get_nprocs(); i ++) { + // res = (res && delta[i].check_dependent_insertion(tuple)) && full[i].check_dependent_insertion(tuple); + // } + // return res; // uint64_t bucket_id = tuple_hash(tuple.data(), join_column_count) % get_bucket_count(); // if (bucket_id != mcomm.get_rank()) { // std::cout << "wwwwwwwwwwwwwwwwwwwwwwwwwwwwww " << std::endl; // } - // int bucket_id = mcomm.get_rank(); - bool res = true; - for (int i = 0 ; i < mcomm.get_nprocs(); i ++) { - res = (res && delta[i].check_dependent_insertion(tuple)) && full[i].check_dependent_insertion(tuple); - } - // return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ; - return res; + int bucket_id = mcomm.get_rank(); + return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ; } void relation::test_calc_hash_rank(u64 rank_n) { diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index 1be454a2..4b9f5215 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -71,7 +71,6 @@ class relation u32 **delta_sub_bucket_element_count; u32 *delta_bucket_element_count; - u32 default_sub_bucket_per_bucket_count; /// 1 u32 *sub_bucket_per_bucket_count; /// sub_bucket_per_bucket_count[i] holds the total number of sub-buckets at bucket index i u32** sub_bucket_rank; /// target rank of a subbucket @@ -96,7 +95,7 @@ class relation bool init_flag = true; public: - + u32 default_sub_bucket_per_bucket_count = 1; /// 1 bool balance_flag = false; /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL); diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h index 09ba11ac..47ff2eb8 100644 --- a/backend/src/relation/shmap_relation.h +++ b/backend/src/relation/shmap_relation.h @@ -54,7 +54,7 @@ struct shmap_relation { std::size_t size() const { return ind.size(); } - bool contains(const t_tuple &t) const { + bool contains(const t_tuple &t) { auto res = ind.find(t); return res != ind.end(); } @@ -68,7 +68,7 @@ struct shmap_relation { // I keep this weird name from souffle, actually join helper function // in souffle its index selection function, in slog we don't need select // so only one version of this function - std::pair lowerUpperRange(const t_tuple &lower, const t_tuple &upper) const + std::pair lowerUpperRange(const t_tuple &lower, const t_tuple &upper) { auto lower_it = ind.lower_bound(lower); auto upper_it = ind.upper_bound(upper); diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 43c04d90..94c13b26 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -437,6 +437,7 @@ int main(int argc, char **argv) { FULL); rel__edge__2__1->balance_flag = true; + // rel__edge__2__1->default_sub_bucket_per_bucket_count = 2; // relation *rel__edge__2__1__2 = new relation( // 2, true, 2, get_tag_for_rel("edge", "1__2"), @@ -489,6 +490,7 @@ int main(int argc, char **argv) { // )); RAM *cc_init_scc = new RAM(false, 1); + cc_init_scc->balance_flag = true; // cc_init_scc->add_relation(rel__edge__2__1__2, false); cc_init_scc->add_relation(rel__edge__2__1, false); cc_init_scc->add_relation(rel__cc__2__1, true); @@ -592,7 +594,7 @@ int main(int argc, char **argv) { // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; // rel__node__1__1->print(); // rel__edge__2__1->print(); - // rel__cc__2__1->print(); +// rel__cc__2__1->print(); // rel__cc_final__2__1->print(); // rel__cc_represent__1__1->print(); // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> From 0370c184e85837b2d5626f6b3c5a173fceecd7d7 Mon Sep 17 00:00:00 2001 From: ysun67 Date: Tue, 17 Jan 2023 11:55:46 -0500 Subject: [PATCH 28/36] fix bucket in join --- backend/src/RA/parallel_join.cpp | 24 ++++++++++++------------ backend/tests/cc/compiled_pre/cc.cpp | 6 +++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index 61630f4d..18c7acf7 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -64,8 +64,8 @@ bool parallel_join::local_join(int threshold, int* offset, u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; auto before_actual_join = MPI_Wtime(); - for (u32 sb = 0; sb < input1_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { - input1[input1_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_left_join_buffer( + // for (u32 sb = 0; sb < input1_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { + input1[bucket_id].as_all_to_allv_left_join_buffer( prefix, join_buffer, input0_buffer + k1,input0_buffer_width, input1_buffer_width, counter, @@ -76,7 +76,7 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(), output->get_is_canonical(), generator_mode, generator_func); - } + // } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; @@ -116,8 +116,8 @@ bool parallel_join::local_join(int threshold, int* offset, } else { if (input_ts.size() != 0) { auto before_actual_join = MPI_Wtime(); - // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; - for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) { + u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; + // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { input1[bucket_id].as_all_to_allv_right_join_buffer( std::vector(prev_non_dependent_columns.begin(), prev_non_dependent_columns.begin()+join_column_count), @@ -131,7 +131,7 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); - } + // } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; input_ts.clear(); @@ -143,8 +143,8 @@ bool parallel_join::local_join(int threshold, int* offset, if (input_ts.size() != 0) { u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; auto before_actual_join = MPI_Wtime(); - for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { - input1[input0_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_right_join_buffer( + // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { + input1[bucket_id].as_all_to_allv_right_join_buffer( std::vector(prev_non_dependent_columns.begin(), prev_non_dependent_columns.begin()+join_column_count), join_buffer, @@ -157,7 +157,7 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); - } + // } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; input_ts.clear(); @@ -174,8 +174,8 @@ bool parallel_join::local_join(int threshold, int* offset, std::vector> input_ts; input_ts.push_back(std::vector(input0_buffer+k1, input0_buffer+k1+input0_buffer_width)); auto before_actual_join = MPI_Wtime(); - for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { - input1[input0_rel->get_sub_bucket_rank()[bucket_id][sb]].as_all_to_allv_right_join_buffer( + // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { + input1[bucket_id].as_all_to_allv_right_join_buffer( prefix, join_buffer, // input0_buffer + k1, input0_buffer_width, input_ts, @@ -187,7 +187,7 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); - } + // } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 94c13b26..1da0f008 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -436,8 +436,8 @@ int main(int argc, char **argv) { ".edge.2.table", FULL); - rel__edge__2__1->balance_flag = true; - // rel__edge__2__1->default_sub_bucket_per_bucket_count = 2; + // rel__edge__2__1->balance_flag = true; + rel__edge__2__1->default_sub_bucket_per_bucket_count = 2; // relation *rel__edge__2__1__2 = new relation( // 2, true, 2, get_tag_for_rel("edge", "1__2"), @@ -490,7 +490,7 @@ int main(int argc, char **argv) { // )); RAM *cc_init_scc = new RAM(false, 1); - cc_init_scc->balance_flag = true; + // cc_init_scc->balance_flag = true; // cc_init_scc->add_relation(rel__edge__2__1__2, false); cc_init_scc->add_relation(rel__edge__2__1, false); cc_init_scc->add_relation(rel__cc__2__1, true); From 62b721c2be9035b3b7f219f3d44b29d03de2c198 Mon Sep 17 00:00:00 2001 From: ysun67 Date: Tue, 17 Jan 2023 15:36:50 -0500 Subject: [PATCH 29/36] add sssp opt --- backend/tests/sssp/compiled_pre/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/sssp/compiled_pre/CMakeLists.txt b/backend/tests/sssp/compiled_pre/CMakeLists.txt index 89ee3ea4..eac399d0 100644 --- a/backend/tests/sssp/compiled_pre/CMakeLists.txt +++ b/backend/tests/sssp/compiled_pre/CMakeLists.txt @@ -18,7 +18,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla # set (base_dir "${PROJECT_SOURCE_DIR}/../backend") set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") -file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") +file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/hash/xxhash.cpp" "${source_dir}/hash/hash.cpp" "${source_dir}/hash/spooky-c.cpp" "${source_dir}/hash/fasthash.cpp" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") file (GLOB source_files_sssp "${PROJECT_SOURCE_DIR}/sssp_opt.cpp") ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}") From 94043c5137c28c24963457aa9a3a3f166ee21ec1 Mon Sep 17 00:00:00 2001 From: ysun67 Date: Thu, 26 Jan 2023 13:55:44 -0500 Subject: [PATCH 30/36] try change insert --- backend/src/RAM/RA_tasks.cpp | 10 +++- .../src/relation/balanced_hash_relation.cpp | 18 +++++-- backend/src/relation/shmap_relation.h | 1 + backend/src/relation/shmap_relation_exp.cpp | 4 ++ backend/tests/cc/compiled_pre/cc.cpp | 50 ++++++++++--------- .../pagerank/compiled_pre/CMakeLists.txt | 3 +- .../pagerank/compiled_pre/pagerank_full.cpp | 26 ++++++---- backend/utility/tsv_to_bin.cpp | 4 +- 8 files changed, 73 insertions(+), 43 deletions(-) diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 546efd21..c57ad061 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -242,6 +242,7 @@ u64 RAM::intra_bucket_comm_execute() input1_trees = input1->get_delta(); input1_size = input1->get_delta_element_count(); } + double before_reduce_time = MPI_Wtime(); int join_direction = LEFT; int local_join_direction_count = input0_size < input1_size ? 0 : 1; // true if size of input0 > input1 int global_join_direction_count = local_join_direction_count; @@ -249,6 +250,10 @@ u64 RAM::intra_bucket_comm_execute() if (global_join_direction_count > mcomm.get_nprocs() / 2) { join_direction = RIGHT; } + double after_reduce_time = MPI_Wtime(); + if (mcomm.get_rank() == 0) { + std::cout << "Reduced time : " << after_reduce_time - before_reduce_time << std::endl; + } if (join_direction == LEFT) { intra_bucket_comm(get_bucket_count(), @@ -851,7 +856,8 @@ void RAM::local_insert_in_newt_comm_compaction(std::map& intern_map) // temporary index column just to match size of column tt.push_back(0); auto _before_i = MPI_Wtime(); - insert_flag = output->check_dependent_value_insert_avalible(tt); + // insert_flag = output->chmeck_dependent_value_insert_avalible(tt); + insert_flag = true; auto _after_i = MPI_Wtime(); check_time += _after_i - _before_i; } else { @@ -1039,8 +1045,8 @@ void RAM::local_insert_in_full() for (u32 i=0; i < ram_relation_count; i++) { relation* current_r = ram_relations[i]; - current_r->insert_delta_in_full(); current_r->local_insert_in_delta(); + current_r->insert_delta_in_full(); } return; } diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index 88af9322..ea9671e3 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -8,6 +8,7 @@ #include "../parallel_RA_inc.h" #include "balanced_hash_relation.h" #include "mpi.h" +#include "shmap_relation.h" #include #include #include @@ -1276,8 +1277,7 @@ bool relation::insert_in_full(u64* t) #endif // std::cout << "inserting full for " << intern_tag << std::endl; - // TODO: use normal insert here! - if (full[bucket_id].insert_tuple_from_array(t, arity+1) == true) + if (full[bucket_id].insert_tuple_from_array(t, arity+1) != INSERT_FAIL) // std::vector tp(t, t+arity+1); // if (full[bucket_id].insert(tp)) { @@ -1315,24 +1315,31 @@ int relation::insert_delta_in_full() // if (insert_in_full ( (u64*)( (input_buffer[i].buffer) + (j*sizeof(u64)) )) == true) // insert_success++; // } + std::vector> tuples_to_del; for(auto it=delta[i].begin(); it != delta[i].end(); ++it) { auto tuple_d = *it; - // std::cout << "inserting into delta "; + // std::cout << "inserting into full "; // for (auto v: tuple_d) { // std::cout << v << " "; // } // std::cout << std::endl; if (insert_in_full(tuple_d.data()) == true) insert_success++; + else { + tuples_to_del.push_back(tuple_d); + } + } + for (auto t: tuples_to_del) { + delta[i].delete_tuple(t); } - delta[i].remove_tuple(); + // delta[i].remove_tuple(); // input_buffer[i].vector_buffer_free(); } } - set_delta_element_count(0); + // set_delta_element_count(0); // delete[] input_buffer; return insert_success; @@ -1396,6 +1403,7 @@ void relation::local_insert_in_delta() // newt_element_count = 0; // } // } else { + delete[] delta; delta = newt; delta_element_count = newt_element_count; diff --git a/backend/src/relation/shmap_relation.h b/backend/src/relation/shmap_relation.h index 47ff2eb8..ea7af922 100644 --- a/backend/src/relation/shmap_relation.h +++ b/backend/src/relation/shmap_relation.h @@ -126,6 +126,7 @@ struct shmap_relation { void remove_tuple(); bool find_tuple_from_array(u64* t, int arity); bool check_dependent_insertion(const std::vector &v); + void delete_tuple(std::vector& t); void as_vector_buffer_recursive(vector_buffer* vb, std::vector prefix); diff --git a/backend/src/relation/shmap_relation_exp.cpp b/backend/src/relation/shmap_relation_exp.cpp index 19decd31..8e1ff0f6 100644 --- a/backend/src/relation/shmap_relation_exp.cpp +++ b/backend/src/relation/shmap_relation_exp.cpp @@ -20,6 +20,10 @@ #include #include +void shmap_relation::delete_tuple(std::vector& t) { + ind.erase(t); +} + shmap_relation::shmap_relation(int arity, bool id_flag) { this->arity = arity; diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 1da0f008..65f8e0c8 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -336,11 +336,13 @@ agg_minimum_local(std::pair local_agg_res_t min_res = std::numeric_limits::max(); for (auto it = joined_range.first; it != joined_range.second; ++it) { auto tuple = (*it); - auto current_v = tuple[tuple.size() - 1]; + auto current_v = tuple[tuple.size() - 2]; + // std::cout << tuple[0] << " " << tuple[1] << " " << tuple.size() << std::endl; if (current_v < min_res) { min_res = current_v; } } + // std::cout << "Min : " << min_res << std::endl; return min_res; } @@ -476,18 +478,20 @@ int main(int argc, char **argv) { relation *rel__cc_represent__1__1 = new relation( 1, true, 1, get_tag_for_rel("cc_represent", "1"), std::to_string(get_tag_for_rel("cc_represent", "1")) + ".cc_represent.2.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("cc_represent", "1")) + + ".cc_represent.1.table", FULL); - // RAM *to_undirected_scc = new RAM(false, 0); - // to_undirected_scc->add_relation(rel__edge__2__1, false); - // to_undirected_scc->add_rule(new parallel_copy_generate( - // rel__edge__2__1, rel__edge__2__1, FULL, - // [](const u64 *const data, u64 *const output) -> int { - // output[0] = data[1]; - // output[1] = data[0]; - // return 1; - // } - // )); + RAM *to_undirected_scc = new RAM(false, 0); + to_undirected_scc->add_relation(rel__edge__2__1, false); + to_undirected_scc->add_rule(new parallel_copy_generate( + rel__edge__2__1, rel__edge__2__1, FULL, + [](const u64 *const data, u64 *const output) -> int { + output[0] = data[1]; + output[1] = data[0]; + return 1; + } + )); RAM *cc_init_scc = new RAM(false, 1); // cc_init_scc->balance_flag = true; @@ -546,12 +550,12 @@ int main(int argc, char **argv) { agg_minimum_local, SpecialAggregator::minimum, agg_minimum_reduce, nullptr, {0,2})); - // RAM* cc_rep_scc = new RAM(false, 3); - // cc_rep_scc->add_relation(rel__cc_final__2__1, false); - // cc_rep_scc->add_relation(rel__cc_represent__1__1, true); - // cc_rep_scc->add_rule(new parallel_copy( - // rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1} - // )); + RAM* cc_rep_scc = new RAM(false, 3); + cc_rep_scc->add_relation(rel__cc_final__2__1, false); + cc_rep_scc->add_relation(rel__cc_represent__1__1, true); + cc_rep_scc->add_rule(new parallel_copy( + rel__cc_represent__1__1, rel__cc_final__2__1, FULL, {1} + )); LIE *cc_lie = new LIE(); @@ -562,16 +566,16 @@ int main(int argc, char **argv) { cc_lie->add_relation(rel__cc_represent__1__1); // cc_lie->add_relation(rel__edge__2__1__2); - // cc_lie->add_scc(to_undirected_scc); + cc_lie->add_scc(to_undirected_scc); cc_lie->add_scc(cc_init_scc); cc_lie->add_scc(cc_compute_scc); cc_lie->add_scc(cc_agg_scc); - // cc_lie->add_scc(cc_rep_scc); + cc_lie->add_scc(cc_rep_scc); - // cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc); + cc_lie->add_scc_dependance(to_undirected_scc, cc_init_scc); cc_lie->add_scc_dependance(cc_init_scc, cc_compute_scc); cc_lie->add_scc_dependance(cc_compute_scc, cc_agg_scc); - // cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc); + cc_lie->add_scc_dependance(cc_agg_scc, cc_rep_scc); cc_lie->enable_all_to_all_dump(); cc_lie->set_output_dir(slog_output_dir); // Write to this directory @@ -595,8 +599,8 @@ int main(int argc, char **argv) { // rel__node__1__1->print(); // rel__edge__2__1->print(); // rel__cc__2__1->print(); - // rel__cc_final__2__1->print(); - // rel__cc_represent__1__1->print(); + rel__cc_final__2__1->print(); + rel__cc_represent__1__1->print(); // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> mcomm.destroy(); diff --git a/backend/tests/pagerank/compiled_pre/CMakeLists.txt b/backend/tests/pagerank/compiled_pre/CMakeLists.txt index 38953a06..3b12bed5 100644 --- a/backend/tests/pagerank/compiled_pre/CMakeLists.txt +++ b/backend/tests/pagerank/compiled_pre/CMakeLists.txt @@ -18,7 +18,8 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++17 -lstdc++fs -Werror=cla # set (base_dir "${PROJECT_SOURCE_DIR}/../backend") set (source_dir "${PROJECT_SOURCE_DIR}/../../../src") -file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") +# file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") +file (GLOB source_files_parallel_RA "${source_dir}/parallel_RA_inc.h" "${source_dir}/log/logger.cpp" "${source_dir}/hash/hash.h" "${source_dir}/hash/xxhash.cpp" "${source_dir}/hash/hash.cpp" "${source_dir}/hash/spooky-c.cpp" "${source_dir}/hash/fasthash.cpp" "${source_dir}/buffer/vector_buffer.cpp" "${source_dir}/comm/comm.cpp" "${source_dir}/relation/shmap_relation_exp.cpp" "${source_dir}/comm/all_to_all_comm.cpp" "${source_dir}/comm/all_to_allv_comm.cpp" "${source_dir}/IO/parallel_io.cpp" "${source_dir}/RA/parallel_join.cpp" "${source_dir}/RA/parallel_agg.cpp" "${source_dir}/comm/intra_bucket_comm.cpp" "${source_dir}/RA/parallel_copy.cpp" "${source_dir}/RA/parallel_copy_filter.cpp" "${source_dir}/RA/parallel_copy_generate.cpp" "${source_dir}/RA/parallel_RA.h" "${source_dir}/RA/parallel_acopy.cpp" "${source_dir}/relation/balanced_hash_relation.cpp" "${source_dir}/relation/relation_load_balancer.cpp" "${source_dir}/RAM/RA_tasks.cpp" "${source_dir}/lie/lie.cpp") file (GLOB source_files_pagerank "${PROJECT_SOURCE_DIR}/pagerank_full.cpp") ADD_LIBRARY(parallel_RA "${source_files_parallel_RA}") diff --git a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp index 7513aa3c..e2087ca8 100644 --- a/backend/tests/pagerank/compiled_pre/pagerank_full.cpp +++ b/backend/tests/pagerank/compiled_pre/pagerank_full.cpp @@ -1,5 +1,5 @@ // location of `parallel_RA_inc.h` here -#include "/home/ubuntu/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" +#include "/home/ysun67/workspace/slog/compiler/../backend/src/parallel_RA_inc.h" #include "mpi.h" // #include @@ -373,7 +373,7 @@ void load_input_relation(std::string db_dir) { for (const auto &entry : std::filesystem::directory_iterator(db_dir)) { // check if ends with table std::string filename_ss = entry.path().filename().string(); - std::cout << "input database has file " << filename_ss << std::endl; + // std::cout << "input database has file " << filename_ss << std::endl; std::string suffix = ".table"; int ft = filename_ss.size() - suffix.size(); if (ft < 0) @@ -396,8 +396,8 @@ void load_input_relation(std::string db_dir) { } if (tag > max_rel) max_rel = tag; - std::cout << "load " << tag << "." << index_stream.str() << "has arity " - << arity << std::endl; + // std::cout << "load " << tag << "." << index_stream.str() << "has arity " + // << arity << std::endl; rel_tag_map[index_stream.str()] = tag; } } @@ -417,8 +417,8 @@ int get_tag_for_rel(std::string relation_name, std::string index_str) { } max_rel++; rel_tag_map[name_arity] = max_rel; - std::cout << "generate rel tag: " << name_arity << " " << max_rel - << std::endl; + // std::cout << "generate rel tag: " << name_arity << " " << max_rel + // << std::endl; return max_rel; } @@ -582,10 +582,12 @@ int main(int argc, char **argv) { dangling_node_cnt = rel__dangling_node->get_global_full_element_count(); dangling_value = FLOAT_SCALE_CONST / total_node_size; + if (mcomm.get_rank() == 0) { std::cout << ">>>>>>>>> Number of nodes: " << total_node_size << " >>>>>>>>> Dangling node count: " << dangling_node_cnt << " >>>>>>>>> Dangling value: " << dangling_value * 1.0 / FLOAT_SCALE_CONST << std::endl; + } rel__edge__2__1->disable_initialization(); rel__node__1__1->disable_initialization(); @@ -616,8 +618,10 @@ int main(int argc, char **argv) { std::vector pg_lie_list; for (int i = 0; i < MAX_PG_ITERATION; i++) { + if (mcomm.get_rank() == 0) { std::cout << ">>>>>>>>>>>>>>>>>>>>> Compute pagerank iter " << current_iter << std::endl; + } LIE *pg_lie = new LIE(); RAM *scc_init = new RAM(false, 0); @@ -687,7 +691,9 @@ int main(int argc, char **argv) { // MPI_Barrier(mcomm.get_comm()); } + if (mcomm.get_rank() == 0) { std::cout << "Aggregating Page Rank Result ..." << std::endl; + } relation *rel__result__2__1__2 = new relation( 2, true, 2, get_tag_for_rel("result", "1__2"), std::to_string(get_tag_for_rel("result", "1__2")) + @@ -734,10 +740,10 @@ int main(int argc, char **argv) { final_lie->execute(); final_lie->print_all_relation_size(); // Continuously print relation sizes - rel__result__2__1__2->print([](const std::vector &tp) { - u32 pg_v = tp[1]; - std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl; - }); + // rel__result__2__1__2->print([](const std::vector &tp) { + // u32 pg_v = tp[1]; + // std::cout << tp[0] << " " << pg_v * 1.0 / FLOAT_SCALE_CONST << std::endl; + // }); // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git a/backend/utility/tsv_to_bin.cpp b/backend/utility/tsv_to_bin.cpp index 9f88afb5..49a22b93 100644 --- a/backend/utility/tsv_to_bin.cpp +++ b/backend/utility/tsv_to_bin.cpp @@ -162,8 +162,8 @@ uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed ) uint64_t tuple_hash(const uint64_t* start_ptr, uint64_t prefix_len) { - // return fnv1a(start_ptr, prefix_len); - return MurmurHash64A(start_ptr, (int)prefix_len, MURMUR_SEED); + return fnv1a(start_ptr, prefix_len); + // return MurmurHash64A(start_ptr, (int)prefix_len, MURMUR_SEED); } u32 string_hash(const std::string& str) { From 3b929d118910397b820ff56529ac496ca0d2d419 Mon Sep 17 00:00:00 2001 From: ysun67 Date: Wed, 28 Dec 2022 14:17:19 -0500 Subject: [PATCH 31/36] theta gcc + mpich --- backend/src/RA/parallel_join.cpp | 93 ++++++++++------------ backend/src/RAM/RA_tasks.cpp | 129 +++++++++++++++++++++++++------ 2 files changed, 143 insertions(+), 79 deletions(-) diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index 18c7acf7..cdea6c8f 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -115,25 +115,20 @@ bool parallel_join::local_join(int threshold, int* offset, input_ts.push_back(input_t); } else { if (input_ts.size() != 0) { - auto before_actual_join = MPI_Wtime(); - u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; - // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { - input1[bucket_id].as_all_to_allv_right_join_buffer( - std::vector(prev_non_dependent_columns.begin(), - prev_non_dependent_columns.begin()+join_column_count), - join_buffer, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); - // } - auto after_actual_join = MPI_Wtime(); - join_time_total += after_actual_join - before_actual_join; + u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; + input1[bucket_id].as_all_to_allv_right_join_buffer( + std::vector(prev_non_dependent_columns.begin(), + prev_non_dependent_columns.begin()+join_column_count), + join_buffer, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); input_ts.clear(); } prev_non_dependent_columns = cur_non_dependent_columns; @@ -142,24 +137,19 @@ bool parallel_join::local_join(int threshold, int* offset, } if (input_ts.size() != 0) { u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; - auto before_actual_join = MPI_Wtime(); - // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { - input1[bucket_id].as_all_to_allv_right_join_buffer( - std::vector(prev_non_dependent_columns.begin(), - prev_non_dependent_columns.begin()+join_column_count), - join_buffer, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); - // } - auto after_actual_join = MPI_Wtime(); - join_time_total += after_actual_join - before_actual_join; + input1[bucket_id].as_all_to_allv_right_join_buffer( + std::vector(prev_non_dependent_columns.begin(), + prev_non_dependent_columns.begin()+join_column_count), + join_buffer, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); input_ts.clear(); } } else { @@ -173,23 +163,18 @@ bool parallel_join::local_join(int threshold, int* offset, u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; std::vector> input_ts; input_ts.push_back(std::vector(input0_buffer+k1, input0_buffer+k1+input0_buffer_width)); - auto before_actual_join = MPI_Wtime(); - // for (u32 sb = 0; sb < input0_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { - input1[bucket_id].as_all_to_allv_right_join_buffer( - prefix, join_buffer, - // input0_buffer + k1, input0_buffer_width, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); - // } - auto after_actual_join = MPI_Wtime(); - join_time_total += after_actual_join - before_actual_join; + input1[bucket_id].as_all_to_allv_right_join_buffer( + prefix, join_buffer, + // input0_buffer + k1, input0_buffer_width, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl; if (local_join_count > threshold) diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index c57ad061..418614b5 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -264,7 +264,42 @@ u64 RAM::intra_bucket_comm_execute() mcomm.get_local_comm()); } else { intra_bucket_comm(get_bucket_count(), - input1_trees, + input0->get_delta(), + input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), + input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), + &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], + mcomm.get_local_comm()); + total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; + } + + /// Join between full and delta + else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) + { + // std::cout << "here>>>>>>>>>>>>>" << std::endl; + // if (input1->get_dependent_column().size() > 0) { + // intra_bucket_comm(get_bucket_count(), + // input0->get_full(), + // input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), + // input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), + // &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], + // mcomm.get_local_comm()); + // } else { + intra_bucket_comm(get_bucket_count(), + input1->get_delta(), + input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), + input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), + &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], + mcomm.get_local_comm()); + // } + total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; + } + + /// Join between full and full + else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL) + { + + intra_bucket_comm(get_bucket_count(), + input1->get_full(), input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], @@ -369,8 +404,6 @@ bool RAM::local_compute(int* offset) auto before_compute_time = MPI_Wtime(); auto ibf_size = 0; u64 jtarget_size = 0; - double size_sync_time = 0; - double real_join_time = 0; for (std::vector::iterator it = RA_list.begin() ; it != RA_list.end(); ++it) { // std::cout << "RA type : " << (*it)->get_RA_type() << std::endl; @@ -616,7 +649,7 @@ bool RAM::local_compute(int* offset) join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), LEFT, get_bucket_count(), - input0_trees, + input0->get_delta(), intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], input1_trees, input1_size, input1->get_arity()+1, reorder_map_array, @@ -627,15 +660,71 @@ bool RAM::local_compute(int* offset) counter, join_column_count, &join_tuples_duplicates, - &join_tuples, - real_j_time_stat); - jtarget_size += input1_size; - ibf_size += input0_size; - } else { + &join_tuples); + total_join_tuples = total_join_tuples + join_tuples; + jtarget_size += input1->get_delta_element_count(); + } + else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL) + { + + join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), + LEFT, + get_bucket_count(), + input0->get_delta(), + intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], + input1->get_full(), input1->get_full_element_count(), input1->get_arity()+1, + reorder_map_array, + output_relation, + compute_buffer, + counter, + join_column_count, + &join_tuples_duplicates, + &join_tuples); + total_join_tuples = total_join_tuples + join_tuples; + jtarget_size += input1->get_full_element_count(); + } + else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) + { + // if (input1->get_dependent_column().size() > 0) { + // join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), + // LEFT, + // get_bucket_count(), + // input0->get_delta(), + // intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], + // input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1, + // reorder_map_array, + // output_relation, + // compute_buffer, + // counter, + // join_column_count, + // &join_tuples_duplicates, + // &join_tuples); + // jtarget_size += input1->get_delta_element_count(); + // } else { + join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), + RIGHT, + get_bucket_count(), + input1->get_delta(), + intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], + input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1, + reorder_map_array, + output_relation, + compute_buffer, + counter, + join_column_count, + &join_tuples_duplicates, + &join_tuples); + jtarget_size += input0->get_full_element_count(); + // } + total_join_tuples = total_join_tuples + join_tuples; + + } + else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL) + { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), RIGHT, get_bucket_count(), - input1_trees, + input1->get_full(), intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], input0_trees, input0_size, input0->get_arity()+1, reorder_map_array, @@ -646,13 +735,12 @@ bool RAM::local_compute(int* offset) counter, join_column_count, &join_tuples_duplicates, - &join_tuples, - real_j_time_stat); - jtarget_size += input0_size; - ibf_size += input1_size; + &join_tuples); + total_join_tuples = total_join_tuples + join_tuples; + jtarget_size += input0->get_full_element_count(); } - total_join_tuples = total_join_tuples + join_tuples; - real_join_time += real_j_time_stat[0]; + + ibf_size += intra_bucket_buf_output_size[counter]; } counter++; } @@ -681,12 +769,9 @@ bool RAM::local_compute(int* offset) if (lc_all_time == slowest_rank_time) { std::cout << "Slowest Rank >>> " << mcomm.get_rank() << " Comp Time >>> " << after_compute_time - before_compute_time - << " Real Join >>> " << real_join_time << " Sync Time >>> " << after_sync_time - before_sync_time - << " Size Sync Time >>> " << size_sync_time << " Input Size >>> " << ibf_size << " Target Count >>> " << jtarget_size - << " Join Count >>> " << total_join_tuples << std::endl; } @@ -1125,12 +1210,6 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& double all_insert_newt = 0; double all_comm = 0; double all_time = 0; - double all_insert_in_full = 0; - double all_allocate_buf = 0; - double all_intra = 0; - double all_free_buf =0; - - // auto before_batch = MPI_Wtime(); while (batch_size != 0) { #if DEBUG_OUTPUT From 08dce3c512701bb7ff8e20eb2b65875217f40303 Mon Sep 17 00:00:00 2001 From: ysun67 Date: Wed, 28 Dec 2022 18:05:23 -0500 Subject: [PATCH 32/36] add more log --- backend/src/RAM/RA_tasks.cpp | 126 ++++------------------------------- 1 file changed, 14 insertions(+), 112 deletions(-) diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 418614b5..3c3060a9 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -242,7 +242,6 @@ u64 RAM::intra_bucket_comm_execute() input1_trees = input1->get_delta(); input1_size = input1->get_delta_element_count(); } - double before_reduce_time = MPI_Wtime(); int join_direction = LEFT; int local_join_direction_count = input0_size < input1_size ? 0 : 1; // true if size of input0 > input1 int global_join_direction_count = local_join_direction_count; @@ -250,10 +249,6 @@ u64 RAM::intra_bucket_comm_execute() if (global_join_direction_count > mcomm.get_nprocs() / 2) { join_direction = RIGHT; } - double after_reduce_time = MPI_Wtime(); - if (mcomm.get_rank() == 0) { - std::cout << "Reduced time : " << after_reduce_time - before_reduce_time << std::endl; - } if (join_direction == LEFT) { intra_bucket_comm(get_bucket_count(), @@ -264,42 +259,7 @@ u64 RAM::intra_bucket_comm_execute() mcomm.get_local_comm()); } else { intra_bucket_comm(get_bucket_count(), - input0->get_delta(), - input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), - input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), - &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], - mcomm.get_local_comm()); - total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; - } - - /// Join between full and delta - else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) - { - // std::cout << "here>>>>>>>>>>>>>" << std::endl; - // if (input1->get_dependent_column().size() > 0) { - // intra_bucket_comm(get_bucket_count(), - // input0->get_full(), - // input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), - // input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), - // &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], - // mcomm.get_local_comm()); - // } else { - intra_bucket_comm(get_bucket_count(), - input1->get_delta(), - input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), - input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), - &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], - mcomm.get_local_comm()); - // } - total_data_moved = total_data_moved + intra_bucket_buf_output_size[counter]; - } - - /// Join between full and full - else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL) - { - - intra_bucket_comm(get_bucket_count(), - input1->get_full(), + input1_trees, input1->get_distinct_sub_bucket_rank_count(), input1->get_distinct_sub_bucket_rank(), input1->get_bucket_map(), input0->get_distinct_sub_bucket_rank_count(), input0->get_distinct_sub_bucket_rank(), input0->get_bucket_map(), &intra_bucket_buf_output_size[counter], &intra_bucket_buf_output[counter], @@ -636,95 +596,31 @@ bool RAM::local_compute(int* offset) int join_direction = LEFT; int local_join_direction_count = input0_size < input1_size ? 0 : 1; // true if size of input0 > input1 int global_join_direction_count = local_join_direction_count; - - auto before_size_sync = MPI_Wtime(); MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm()); if (global_join_direction_count > mcomm.get_nprocs() / 2) { join_direction = RIGHT; } - auto after_size_sync = MPI_Wtime(); - size_sync_time += after_size_sync - before_size_sync; - std::vector real_j_time_stat; + if (join_direction == LEFT) { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), LEFT, get_bucket_count(), - input0->get_delta(), + input0_trees, intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], input1_trees, input1_size, input1->get_arity()+1, reorder_map_array, output_relation, - input0, - input1, - compute_buffer, - counter, - join_column_count, - &join_tuples_duplicates, - &join_tuples); - total_join_tuples = total_join_tuples + join_tuples; - jtarget_size += input1->get_delta_element_count(); - } - else if (current_ra->get_join_input0_graph_type() == DELTA && current_ra->get_join_input1_graph_type() == FULL) - { - - join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), - LEFT, - get_bucket_count(), - input0->get_delta(), - intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], - input1->get_full(), input1->get_full_element_count(), input1->get_arity()+1, - reorder_map_array, - output_relation, compute_buffer, counter, join_column_count, &join_tuples_duplicates, &join_tuples); - total_join_tuples = total_join_tuples + join_tuples; - jtarget_size += input1->get_full_element_count(); - } - else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == DELTA) - { - // if (input1->get_dependent_column().size() > 0) { - // join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), - // LEFT, - // get_bucket_count(), - // input0->get_delta(), - // intra_bucket_buf_output_size[counter], input0->get_arity()+1, intra_bucket_buf_output[counter], - // input1->get_delta(), input1->get_delta_element_count(), input1->get_arity()+1, - // reorder_map_array, - // output_relation, - // compute_buffer, - // counter, - // join_column_count, - // &join_tuples_duplicates, - // &join_tuples); - // jtarget_size += input1->get_delta_element_count(); - // } else { - join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), - RIGHT, - get_bucket_count(), - input1->get_delta(), - intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], - input0->get_full(), input0->get_full_element_count(), input0->get_arity()+1, - reorder_map_array, - output_relation, - compute_buffer, - counter, - join_column_count, - &join_tuples_duplicates, - &join_tuples); - jtarget_size += input0->get_full_element_count(); - // } - total_join_tuples = total_join_tuples + join_tuples; - - } - else if (current_ra->get_join_input0_graph_type() == FULL && current_ra->get_join_input1_graph_type() == FULL) - { + + } else { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), RIGHT, get_bucket_count(), - input1->get_full(), + input1_trees, intra_bucket_buf_output_size[counter], input1->get_arity()+1, intra_bucket_buf_output[counter], input0_trees, input0_size, input0->get_arity()+1, reorder_map_array, @@ -736,9 +632,9 @@ bool RAM::local_compute(int* offset) join_column_count, &join_tuples_duplicates, &join_tuples); - total_join_tuples = total_join_tuples + join_tuples; - jtarget_size += input0->get_full_element_count(); } + total_join_tuples = total_join_tuples + join_tuples; + jtarget_size += input1->get_delta_element_count(); ibf_size += intra_bucket_buf_output_size[counter]; } @@ -1210,6 +1106,12 @@ void RAM::execute_in_batches(std::string name, int batch_size, std::vector& double all_insert_newt = 0; double all_comm = 0; double all_time = 0; + double all_insert_in_full = 0; + double all_allocate_buf = 0; + double all_intra = 0; + double all_free_buf =0; + + // auto before_batch = MPI_Wtime(); while (batch_size != 0) { #if DEBUG_OUTPUT From 2b34dcba09a6d29d09d73696b44c7c19966cee61 Mon Sep 17 00:00:00 2001 From: ysun67 Date: Thu, 29 Dec 2022 23:45:09 -0500 Subject: [PATCH 33/36] more stat --- backend/src/RA/parallel_join.cpp | 35 +++++++++++++++++++------------- backend/src/RAM/RA_tasks.cpp | 25 ++++++++++++++++------- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index cdea6c8f..c04806c8 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -62,21 +62,19 @@ bool parallel_join::local_join(int threshold, int* offset, } u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; - + auto before_actual_join = MPI_Wtime(); - // for (u32 sb = 0; sb < input1_rel->get_sub_bucket_per_bucket_count()[bucket_id]; sb++) { - input1[bucket_id].as_all_to_allv_left_join_buffer( - prefix, join_buffer, - input0_buffer + k1,input0_buffer_width, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, output->get_join_column_count(), - output->get_is_canonical(), - generator_mode, generator_func); - // } + input1[bucket_id].as_all_to_allv_left_join_buffer( + prefix, join_buffer, + input0_buffer + k1,input0_buffer_width, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, output->get_join_column_count(), + output->get_is_canonical(), + generator_mode, generator_func); auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; @@ -115,6 +113,7 @@ bool parallel_join::local_join(int threshold, int* offset, input_ts.push_back(input_t); } else { if (input_ts.size() != 0) { + auto before_actual_join = MPI_Wtime(); u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; input1[bucket_id].as_all_to_allv_right_join_buffer( std::vector(prev_non_dependent_columns.begin(), @@ -129,6 +128,8 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); + auto after_actual_join = MPI_Wtime(); + join_time_total += after_actual_join - before_actual_join; input_ts.clear(); } prev_non_dependent_columns = cur_non_dependent_columns; @@ -137,6 +138,7 @@ bool parallel_join::local_join(int threshold, int* offset, } if (input_ts.size() != 0) { u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; + auto before_actual_join = MPI_Wtime(); input1[bucket_id].as_all_to_allv_right_join_buffer( std::vector(prev_non_dependent_columns.begin(), prev_non_dependent_columns.begin()+join_column_count), @@ -150,6 +152,8 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); + auto after_actual_join = MPI_Wtime(); + join_time_total += after_actual_join - before_actual_join; input_ts.clear(); } } else { @@ -163,6 +167,7 @@ bool parallel_join::local_join(int threshold, int* offset, u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; std::vector> input_ts; input_ts.push_back(std::vector(input0_buffer+k1, input0_buffer+k1+input0_buffer_width)); + auto before_actual_join = MPI_Wtime(); input1[bucket_id].as_all_to_allv_right_join_buffer( prefix, join_buffer, // input0_buffer + k1, input0_buffer_width, @@ -175,6 +180,8 @@ bool parallel_join::local_join(int threshold, int* offset, global_join_inserts, output->get_join_column_count(),output->get_is_canonical(), generator_mode, generator_func); + auto after_actual_join = MPI_Wtime(); + join_time_total += after_actual_join - before_actual_join; // std::cout << "local_join_count " << local_join_count << " Threshold " << threshold << " k1 " << k1 << " offset " << *offset << " " << input0_buffer_width << std::endl; if (local_join_count > threshold) diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 3c3060a9..805b8423 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -364,6 +364,8 @@ bool RAM::local_compute(int* offset) auto before_compute_time = MPI_Wtime(); auto ibf_size = 0; u64 jtarget_size = 0; + double size_sync_time = 0; + double real_join_time = 0; for (std::vector::iterator it = RA_list.begin() ; it != RA_list.end(); ++it) { // std::cout << "RA type : " << (*it)->get_RA_type() << std::endl; @@ -596,11 +598,15 @@ bool RAM::local_compute(int* offset) int join_direction = LEFT; int local_join_direction_count = input0_size < input1_size ? 0 : 1; // true if size of input0 > input1 int global_join_direction_count = local_join_direction_count; + + auto before_size_sync = MPI_Wtime(); MPI_Allreduce(&local_join_direction_count, &global_join_direction_count, 1, MPI_INT, MPI_SUM, mcomm.get_comm()); if (global_join_direction_count > mcomm.get_nprocs() / 2) { join_direction = RIGHT; } - + auto after_size_sync = MPI_Wtime(); + size_sync_time += after_size_sync - before_size_sync; + std::vector real_j_time_stat; if (join_direction == LEFT) { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), LEFT, @@ -614,8 +620,10 @@ bool RAM::local_compute(int* offset) counter, join_column_count, &join_tuples_duplicates, - &join_tuples); - + &join_tuples, + real_j_time_stat); + jtarget_size += input1_size; + ibf_size += input0_size; } else { join_completed = join_completed & current_ra->local_join(threshold, &(offset[counter]), RIGHT, @@ -631,12 +639,13 @@ bool RAM::local_compute(int* offset) counter, join_column_count, &join_tuples_duplicates, - &join_tuples); + &join_tuples, + real_j_time_stat); + jtarget_size += input0_size; + ibf_size += input1_size; } total_join_tuples = total_join_tuples + join_tuples; - jtarget_size += input1->get_delta_element_count(); - - ibf_size += intra_bucket_buf_output_size[counter]; + real_join_time += real_j_time_stat[0]; } counter++; } @@ -665,7 +674,9 @@ bool RAM::local_compute(int* offset) if (lc_all_time == slowest_rank_time) { std::cout << "Slowest Rank >>> " << mcomm.get_rank() << " Comp Time >>> " << after_compute_time - before_compute_time + << " Real Join >>> " << real_join_time << " Sync Time >>> " << after_sync_time - before_sync_time + << " Size Sync Time >>> " << size_sync_time << " Input Size >>> " << ibf_size << " Target Count >>> " << jtarget_size << std::endl; From 271e9c4cc756a76b38b0e8e3c1990a8732e1652b Mon Sep 17 00:00:00 2001 From: ysun67 Date: Wed, 4 Jan 2023 14:53:44 -0500 Subject: [PATCH 34/36] more hash function --- backend/src/RAM/RA_tasks.cpp | 1 + .../src/relation/balanced_hash_relation.cpp | 1 - backend/tests/cc/compiled_pre/cc.cpp | 22 ++++++++----------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/backend/src/RAM/RA_tasks.cpp b/backend/src/RAM/RA_tasks.cpp index 805b8423..4bb90015 100644 --- a/backend/src/RAM/RA_tasks.cpp +++ b/backend/src/RAM/RA_tasks.cpp @@ -679,6 +679,7 @@ bool RAM::local_compute(int* offset) << " Size Sync Time >>> " << size_sync_time << " Input Size >>> " << ibf_size << " Target Count >>> " << jtarget_size + << " Join Count >>> " << total_join_tuples << std::endl; } diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index ea9671e3..da06df04 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -8,7 +8,6 @@ #include "../parallel_RA_inc.h" #include "balanced_hash_relation.h" #include "mpi.h" -#include "shmap_relation.h" #include #include #include diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 65f8e0c8..4b53c04e 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -438,15 +438,12 @@ int main(int argc, char **argv) { ".edge.2.table", FULL); - // rel__edge__2__1->balance_flag = true; - rel__edge__2__1->default_sub_bucket_per_bucket_count = 2; - - // relation *rel__edge__2__1__2 = new relation( - // 2, true, 2, get_tag_for_rel("edge", "1__2"), - // std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", - // slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + - // ".edge.2.table", - // FULL); + relation *rel__edge__2__1__2 = new relation( + 2, true, 2, get_tag_for_rel("edge", "1__2"), + std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", + slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + + ".edge.2.table", + FULL); relation *rel__cc__2__1 = new relation( 1, true, 2, get_tag_for_rel("cc", "1"), @@ -494,8 +491,7 @@ int main(int argc, char **argv) { )); RAM *cc_init_scc = new RAM(false, 1); - // cc_init_scc->balance_flag = true; - // cc_init_scc->add_relation(rel__edge__2__1__2, false); + cc_init_scc->add_relation(rel__edge__2__1__2, false); cc_init_scc->add_relation(rel__edge__2__1, false); cc_init_scc->add_relation(rel__cc__2__1, true); cc_init_scc->add_relation(rel__node__1__1, true); @@ -564,7 +560,7 @@ int main(int argc, char **argv) { cc_lie->add_relation(rel__cc__2__1); cc_lie->add_relation(rel__cc_final__2__1); cc_lie->add_relation(rel__cc_represent__1__1); - // cc_lie->add_relation(rel__edge__2__1__2); + cc_lie->add_relation(rel__edge__2__1__2); cc_lie->add_scc(to_undirected_scc); cc_lie->add_scc(cc_init_scc); @@ -594,7 +590,7 @@ int main(int argc, char **argv) { // rel__edge__2__1__2->test_calc_hash_rank(4096); - // rel__edge__2__1->test_calc_hash_rank(4096); + rel__edge__2__1->test_calc_hash_rank(4096); // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; // rel__node__1__1->print(); // rel__edge__2__1->print(); From 1c8f027009af2699aab17c56385ace15f4ff6f0d Mon Sep 17 00:00:00 2001 From: ysun67 Date: Thu, 5 Jan 2023 12:46:37 -0500 Subject: [PATCH 35/36] stage change --- backend/src/lie/lie.cpp | 2 -- .../src/relation/balanced_hash_relation.cpp | 9 +++++++-- backend/src/relation/balanced_hash_relation.h | 2 ++ backend/tests/cc/compiled_pre/cc.cpp | 20 ++++++++++--------- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index 6e4de91a..88d85b4e 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -433,9 +433,7 @@ bool LIE::execute () #endif // load balance before a SCC executed - if (executable_task->balance_flag) { executable_task->load_balance(); - } if (restart_flag == false) { diff --git a/backend/src/relation/balanced_hash_relation.cpp b/backend/src/relation/balanced_hash_relation.cpp index da06df04..1934af27 100644 --- a/backend/src/relation/balanced_hash_relation.cpp +++ b/backend/src/relation/balanced_hash_relation.cpp @@ -1433,8 +1433,13 @@ bool relation::check_dependent_value_insert_avalible(const std::vector& tup // if (bucket_id != mcomm.get_rank()) { // std::cout << "wwwwwwwwwwwwwwwwwwwwwwwwwwwwww " << std::endl; // } - int bucket_id = mcomm.get_rank(); - return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ; + // int bucket_id = mcomm.get_rank(); + bool res = true; + for (int i = 0 ; i < mcomm.get_nprocs(); i ++) { + res = (res && delta[i].check_dependent_insertion(tuple)) && full[i].check_dependent_insertion(tuple); + } + // return delta[bucket_id].check_dependent_insertion(tuple) && full[bucket_id].check_dependent_insertion(tuple) ; + return res; } void relation::test_calc_hash_rank(u64 rank_n) { diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index 4b9f5215..5368811f 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -98,6 +98,8 @@ class relation u32 default_sub_bucket_per_bucket_count = 1; /// 1 bool balance_flag = false; + bool balance_flag = false; + /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL); /// 2: arity (Internally one extra id (intern id) column is added to every relation) /// true: arity == join column count diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 4b53c04e..19cad2c9 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -438,12 +438,14 @@ int main(int argc, char **argv) { ".edge.2.table", FULL); - relation *rel__edge__2__1__2 = new relation( - 2, true, 2, get_tag_for_rel("edge", "1__2"), - std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", - slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + - ".edge.2.table", - FULL); + rel__edge__2__1->balance_flag = true; + + // relation *rel__edge__2__1__2 = new relation( + // 2, true, 2, get_tag_for_rel("edge", "1__2"), + // std::to_string(get_tag_for_rel("edge", "1__2")) + ".edge.2.table", + // slog_input_dir + "/" + std::to_string(get_tag_for_rel("edge", "1__2")) + + // ".edge.2.table", + // FULL); relation *rel__cc__2__1 = new relation( 1, true, 2, get_tag_for_rel("cc", "1"), @@ -491,7 +493,7 @@ int main(int argc, char **argv) { )); RAM *cc_init_scc = new RAM(false, 1); - cc_init_scc->add_relation(rel__edge__2__1__2, false); + // cc_init_scc->add_relation(rel__edge__2__1__2, false); cc_init_scc->add_relation(rel__edge__2__1, false); cc_init_scc->add_relation(rel__cc__2__1, true); cc_init_scc->add_relation(rel__node__1__1, true); @@ -560,7 +562,7 @@ int main(int argc, char **argv) { cc_lie->add_relation(rel__cc__2__1); cc_lie->add_relation(rel__cc_final__2__1); cc_lie->add_relation(rel__cc_represent__1__1); - cc_lie->add_relation(rel__edge__2__1__2); + // cc_lie->add_relation(rel__edge__2__1__2); cc_lie->add_scc(to_undirected_scc); cc_lie->add_scc(cc_init_scc); @@ -590,7 +592,7 @@ int main(int argc, char **argv) { // rel__edge__2__1__2->test_calc_hash_rank(4096); - rel__edge__2__1->test_calc_hash_rank(4096); + // rel__edge__2__1->test_calc_hash_rank(4096); // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; // rel__node__1__1->print(); // rel__edge__2__1->print(); From b4d1b7f2ff859409e8a7ec85b33c0803fc12ce67 Mon Sep 17 00:00:00 2001 From: Yihao Sun Date: Thu, 5 Jan 2023 21:08:03 -0500 Subject: [PATCH 36/36] add manual sub rank split --- backend/src/RA/parallel_join.cpp | 116 ++++++++++-------- backend/src/lie/lie.cpp | 2 +- backend/src/relation/balanced_hash_relation.h | 2 - backend/tests/cc/compiled_pre/cc.cpp | 19 +-- 4 files changed, 73 insertions(+), 66 deletions(-) diff --git a/backend/src/RA/parallel_join.cpp b/backend/src/RA/parallel_join.cpp index c04806c8..1333dcdc 100644 --- a/backend/src/RA/parallel_join.cpp +++ b/backend/src/RA/parallel_join.cpp @@ -61,20 +61,22 @@ bool parallel_join::local_join(int threshold, int* offset, //std::cout << "PREFIX " << input0_buffer[k1 + jc] << std::endl; } - u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; - + // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; + auto before_actual_join = MPI_Wtime(); - input1[bucket_id].as_all_to_allv_left_join_buffer( - prefix, join_buffer, - input0_buffer + k1,input0_buffer_width, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, output->get_join_column_count(), - output->get_is_canonical(), - generator_mode, generator_func); + for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) { + input1[bucket_id].as_all_to_allv_left_join_buffer( + prefix, join_buffer, + input0_buffer + k1,input0_buffer_width, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, output->get_join_column_count(), + output->get_is_canonical(), + generator_mode, generator_func); + } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; @@ -114,20 +116,22 @@ bool parallel_join::local_join(int threshold, int* offset, } else { if (input_ts.size() != 0) { auto before_actual_join = MPI_Wtime(); - u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; - input1[bucket_id].as_all_to_allv_right_join_buffer( - std::vector(prev_non_dependent_columns.begin(), - prev_non_dependent_columns.begin()+join_column_count), - join_buffer, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); + // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; + for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) { + input1[bucket_id].as_all_to_allv_right_join_buffer( + std::vector(prev_non_dependent_columns.begin(), + prev_non_dependent_columns.begin()+join_column_count), + join_buffer, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); + } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; input_ts.clear(); @@ -137,21 +141,23 @@ bool parallel_join::local_join(int threshold, int* offset, } } if (input_ts.size() != 0) { - u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; + // u64 bucket_id = tuple_hash(prev_non_dependent_columns.data(), join_column_count) % buckets; auto before_actual_join = MPI_Wtime(); - input1[bucket_id].as_all_to_allv_right_join_buffer( - std::vector(prev_non_dependent_columns.begin(), - prev_non_dependent_columns.begin()+join_column_count), - join_buffer, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); + for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) { + input1[bucket_id].as_all_to_allv_right_join_buffer( + std::vector(prev_non_dependent_columns.begin(), + prev_non_dependent_columns.begin()+join_column_count), + join_buffer, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); + } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; input_ts.clear(); @@ -164,22 +170,24 @@ bool parallel_join::local_join(int threshold, int* offset, for (int jc=0; jc < join_column_count; jc++) prefix.push_back(input0_buffer[k1 + jc]); - u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; + // u64 bucket_id = tuple_hash(input0_buffer + k1, join_column_count) % buckets; std::vector> input_ts; input_ts.push_back(std::vector(input0_buffer+k1, input0_buffer+k1+input0_buffer_width)); auto before_actual_join = MPI_Wtime(); - input1[bucket_id].as_all_to_allv_right_join_buffer( - prefix, join_buffer, - // input0_buffer + k1, input0_buffer_width, - input_ts, - input1_buffer_width, counter, - buckets, output_sub_bucket_count, - output_sub_bucket_rank, reorder_map_array, - join_column_count, deduplicate, - &local_join_count, global_join_duplicates, - global_join_inserts, - output->get_join_column_count(),output->get_is_canonical(), - generator_mode, generator_func); + for (u32 bucket_id = 0; bucket_id < buckets; bucket_id++) { + input1[bucket_id].as_all_to_allv_right_join_buffer( + prefix, join_buffer, + // input0_buffer + k1, input0_buffer_width, + input_ts, + input1_buffer_width, counter, + buckets, output_sub_bucket_count, + output_sub_bucket_rank, reorder_map_array, + join_column_count, deduplicate, + &local_join_count, global_join_duplicates, + global_join_inserts, + output->get_join_column_count(),output->get_is_canonical(), + generator_mode, generator_func); + } auto after_actual_join = MPI_Wtime(); join_time_total += after_actual_join - before_actual_join; diff --git a/backend/src/lie/lie.cpp b/backend/src/lie/lie.cpp index 88d85b4e..c9ea272e 100644 --- a/backend/src/lie/lie.cpp +++ b/backend/src/lie/lie.cpp @@ -433,7 +433,7 @@ bool LIE::execute () #endif // load balance before a SCC executed - executable_task->load_balance(); + // executable_task->load_balance(); if (restart_flag == false) { diff --git a/backend/src/relation/balanced_hash_relation.h b/backend/src/relation/balanced_hash_relation.h index 5368811f..4b9f5215 100644 --- a/backend/src/relation/balanced_hash_relation.h +++ b/backend/src/relation/balanced_hash_relation.h @@ -98,8 +98,6 @@ class relation u32 default_sub_bucket_per_bucket_count = 1; /// 1 bool balance_flag = false; - bool balance_flag = false; - /// Example: relation* rel_path_2_1_2 = new relation(2, true, 2, 257, "rel_path_2_1_2", "../data/g5955/path_2_1_2", FULL); /// 2: arity (Internally one extra id (intern id) column is added to every relation) /// true: arity == join column count diff --git a/backend/tests/cc/compiled_pre/cc.cpp b/backend/tests/cc/compiled_pre/cc.cpp index 19cad2c9..d27afb1a 100644 --- a/backend/tests/cc/compiled_pre/cc.cpp +++ b/backend/tests/cc/compiled_pre/cc.cpp @@ -1,5 +1,5 @@ // location of `parallel_RA_inc.h` here -#include "/home/ysun67/workspace/slog/backend/src/parallel_RA_inc.h" +#include "/home/stargazermiao/workspace/PL/slog/backend/src/parallel_RA_inc.h" #include "mpi.h" // #include @@ -439,6 +439,7 @@ int main(int argc, char **argv) { FULL); rel__edge__2__1->balance_flag = true; + rel__edge__2__1->default_sub_bucket_per_bucket_count = 2; // relation *rel__edge__2__1__2 = new relation( // 2, true, 2, get_tag_for_rel("edge", "1__2"), @@ -527,11 +528,11 @@ int main(int argc, char **argv) { std::vector res(2, 0); res[0] = input_v[1]; res[1] = target_v[1]; - // if (target_v[0] == 21) { - // std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl; - // std::cout << "cc " << target_v[0] << " " << target_v[1] << std::endl; - // std::cout << "res " << res[0] << " " << res[1] << std::endl; - // } + if (target_v[0] == 21) { + std::cout << "ww " << input_v[0] << " " << input_v[1] << std::endl; + std::cout << "cc " << target_v[0] << " " << target_v[1] << std::endl; + std::cout << "res " << res[0] << " " << res[1] << std::endl; + } res_set.push_back(res); return true; } @@ -595,10 +596,10 @@ int main(int argc, char **argv) { // rel__edge__2__1->test_calc_hash_rank(4096); // std::cout << "Edge size on rank " << mcomm.get_rank() << " is " << rel__edge__2__1->get_full_element_count() << std::endl; // rel__node__1__1->print(); - // rel__edge__2__1->print(); + rel__edge__2__1->print(); // rel__cc__2__1->print(); - rel__cc_final__2__1->print(); - rel__cc_represent__1__1->print(); + // rel__cc_final__2__1->print(); + // rel__cc_represent__1__1->print(); // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> mcomm.destroy();