Skip to content

Commit

Permalink
Starting to allow container columns in binary format
Browse files Browse the repository at this point in the history
  • Loading branch information
hosseinmoein committed Jun 5, 2024
1 parent 8308740 commit c1a83db
Show file tree
Hide file tree
Showing 6 changed files with 378 additions and 59 deletions.
91 changes: 90 additions & 1 deletion data/AAPL_10dBucketWithMaps.csv

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion docs/HTML/DateTime.html
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@ <H2><font color="blue">Member Functions</font></H2>
<span class="line_wrapper"> <span style="color:#800000; font-weight:bold; ">void</span> add_months<span style="color:#808030; ">(</span><span style="color:#800000; font-weight:bold; ">long</span> months<span style="color:#808030; ">)</span> <span style="color:#800000; font-weight:bold; ">noexcept</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"> <span style="color:#800000; font-weight:bold; ">void</span> add_years<span style="color:#808030; ">(</span><span style="color:#800000; font-weight:bold; ">long</span> years<span style="color:#808030; ">)</span> <span style="color:#800000; font-weight:bold; ">noexcept</span><span style="color:#800080; ">;</span></span></pre>


<BR><HR COLOR="Gray" SIZE="1">
<pre class="code_syntax" style="color:#000000;background:#ffffff00;"><span class="line_wrapper"> <span style="color:#696969; ">// These methods format the date/time into a string based on the format parameter</span></span>
<span class="line_wrapper"> <span style="color:#696969; ">//</span></span>
Expand Down
19 changes: 19 additions & 0 deletions include/DataFrame/Internals/DataFrame_misc.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,25 @@ DataFrame<I, H>::print_binary_functor_<Ts ...>::operator() (const T &vec) {
_write_binary_string_(os, vec, start_row, end_row);
else if constexpr (std::is_same_v<ValueType, DateTime>)
_write_binary_datetime_(os, vec, start_row, end_row);
else if constexpr (std::is_same_v<ValueType, ColumnVecType<double>>)
_write_binary_dbl_vec_(os, vec, start_row, end_row);
else if constexpr (std::is_same_v<ValueType, ColumnVecType<std::string>> ||
std::is_same_v<ValueType, ColumnVecType<const char *>>)
_write_binary_str_vec_(os, vec, start_row, end_row);
else if constexpr (std::is_same_v<ValueType, std::set<double>> ||
std::is_same_v<ValueType, DFSet<double>>)
_write_binary_dbl_set_(os, vec, start_row, end_row);
else if constexpr (std::is_same_v<ValueType, std::set<std::string>> ||
std::is_same_v<ValueType, DFSet<std::string>> ||
std::is_same_v<ValueType, std::set<const char *>> ||
std::is_same_v<ValueType, DFSet<const char *>>)
_write_binary_str_set_(os, vec, start_row, end_row);
else if constexpr (
std::is_same_v<ValueType, std::map<std::string, double>> ||
std::is_same_v<ValueType, DFMap<std::string, double>> ||
std::is_same_v<ValueType, std::unordered_map<std::string, double>> ||
std::is_same_v<ValueType, DFUnorderedMap<std::string, double>>)
_write_binary_str_dbl_map(os, vec, start_row, end_row);
else
_write_binary_data_(os, vec, start_row, end_row);

Expand Down
70 changes: 25 additions & 45 deletions include/DataFrame/Internals/DataFrame_read.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -1316,53 +1316,9 @@ read_binary_(std::istream &stream,
else if constexpr (std::is_same_v<IndexType, DateTime>)
_read_binary_datetime_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, float>)
else
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, double>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, short int>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, unsigned short int>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, int>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, unsigned int>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, long int>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, unsigned long int>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, long long int>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, unsigned long long int>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, char>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, unsigned char>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else if constexpr (std::is_same_v<IndexType, bool>)
_read_binary_data_(stream, idx_vec, needs_flipping,
starting_row, num_rows);
else {
String1K err;

err.printf(
"read_binary_(): ERROR: Type '%s' is not supported for index",
col_type);
throw DataFrameError(err.c_str());
}
load_index(std::move(idx_vec));
}

Expand Down Expand Up @@ -1490,6 +1446,30 @@ read_binary_(std::istream &stream,
load_column(col_name, std::move(vec),
nan_policy::dont_pad_with_nans);
}
else if ( ! std::strcmp(col_type, "dbl_vec")) {
ColumnVecType<ColumnVecType<double>> vec;

}
else if ( ! std::strcmp(col_type, "str_vec")) {
ColumnVecType<ColumnVecType<std::string>> vec;

}
else if ( ! std::strcmp(col_type, "dbl_set")) {
ColumnVecType<DFSet<double>> vec;

}
else if ( ! std::strcmp(col_type, "str_set")) {
ColumnVecType<DFSet<std::string>> vec;

}
else if ( ! std::strcmp(col_type, "str_dbl_map")) {
ColumnVecType<DFMap<std::string, double>> vec;

}
else if ( ! std::strcmp(col_type, "str_dbl_unomap")) {
ColumnVecType<DFUnorderedMap<std::string, double>> vec;

}
else {
String1K err;

Expand Down
240 changes: 237 additions & 3 deletions include/DataFrame/Internals/DataFrame_standalone.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,16 @@ using _TypeInfoRef_ = std::reference_wrapper<const std::type_info>;
struct _TypeinfoHasher_ {

std::size_t
operator()(_TypeInfoRef_ item) const { return (item.get().hash_code()); }
operator()(const _TypeInfoRef_ item) const {

return (item.get().hash_code());
}
};

struct _TypeinfoEqualTo_ {

bool
operator()(_TypeInfoRef_ lhs, _TypeInfoRef_ rhs) const {
operator()(const _TypeInfoRef_ lhs, const _TypeInfoRef_ rhs) const {

return (lhs.get() == rhs.get());
}
Expand All @@ -74,7 +77,8 @@ static const
std::unordered_map<_TypeInfoRef_,
const char *const,
_TypeinfoHasher_,
_TypeinfoEqualTo_> _typeinfo_name_ {
_TypeinfoEqualTo_>
_typeinfo_name_ {
{ typeid(float), "float" },
{ typeid(double), "double" },
{ typeid(long double), "longdouble" },
Expand Down Expand Up @@ -990,6 +994,122 @@ _write_binary_datetime_(STRM &strm, const V &dt_vec,

// ----------------------------------------------------------------------------

// Vector of double vectors
//
template<typename STRM, typename V>
inline static STRM &
_write_binary_dbl_vec_(STRM &strm, const V &vecs,
std::size_t start_row, std::size_t end_row) {

_write_binary_common_(strm, vecs, start_row, end_row);

for (uint64_t i = start_row; i < end_row; ++i)
_write_binary_data_(strm, vecs[i], 0, vecs[i].size());

return (strm);
}

// ----------------------------------------------------------------------------

// Vector of string vectors
//
template<typename STRM, typename V>
inline static STRM &
_write_binary_str_vec_(STRM &strm, const V &vecs,
std::size_t start_row, std::size_t end_row) {

_write_binary_common_(strm, vecs, start_row, end_row);

for (uint64_t i = start_row; i < end_row; ++i)
_write_binary_string_(strm, vecs[i], 0, vecs[i].size());

return (strm);
}

// ----------------------------------------------------------------------------

// Vector of double sets
//
template<typename STRM, typename S>
inline static STRM &
_write_binary_dbl_set_(STRM &strm, const S &dbl_sets,
std::size_t start_row, std::size_t end_row) {

_write_binary_common_(strm, dbl_sets, start_row, end_row);

for (uint64_t i = start_row; i < end_row; ++i) {
const uint64_t sz = dbl_sets.size();

strm.write(reinterpret_cast<const char *>(&sz), sizeof(sz));
for (const double val : dbl_sets[i])
strm.write(reinterpret_cast<const char *>(&val), sizeof(val));
}

return (strm);
}

// ----------------------------------------------------------------------------

// Vector of string sets
//
template<typename STRM, typename S>
inline static STRM &
_write_binary_str_set_(STRM &strm, const S &str_sets,
std::size_t start_row, std::size_t end_row) {

_write_binary_common_(strm, str_sets, start_row, end_row);

for (uint64_t i = start_row; i < end_row; ++i) {
const uint64_t sz = str_sets.size();

strm.write(reinterpret_cast<const char *>(&sz), sizeof(sz));
for (const auto &str : str_sets[i]) {
const uint16_t str_sz = static_cast<uint16_t>(str.size());

strm.write(reinterpret_cast<const char *>(&str_sz),
sizeof(str_sz));
}

for (const auto &str : str_sets[i])
strm.write(str.data(), str.size() * sizeof(char));
}

return (strm);
}

// ----------------------------------------------------------------------------

// Vector of string to double [unordered] maps
//
template<typename STRM, typename M>
inline static STRM &
_write_binary_str_dbl_map(STRM &strm, const M &sd_maps,
std::size_t start_row, std::size_t end_row) {

_write_binary_common_(strm, sd_maps, start_row, end_row);

for (uint64_t i = start_row; i < end_row; ++i) {
const uint64_t sz = sd_maps.size();

strm.write(reinterpret_cast<const char *>(&sz), sizeof(sz));
for (const auto &[str, dbl] : sd_maps[i]) {
const uint16_t str_sz = static_cast<uint16_t>(str.size());

strm.write(reinterpret_cast<const char *>(&str_sz),
sizeof(str_sz));
}

for (const auto &[str, dbl] : sd_maps[i]) {
strm.write(str.data(), str.size() * sizeof(char));
strm.write(reinterpret_cast<const char *>(&dbl), sizeof(dbl));
}
}

return (strm);
}

// ----------------------------------------------------------------------------

template<typename STRM>
inline static uint64_t
_read_binary_common_(STRM &strm, bool needs_flipping, std::size_t start_row) {
Expand Down Expand Up @@ -1136,6 +1256,120 @@ _read_binary_datetime_(STRM &strm, V &dt_vec, bool needs_flipping,
}

// ----------------------------------------------------------------------------














// Vector of double vectors
//
template<typename STRM, typename V>
inline static STRM &
_read_binary_dbl_vec(STRM &strm, V &vec, bool needs_flipping,
std::size_t start_row, std::size_t num_rows) {

using VecType = typename std::remove_reference<V>::type;
using ValueType = typename VecType::value_type;

const uint64_t vec_size =
_read_binary_common_(strm, needs_flipping, start_row);
const uint64_t read_end =
(num_rows == std::numeric_limits<std::size_t>::max() ||
(start_row + num_rows) > vec_size)
? vec_size : uint64_t(start_row + num_rows);

vec.reserve(read_end - start_row);
for (uint64_t i = 0; i < vec_size; ++i) {
// Skip type name
//
strm.seekg(32 * sizeof(char), std::ios_base::cur);

if (i >= start_row && i < read_end) [[likely]] {
ValueType dbl_vec;

_read_binary_data_(strm, dbl_vec, needs_flipping,
0, std::numeric_limits<std::size_t>::max());
vec.push_back(std::move(dbl_vec));
}
else { // Skip the data
const uint64_t inner_vec_size =
_read_binary_common_(strm, needs_flipping, 0);

strm.seekg(inner_vec_size * sizeof(double), std::ios_base::cur);
}
}
return (strm);
}

// ----------------------------------------------------------------------------

// Vector of string vectors
//
template<typename STRM, typename V>
inline static STRM &
_read_binary_str_vec(STRM &strm, V &vec, bool needs_flipping,
std::size_t start_row, std::size_t num_rows) {

using VecType = typename std::remove_reference<V>::type;
using ValueType = typename VecType::value_type;

const uint64_t vec_size =
_read_binary_common_(strm, needs_flipping, start_row);
const uint64_t read_end =
(num_rows == std::numeric_limits<std::size_t>::max() ||
(start_row + num_rows) > vec_size)
? vec_size : uint64_t(start_row + num_rows);
ValueType str_vec;

vec.reserve(read_end - start_row);
for (uint64_t i = 0; i < vec_size; ++i) {
// Skip type name
//
strm.seekg(32 * sizeof(char), std::ios_base::cur);

str_vec.clear();
_read_binary_string_(strm, str_vec, needs_flipping,
0, std::numeric_limits<std::size_t>::max());
if (i >= start_row && i < read_end) [[likely]]
vec.push_back(std::move(str_vec));
}
return (strm);
}

// ----------------------------------------------------------------------------

























//
// Specializing std::hash for tuples
Expand Down
Loading

0 comments on commit c1a83db

Please sign in to comment.