Skip to content

Commit

Permalink
Now in csv2 format you can read/write columns of type std::map and st…
Browse files Browse the repository at this point in the history
…d::unordered_map of string to double
  • Loading branch information
hosseinmoein committed Aug 23, 2023
1 parent 2620830 commit 9044819
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 27 deletions.
6 changes: 6 additions & 0 deletions data/AAPL_10dBucketWithMaps.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
INDEX:4:<DateTimeAME>,Open:4:<double>,High:4:<double>,Low:4:<double>,Close:4:<double>,Mean:4:<double>,Median:4:<double>,25% Quantile:4:<double>,Std:4:<double>,MAD:4:<double>,Map 1:4:<str_dbl_map>,Unordered Map:4:<str_dbl_unomap>,Volume:4:<long>
01/14/2000 00:00:00.000,0.999442,0.999442,0.77846,0.896763,0.8819754,0.8805805,0.8560265,0.0600931968588,0.0436942,3{label one 1:123.0|label one 2:-782.5|label one 3:444.44},3{Key one 1:123.0|Key one 2:-782.5|Key one 3:444.44},6400945600
01/31/2000 00:00:00.000,0.928013,1.013393,0.907366,0.926339,0.9637277,0.966797,0.938337,0.0362519289143,0.0313617,3{label two 1:123.0|label two 2:-782.5|label two 3:444.44},3{Key two 1:123.0|Key two 2:-782.5|Key two 3:444.44},6154232000
02/14/2000 00:00:00.000,0.895089,1.03404,0.882254,1.03404,0.9732142,0.988281,0.9433595,0.0559733198384,0.0462054,3{label three 1:123.0|label three 2:-782.5|label three 3:444.44},3{Key three 1:123.0|Key three 2:-782.5|Key three 3:444.44},3714592000
02/29/2000 00:00:00.000,1.0625,1.0625,0.985491,1.023438,1.0203265,1.0212055,1.013672,0.0217113745778,0.0153041,3{label four 1:123.0|label four 2:-782.5|label four 3:444.44},3{Key four 1:123.0|Key four 2:-782.5|Key four 3:444.44},3605190400

14 changes: 9 additions & 5 deletions docs/HTML/read.html
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,15 @@
</PRE>
In case of io_format::csv2 the following additional types are also supported:
<PRE>
DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
dbl_vector -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
where s is the size of the vector and d's are the double values.
DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
dbl_vector -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
where s is the size of the vector and d's are the double values.
str_dbl_map -- A map of string keys to double precision values, The map is printed as "s[k1:v1|k2:v2|...]"
where s is the size of the map and k's and v's are keys and values.
str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s[k1:v1|k2:v2|...]"
where s is the size of the map and k's and v's are keys and values.
</PRE>

<B>NOTE:</B>: This version of read() can be substantially faster, especially for larger files, than if you open the file yourself and use the read() version below.
Expand Down
14 changes: 9 additions & 5 deletions docs/HTML/write.html
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,15 @@
</PRE>
In case of io_format::csv2 the following additional types are also supported:
<PRE>
DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
dbl_vector -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
where s is the size of the vector and d's are the double values.
DateTimeAME -- DateTime string printed in American style (MM/DD/YYYY HH:MM:SS.mmm)
DateTimeEUR -- DateTime string printed in European style (YYYY/MM/DD HH:MM:SS.mmm)
DateTimeISO -- DateTime string printed in ISO style (YYYY-MM-DD HH:MM:SS.mmm)
dbl_vector -- A vector of double precision values, The vector is printed as "s[d1|d2|...]"
where s is the size of the vector and d's are the double values.
str_dbl_map -- A map of string keys to double precision values, The map is printed as "s[k1:v1|k2:v2|...]"
where s is the size of the map and k's and v's are keys and values.
str_dbl_unomap -- An unordered map of string keys to double precision values, The map is printed as "s[k1:v1|k2:v2|...]"
where s is the size of the map and k's and v's are keys and values.
</PRE>
</td>
<td>
Expand Down
93 changes: 79 additions & 14 deletions include/DataFrame/Internals/DataFrame_read.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -606,10 +606,22 @@ read_csv2_(std::istream &stream,
col_name.c_str(),
nrows);
else if (type_str == "dbl_vec")
spec_vec.emplace_back(StlVecType<StlVecType<double>>(),
spec_vec.emplace_back(StlVecType<StlVecType<double>>{ },
type_str.c_str(),
col_name.c_str(),
nrows);
else if (type_str == "str_dbl_map")
spec_vec.emplace_back(
StlVecType<std::map<std::string, double>>{ },
type_str.c_str(),
col_name.c_str(),
nrows);
else if (type_str == "str_dbl_unomap")
spec_vec.emplace_back(
StlVecType<std::unordered_map<std::string, double>>{ },
type_str.c_str(),
col_name.c_str(),
nrows);
else
throw DataFrameError("DataFrame::read_csv2_(): ERROR: "
"Unknown column type");
Expand Down Expand Up @@ -734,6 +746,28 @@ read_csv2_(std::istream &stream,
value.c_str())));
}
}
else if (col_spec.type_spec == "str_dbl_map") {
using map_t = std::map<std::string, double>;

if (! value.empty()) {
StlVecType<map_t> &vec =
std::any_cast<StlVecType<map_t> &>(col_spec.col_vec);

vec.push_back(std::move(_get_str_dbl_map_from_value_<map_t>(
value.c_str())));
}
}
else if (col_spec.type_spec == "str_dbl_unomap") {
using map_t = std::unordered_map<std::string, double>;

if (! value.empty()) {
StlVecType<map_t> &vec =
std::any_cast<StlVecType<map_t> &> (col_spec.col_vec);

vec.push_back(std::move(_get_str_dbl_map_from_value_<map_t>(
value.c_str())));
}
}
col_index += 1;
}
}
Expand All @@ -756,70 +790,101 @@ read_csv2_(std::istream &stream,
_col_data_spec_ col_spec = spec_vec[i];

if (col_spec.type_spec == "float")
load_column<float>(col_spec.col_name.c_str(),
load_column<float>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<float> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "double") [[likely]]
load_column<double>(col_spec.col_name.c_str(),
load_column<double>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<double> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "longdouble")
load_column<long double>(col_spec.col_name.c_str(),
load_column<long double>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<long double> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "int")
load_column<int>(col_spec.col_name.c_str(),
load_column<int>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<int> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "uint")
load_column<unsigned int>(col_spec.col_name.c_str(),
load_column<unsigned int>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<unsigned int> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "long")
load_column<long>(col_spec.col_name.c_str(),
load_column<long>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<long> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "longlong")
load_column<long long>(col_spec.col_name.c_str(),
load_column<long long>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<long long> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "ulong")
load_column<unsigned long>(col_spec.col_name.c_str(),
load_column<unsigned long>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<unsigned long>&>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "ulonglong")
load_column<unsigned long long>(col_spec.col_name.c_str(),
load_column<unsigned long long>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<unsigned long long> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "string")
load_column<std::string>(col_spec.col_name.c_str(),
load_column<std::string>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<std::string> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (! ::strncmp(col_spec.type_spec.c_str(), "DateTime", 8))
load_column<DateTime>(col_spec.col_name.c_str(),
load_column<DateTime>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<DateTime> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "bool")
load_column<bool>(col_spec.col_name.c_str(),
load_column<bool>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<bool> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "dbl_vec")
load_column<StlVecType<double>>(col_spec.col_name.c_str(),
load_column<StlVecType<double>>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<StlVecType<double>> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
else if (col_spec.type_spec == "str_dbl_map") {
using map_t = std::map<std::string, double>;

load_column<map_t>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<map_t> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
}
else if (col_spec.type_spec == "str_dbl_unomap") {
using map_t = std::unordered_map<std::string, double>;

load_column<map_t>(
col_spec.col_name.c_str(),
std::move(std::any_cast<StlVecType<map_t> &>
(col_spec.col_vec)),
nan_policy::dont_pad_with_nans);
}
}
}
}
Expand Down
70 changes: 67 additions & 3 deletions include/DataFrame/Internals/DataFrame_standalone.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <map>
#include <sstream>
#include <tuple>
#include <unordered_map>
#include <utility>

// ----------------------------------------------------------------------------
Expand Down Expand Up @@ -78,7 +79,22 @@ static S &operator << (S &stream, const std::map<K, V> &data) {
if (! data.empty()) {
stream << data.size() << '{'
<< data.cbegin()->first << ':' << data.cbegin()->second;
for (auto citer = data.cbegin() + 1; citer < data.cend(); ++citer)
for (auto citer = ++(data.cbegin()); citer != data.cend(); ++citer)
stream << '|' << citer->first << ':' << citer->second;
stream << '}';
}
return (stream);
}

// ----------------------------------------------------------------------------

template<typename S, typename K, typename V>
static S &operator << (S &stream, const std::unordered_map<K, V> &data) {

if (! data.empty()) {
stream << data.size() << '{'
<< data.cbegin()->first << ':' << data.cbegin()->second;
for (auto citer = ++(data.cbegin()); citer != data.cend(); ++citer)
stream << '|' << citer->first << ':' << citer->second;
stream << '}';
}
Expand Down Expand Up @@ -505,15 +521,15 @@ _get_dbl_vec_from_value_(const char *value) {

using vec_t = typename DF::template StlVecType<double>;

std::size_t vcnt = 0;
std::size_t vcnt = 0;
char buffer[128];

while (value[vcnt] != '[')
buffer[vcnt] = value[vcnt++];
buffer[vcnt] = '\0';

vec_t data;
std::size_t bcnt;
std::size_t bcnt;

data.reserve(std::strtol(buffer, nullptr, 10));
vcnt += 1; // skip [
Expand All @@ -530,6 +546,50 @@ _get_dbl_vec_from_value_(const char *value) {

// ----------------------------------------------------------------------------

template<typename MAP>
inline static MAP
_get_str_dbl_map_from_value_(const char *value) {

using map_t = MAP;
using unomap_t = std::unordered_map<std::string, double>;;

std::size_t vcnt = 0;
char buffer[256];

while (value[vcnt] != '{')
buffer[vcnt] = value[vcnt++];
buffer[vcnt] = '\0';

map_t data;
std::size_t bcnt;

if constexpr (std::is_base_of_v<unomap_t, map_t>)
data.reserve(std::strtol(buffer, nullptr, 10));
vcnt += 1; // skip {
while (value[vcnt] && value[vcnt] != '}') {
bcnt = 0;
while (value[vcnt] != ':')
buffer[bcnt++] = value[vcnt++];
buffer[bcnt] = '\0';
vcnt += 1; // skip :

std::string key = buffer;

bcnt = 0;
while (value[vcnt] != '|' && value[vcnt] != '}')
buffer[bcnt++] = value[vcnt++];
buffer[bcnt] = '\0';

const double value = std::strtod(buffer, nullptr);

data.emplace(std::make_pair(std::move(key), value));
vcnt += 1; // skip separator
}
return (data);
}

// ----------------------------------------------------------------------------

template<typename S, typename T>
inline static S &
_write_csv_df_header_base_(S &o, const char *col_name, std::size_t col_size) {
Expand Down Expand Up @@ -590,6 +650,10 @@ _write_csv2_df_header_(S &o, const char *col_name, std::size_t col_size) {
o << "<DateTimeAME>";
else if (typeid(T) == typeid(std::vector<double>))
o << "<dbl_vec>";
else if (typeid(T) == typeid(std::map<std::string, double>))
o << "<str_dbl_map>";
else if (typeid(T) == typeid(std::unordered_map<std::string, double>))
o << "<str_dbl_unomap>";
return (o);
}

Expand Down
38 changes: 38 additions & 0 deletions test/dataframe_tester_3.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2192,6 +2192,43 @@ static void test_read_csv_with_vector() {

// -----------------------------------------------------------------------------

static void test_read_csv_with_maps() {

std::cout << "\nTesting test_read_csv_with_maps ..." << std::endl;

using DT_DataFrame = StdDataFrame<DateTime>;
using map_t = std::map<std::string, double>;
using unomap_t = std::unordered_map<std::string, double>;

DT_DataFrame df;

try {
df.read("data/AAPL_10dBucketWithMaps.csv", io_format::csv2);

// df.write<std::ostream, double, long, map_t, unomap_t>
// (std::cout, io_format::csv2);
assert(df.get_index().size() == 4);
assert((std::fabs(df.get_column<double>("Close")[3] - 1.0234) < 0.0001));
assert((df.get_column<long>("Volume")[3] == 3605190400));

assert((std::fabs(
df.get_column<map_t>("Map 1")[3]["label four 2"] - -782.5) < 0.001));
assert((std::fabs(
df.get_column<map_t>("Map 1")[0]["label one 1"] - 123.0) < 0.001));
assert((std::fabs(
df.get_column<unomap_t>
("Unordered Map")[3]["Key four 3"] - 444.44) < 0.001));
assert((std::fabs(
df.get_column<unomap_t>
("Unordered Map")[0]["Key one 2"] - -782.5) < 0.001));
}
catch (const DataFrameError &ex) {
std::cout << ex.what() << std::endl;
}
}

// -----------------------------------------------------------------------------

int main(int, char *[]) {

test_groupby_edge();
Expand Down Expand Up @@ -2239,6 +2276,7 @@ int main(int, char *[]) {
test_EldersForceIndexVisitor();
test_EaseOfMovementVisitor();
test_read_csv_with_vector();
test_read_csv_with_maps();

return (0);
}
Expand Down

0 comments on commit 9044819

Please sign in to comment.