Parse a typed csv file with boost :: spirit :: qi
I want to parse a CSV file with typed values. The type of each column is defined in the header, ex:
int double double int unsigned
12 1.3 23445 1 42
45 46 47 48 49
The result data structure can be something like this 2D vector:
using ColumnType = boost::variant<
std::vector<int>,
std::vector<unsigned>,
std::vector<double>
>;
using ResultType = std::vector<ColumnType>;
My working code:
namespace phoenix = boost::phoenix;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using ColumnType = boost::variant<
std::vector<int>,
std::vector<unsigned>,
std::vector<double>
>;
using ResultType = std::vector<ColumnType>;
enum class CSVDataType
{
Int, UInt, Double
};
template<typename Iterator>
struct TypedCSVGrammar: qi::grammar<Iterator, ResultType(), ascii::blank_type> {
struct types_: qi::symbols<char, CSVDataType> {
types_() {
add
("int", CSVDataType::Int)
("unsigned", CSVDataType::UInt)
("double", CSVDataType::Double);
}
} types;
TypedCSVGrammar() :
TypedCSVGrammar::base_type(csv, "csv")
{
using ascii::string;
using namespace qi::labels;
header %= *(types);
cell =
(
qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::Int))
>> qi::int_ [phoenix::bind(&TypedCSVGrammar::add_int, this, _r1, _1)]
) | (
qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::UInt))
>> qi::uint_ [phoenix::bind(&TypedCSVGrammar::add_uint, this, _r1, _1)]
) | (
qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::Double))
>> qi::double_ [phoenix::bind(&TypedCSVGrammar::add_double, this, _r1, _1)]
);
row =
qi::eps [phoenix::ref(column) = phoenix::val(0)]
>> qi::repeat(phoenix::size(phoenix::ref(column_types))) [
cell(phoenix::ref(column))
>> qi::eps [phoenix::ref(column)++]
];
csv =
header [phoenix::bind(&TypedCSVGrammar::construct_columns, this, _1)]
> qi::eol
> row % qi::eol
> *qi::eol
> qi::attr(result);
qi::on_error<qi::fail>
(
csv
, std::cout
<< phoenix::val("Error! Expecting ")
<< _4 // what failed?
<< phoenix::val(" here: \"")
<< phoenix::construct<std::string>(_3, _2) // iterators to error-pos, end
<< phoenix::val("\"")
<< std::endl
);
}
void add_int(std::size_t c, int i) {
boost::get<std::vector<int>>(result[c]).push_back(i);
}
void add_uint(std::size_t c, unsigned i) {
boost::get<std::vector<unsigned>>(result[c]).push_back(i);
}
void add_double(std::size_t c, double i) {
boost::get<std::vector<double>>(result[c]).push_back(i);
}
void construct_columns(const std::vector<CSVDataType>& columns) {
column_types = columns;
for (const auto& c : columns) {
switch (c) {
case CSVDataType::Int:
result.push_back(std::vector<int>());
break;
case CSVDataType::UInt:
result.push_back(std::vector<unsigned>());
break;
case CSVDataType::Double:
result.push_back(std::vector<double>());
break;
}
}
}
std::vector<CSVDataType> column_types;
std::size_t column;
ResultType result;
qi::rule<Iterator, ResultType(), ascii::blank_type> csv;
qi::rule<Iterator, std::vector<CSVDataType>(), ascii::blank_type> header;
qi::rule<Iterator, void(std::size_t), ascii::blank_type> cell;
qi::rule<Iterator, void(), ascii::blank_type> row;
};
Is there a better solution? I want to use more than 3 types (maybe more than 10 types). That would be a lot to print.
source to share
I don't understand why you came up with such a contrived target data structure. It seems to be suggesting bugs with unmatched indexes.
May I suggest Nabialek Trick .
This works well if you change the AST around:
using ValueType = boost::variant<int, unsigned, double>;
using ResultType = std::vector<std::vector<ValueType>>;
(This seems like a more desirable approach anyway)
In short, you are translating the column types into a vector of parser rules ( std::vector<dynamic>
).
#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace px = boost::phoenix;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using ValueType = boost::variant<int, unsigned, double>;
using ResultType = std::vector<std::vector<ValueType>>;
enum class CSVDataType { Int, UInt, Double };
namespace boost { namespace spirit { namespace qi { // FOR DEBUG
template <typename... T> std::ostream& operator<<(std::ostream& os, rule<T...> const*) { return os << "(lazy rule)"; }
template <typename... T> std::ostream& operator<<(std::ostream& os, std::vector<rule<T...> const*> const&) { return os << "(column mappings)"; }
} } }
template<typename Iterator, typename Skipper = ascii::blank_type>
struct TypedCSVGrammar: qi::grammar<Iterator, ResultType(), Skipper> {
TypedCSVGrammar() : TypedCSVGrammar::base_type(start, "csv")
{
using namespace qi::labels;
header = *types;
csv = qi::omit[ header [ _cols = _1 ] ] > qi::eol
> qi::repeat(_current=0, px::size(_cols)) [ qi::lazy(*_cols[_current++]) ] % qi::eol
> *qi::eol
;
start = csv;
BOOST_SPIRIT_DEBUG_NODES((start)(csv)(header));
qi::on_error<qi::fail> (csv, px::ref(std::cout)
<< "Error! Expecting " << _4 // what failed?
<< " here: \"" << px::construct<std::string>(_3, _2) // iterators to error-pos, end
<< "\"\n"
);
}
private:
using cell_parser_t = qi::rule<Iterator, ValueType(), Skipper>;
using dynamic = cell_parser_t const*;
struct types_: qi::symbols<char, dynamic> {
cell_parser_t
int_cell = qi::int_,
uint_cell = qi::uint_,
double_cell = qi::double_;
types_() {
this->add
("int", &int_cell)
("unsigned", &uint_cell)
("double", &double_cell);
BOOST_SPIRIT_DEBUG_NODES((int_cell)(uint_cell)(double_cell))
}
} types;
using state = qi::locals<std::vector<dynamic>, size_t>;
qi::_a_type _cols;
qi::_b_type _current;
qi::rule<Iterator, ResultType(), Skipper> start;
qi::rule<Iterator, std::vector<dynamic>(), Skipper> header;
qi::rule<Iterator, ResultType(), Skipper, state> csv;
};
int main() {
using It = boost::spirit::istream_iterator;
It f(std::cin >> std::noskipws), l;
TypedCSVGrammar<It> g;
ResultType data;
bool ok = qi::phrase_parse(f, l, g, ascii::blank, data);
if (ok) {
std::cout << "Parse success\n";
for(auto& row: data) {
for(auto& cell: row) std::cout << cell << "\t";
std::cout << "\n";
}
}
else
std::cout << "Parse failed\n";
if (f!=l)
std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
}
So, for the input shown, it prints
Parse success
12 1.3 23445 1 42
45 46 47 48 49
And debug information if
<start>
<try>int double double in</try>
<csv>
<try>int double double in</try>
<header>
<try>int double double in</try>
<success>\n12 1.3 23445 1</success>
<attributes>[[(lazy rule), (lazy rule), (lazy rule), (lazy rule), (lazy rule)]]</attributes>
</header>
<int_cell>
<try>12 1.3 23445 1 </try>
<success> 1.3 23445 1 </success>
<attributes>[12]</attributes>
</int_cell>
<double_cell>
<try> 1.3 23445 1 </try>
<success> 23445 1 42\n45</success>
<attributes>[1.3]</attributes>
</double_cell>
<double_cell>
<try> 23445 1 42\n45</try>
<success> 1 42\n45 46 </success>
<attributes>[23445]</attributes>
</double_cell>
<int_cell>
<try> 1 42\n45 46 </try>
<success> 42\n45 46 47 </success>
<attributes>[1]</attributes>
</int_cell>
<uint_cell>
<try> 42\n45 46 47 </try>
<success>\n45 46 47 4</success>
<attributes>[42]</attributes>
</uint_cell>
<int_cell>
<try>45 46 47 48</try>
<success> 46 47 48 </success>
<attributes>[45]</attributes>
</int_cell>
<double_cell>
<try> 46 47 48 </try>
<success> 47 48 49\n</success>
<attributes>[46]</attributes>
</double_cell>
<double_cell>
<try> 47 48 49\n</try>
<success> 48 49\n</success>
<attributes>[47]</attributes>
</double_cell>
<int_cell>
<try> 48 49\n</try>
<success> 49\n</success>
<attributes>[48]</attributes>
</int_cell>
<uint_cell>
<try> 49\n</try>
<success>\n</success>
<attributes>[49]</attributes>
</uint_cell>
<int_cell>
<try></try>
<fail/>
</int_cell>
<success></success>
<attributes>[[[12, 1.3, 23445, 1, 42], [45, 46, 47, 48, 49], []]]</attributes><locals>((column mappings) 1)</locals>
</csv>
<success></success>
<attributes>[[[12, 1.3, 23445, 1, 42], [45, 46, 47, 48, 49], []]]</attributes>
</start>
source to share