diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp index 07310628cc802aa80c06217bfbe910c68448bfbb..811d3f9626b01372e49dcc5daee1af5b410b2b6e 100644 --- a/include/orcus/sax_parser_base.hpp +++ b/include/orcus/sax_parser_base.hpp @@ -65,6 +65,19 @@ struct doctype_declaration */ ORCUS_PSR_DLLPUBLIC char decode_xml_encoded_char(const char* p, size_t n); +/** + * Given an unicode string (such as #20A9 ), return a std::string + * that corresponds with the name. The name shouldn't include the + * leading '&' and trailing ';'. + * + * @param p pointer to the first character of encoded name + * @param n length of encoded name + * + * @return string that corresponds with the encoded name. empty string is + * returned if decoding fails. + */ +ORCUS_PSR_DLLPUBLIC std::string decode_xml_unicode_char(const char* p, size_t n); + /** * Element properties passed by sax_parser to its handler's open_element() * and close_element() calls. diff --git a/src/liborcus/odf_styles_context_test.cpp b/src/liborcus/odf_styles_context_test.cpp index d2a9b20444ef887acda750d03f5e9f842054bcb1..769d0341c2751e51ac88e18430e4c7cef230207c 100644 --- a/src/liborcus/odf_styles_context_test.cpp +++ b/src/liborcus/odf_styles_context_test.cpp @@ -208,9 +208,9 @@ void test_odf_number_formatting(orcus::spreadsheet::import_styles& styles) cell_format = styles.get_cell_style_format(xf); assert(cell_format); - // number_format = cell_format->number_format; - // cell_number_format = styles.get_number_format(number_format); - // assert(cell_number_format->format_string.str() == "[$₹]#,##0.00;[RED]-[$₹]#,##0.00"); + number_format = cell_format->number_format; + cell_number_format = styles.get_number_format(number_format); + assert(cell_number_format->format_string.str() == "[$₹]#,##0.00;[RED]-[$₹]#,##0.00"); style = find_cell_style_by_name("Name12", &styles); xf = style->xf; diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp index 3cd4f252099347ec312d4d9e35cb262277b2b213..3e5321c051ed1733fc7e11c891d9939868262266 100644 --- a/src/parser/sax_parser_base.cpp +++ b/src/parser/sax_parser_base.cpp @@ -50,6 +50,43 @@ char decode_xml_encoded_char(const char* p, size_t n) return '\0'; } +std::string decode_xml_unicode_char(const char* p, size_t n) +{ + std::string s(p); + unsigned int point; + if (*p == '#') + { + if (s[1] == 'x') + point = std::stoi(s.substr(2), nullptr, 16); + else + point = std::stoi(s.substr(1), nullptr, 10); + + if (point < 0x80) + { + s = (point >> 0 & 0x7F) | 0x00; + } + else if (point < 0x0800) + { + s = (point >> 6 & 0x1F) | 0xC0; + s += (point >> 0 & 0x3F) | 0x80; + } + else if (point < 0x010000) + { + s = (point >> 12 & 0x0F) | 0xE0; + s += (point >> 6 & 0x3F) | 0x80; + s += (point >> 0 & 0x3F) | 0x80; + } + else if (point < 0x110000) + { + s = (point >> 18 & 0x07) | 0xF0; + s += (point >> 12 & 0x3F) | 0x80; + s += (point >> 6 & 0x3F) | 0x80; + s += (point >> 0 & 0x3F) | 0x80; + } + } + return s; +} + struct parser_base::impl { std::vector> m_cell_buffers; @@ -177,14 +214,21 @@ void parser_base::parse_encoded_char(cell_buffer& buf) cout << "sax_parser::parse_encoded_char: raw='" << std::string(p0, n) << "'" << endl; #endif + std::string utf8; + char c = decode_xml_encoded_char(p0, n); if (c) buf.append(&c, 1); + else + utf8 = decode_xml_unicode_char(p0, n); + + if (!utf8.empty()) + buf.append(utf8.c_str(), utf8.size() / sizeof(utf8[0])); // Move to the character past ';' before returning to the parent call. next(); - if (!c) + if (!c && utf8.empty()) { #if ORCUS_DEBUG_SAX_PARSER cout << "sax_parser::parse_encoded_char: not a known encoding name. Use the original." << endl; diff --git a/src/parser/sax_token_parser_test.cpp b/src/parser/sax_token_parser_test.cpp index 07c5e1007c961d83f88fde7e186dc40cb89188f5..dfc97d7513b6507b6d24313b2495c9a56c513914 100644 --- a/src/parser/sax_token_parser_test.cpp +++ b/src/parser/sax_token_parser_test.cpp @@ -11,6 +11,7 @@ #include "orcus/xml_namespace.hpp" #include +#include using namespace std; using namespace orcus; @@ -106,9 +107,58 @@ void test_sax_token_parser_1() assert(hdl.get_token_count() == ORCUS_N_ELEMENTS(checks)); } +void test_unicode_string() +{ + const char* content1 = "!"; + const char* content2 = ""; + const char* content3 = ""; + + class handler + { + orcus::pstring str; + public: + handler(orcus::pstring str): + str(str) + {} + + void start_element(const orcus::xml_token_element_t& /*elem*/) + { + } + + void end_element(const orcus::xml_token_element_t& /*elem*/) + { + } + + void characters(const orcus::pstring& val, bool transient) + { + std::cout << "charachters:" << std::endl; + std::cout << val << std::endl; + assert(val == str); + } + }; + + const char* token_names[] = { + }; + size_t token_count = ORCUS_N_ELEMENTS(token_names); + + tokens token_map(token_names, token_count); + xmlns_repository ns_repo; + xmlns_context ns_cxt = ns_repo.create_context(); + handler hdl("\u0021"); + sax_token_parser parser1(content1, strlen(content1), token_map, ns_cxt, hdl); + parser1.parse(); + hdl = handler("\u00B6"); + sax_token_parser parser2(content2, strlen(content2), token_map, ns_cxt, hdl); + parser2.parse(); + hdl = handler("\u20B9"); + sax_token_parser parser3(content3, strlen(content3), token_map, ns_cxt, hdl); + parser3.parse(); +} + int main() { test_sax_token_parser_1(); + test_unicode_string(); return EXIT_SUCCESS; }