From eae7ac36bc938a7a12b3fa08ece57d14b3ad85dc Mon Sep 17 00:00:00 2001 From: Markus Mohrhard Date: Mon, 25 Jul 2016 02:10:25 +0200 Subject: [PATCH 1/3] add initial code for handling escaped unicode characters --- src/parser/sax_parser_base.cpp | 7 ++++- src/parser/sax_token_parser_test.cpp | 38 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp index 3cd4f2520..f7a3c4ead 100644 --- a/src/parser/sax_parser_base.cpp +++ b/src/parser/sax_parser_base.cpp @@ -21,7 +21,12 @@ malformed_xml_error::~malformed_xml_error() throw() {} char decode_xml_encoded_char(const char* p, size_t n) { - if (n == 2) + if (*p == '#') + { + // this is an escaped unicode character + // what should we do here? + } + else if (n == 2) { if (!std::strncmp(p, "lt", n)) return '<'; diff --git a/src/parser/sax_token_parser_test.cpp b/src/parser/sax_token_parser_test.cpp index 07c5e1007..7ec125091 100644 --- a/src/parser/sax_token_parser_test.cpp +++ b/src/parser/sax_token_parser_test.cpp @@ -11,6 +11,7 @@ #include "orcus/xml_namespace.hpp" #include +#include using namespace std; using namespace orcus; @@ -106,9 +107,46 @@ void test_sax_token_parser_1() assert(hdl.get_token_count() == ORCUS_N_ELEMENTS(checks)); } +void test_unicode_string() +{ + const char* content = ""; + size_t content_size = strlen(content); + + class handler + { + public: + + void start_element(const orcus::xml_token_element_t& /*elem*/) + { + } + + void end_element(const orcus::xml_token_element_t& /*elem*/) + { + } + + void characters(const orcus::pstring& val, bool transient) + { + std::cout << "charachters:" << std::endl; + std::cout << val << std::endl; + } + }; + + const char* token_names[] = { + }; + size_t token_count = ORCUS_N_ELEMENTS(token_names); + + handler hdl; + tokens token_map(token_names, token_count); + xmlns_repository ns_repo; + xmlns_context ns_cxt = ns_repo.create_context(); + sax_token_parser parser(content, content_size, token_map, ns_cxt, hdl); + parser.parse(); +} + int main() { test_sax_token_parser_1(); + test_unicode_string(); return EXIT_SUCCESS; } -- GitLab From 37c865e34dbdfb558290366a43e5be80113b2069 Mon Sep 17 00:00:00 2001 From: Jaskaran Singh Date: Tue, 16 Aug 2016 21:33:38 +0530 Subject: [PATCH 2/3] Add code for character reference in sax parser and re-enable test related to it --- include/orcus/sax_parser_base.hpp | 13 ++++++ src/liborcus/odf_styles_context_test.cpp | 6 +-- src/parser/sax_parser_base.cpp | 53 ++++++++++++++++++++---- 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp index 07310628c..811d3f962 100644 --- a/include/orcus/sax_parser_base.hpp +++ b/include/orcus/sax_parser_base.hpp @@ -65,6 +65,19 @@ struct doctype_declaration */ ORCUS_PSR_DLLPUBLIC char decode_xml_encoded_char(const char* p, size_t n); +/** + * Given an unicode string (such as #20A9 ), return a std::string + * that corresponds with the name. The name shouldn't include the + * leading '&' and trailing ';'. + * + * @param p pointer to the first character of encoded name + * @param n length of encoded name + * + * @return string that corresponds with the encoded name. empty string is + * returned if decoding fails. + */ +ORCUS_PSR_DLLPUBLIC std::string decode_xml_unicode_char(const char* p, size_t n); + /** * Element properties passed by sax_parser to its handler's open_element() * and close_element() calls. diff --git a/src/liborcus/odf_styles_context_test.cpp b/src/liborcus/odf_styles_context_test.cpp index d2a9b2044..769d0341c 100644 --- a/src/liborcus/odf_styles_context_test.cpp +++ b/src/liborcus/odf_styles_context_test.cpp @@ -208,9 +208,9 @@ void test_odf_number_formatting(orcus::spreadsheet::import_styles& styles) cell_format = styles.get_cell_style_format(xf); assert(cell_format); - // number_format = cell_format->number_format; - // cell_number_format = styles.get_number_format(number_format); - // assert(cell_number_format->format_string.str() == "[$₹]#,##0.00;[RED]-[$₹]#,##0.00"); + number_format = cell_format->number_format; + cell_number_format = styles.get_number_format(number_format); + assert(cell_number_format->format_string.str() == "[$₹]#,##0.00;[RED]-[$₹]#,##0.00"); style = find_cell_style_by_name("Name12", &styles); xf = style->xf; diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp index f7a3c4ead..3e5321c05 100644 --- a/src/parser/sax_parser_base.cpp +++ b/src/parser/sax_parser_base.cpp @@ -21,12 +21,7 @@ malformed_xml_error::~malformed_xml_error() throw() {} char decode_xml_encoded_char(const char* p, size_t n) { - if (*p == '#') - { - // this is an escaped unicode character - // what should we do here? - } - else if (n == 2) + if (n == 2) { if (!std::strncmp(p, "lt", n)) return '<'; @@ -55,6 +50,43 @@ char decode_xml_encoded_char(const char* p, size_t n) return '\0'; } +std::string decode_xml_unicode_char(const char* p, size_t n) +{ + std::string s(p); + unsigned int point; + if (*p == '#') + { + if (s[1] == 'x') + point = std::stoi(s.substr(2), nullptr, 16); + else + point = std::stoi(s.substr(1), nullptr, 10); + + if (point < 0x80) + { + s = (point >> 0 & 0x7F) | 0x00; + } + else if (point < 0x0800) + { + s = (point >> 6 & 0x1F) | 0xC0; + s += (point >> 0 & 0x3F) | 0x80; + } + else if (point < 0x010000) + { + s = (point >> 12 & 0x0F) | 0xE0; + s += (point >> 6 & 0x3F) | 0x80; + s += (point >> 0 & 0x3F) | 0x80; + } + else if (point < 0x110000) + { + s = (point >> 18 & 0x07) | 0xF0; + s += (point >> 12 & 0x3F) | 0x80; + s += (point >> 6 & 0x3F) | 0x80; + s += (point >> 0 & 0x3F) | 0x80; + } + } + return s; +} + struct parser_base::impl { std::vector> m_cell_buffers; @@ -182,14 +214,21 @@ void parser_base::parse_encoded_char(cell_buffer& buf) cout << "sax_parser::parse_encoded_char: raw='" << std::string(p0, n) << "'" << endl; #endif + std::string utf8; + char c = decode_xml_encoded_char(p0, n); if (c) buf.append(&c, 1); + else + utf8 = decode_xml_unicode_char(p0, n); + + if (!utf8.empty()) + buf.append(utf8.c_str(), utf8.size() / sizeof(utf8[0])); // Move to the character past ';' before returning to the parent call. next(); - if (!c) + if (!c && utf8.empty()) { #if ORCUS_DEBUG_SAX_PARSER cout << "sax_parser::parse_encoded_char: not a known encoding name. Use the original." << endl; -- GitLab From 2e466f2a4d61fb1503d3ef02168756d7bc21dd32 Mon Sep 17 00:00:00 2001 From: Jaskaran Singh Date: Thu, 18 Aug 2016 18:23:58 +0530 Subject: [PATCH 3/3] Add test for decoding a unicode string --- src/parser/sax_token_parser_test.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/parser/sax_token_parser_test.cpp b/src/parser/sax_token_parser_test.cpp index 7ec125091..dfc97d751 100644 --- a/src/parser/sax_token_parser_test.cpp +++ b/src/parser/sax_token_parser_test.cpp @@ -109,12 +109,17 @@ void test_sax_token_parser_1() void test_unicode_string() { - const char* content = ""; - size_t content_size = strlen(content); + const char* content1 = "!"; + const char* content2 = ""; + const char* content3 = ""; class handler { + orcus::pstring str; public: + handler(orcus::pstring str): + str(str) + {} void start_element(const orcus::xml_token_element_t& /*elem*/) { @@ -128,6 +133,7 @@ void test_unicode_string() { std::cout << "charachters:" << std::endl; std::cout << val << std::endl; + assert(val == str); } }; @@ -135,12 +141,18 @@ void test_unicode_string() }; size_t token_count = ORCUS_N_ELEMENTS(token_names); - handler hdl; tokens token_map(token_names, token_count); xmlns_repository ns_repo; xmlns_context ns_cxt = ns_repo.create_context(); - sax_token_parser parser(content, content_size, token_map, ns_cxt, hdl); - parser.parse(); + handler hdl("\u0021"); + sax_token_parser parser1(content1, strlen(content1), token_map, ns_cxt, hdl); + parser1.parse(); + hdl = handler("\u00B6"); + sax_token_parser parser2(content2, strlen(content2), token_map, ns_cxt, hdl); + parser2.parse(); + hdl = handler("\u20B9"); + sax_token_parser parser3(content3, strlen(content3), token_map, ns_cxt, hdl); + parser3.parse(); } int main() -- GitLab