From eae7ac36bc938a7a12b3fa08ece57d14b3ad85dc Mon Sep 17 00:00:00 2001
From: Markus Mohrhard <markus.mohrhard@googlemail.com>
Date: Mon, 25 Jul 2016 02:10:25 +0200
Subject: [PATCH 1/3] add initial code for handling escaped unicode characters

---
 src/parser/sax_parser_base.cpp       |  7 ++++-
 src/parser/sax_token_parser_test.cpp | 38 ++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp
index 3cd4f2520..f7a3c4ead 100644
--- a/src/parser/sax_parser_base.cpp
+++ b/src/parser/sax_parser_base.cpp
@@ -21,7 +21,12 @@ malformed_xml_error::~malformed_xml_error() throw() {}
 
 char decode_xml_encoded_char(const char* p, size_t n)
 {
-    if (n == 2)
+    if (*p == '#')
+    {
+        // this is an escaped unicode character
+        // what should we do here?
+    }
+    else if (n == 2)
     {
         if (!std::strncmp(p, "lt", n))
             return '<';
diff --git a/src/parser/sax_token_parser_test.cpp b/src/parser/sax_token_parser_test.cpp
index 07c5e1007..7ec125091 100644
--- a/src/parser/sax_token_parser_test.cpp
+++ b/src/parser/sax_token_parser_test.cpp
@@ -11,6 +11,7 @@
 #include "orcus/xml_namespace.hpp"
 
 #include <cstring>
+#include <iostream>
 
 using namespace std;
 using namespace orcus;
@@ -106,9 +107,46 @@ void test_sax_token_parser_1()
     assert(hdl.get_token_count() == ORCUS_N_ELEMENTS(checks));
 }
 
+void test_unicode_string()
+{
+    const char* content = "<?xml version=\"1.0\"?><root>&#x20B9;</root>";
+    size_t content_size = strlen(content);
+
+    class handler
+    {
+    public:
+
+        void start_element(const orcus::xml_token_element_t& /*elem*/)
+        {
+        }
+
+        void end_element(const orcus::xml_token_element_t& /*elem*/)
+        {
+        }
+
+        void characters(const orcus::pstring& val, bool transient)
+        {
+            std::cout << "charachters:" << std::endl;
+            std::cout << val << std::endl;
+        }
+    };
+
+    const char* token_names[] = {
+    };
+    size_t token_count = ORCUS_N_ELEMENTS(token_names);
+
+    handler hdl;
+    tokens token_map(token_names, token_count);
+    xmlns_repository ns_repo;
+    xmlns_context ns_cxt = ns_repo.create_context();
+    sax_token_parser<handler> parser(content, content_size, token_map, ns_cxt, hdl);
+    parser.parse();
+}
+
 int main()
 {
     test_sax_token_parser_1();
+    test_unicode_string();
     return EXIT_SUCCESS;
 }
 
-- 
GitLab


From 37c865e34dbdfb558290366a43e5be80113b2069 Mon Sep 17 00:00:00 2001
From: Jaskaran Singh <jvsg1303@gmail.com>
Date: Tue, 16 Aug 2016 21:33:38 +0530
Subject: [PATCH 2/3] Add code for character reference in sax parser and
 re-enable test related to it

---
 include/orcus/sax_parser_base.hpp        | 13 ++++++
 src/liborcus/odf_styles_context_test.cpp |  6 +--
 src/parser/sax_parser_base.cpp           | 53 ++++++++++++++++++++----
 3 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp
index 07310628c..811d3f962 100644
--- a/include/orcus/sax_parser_base.hpp
+++ b/include/orcus/sax_parser_base.hpp
@@ -65,6 +65,19 @@ struct doctype_declaration
  */
 ORCUS_PSR_DLLPUBLIC char decode_xml_encoded_char(const char* p, size_t n);
 
+/**
+ * Given an unicode string (such as #20A9 ), return a std::string
+ * that corresponds with the name.  The name shouldn't include the
+ * leading '&' and trailing ';'.
+ *
+ * @param p pointer to the first character of encoded name
+ * @param n length of encoded name
+ *
+ * @return string that corresponds with the encoded name.  empty string is
+ *         returned if decoding fails.
+ */
+ORCUS_PSR_DLLPUBLIC std::string decode_xml_unicode_char(const char* p, size_t n);
+
 /**
  * Element properties passed by sax_parser to its handler's open_element()
  * and close_element() calls.
diff --git a/src/liborcus/odf_styles_context_test.cpp b/src/liborcus/odf_styles_context_test.cpp
index d2a9b2044..769d0341c 100644
--- a/src/liborcus/odf_styles_context_test.cpp
+++ b/src/liborcus/odf_styles_context_test.cpp
@@ -208,9 +208,9 @@ void test_odf_number_formatting(orcus::spreadsheet::import_styles& styles)
     cell_format = styles.get_cell_style_format(xf);
     assert(cell_format);
 
-    // number_format = cell_format->number_format;
-    // cell_number_format = styles.get_number_format(number_format);
-    // assert(cell_number_format->format_string.str() == "[$₹]#,##0.00;[RED]-[$₹]#,##0.00");
+    number_format = cell_format->number_format;
+    cell_number_format = styles.get_number_format(number_format);
+    assert(cell_number_format->format_string.str() == "[$₹]#,##0.00;[RED]-[$₹]#,##0.00");
 
     style = find_cell_style_by_name("Name12", &styles);
     xf = style->xf;
diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp
index f7a3c4ead..3e5321c05 100644
--- a/src/parser/sax_parser_base.cpp
+++ b/src/parser/sax_parser_base.cpp
@@ -21,12 +21,7 @@ malformed_xml_error::~malformed_xml_error() throw() {}
 
 char decode_xml_encoded_char(const char* p, size_t n)
 {
-    if (*p == '#')
-    {
-        // this is an escaped unicode character
-        // what should we do here?
-    }
-    else if (n == 2)
+    if (n == 2)
     {
         if (!std::strncmp(p, "lt", n))
             return '<';
@@ -55,6 +50,43 @@ char decode_xml_encoded_char(const char* p, size_t n)
     return '\0';
 }
 
+std::string decode_xml_unicode_char(const char* p, size_t n)
+{
+    std::string s(p);
+    unsigned int point;
+    if (*p == '#')
+    {
+        if (s[1] == 'x')
+            point = std::stoi(s.substr(2), nullptr, 16);
+        else
+            point = std::stoi(s.substr(1), nullptr, 10);
+
+        if (point < 0x80)
+        {
+            s = (point >> 0 & 0x7F) | 0x00;
+        }
+        else if (point < 0x0800)
+        {
+            s = (point >> 6 & 0x1F) | 0xC0;
+            s += (point >> 0 & 0x3F) | 0x80;
+        }
+        else if (point < 0x010000)
+        {
+            s = (point >> 12 & 0x0F) | 0xE0;
+            s += (point >> 6 & 0x3F) | 0x80;
+            s += (point >> 0 & 0x3F) | 0x80;
+        }
+        else if (point < 0x110000)
+        {
+            s = (point >> 18 & 0x07) | 0xF0;
+            s += (point >> 12 & 0x3F) | 0x80;
+            s += (point >> 6 & 0x3F) | 0x80;
+            s += (point >> 0 & 0x3F) | 0x80;
+        }
+    }
+    return s;
+}
+
 struct parser_base::impl
 {
     std::vector<std::unique_ptr<cell_buffer>> m_cell_buffers;
@@ -182,14 +214,21 @@ void parser_base::parse_encoded_char(cell_buffer& buf)
         cout << "sax_parser::parse_encoded_char: raw='" << std::string(p0, n) << "'" << endl;
 #endif
 
+        std::string utf8;
+
         char c = decode_xml_encoded_char(p0, n);
         if (c)
             buf.append(&c, 1);
+        else
+            utf8 = decode_xml_unicode_char(p0, n);
+
+        if (!utf8.empty())
+            buf.append(utf8.c_str(), utf8.size() / sizeof(utf8[0]));
 
         // Move to the character past ';' before returning to the parent call.
         next();
 
-        if (!c)
+        if (!c && utf8.empty())
         {
 #if ORCUS_DEBUG_SAX_PARSER
             cout << "sax_parser::parse_encoded_char: not a known encoding name. Use the original." << endl;
-- 
GitLab


From 2e466f2a4d61fb1503d3ef02168756d7bc21dd32 Mon Sep 17 00:00:00 2001
From: Jaskaran Singh <jvsg1303@gmail.com>
Date: Thu, 18 Aug 2016 18:23:58 +0530
Subject: [PATCH 3/3] Add test for decoding a unicode string

---
 src/parser/sax_token_parser_test.cpp | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/parser/sax_token_parser_test.cpp b/src/parser/sax_token_parser_test.cpp
index 7ec125091..dfc97d751 100644
--- a/src/parser/sax_token_parser_test.cpp
+++ b/src/parser/sax_token_parser_test.cpp
@@ -109,12 +109,17 @@ void test_sax_token_parser_1()
 
 void test_unicode_string()
 {
-    const char* content = "<?xml version=\"1.0\"?><root>&#x20B9;</root>";
-    size_t content_size = strlen(content);
+    const char* content1 = "<?xml version=\"1.0\"?><root>&#x0021;</root>";
+    const char* content2 = "<?xml version=\"1.0\"?><root>&#x00B6;</root>";
+    const char* content3 = "<?xml version=\"1.0\"?><root>&#x20B9;</root>";
 
     class handler
     {
+        orcus::pstring str;
     public:
+        handler(orcus::pstring str):
+            str(str)
+            {}
 
         void start_element(const orcus::xml_token_element_t& /*elem*/)
         {
@@ -128,6 +133,7 @@ void test_unicode_string()
         {
             std::cout << "charachters:" << std::endl;
             std::cout << val << std::endl;
+            assert(val == str);
         }
     };
 
@@ -135,12 +141,18 @@ void test_unicode_string()
     };
     size_t token_count = ORCUS_N_ELEMENTS(token_names);
 
-    handler hdl;
     tokens token_map(token_names, token_count);
     xmlns_repository ns_repo;
     xmlns_context ns_cxt = ns_repo.create_context();
-    sax_token_parser<handler> parser(content, content_size, token_map, ns_cxt, hdl);
-    parser.parse();
+    handler hdl("\u0021");
+    sax_token_parser<handler> parser1(content1, strlen(content1), token_map, ns_cxt, hdl);
+    parser1.parse();
+    hdl = handler("\u00B6");
+    sax_token_parser<handler> parser2(content2, strlen(content2), token_map, ns_cxt, hdl);
+    parser2.parse();
+    hdl = handler("\u20B9");
+    sax_token_parser<handler> parser3(content3, strlen(content3), token_map, ns_cxt, hdl);
+    parser3.parse();
 }
 
 int main()
-- 
GitLab