From 9d7868493b5c0407a9d7d0c67e416cbf845f6341 Mon Sep 17 00:00:00 2001 From: HenrikVDestia Date: Mon, 28 Jul 2025 15:02:25 +0300 Subject: [PATCH 1/2] Handling inline value by asking backend document. Implements xlsx inline string support for data. Does not handle style information. Implementation adds a import_sheet::set_string(row_t, col_t, std::string_view) which is called when inline string found in xlsx. String is not pushed to string pool because string pool indexes and sheet references to string pool might get messed up in mixed use case. Since this is an edge case handling is pushed to document back end for those how need it. --- .../orcus/spreadsheet/import_interface.hpp | 15 +++++ src/liborcus/spreadsheet_interface.cpp | 2 + src/liborcus/xlsx_sheet_context.cpp | 58 ++++++++++++------- 3 files changed, 53 insertions(+), 22 deletions(-) diff --git a/include/orcus/spreadsheet/import_interface.hpp b/include/orcus/spreadsheet/import_interface.hpp index ecdb33ec1..6f4395818 100644 --- a/include/orcus/spreadsheet/import_interface.hpp +++ b/include/orcus/spreadsheet/import_interface.hpp @@ -952,6 +952,21 @@ public: */ virtual void set_string(row_t row, col_t col, string_id_t sindex) = 0; + /** + * Set string value to a cell when string is not in the pool. + * + * Used in cases when parser cannot quarentee that string pools indexes + * and in document references are consistent if pushed to pool. + * + * Implementing this allowed document backend to handle these best way possible but + * primarily strings are pushed to the string pool. + * + * @param row row ID + * @oaram col column ID + * @param s inline string found while parsing. + */ + virtual void set_string(row_t row, col_t col, std::string_view s); + /** * Set numerical value to a cell. * diff --git a/src/liborcus/spreadsheet_interface.cpp b/src/liborcus/spreadsheet_interface.cpp index 585644a7d..8e97e81d3 100644 --- a/src/liborcus/spreadsheet_interface.cpp +++ b/src/liborcus/spreadsheet_interface.cpp @@ -114,6 +114,8 @@ import_array_formula::~import_array_formula() {} import_sheet::~import_sheet() {} +void import_sheet::set_string(row_t /*row*/, col_t /*col*/, std::string_view /*s*/) {} + import_sheet_view* import_sheet::get_sheet_view() { return nullptr; diff --git a/src/liborcus/xlsx_sheet_context.cpp b/src/liborcus/xlsx_sheet_context.cpp index 2b6192aaf..b61b5620b 100644 --- a/src/liborcus/xlsx_sheet_context.cpp +++ b/src/liborcus/xlsx_sheet_context.cpp @@ -244,6 +244,12 @@ void xlsx_sheet_context::start_element(xmlns_id_t ns, xml_token_t name, const xm case XML_v: xml_element_expected(parent, NS_ooxml_xlsx, XML_c); break; + case XML_is: + xml_element_expected(parent, NS_ooxml_xlsx, XML_c); + break; + case XML_t: + xml_element_expected(parent, NS_ooxml_xlsx, XML_is); + break; case XML_tableParts: xml_element_expected(parent, NS_ooxml_xlsx, XML_worksheet); break; @@ -278,6 +284,9 @@ bool xlsx_sheet_context::end_element(xmlns_id_t ns, xml_token_t name) case XML_c: end_element_cell(); break; + case XML_t: + m_cur_value = m_cur_str; + break; case XML_f: m_cur_formula.str = m_cur_str; break; @@ -791,29 +800,34 @@ void xlsx_sheet_context::push_raw_cell_value() switch (m_cur_cell_type) { - case xlsx_ct_shared_string: - { - // string cell - size_t str_id = to_long(m_cur_value); - m_sheet.set_string(m_cur_row, m_cur_col, str_id); - } - break; - case xlsx_ct_numeric: - { - // value cell - double val = to_double(m_cur_value); - m_sheet.set_value(m_cur_row, m_cur_col, val); - } + case xlsx_ct_inline_string: + // For the rare case of inline string we do not have context is pool safe to push. + // Hence, push to document backend to handle or not. + m_sheet.set_string(m_cur_row, m_cur_col, m_cur_value); break; - case xlsx_ct_boolean: - { - // boolean cell - bool val = to_long(m_cur_value) != 0; - m_sheet.set_bool(m_cur_row, m_cur_col, val); - } - break; - default: - warn("unhanlded cell content type"); + case xlsx_ct_shared_string: + { + // string cell + size_t str_id = to_long(m_cur_value); + m_sheet.set_string(m_cur_row, m_cur_col, str_id); + } + break; + case xlsx_ct_numeric: + { + // value cell + double val = to_double(m_cur_value); + m_sheet.set_value(m_cur_row, m_cur_col, val); + } + break; + case xlsx_ct_boolean: + { + // boolean cell + bool val = to_long(m_cur_value) != 0; + m_sheet.set_bool(m_cur_row, m_cur_col, val); + } + break; + default: + warn("unhanlded cell content type"); } } -- GitLab From 7ad4e7924eef11e0242ff01b49dfd9d1bc0306cb Mon Sep 17 00:00:00 2001 From: HenrikVDestia Date: Mon, 28 Jul 2025 15:22:02 +0300 Subject: [PATCH 2/2] Small unit test for the xlsx inline strings. --- src/include/mock_spreadsheet.hpp | 2 ++ src/liborcus/xlsx_sheet_context_test.cpp | 44 ++++++++++++++++++++++++ src/test/mock_spreadsheet.cpp | 5 +++ 3 files changed, 51 insertions(+) diff --git a/src/include/mock_spreadsheet.hpp b/src/include/mock_spreadsheet.hpp index 07a72f702..d6c32b203 100644 --- a/src/include/mock_spreadsheet.hpp +++ b/src/include/mock_spreadsheet.hpp @@ -135,6 +135,8 @@ public: virtual void set_string(orcus::spreadsheet::row_t row, orcus::spreadsheet::col_t col, orcus::spreadsheet::string_id_t sindex) override; + virtual void set_string(orcus::spreadsheet::row_t row, orcus::spreadsheet::col_t col, std::string_view s) override; + virtual void set_value(orcus::spreadsheet::row_t row, orcus::spreadsheet::col_t col, double value) override; virtual void set_bool(orcus::spreadsheet::row_t row, orcus::spreadsheet::col_t col, bool value) override; diff --git a/src/liborcus/xlsx_sheet_context_test.cpp b/src/liborcus/xlsx_sheet_context_test.cpp index a116b7855..50b49ad87 100644 --- a/src/liborcus/xlsx_sheet_context_test.cpp +++ b/src/liborcus/xlsx_sheet_context_test.cpp @@ -85,6 +85,14 @@ public: } }; +const string_view str_value = "Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque " + "faucibus ex sapien vitae pellentesque sem placerat. In id cursus" + " mi pretium tellus duis convallis. Tempus leo eu aenean sed diam" + " urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendu" + "m egestas. Iaculis massa nisl malesuada lacinia integer nunc pos" + "uere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad l" + "itora torquent per conubia nostra inceptos himenaeos."; + class mock_sheet : public import_sheet { mock_array_formula m_array_formula; @@ -104,6 +112,12 @@ public: assert(val == true); } + virtual void set_string(row_t row, col_t col, std::string_view s) override { + assert(row == -1); + assert(col == 0); + assert(s == str_value); + } + virtual iface::import_array_formula* get_array_formula() override { return &m_array_formula; @@ -227,6 +241,35 @@ void test_array_formula() context.end_element(ns, elem); } +void test_cell_string() +{ + mock_sheet sheet; + mock_ref_resolver resolver; + session_context cxt(std::make_unique()); + config opt(format_t::xlsx); + opt.structure_check = false; + + orcus::xlsx_sheet_context context(cxt, orcus::ooxml_tokens, 0, resolver, sheet); + context.set_config(opt); + + orcus::xmlns_id_t ns = NS_ooxml_xlsx; + + orcus::xml_token_attrs_t inline_attrs; + inline_attrs.push_back(xml_token_attr_t(ns, XML_t, "inlineStr", false)); + context.start_element(ns, XML_c, inline_attrs); + + { + xml_token_attrs_t val_attrs; + context.start_element(ns, XML_is, val_attrs); + context.start_element(ns, XML_t, val_attrs); + context.characters(str_value, false); + context.end_element(ns, XML_t); + context.end_element(ns, XML_is); + } + + context.end_element(ns, XML_c); +} + void test_hidden_col() { mock_sheet2 sheet; @@ -274,6 +317,7 @@ int main() { test_cell_value(); test_cell_bool(); + test_cell_string(); test_array_formula(); test_hidden_col(); test_hidden_row(); diff --git a/src/test/mock_spreadsheet.cpp b/src/test/mock_spreadsheet.cpp index 38b3c731e..c5e07eeb7 100644 --- a/src/test/mock_spreadsheet.cpp +++ b/src/test/mock_spreadsheet.cpp @@ -297,6 +297,11 @@ void import_sheet::set_string(row_t, col_t, string_id_t) assert(false); } +void import_sheet::set_string(row_t, col_t, std::string_view) +{ + assert(false); +} + void import_sheet::set_format(row_t, col_t, size_t) { assert(false); -- GitLab