DocWire SDK
DocWire SDK: Award-winning modern data processing in C++20. SourceForge Community Choice & Microsoft support. AI-driven processing. Supports nearly 100 data formats, including email boxes and OCR. Boost efficiency in text extraction, web data extraction, data mining, document analysis. Offline processing possible for security and confidentiality
common_xml_document_parser.h
1 /*********************************************************************************************************************************************/
2 /* DocWire SDK: Award-winning modern data processing in C++20. SourceForge Community Choice & Microsoft support. AI-driven processing. */
3 /* Supports nearly 100 data formats, including email boxes and OCR. Boost efficiency in text extraction, web data extraction, data mining, */
4 /* document analysis. Offline processing possible for security and confidentiality */
5 /* */
6 /* Copyright (c) SILVERCODERS Ltd, http://silvercoders.com */
7 /* Project homepage: https://github.com/docwire/docwire */
8 /* */
9 /* SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-DocWire-Commercial */
10 /*********************************************************************************************************************************************/
11 
12 #ifndef DOCWIRE_COMMON_XML_PARSER_H
13 #define DOCWIRE_COMMON_XML_PARSER_H
14 
15 #include "attributes.h"
16 #include "chain_element.h"
17 #include "pimpl.h"
18 #include "xml_children.h"
19 #include <string>
20 #include <vector>
21 #include <map>
22 
23 namespace docwire
24 {
25  class zip_reader;
26  class Metadata;
27 
28 enum XmlParseMode { PARSE_XML, FIX_XML, STRIP_XML };
29 
40 template <safety_policy safety_level = default_safety_level>
41 class common_xml_document_parser: public chain_element, public with_pimpl<common_xml_document_parser<safety_level>>
42 {
43  private:
46 
47  //public interface for derived classes (and its components)
48  public:
51  {
52  number,
53  bullet
54  };
55 
57  struct comment
58  {
59  std::string m_author;
60  std::string m_time;
61  std::string m_text;
62  comment() {}
69  comment(const std::string& author, const std::string& time, const std::string& text)
70  : m_author(author), m_time(time), m_text(text) {}
71  };
72 
74  struct relationship
75  {
76  std::string m_target;
77  };
78 
81  {
82  std::string m_text;
83  };
84 
86  typedef std::vector<ODFOOXMLListStyle> ListStyleVector;
88  using ListStyleMap = std::map<std::string, common_xml_document_parser<safety_level>::ListStyleVector>;
90  using CommentMap = std::map<int, common_xml_document_parser<safety_level>::comment>;
92  using RelationshipMap = std::map<std::string, common_xml_document_parser<safety_level>::relationship>;
94  using SharedStringVector = std::vector<shared_string>;
95 
99  typedef std::function<void(xml::node_ref<safety_level>& xml_node, XmlParseMode mode,
100  zip_reader* zipfile, std::string& text,
101  bool& children_processed, std::string& level_suffix, bool first_on_level)> CommandHandler;
102 
111  void registerODFOOXMLCommandHandler(const std::string& xml_tag, const CommandHandler& handler);
112 
123  std::string parseXmlData(xml::children_view<safety_level> xml_nodes, XmlParseMode mode, zip_reader* zipfile);
124 
133  std::string parseXmlChildren(xml::node_ref<safety_level>& xml_node, XmlParseMode mode, zip_reader* zipfile);
134 
145  void extractText(std::string_view xml_contents, XmlParseMode mode, zip_reader* zipfile, std::string& text);
146 
152  void parseODFMetadata(std::string_view xml_content, attributes::metadata& metadata) const;
153 
161  const std::string formatComment(const std::string& author, const std::string& time, const std::string& text);
162 
164  size_t& getListDepth();
165 
168 
171 
174 
177 
179  bool disabledText() const;
180 
183 
185  void disableText(bool disable);
186 
189 
191  void activeEmittingSignals(bool flag);
192 
193  //public interface
194  public:
199 
200  protected:
206  {
207  public:
216  private:
217  common_xml_document_parser& m_parser;
218  };
219 };
220 
221 } // namespace docwire
222 
223 #endif
Helper class to manage the context stack scope. Pushes a new context on construction and pops it on d...
scoped_context_stack_push(common_xml_document_parser &parser, const message_callbacks &emit_message)
Constructs the helper and pushes a new context onto the parser's stack.
~scoped_context_stack_push()
Destructor that pops the context from the parser's stack.
Base class for XML-based document parsers (ODF, OOXML, etc.).
RelationshipMap & getRelationships()
Gets the map of relationships.
size_t & getListDepth()
Returns the current nesting depth of lists.
CommentMap & getComments()
Gets the map of comments.
std::map< std::string, common_xml_document_parser< safety_level >::ListStyleVector > ListStyleMap
Type alias for a map of list style names to their definitions.
void registerODFOOXMLCommandHandler(const std::string &xml_tag, const CommandHandler &handler)
Registers a handler for a specific XML tag.
void activeEmittingSignals(bool flag)
Controls whether signal emission (callbacks) is active.
void disableText(bool disable)
Enables or disables text extraction.
common_xml_document_parser()
Default constructor.
SharedStringVector & getSharedStrings()
Gets the vector of shared strings.
ListStyleMap & getListStyles()
Gets the map of list styles.
const std::string formatComment(const std::string &author, const std::string &time, const std::string &text)
Formats a comment for output.
std::map< int, common_xml_document_parser< safety_level >::comment > CommentMap
Type alias for a map of comment IDs to Comment objects.
bool disabledText() const
Checks if text extraction is currently disabled.
void set_blanks(xml::reader_blanks blanks)
Sets the blank node handling policy for the XML reader.
std::vector< shared_string > SharedStringVector
Type alias for a vector of shared strings.
void parseODFMetadata(std::string_view xml_content, attributes::metadata &metadata) const
Parses ODF metadata from XML content.
ODFOOXMLListStyle
Enum for list styles (e.g., numbered or bulleted).
void extractText(std::string_view xml_contents, XmlParseMode mode, zip_reader *zipfile, std::string &text)
Extracts text from raw XML content.
std::string parseXmlData(xml::children_view< safety_level > xml_nodes, XmlParseMode mode, zip_reader *zipfile)
Parses XML data from a view of nodes.
std::function< void(xml::node_ref< safety_level > &xml_node, XmlParseMode mode, zip_reader *zipfile, std::string &text, bool &children_processed, std::string &level_suffix, bool first_on_level)> CommandHandler
Defines the function signature for an XML tag command handler.
std::map< std::string, common_xml_document_parser< safety_level >::relationship > RelationshipMap
Type alias for a map of relationship IDs to Relationship objects.
std::string parseXmlChildren(xml::node_ref< safety_level > &xml_node, XmlParseMode mode, zip_reader *zipfile)
Parses the children of a given XML node.
std::vector< ODFOOXMLListStyle > ListStyleVector
Type alias for a vector of list styles.
xml::reader_blanks blanks() const
Gets the current blank node handling policy.
A view over the direct children of an XML node.
Definition: xml_children.h:33
A reference to the current XML node in the reader.
Definition: xml_node_ref.h:33
reader_blanks
Options for handling blank nodes in the XML reader.
Definition: xml_reader.h:26
The main namespace for the DocWire SDK.
Definition: ai_elements.h:19
Represents a comment with author, time, and text.
comment(const std::string &author, const std::string &time, const std::string &text)
Constructs a Comment.
Represents a relationship, typically for hyperlinks or embedded objects.
Represents a shared string, a common optimization in OOXML formats.