DocWire SDK
DocWire SDK: Award-winning modern data processing in C++20. SourceForge Community Choice & Microsoft support. AI-driven processing. Supports nearly 100 data formats, including email boxes and OCR. Boost efficiency in text extraction, web data extraction, data mining, document analysis. Offline processing possible for security and confidentiality
data_source.h
1 /*********************************************************************************************************************************************/
2 /* DocWire SDK: Award-winning modern data processing in C++20. SourceForge Community Choice & Microsoft support. AI-driven processing. */
3 /* Supports nearly 100 data formats, including email boxes and OCR. Boost efficiency in text extraction, web data extraction, data mining, */
4 /* document analysis. Offline processing possible for security and confidentiality */
5 /* */
6 /* Copyright (c) SILVERCODERS Ltd, http://silvercoders.com */
7 /* Project homepage: https://github.com/docwire/docwire */
8 /* */
9 /* SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-DocWire-Commercial */
10 /*********************************************************************************************************************************************/
11 
12 #ifndef DOCWIRE_DATA_SOURCE_H
13 #define DOCWIRE_DATA_SOURCE_H
14 
15 #include "core_export.h"
16 #include "file_extension.h"
17 #include <filesystem>
18 #include <functional>
19 #include <span>
20 #include "memory_buffer.h"
21 #include <optional>
22 #include <string_view>
23 #include "unique_identifier.h"
24 #include <unordered_map>
25 #include <variant>
26 #include <vector>
27 
28 namespace docwire
29 {
30 
33 {
34  std::shared_ptr<std::istream> v;
35 };
36 
39 {
40  std::shared_ptr<std::istream> v;
41 };
42 
45 {
46  size_t v;
47 };
48 
50 struct mime_type
51 {
52  std::string v;
53  bool operator==(const mime_type& rhs) const = default;
54 };
55 
56 }
57 
58 namespace std {
59 template <>
63 struct hash<docwire::mime_type>
64 {
65  size_t operator()(const docwire::mime_type& mt) const
66  {
67  return hash<std::string>{}(mt.v);
68  }
69 };
70 } // namespace std
71 
72 namespace docwire
73 {
74 
78 enum class confidence
79 {
80  none,
81  low,
82  medium,
83  high,
84  very_high,
85  highest
86 };
87 
91 template <typename T>
93  std::is_same_v<T, std::filesystem::path> ||
94  std::is_same_v<T, std::vector<std::byte>> ||
95  std::is_same_v<T, std::span<const std::byte>> ||
96  std::is_same_v<T, std::string> ||
97  std::is_same_v<T, std::string_view> ||
98  std::is_same_v<T, seekable_stream_ptr> ||
99  std::is_same_v<T, unseekable_stream_ptr>;
100 
104 template <typename T>
105 concept data_source_compatible_type_ref_qualified = data_source_compatible_type<std::remove_reference_t<T>>;
106 
111 template<class... Ts>
112 struct overloaded : Ts... { using Ts::operator()...; };
113 // Deduction guide for `overloaded`.
114 template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
115 
127 class DOCWIRE_CORE_EXPORT data_source
128 {
129  public:
134  template <data_source_compatible_type T>
135  explicit data_source(const T& source)
136  : m_source{source}
137  {}
138 
143  template <data_source_compatible_type T>
144  explicit data_source(T&& source)
145  : m_source{std::move(source)}
146  {}
147 
153  template <data_source_compatible_type T>
154  explicit data_source(const T& source, file_extension file_extension)
155  : m_source{source}, m_file_extension{file_extension}
156  {}
157 
163  template <data_source_compatible_type T>
165  : m_source{std::move(source)}, m_file_extension{file_extension}
166  {}
167 
174  template <data_source_compatible_type T>
175  explicit data_source(const T& source, mime_type mime_type, confidence mime_type_confidence)
176  : m_source{source}
177  {
178  add_mime_type(mime_type, mime_type_confidence);
179  }
180 
187  template <data_source_compatible_type T>
188  explicit data_source(T&& source, mime_type mime_type, confidence mime_type_confidence)
189  : m_source{std::move(source)}
190  {
191  add_mime_type(mime_type, mime_type_confidence);
192  }
193 
199  std::span<const std::byte> span(std::optional<length_limit> limit = std::nullopt) const;
200 
206  std::string string(std::optional<length_limit> limit = std::nullopt) const;
207 
216  std::string_view string_view(std::optional<length_limit> limit = std::nullopt) const;
217 
219  std::shared_ptr<std::istream> istream() const;
220 
222  std::optional<std::filesystem::path> path() const;
223 
225  std::optional<docwire::file_extension> file_extension() const;
226 
229  {
230  return m_id;
231  }
232 
240  std::optional<std::pair<mime_type, confidence>> highest_confidence_mime_type_info() const
241  {
242  auto hc_mt_it = std::max_element(mime_types.begin(), mime_types.end(),
243  [](const auto& p1, const auto& p2)
244  {
245  if (p1.second != p2.second)
246  return p1.second < p2.second;
247  // Deterministic tie-breaker: alphabetically earlier string wins.
248  // Since max_element looks for the "largest" element, we invert the string comparison.
249  return p1.first.v > p2.first.v;
250  });
251  if (hc_mt_it != mime_types.end())
252  return *hc_mt_it;
253  else
254  return std::nullopt;
255  }
256 
258  std::optional<mime_type> highest_confidence_mime_type() const
259  {
260  auto hc_mt = highest_confidence_mime_type_info();
261  if (hc_mt)
262  return hc_mt->first;
263  else
264  return std::nullopt;
265  }
266 
269  {
270  auto hc_mt = highest_confidence_mime_type_info();
271  if (hc_mt)
272  return hc_mt->second;
273  else
274  return confidence::none;
275  }
276 
281  bool has_highest_confidence_mime_type_in(const std::vector<mime_type>& mts) const;
283  void assert_not_encrypted() const;
284 
287  {
288  auto mt_iter = mime_types.find(mt);
289  if (mt_iter == mime_types.end())
290  return confidence::none;
291  else
292  return mt_iter->second;
293  }
294 
301  {
302  auto [existing_it, inserted] = mime_types.try_emplace(mt, c);
303  if (!inserted && existing_it->second < c)
304  existing_it->second = c;
305  }
306 
308  std::unordered_map<mime_type, confidence> mime_types;
309 
310  private:
311  std::variant<std::filesystem::path, std::vector<std::byte>, std::span<const std::byte>, std::string, std::string_view, seekable_stream_ptr, unseekable_stream_ptr> m_source;
312  std::optional<docwire::file_extension> m_file_extension;
313  mutable std::shared_ptr<memory_buffer> m_memory_cache;
314  mutable std::shared_ptr<std::istream> m_path_stream;
315  mutable std::optional<size_t> m_stream_size;
316  unique_identifier m_id;
317 
318  void fill_memory_cache(std::optional<length_limit> limit) const;
319 };
320 
321 } // namespace docwire
322 
323 #endif // DOCWIRE_DATA_SOURCE_H
std::string string(std::optional< length_limit > limit=std::nullopt) const
Returns the content as a string.
data_source(T &&source, file_extension file_extension)
Constructs a data_source by moving, with an explicit file extension.
Definition: data_source.h:164
data_source(T &&source)
Constructs a data_source by moving from a compatible type.
Definition: data_source.h:144
data_source(const T &source, file_extension file_extension)
Constructs a data_source with an explicit file extension.
Definition: data_source.h:154
confidence highest_mime_type_confidence() const
Returns the highest confidence level found among detected MIME types.
Definition: data_source.h:268
bool has_highest_confidence_mime_type_in(const std::vector< mime_type > &mts) const
Checks if the highest confidence mime type is present in the given list.
confidence mime_type_confidence(mime_type mt) const
Returns the confidence level for a specific MIME type.
Definition: data_source.h:286
std::span< const std::byte > span(std::optional< length_limit > limit=std::nullopt) const
Returns the content as a span of bytes.
void assert_not_encrypted() const
Asserts that the data source is not encrypted.
void add_mime_type(mime_type mt, confidence c)
Adds a mime type with a confidence level.
Definition: data_source.h:300
unique_identifier id() const
Returns the unique identifier for this data source.
Definition: data_source.h:228
std::optional< std::pair< mime_type, confidence > > highest_confidence_mime_type_info() const
Returns the MIME type with the highest confidence and its confidence level.
Definition: data_source.h:240
std::optional< docwire::file_extension > file_extension() const
Returns the file extension if available.
std::shared_ptr< std::istream > istream() const
Returns an input stream for reading the data.
data_source(const T &source, mime_type mime_type, confidence mime_type_confidence)
Constructs a data_source with an initial MIME type and confidence.
Definition: data_source.h:175
std::unordered_map< mime_type, confidence > mime_types
Map of detected MIME types and their confidence levels.
Definition: data_source.h:308
std::optional< std::filesystem::path > path() const
Returns the file path if the source is a file, otherwise std::nullopt.
std::string_view string_view(std::optional< length_limit > limit=std::nullopt) const
Returns the content as a string_view.
data_source(const T &source)
Constructs a data_source from a compatible type.
Definition: data_source.h:135
std::optional< mime_type > highest_confidence_mime_type() const
Returns the MIME type with the highest confidence.
Definition: data_source.h:258
data_source(T &&source, mime_type mime_type, confidence mime_type_confidence)
Constructs a data_source by moving, with an initial MIME type and confidence.
Definition: data_source.h:188
A class representing a file extension.
The class represents unique (for a single program run) identifier of an object.
The main namespace for the DocWire SDK.
Definition: ai_elements.h:19
concept data_source_compatible_type
Concept matching types that can be used to initialize a data_source.
Definition: data_source.h:92
concept data_source_compatible_type_ref_qualified
Concept matching reference-qualified types compatible with data_source.
Definition: data_source.h:105
confidence
Represents the confidence level of a detected MIME type.
Definition: data_source.h:79
Wrapper for a length limit value.
Definition: data_source.h:45
Wrapper for a MIME type string.
Definition: data_source.h:51
A helper for creating a visitor from a set of lambdas, used for visiting std::variant.
Definition: data_source.h:112
Wrapper for a shared pointer to a seekable input stream.
Definition: data_source.h:33
Wrapper for a shared pointer to an unseekable input stream.
Definition: data_source.h:39