This example shows how to perform file type detection (with or without document processing) by file extension or file signature or rely on provided mime-type
#include "docwire.h"
#include <fstream>
int main()
{
std::stringstream out_stream;
const std::string expected_output_1 = "Data processing refers to the activities performed on raw data to convert it into meaningful information. It involves collecting, organizing, analyzing, and interpreting data to extract useful insights and support decision-making. This can include tasks such as sorting, filtering, summarizing, and transforming data through various computational and statistical methods. Data processing is essential in various fields, including business, science, and technology, as it enables organizations to derive valuable knowledge from large datasets, make informed decisions, and improve overall efficiency.\n\n";
const std::string expected_output_2 = "<http://www.silvercoders.com/>hyperlink test\n\n";
try
{
data_source data{seekable_stream_ptr{
std::make_shared<std::ifstream>("data_processing_definition.doc", std::ios::binary)
}};
std::optional<mime_type> mt = data.highest_confidence_mime_type();
ensure(mt.has_value()) ==
true;
ensure(*mt) == mime_type {
"application/msword" };
data_source
{
seekable_stream_ptr{std::make_shared<std::ifstream>("data_processing_definition.doc", std::ios::binary)},
mime_type{"application/msword"},
confidence::high
} |
office_formats_parser{} |
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_1;
out_stream.str("");
data_source
{
seekable_stream_ptr{std::make_shared<std::ifstream>("data_processing_definition.doc", std::ios::binary)},
file_extension{".doc"}
} |
content_type::by_file_extension::detector{} |
office_formats_parser{} |
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_1;
out_stream.str("");
std::filesystem::path{"data_processing_definition.doc"} |
content_type::by_file_extension::detector{} |
office_formats_parser{} |
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_1;
out_stream.str("");
data_source zip_data{seekable_stream_ptr{
std::make_shared<std::ifstream>("test.zip", std::ios::binary)
}};
std::optional<mime_type> zip_data_mt = zip_data.highest_confidence_mime_type();
ensure(zip_data_mt.has_value()) ==
true;
ensure(*zip_data_mt) == mime_type {
"application/zip" };
std::filesystem::path{"test.zip"} |
content_type::detector{} |
archives_parser{} |
[](message_ptr msg, const message_callbacks& emit_message)
{
if (!msg->is<data_source>())
return emit_message(std::move(msg));
data_source& data = msg->get<data_source>();
std::optional<mime_type> mt = data.highest_confidence_mime_type();
if (!mt || *mt != mime_type { "application/msword" })
return continuation::skip;
return emit_message(std::move(msg));
} |
office_formats_parser{} |
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_2;
out_stream.str("");
std::filesystem::path{"test.zip"} |
content_type::detector{} |
archives_parser{} |
doc_parser{} |
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_2;
}
catch (const std::exception& e)
{
return 1;
}
return 0;
}
DOCWIRE_CONTENT_TYPE_EXPORT void detect(data_source &data, const by_signature::database &signatures_db_to_use=by_signature::database{})
DOCWIRE_CORE_EXPORT std::string diagnostic_message(const std::exception &e)
Generates a diagnostic message for the given nested exceptions chain.
The main namespace for the DocWire SDK.
ensure(const T &, const docwire::source_location &) -> ensure< T >
Deduction guide for the ensure class template.