DocWire SDK
DocWire SDK: Award-winning modern data processing in C++20. SourceForge Community Choice & Microsoft support. AI-driven processing. Supports nearly 100 data formats, including email boxes and OCR. Boost efficiency in text extraction, web data extraction, data mining, document analysis. Offline processing possible for security and confidentiality
file_type_determination.cpp

This example shows how to perform file type detection (with or without document processing) by file extension or file signature or rely on provided mime-type

#include "docwire.h"
#include <fstream>
int main()
{
using namespace docwire;
std::stringstream out_stream;
const std::string expected_output_1 = "Data processing refers to the activities performed on raw data to convert it into meaningful information. It involves collecting, organizing, analyzing, and interpreting data to extract useful insights and support decision-making. This can include tasks such as sorting, filtering, summarizing, and transforming data through various computational and statistical methods. Data processing is essential in various fields, including business, science, and technology, as it enables organizations to derive valuable knowledge from large datasets, make informed decisions, and improve overall efficiency.\n\n";
const std::string expected_output_2 = "<http://www.silvercoders.com/>hyperlink test\n\n";
try
{
// Perform file type determination and output the file types without also performing document conversion.
data_source data{seekable_stream_ptr{
std::make_shared<std::ifstream>("data_processing_definition.doc", std::ios::binary)
}};
std::optional<mime_type> mt = data.highest_confidence_mime_type();
ensure(mt.has_value()) == true;
ensure(*mt) == mime_type { "application/msword" };
// Bypass file type determination and rely on provided mime-type
data_source
{
seekable_stream_ptr{std::make_shared<std::ifstream>("data_processing_definition.doc", std::ios::binary)},
mime_type{"application/msword"},
confidence::high
} |
office_formats_parser{} | // more parsers can be added
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_1;
out_stream.str("");
// Perform file type determination based on provided file extension
data_source
{
seekable_stream_ptr{std::make_shared<std::ifstream>("data_processing_definition.doc", std::ios::binary)},
file_extension{".doc"}
} |
content_type::by_file_extension::detector{} |
office_formats_parser{} | // more parsers can be added
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_1;
out_stream.str("");
// Perform file type determination based on file extension extracted from file name
std::filesystem::path{"data_processing_definition.doc"} |
content_type::by_file_extension::detector{} |
office_formats_parser{} | // more parsers can be added
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_1;
out_stream.str("");
// Perform file type determination on a Zip file without also performing it on its contents.
data_source zip_data{seekable_stream_ptr{
std::make_shared<std::ifstream>("test.zip", std::ios::binary)
}};
std::optional<mime_type> zip_data_mt = zip_data.highest_confidence_mime_type();
ensure(zip_data_mt.has_value()) == true;
ensure(*zip_data_mt) == mime_type { "application/zip" };
// Perform file type determination on all files within a zip and selectively performing document conversion based on the returned file types
std::filesystem::path{"test.zip"} |
content_type::detector{} |
archives_parser{} |
[](message_ptr msg, const message_callbacks& emit_message)
{
if (!msg->is<data_source>())
return emit_message(std::move(msg));
data_source& data = msg->get<data_source>();
std::optional<mime_type> mt = data.highest_confidence_mime_type();
if (!mt || *mt != mime_type { "application/msword" })
return continuation::skip;
return emit_message(std::move(msg));
} |
office_formats_parser{} | // more parsers can be added
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_2;
out_stream.str("");
// ... or
std::filesystem::path{"test.zip"} |
content_type::detector{} |
archives_parser{} |
doc_parser{} | // other formats will be skipped
plain_text_exporter() | out_stream;
ensure(out_stream.str()) == expected_output_2;
}
catch (const std::exception& e)
{
std::cerr << "[ERROR] " << errors::diagnostic_message(e) << std::endl;
return 1;
}
return 0;
}
DOCWIRE_CONTENT_TYPE_EXPORT void detect(data_source &data, const by_signature::database &signatures_db_to_use=by_signature::database{})
DOCWIRE_CORE_EXPORT std::string diagnostic_message(const std::exception &e)
Generates a diagnostic message for the given nested exceptions chain.
The main namespace for the DocWire SDK.
Definition: ai_elements.h:19
ensure(const T &, const docwire::source_location &) -> ensure< T >
Deduction guide for the ensure class template.