diff options
author | suzuki toshiya <mpsuzuki@hiroshima-u.ac.jp> | 2020-05-16 04:54:55 +0000 |
---|---|---|
committer | Albert Astals Cid <tsdgeos@yahoo.es> | 2020-05-19 21:06:58 +0000 |
commit | 3189332012ca46998f8ffb872e7ed81c630c4c7a (patch) | |
tree | ef630b5f5fc7512cd9f6373077ca9e43d56b8e69 /cpp | |
parent | 437553ecb26948f77c3dbf7ad29bca86ffff7f6e (diff) |
[cpp] separate the font info in text_box to another struct.
* add new API, page::text_list(int opt_flag). The old one
taking no argument is kept for ABI compatibility.
The opt_flag is a bitmask-multiple of the new enum,
page::text_list_option_enum.
* text_box.m_data->text_box_font is an unique pointer to
the storage (if text_list() requests the font info), or
just a null pointer (if text_list() does not request the
font info).
* new option "--show-text-list-with-font" showing font
info, to tests/poppler-dump.cpp. "--show-text-list"
does not load the font info at all.
Co-authored-by: Adam Reichold <adam.reichold@t-online.de>
Co-authored-by: Albert Astals Cid <aacid@kde.org>
Diffstat (limited to 'cpp')
-rw-r--r-- | cpp/poppler-page.cpp | 101 | ||||
-rw-r--r-- | cpp/poppler-page.h | 22 | ||||
-rw-r--r-- | cpp/poppler-private.h | 30 | ||||
-rw-r--r-- | cpp/tests/poppler-dump.cpp | 18 |
4 files changed, 121 insertions, 50 deletions
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index f274ca5b..01b0409d 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -299,7 +299,7 @@ static void appendToGooString(void *stream, const char *text, int len) { ustring page::text(const rectf &r, text_layout_enum layout_mode) const { std::unique_ptr<GooString> out(new GooString()); - const bool use_raw_order = (layout_mode == raw_order_layout); + const bool use_raw_order = (layout_mode == raw_order_layout); const bool use_physical_layout = (layout_mode == physical_layout); TextOutputDev td(&appendToGooString, out.get(), use_physical_layout, 0, use_raw_order, false); if (r.is_empty()) { @@ -312,6 +312,11 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const } /* + * text_box_font_info object for text_box + */ +text_box_font_info_data::~text_box_font_info_data() = default; + +/* * text_box object for page::text_list() */ text_box_data::~text_box_data() = default; @@ -352,30 +357,41 @@ bool text_box::has_space_after() const return m_data->has_space_after; } +bool text_box::has_font_info() const +{ + return (m_data->text_box_font != nullptr); +} + text_box::writing_mode_enum text_box::get_wmode(int i) const { - return m_data->wmodes[i]; + if (this->has_font_info()) + return m_data->text_box_font->wmodes[i]; + else + return text_box::invalid_wmode; } double text_box::get_font_size() const { - return m_data->font_size; + if (this->has_font_info()) + return m_data->text_box_font->font_size; + else + return -1; } std::string text_box::get_font_name(int i) const { - int j = m_data->glyph_to_cache_index[i]; + if (!this->has_font_info()) + return std::string("*ignored*"); + + int j = m_data->text_box_font->glyph_to_cache_index[i]; if (j < 0) { return std::string(""); } - return m_data->font_info_cache[j].name(); + return m_data->text_box_font->font_info_cache[j].name(); } - -std::vector<text_box> page::text_list() const +std::vector<text_box> page::text_list(int opt_flag) const { - d->init_font_info_cache(); - std::vector<text_box> output_list; /* config values are same with Qt5 Page::TextList() */ @@ -419,41 +435,55 @@ std::vector<text_box> page::text_list() const word->getRotation(), {}, word->hasSpaceAfter() == true, - {}, - word->getFontSize(), - d->font_info_cache, - {} + nullptr }}; + std::unique_ptr<text_box_font_info_data> tb_font_info = nullptr; + if (opt_flag & page::text_list_include_font) { + d->init_font_info_cache(); + + std::unique_ptr<text_box_font_info_data> tb_font{new text_box_font_info_data{ + word->getFontSize(), // double font_size + {}, // std::vector<text_box::writing_mode> wmodes; + d->font_info_cache, // std::vector<font_info> font_info_cache; + {} // std::vector<int> glyph_to_cache_index; + }}; + + tb_font_info = std::move(tb_font); + }; + tb.m_data->char_bboxes.reserve(word->getLength()); for (int j = 0; j < word->getLength(); j ++) { word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax); tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin); } - tb.m_data->glyph_to_cache_index.reserve(word->getLength()); - for (int j = 0; j < word->getLength(); j++) { - const TextFontInfo* cur_text_font_info = word->getFontInfo(j); - - // filter-out the invalid WMode value here. - switch (cur_text_font_info->getWMode()) { - case 0: - tb.m_data->wmodes.push_back(text_box::horizontal_wmode); - break; - case 1: - tb.m_data->wmodes.push_back(text_box::vertical_wmode); - break; - default: - tb.m_data->wmodes.push_back(text_box::invalid_wmode); - }; - - tb.m_data->glyph_to_cache_index[j] = -1; - for (size_t k = 0; k < d->font_info_cache.size(); k++) { - if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) { - tb.m_data->glyph_to_cache_index[j] = k; + if (tb_font_info && d->font_info_cache_initialized) { + tb_font_info->glyph_to_cache_index.reserve(word->getLength()); + for (int j = 0; j < word->getLength(); j++) { + const TextFontInfo* cur_text_font_info = word->getFontInfo(j); + + // filter-out the invalid WMode value here. + switch (cur_text_font_info->getWMode()) { + case 0: + tb_font_info->wmodes.push_back(text_box::horizontal_wmode); break; + case 1: + tb_font_info->wmodes.push_back(text_box::vertical_wmode); + break; + default: + tb_font_info->wmodes.push_back(text_box::invalid_wmode); + }; + + tb_font_info->glyph_to_cache_index[j] = -1; + for (size_t k = 0; k < tb_font_info->font_info_cache.size(); k++) { + if (cur_text_font_info->matches(&(tb_font_info->font_info_cache[k].d->ref))) { + tb_font_info->glyph_to_cache_index[j] = k; + break; + } } } + tb.m_data->text_box_font = std::move(tb_font_info); } output_list.push_back(std::move(tb)); @@ -462,3 +492,8 @@ std::vector<text_box> page::text_list() const return output_list; } + +std::vector<text_box> page::text_list() const +{ + return text_list(0); +} diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index ca5be2fd..dd6ebf2c 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -66,6 +66,12 @@ public: rectf char_bbox(size_t i) const; bool has_space_after() const; + + /** + \since 0.89 + */ + bool has_font_info() const; + /** Get a writing mode for the i-th glyph @@ -186,6 +192,22 @@ public: */ std::vector<text_box> text_list() const; + /* + * text_list_option_enum is a bitmask-style flags for text_list(), + * 0 means the default & simplest behaviour. + */ + enum text_list_option_enum { + text_list_include_font = 1 // \since 0.89 + }; + + /** + Extended version of text_list() taking an option flag. + The option flag should be the multiple of text_list_option_enum. + + \since 0.89 + */ + std::vector<text_box> text_list(int opt_flag) const; + private: page(document_private *doc, int index); diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h index 83e46319..4ec159a8 100644 --- a/cpp/poppler-private.h +++ b/cpp/poppler-private.h @@ -73,23 +73,17 @@ void delete_all(const Collection &c) } class font_info; -struct text_box_data +struct text_box_font_info_data { - ~text_box_data(); - - ustring text; - rectf bbox; - int rotation; - std::vector<rectf> char_bboxes; - bool has_space_after; + ~text_box_font_info_data(); - std::vector<text_box::writing_mode_enum> wmodes; double font_size; + std::vector<text_box::writing_mode_enum> wmodes; /* * a duplication of the font_info_cache created by the * poppler::font_iterator and owned by the poppler::page - * object. Its lifetime might differ from that of text_box + * object. Its lifetime might differ from that of text_box * object (think about collecting all text_box objects * from all pages), so we have to duplicate it into all * text_box instances. @@ -97,7 +91,7 @@ struct text_box_data std::vector<font_info> font_info_cache; /* - * a std::vector from the glyph index in the current + * a std::vector from the glyph index in the owner * text_box to the font_info index in font_info_cache. * The "-1" means no corresponding fonts found in the * cache. @@ -105,6 +99,20 @@ struct text_box_data std::vector<int> glyph_to_cache_index; }; +class font_info; +struct text_box_data +{ + ~text_box_data(); + + ustring text; + rectf bbox; + int rotation; + std::vector<rectf> char_bboxes; + bool has_space_after; + + std::unique_ptr<text_box_font_info_data> text_box_font; +}; + } #endif diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp index 7864979e..ef391d78 100644 --- a/cpp/tests/poppler-dump.cpp +++ b/cpp/tests/poppler-dump.cpp @@ -60,6 +60,7 @@ bool show_help = false; bool show_version = false; char show_text[32]; bool show_text_list = false; +bool show_text_list_with_font = false; poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout; static const ArgDesc the_args[] = { @@ -85,6 +86,8 @@ static const ArgDesc the_args[] = { "show text (physical|raw|none) extracted from all pages" }, { "--show-text-list", argFlag, &show_text_list, 0, "show text list (experimental)" }, + { "--show-text-list-with-font", argFlag, &show_text_list_with_font, 0, + "show text list with font info (experimental)" }, { "-h", argFlag, &show_help, 0, "print usage information" }, { "--help", argFlag, &show_help, 0, @@ -417,14 +420,14 @@ static void print_page_text(poppler::page *p) std::cout << std::endl; } -static void print_page_text_list(poppler::page *p) +static void print_page_text_list(poppler::page *p, int opt_flag = 0) { if (!p) { std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl; std::cout << std::endl; return; } - auto text_list = p->text_list(); + auto text_list = p->text_list(opt_flag); std::cout << "---" << std::endl; for (const poppler::text_box &text : text_list) { @@ -435,9 +438,9 @@ static void print_page_text_list(poppler::page *p) std::string font_name = text.get_font_name(); std::cout << "[" << ustr << "] @ "; std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )"; - std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )"; + if (text.has_font_info()) + std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )"; std::cout << std::endl; - } std::cout << "---" << std::endl; } @@ -538,12 +541,15 @@ int main(int argc, char *argv[]) print_page_text(p.get()); } } - if (show_text_list) { + if (show_text_list || show_text_list_with_font) { const int pages = doc->pages(); for (int i = 0; i < pages; ++i) { std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl; std::unique_ptr<poppler::page> p(doc->create_page(i)); - print_page_text_list(p.get()); + if (show_text_list_with_font) + print_page_text_list(p.get(), poppler::page::text_list_include_font); + else + print_page_text_list(p.get(), 0); } } |