summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsuzuki toshiya <mpsuzuki@hiroshima-u.ac.jp>2020-05-16 04:54:55 +0000
committerAlbert Astals Cid <tsdgeos@yahoo.es>2020-05-19 21:06:58 +0000
commit3189332012ca46998f8ffb872e7ed81c630c4c7a (patch)
treeef630b5f5fc7512cd9f6373077ca9e43d56b8e69
parent437553ecb26948f77c3dbf7ad29bca86ffff7f6e (diff)
[cpp] separate the font info in text_box to another struct.
* add new API, page::text_list(int opt_flag). The old one taking no argument is kept for ABI compatibility. The opt_flag is a bitmask-multiple of the new enum, page::text_list_option_enum. * text_box.m_data->text_box_font is an unique pointer to the storage (if text_list() requests the font info), or just a null pointer (if text_list() does not request the font info). * new option "--show-text-list-with-font" showing font info, to tests/poppler-dump.cpp. "--show-text-list" does not load the font info at all. Co-authored-by: Adam Reichold <adam.reichold@t-online.de> Co-authored-by: Albert Astals Cid <aacid@kde.org>
-rw-r--r--cpp/poppler-page.cpp101
-rw-r--r--cpp/poppler-page.h22
-rw-r--r--cpp/poppler-private.h30
-rw-r--r--cpp/tests/poppler-dump.cpp18
4 files changed, 121 insertions, 50 deletions
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index f274ca5b..01b0409d 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -299,7 +299,7 @@ static void appendToGooString(void *stream, const char *text, int len) {
ustring page::text(const rectf &r, text_layout_enum layout_mode) const
{
std::unique_ptr<GooString> out(new GooString());
- const bool use_raw_order = (layout_mode == raw_order_layout);
+ const bool use_raw_order = (layout_mode == raw_order_layout);
const bool use_physical_layout = (layout_mode == physical_layout);
TextOutputDev td(&appendToGooString, out.get(), use_physical_layout, 0, use_raw_order, false);
if (r.is_empty()) {
@@ -312,6 +312,11 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
}
/*
+ * text_box_font_info object for text_box
+ */
+text_box_font_info_data::~text_box_font_info_data() = default;
+
+/*
* text_box object for page::text_list()
*/
text_box_data::~text_box_data() = default;
@@ -352,30 +357,41 @@ bool text_box::has_space_after() const
return m_data->has_space_after;
}
+bool text_box::has_font_info() const
+{
+ return (m_data->text_box_font != nullptr);
+}
+
text_box::writing_mode_enum text_box::get_wmode(int i) const
{
- return m_data->wmodes[i];
+ if (this->has_font_info())
+ return m_data->text_box_font->wmodes[i];
+ else
+ return text_box::invalid_wmode;
}
double text_box::get_font_size() const
{
- return m_data->font_size;
+ if (this->has_font_info())
+ return m_data->text_box_font->font_size;
+ else
+ return -1;
}
std::string text_box::get_font_name(int i) const
{
- int j = m_data->glyph_to_cache_index[i];
+ if (!this->has_font_info())
+ return std::string("*ignored*");
+
+ int j = m_data->text_box_font->glyph_to_cache_index[i];
if (j < 0) {
return std::string("");
}
- return m_data->font_info_cache[j].name();
+ return m_data->text_box_font->font_info_cache[j].name();
}
-
-std::vector<text_box> page::text_list() const
+std::vector<text_box> page::text_list(int opt_flag) const
{
- d->init_font_info_cache();
-
std::vector<text_box> output_list;
/* config values are same with Qt5 Page::TextList() */
@@ -419,41 +435,55 @@ std::vector<text_box> page::text_list() const
word->getRotation(),
{},
word->hasSpaceAfter() == true,
- {},
- word->getFontSize(),
- d->font_info_cache,
- {}
+ nullptr
}};
+ std::unique_ptr<text_box_font_info_data> tb_font_info = nullptr;
+ if (opt_flag & page::text_list_include_font) {
+ d->init_font_info_cache();
+
+ std::unique_ptr<text_box_font_info_data> tb_font{new text_box_font_info_data{
+ word->getFontSize(), // double font_size
+ {}, // std::vector<text_box::writing_mode> wmodes;
+ d->font_info_cache, // std::vector<font_info> font_info_cache;
+ {} // std::vector<int> glyph_to_cache_index;
+ }};
+
+ tb_font_info = std::move(tb_font);
+ };
+
tb.m_data->char_bboxes.reserve(word->getLength());
for (int j = 0; j < word->getLength(); j ++) {
word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax);
tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin);
}
- tb.m_data->glyph_to_cache_index.reserve(word->getLength());
- for (int j = 0; j < word->getLength(); j++) {
- const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
-
- // filter-out the invalid WMode value here.
- switch (cur_text_font_info->getWMode()) {
- case 0:
- tb.m_data->wmodes.push_back(text_box::horizontal_wmode);
- break;
- case 1:
- tb.m_data->wmodes.push_back(text_box::vertical_wmode);
- break;
- default:
- tb.m_data->wmodes.push_back(text_box::invalid_wmode);
- };
-
- tb.m_data->glyph_to_cache_index[j] = -1;
- for (size_t k = 0; k < d->font_info_cache.size(); k++) {
- if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) {
- tb.m_data->glyph_to_cache_index[j] = k;
+ if (tb_font_info && d->font_info_cache_initialized) {
+ tb_font_info->glyph_to_cache_index.reserve(word->getLength());
+ for (int j = 0; j < word->getLength(); j++) {
+ const TextFontInfo* cur_text_font_info = word->getFontInfo(j);
+
+ // filter-out the invalid WMode value here.
+ switch (cur_text_font_info->getWMode()) {
+ case 0:
+ tb_font_info->wmodes.push_back(text_box::horizontal_wmode);
break;
+ case 1:
+ tb_font_info->wmodes.push_back(text_box::vertical_wmode);
+ break;
+ default:
+ tb_font_info->wmodes.push_back(text_box::invalid_wmode);
+ };
+
+ tb_font_info->glyph_to_cache_index[j] = -1;
+ for (size_t k = 0; k < tb_font_info->font_info_cache.size(); k++) {
+ if (cur_text_font_info->matches(&(tb_font_info->font_info_cache[k].d->ref))) {
+ tb_font_info->glyph_to_cache_index[j] = k;
+ break;
+ }
}
}
+ tb.m_data->text_box_font = std::move(tb_font_info);
}
output_list.push_back(std::move(tb));
@@ -462,3 +492,8 @@ std::vector<text_box> page::text_list() const
return output_list;
}
+
+std::vector<text_box> page::text_list() const
+{
+ return text_list(0);
+}
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index ca5be2fd..dd6ebf2c 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -66,6 +66,12 @@ public:
rectf char_bbox(size_t i) const;
bool has_space_after() const;
+
+ /**
+ \since 0.89
+ */
+ bool has_font_info() const;
+
/**
Get a writing mode for the i-th glyph
@@ -186,6 +192,22 @@ public:
*/
std::vector<text_box> text_list() const;
+ /*
+ * text_list_option_enum is a bitmask-style flags for text_list(),
+ * 0 means the default & simplest behaviour.
+ */
+ enum text_list_option_enum {
+ text_list_include_font = 1 // \since 0.89
+ };
+
+ /**
+ Extended version of text_list() taking an option flag.
+ The option flag should be the multiple of text_list_option_enum.
+
+ \since 0.89
+ */
+ std::vector<text_box> text_list(int opt_flag) const;
+
private:
page(document_private *doc, int index);
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 83e46319..4ec159a8 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -73,23 +73,17 @@ void delete_all(const Collection &c)
}
class font_info;
-struct text_box_data
+struct text_box_font_info_data
{
- ~text_box_data();
-
- ustring text;
- rectf bbox;
- int rotation;
- std::vector<rectf> char_bboxes;
- bool has_space_after;
+ ~text_box_font_info_data();
- std::vector<text_box::writing_mode_enum> wmodes;
double font_size;
+ std::vector<text_box::writing_mode_enum> wmodes;
/*
* a duplication of the font_info_cache created by the
* poppler::font_iterator and owned by the poppler::page
- * object. Its lifetime might differ from that of text_box
+ * object. Its lifetime might differ from that of text_box
* object (think about collecting all text_box objects
* from all pages), so we have to duplicate it into all
* text_box instances.
@@ -97,7 +91,7 @@ struct text_box_data
std::vector<font_info> font_info_cache;
/*
- * a std::vector from the glyph index in the current
+ * a std::vector from the glyph index in the owner
* text_box to the font_info index in font_info_cache.
* The "-1" means no corresponding fonts found in the
* cache.
@@ -105,6 +99,20 @@ struct text_box_data
std::vector<int> glyph_to_cache_index;
};
+class font_info;
+struct text_box_data
+{
+ ~text_box_data();
+
+ ustring text;
+ rectf bbox;
+ int rotation;
+ std::vector<rectf> char_bboxes;
+ bool has_space_after;
+
+ std::unique_ptr<text_box_font_info_data> text_box_font;
+};
+
}
#endif
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index 7864979e..ef391d78 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -60,6 +60,7 @@ bool show_help = false;
bool show_version = false;
char show_text[32];
bool show_text_list = false;
+bool show_text_list_with_font = false;
poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
static const ArgDesc the_args[] = {
@@ -85,6 +86,8 @@ static const ArgDesc the_args[] = {
"show text (physical|raw|none) extracted from all pages" },
{ "--show-text-list", argFlag, &show_text_list, 0,
"show text list (experimental)" },
+ { "--show-text-list-with-font", argFlag, &show_text_list_with_font, 0,
+ "show text list with font info (experimental)" },
{ "-h", argFlag, &show_help, 0,
"print usage information" },
{ "--help", argFlag, &show_help, 0,
@@ -417,14 +420,14 @@ static void print_page_text(poppler::page *p)
std::cout << std::endl;
}
-static void print_page_text_list(poppler::page *p)
+static void print_page_text_list(poppler::page *p, int opt_flag = 0)
{
if (!p) {
std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
std::cout << std::endl;
return;
}
- auto text_list = p->text_list();
+ auto text_list = p->text_list(opt_flag);
std::cout << "---" << std::endl;
for (const poppler::text_box &text : text_list) {
@@ -435,9 +438,9 @@ static void print_page_text_list(poppler::page *p)
std::string font_name = text.get_font_name();
std::cout << "[" << ustr << "] @ ";
std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
- std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
+ if (text.has_font_info())
+ std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
std::cout << std::endl;
-
}
std::cout << "---" << std::endl;
}
@@ -538,12 +541,15 @@ int main(int argc, char *argv[])
print_page_text(p.get());
}
}
- if (show_text_list) {
+ if (show_text_list || show_text_list_with_font) {
const int pages = doc->pages();
for (int i = 0; i < pages; ++i) {
std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
std::unique_ptr<poppler::page> p(doc->create_page(i));
- print_page_text_list(p.get());
+ if (show_text_list_with_font)
+ print_page_text_list(p.get(), poppler::page::text_list_include_font);
+ else
+ print_page_text_list(p.get(), 0);
}
}