summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Berland <ken@hero.com>2010-10-17 14:24:16 +0100
committerAlbert Astals Cid <aacid@kde.org>2010-10-17 16:15:11 +0100
commit3b45c66de91243d7419657512cf726c9e1c26dfb (patch)
treef84d8d72f13923869ba88cf7b4123d1853489de1
parent71ec4c7a6715a40d536d1e4b911dae48d1bba7b9 (diff)
Add -bbox option to pdftotext
-rw-r--r--utils/pdftotext.14
-rw-r--r--utils/pdftotext.cc125
2 files changed, 103 insertions, 26 deletions
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 4e3cb0b7..eb3aa8e7 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -63,6 +63,10 @@ Generate a simple HTML file, including the meta information. This
simply wraps the text in <pre> and </pre> and prepends the meta
headers.
.TP
+.B \-bbox
+Generate an XHTML file containing bounding box information for each
+word in the file.
+.TP
.BI \-enc " encoding-name"
Sets the encoding to use for text output. This defaults to "UTF-8".
.TP
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index d4e004ba..b189e846 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -19,6 +19,7 @@
// Copyright (C) 2007-2008, 2010 Albert Astals Cid <aacid@kde.org>
// Copyright (C) 2009 Jan Jockusch <jan@jockusch.de>
// Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
+// Copyright (C) 2010 Kenneth Berland <ken@hero.com>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
@@ -49,6 +50,7 @@
#include "CharTypes.h"
#include "UnicodeMap.h"
#include "Error.h"
+#include <string>
static void printInfoString(FILE *f, Dict *infoDict, char *key,
char *text1, char *text2, UnicodeMap *uMap);
@@ -61,6 +63,7 @@ static int x = 0;
static int y = 0;
static int w = 0;
static int h = 0;
+static GBool bbox = gFalse;
static GBool physLayout = gFalse;
static GBool rawOrder = gFalse;
static GBool htmlMeta = gFalse;
@@ -103,6 +106,8 @@ static const ArgDesc argDesc[] = {
"output end-of-line convention (unix, dos, or mac)"},
{"-nopgbrk", argFlag, &noPageBreaks, 0,
"don't insert page breaks between pages"},
+ {"-bbox", argFlag, &bbox, 0,
+ "output bounding box for each word and page size to html. Sets -htmlmeta"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
@@ -122,6 +127,30 @@ static const ArgDesc argDesc[] = {
{NULL}
};
+static std::string myStringReplace(const std::string &inString, const std::string &oldToken, const std::string &newToken) {
+ std::string result = inString;
+ size_t foundLoc;
+ int advance = 0;
+ do {
+ foundLoc = result.find(oldToken, advance);
+ if (foundLoc != std::string::npos){
+ result.replace(foundLoc, oldToken.length(), newToken);
+ advance = foundLoc + newToken.length();
+ }
+ } while (foundLoc != std::string::npos );
+ return result;
+}
+
+static std::string myXmlTokenReplace(const char *inString){
+ std::string myString(inString);
+ myString = myStringReplace(myString, "&", "&amp;" );
+ myString = myStringReplace(myString, "'", "&apos;" );
+ myString = myStringReplace(myString, "\"", "&quot;" );
+ myString = myStringReplace(myString, "<", "&lt;" );
+ myString = myStringReplace(myString, ">", "&gt;" );
+ return myString;
+}
+
int main(int argc, char *argv[]) {
PDFDoc *doc;
GooString *fileName;
@@ -139,6 +168,9 @@ int main(int argc, char *argv[]) {
// parse args
ok = parseArgs(argDesc, &argc, argv);
+ if (bbox) {
+ htmlMeta = gTrue;
+ }
if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) {
fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION);
fprintf(stderr, "%s\n", popplerCopyright);
@@ -257,57 +289,98 @@ int main(int argc, char *argv[]) {
goto err3;
}
}
- fputs("<html>\n", f);
+ fputs("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", f);
+ fputs("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n", f);
fputs("<head>\n", f);
doc->getDocInfo(&info);
if (info.isDict()) {
- printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
- uMap);
+ Object obj;
+ if (info.getDict()->lookup("Title", &obj)->isString()) {
+ printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n", uMap);
+ } else {
+ fputs("<title></title>\n", f);
+ }
+ obj.free();
printInfoString(f, info.getDict(), "Subject",
- "<meta name=\"Subject\" content=\"", "\">\n", uMap);
+ "<meta name=\"Subject\" content=\"", "\"/>\n", uMap);
printInfoString(f, info.getDict(), "Keywords",
- "<meta name=\"Keywords\" content=\"", "\">\n", uMap);
+ "<meta name=\"Keywords\" content=\"", "\"/>\n", uMap);
printInfoString(f, info.getDict(), "Author",
- "<meta name=\"Author\" content=\"", "\">\n", uMap);
+ "<meta name=\"Author\" content=\"", "\"/>\n", uMap);
printInfoString(f, info.getDict(), "Creator",
- "<meta name=\"Creator\" content=\"", "\">\n", uMap);
+ "<meta name=\"Creator\" content=\"", "\"/>\n", uMap);
printInfoString(f, info.getDict(), "Producer",
- "<meta name=\"Producer\" content=\"", "\">\n", uMap);
+ "<meta name=\"Producer\" content=\"", "\"/>\n", uMap);
printInfoDate(f, info.getDict(), "CreationDate",
- "<meta name=\"CreationDate\" content=\"\">\n");
+ "<meta name=\"CreationDate\" content=\"\"/>\n");
printInfoDate(f, info.getDict(), "LastModifiedDate",
- "<meta name=\"ModDate\" content=\"\">\n");
+ "<meta name=\"ModDate\" content=\"\"/>\n");
}
info.free();
fputs("</head>\n", f);
fputs("<body>\n", f);
- fputs("<pre>\n", f);
+ if (!bbox) fputs("<pre>\n", f);
if (f != stdout) {
fclose(f);
}
}
// write text file
- textOut = new TextOutputDev(textFileName->getCString(),
- physLayout, rawOrder, htmlMeta);
- if (textOut->isOk()) {
- if ((w==0) && (h==0) && (x==0) && (y==0)) {
- doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
- gTrue, gFalse, gFalse);
- } else {
- int page;
-
- for (page = firstPage; page <= lastPage; ++page) {
- doc->displayPageSlice(textOut, page, resolution, resolution, 0,
+ if (bbox) {
+ textOut = new TextOutputDev(NULL, physLayout, rawOrder, htmlMeta);
+ if (!(f = fopen(textFileName->getCString(), "ab"))) {
+ error(-1, "Couldn't open text file '%s' for append", textFileName->getCString());
+ exitCode = 2;
+ delete textOut;
+ goto err3;
+ }
+
+ if (textOut->isOk()) {
+ fprintf(f, "<doc>\n");
+ for (int page = firstPage; page <= lastPage; ++page) {
+ fprintf(f, " <page width=\"%f\" height=\"%f\">\n",doc->getPageCropWidth(page), doc->getPageCropHeight(page));
+ doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
+ TextWordList *wordlist = textOut->makeWordList();
+ const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
+ TextWord *word;
+ double xMinA, yMinA, xMaxA, yMaxA;
+ if (word_length == 0)
+ fprintf(stderr, "no word list\n");
+
+ for (int i = 0; i < word_length; ++i) {
+ word = wordlist->get(i);
+ word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
+ const std::string myString = myXmlTokenReplace(word->getText()->getCString());
+ fprintf(f," <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str());
+ }
+ fprintf(f, " </page>\n");
+ delete wordlist;
+ }
+ fprintf(f, "</doc>\n");
+ }
+ fclose(f);
+ delete textOut;
+ } else {
+ textOut = new TextOutputDev(textFileName->getCString(),
+ physLayout, rawOrder, htmlMeta);
+ if (textOut->isOk()) {
+ if ((w==0) && (h==0) && (x==0) && (y==0)) {
+ doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
+ gTrue, gFalse, gFalse);
+ } else {
+
+ for (int page = firstPage; page <= lastPage; ++page) {
+ doc->displayPageSlice(textOut, page, resolution, resolution, 0,
gTrue, gFalse, gFalse,
x, y, w, h);
- }
- }
+ }
+ }
- } else {
+ } else {
delete textOut;
exitCode = 2;
goto err3;
+ }
}
delete textOut;
@@ -322,7 +395,7 @@ int main(int argc, char *argv[]) {
goto err3;
}
}
- fputs("</pre>\n", f);
+ if (!bbox) fputs("</pre>\n", f);
fputs("</body>\n", f);
fputs("</html>\n", f);
if (f != stdout) {