diff options
author | Kenneth Berland <ken@hero.com> | 2010-10-17 14:24:16 +0100 |
---|---|---|
committer | Albert Astals Cid <aacid@kde.org> | 2010-10-17 16:15:11 +0100 |
commit | 3b45c66de91243d7419657512cf726c9e1c26dfb (patch) | |
tree | f84d8d72f13923869ba88cf7b4123d1853489de1 | |
parent | 71ec4c7a6715a40d536d1e4b911dae48d1bba7b9 (diff) |
Add -bbox option to pdftotext
-rw-r--r-- | utils/pdftotext.1 | 4 | ||||
-rw-r--r-- | utils/pdftotext.cc | 125 |
2 files changed, 103 insertions, 26 deletions
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1 index 4e3cb0b7..eb3aa8e7 100644 --- a/utils/pdftotext.1 +++ b/utils/pdftotext.1 @@ -63,6 +63,10 @@ Generate a simple HTML file, including the meta information. This simply wraps the text in <pre> and </pre> and prepends the meta headers. .TP +.B \-bbox +Generate an XHTML file containing bounding box information for each +word in the file. +.TP .BI \-enc " encoding-name" Sets the encoding to use for text output. This defaults to "UTF-8". .TP diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index d4e004ba..b189e846 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -19,6 +19,7 @@ // Copyright (C) 2007-2008, 2010 Albert Astals Cid <aacid@kde.org> // Copyright (C) 2009 Jan Jockusch <jan@jockusch.de> // Copyright (C) 2010 Hib Eris <hib@hiberis.nl> +// Copyright (C) 2010 Kenneth Berland <ken@hero.com> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -49,6 +50,7 @@ #include "CharTypes.h" #include "UnicodeMap.h" #include "Error.h" +#include <string> static void printInfoString(FILE *f, Dict *infoDict, char *key, char *text1, char *text2, UnicodeMap *uMap); @@ -61,6 +63,7 @@ static int x = 0; static int y = 0; static int w = 0; static int h = 0; +static GBool bbox = gFalse; static GBool physLayout = gFalse; static GBool rawOrder = gFalse; static GBool htmlMeta = gFalse; @@ -103,6 +106,8 @@ static const ArgDesc argDesc[] = { "output end-of-line convention (unix, dos, or mac)"}, {"-nopgbrk", argFlag, &noPageBreaks, 0, "don't insert page breaks between pages"}, + {"-bbox", argFlag, &bbox, 0, + "output bounding box for each word and page size to html. Sets -htmlmeta"}, {"-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)"}, {"-upw", argString, userPassword, sizeof(userPassword), @@ -122,6 +127,30 @@ static const ArgDesc argDesc[] = { {NULL} }; +static std::string myStringReplace(const std::string &inString, const std::string &oldToken, const std::string &newToken) { + std::string result = inString; + size_t foundLoc; + int advance = 0; + do { + foundLoc = result.find(oldToken, advance); + if (foundLoc != std::string::npos){ + result.replace(foundLoc, oldToken.length(), newToken); + advance = foundLoc + newToken.length(); + } + } while (foundLoc != std::string::npos ); + return result; +} + +static std::string myXmlTokenReplace(const char *inString){ + std::string myString(inString); + myString = myStringReplace(myString, "&", "&" ); + myString = myStringReplace(myString, "'", "'" ); + myString = myStringReplace(myString, "\"", """ ); + myString = myStringReplace(myString, "<", "<" ); + myString = myStringReplace(myString, ">", ">" ); + return myString; +} + int main(int argc, char *argv[]) { PDFDoc *doc; GooString *fileName; @@ -139,6 +168,9 @@ int main(int argc, char *argv[]) { // parse args ok = parseArgs(argDesc, &argc, argv); + if (bbox) { + htmlMeta = gTrue; + } if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) { fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION); fprintf(stderr, "%s\n", popplerCopyright); @@ -257,57 +289,98 @@ int main(int argc, char *argv[]) { goto err3; } } - fputs("<html>\n", f); + fputs("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", f); + fputs("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n", f); fputs("<head>\n", f); doc->getDocInfo(&info); if (info.isDict()) { - printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n", - uMap); + Object obj; + if (info.getDict()->lookup("Title", &obj)->isString()) { + printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n", uMap); + } else { + fputs("<title></title>\n", f); + } + obj.free(); printInfoString(f, info.getDict(), "Subject", - "<meta name=\"Subject\" content=\"", "\">\n", uMap); + "<meta name=\"Subject\" content=\"", "\"/>\n", uMap); printInfoString(f, info.getDict(), "Keywords", - "<meta name=\"Keywords\" content=\"", "\">\n", uMap); + "<meta name=\"Keywords\" content=\"", "\"/>\n", uMap); printInfoString(f, info.getDict(), "Author", - "<meta name=\"Author\" content=\"", "\">\n", uMap); + "<meta name=\"Author\" content=\"", "\"/>\n", uMap); printInfoString(f, info.getDict(), "Creator", - "<meta name=\"Creator\" content=\"", "\">\n", uMap); + "<meta name=\"Creator\" content=\"", "\"/>\n", uMap); printInfoString(f, info.getDict(), "Producer", - "<meta name=\"Producer\" content=\"", "\">\n", uMap); + "<meta name=\"Producer\" content=\"", "\"/>\n", uMap); printInfoDate(f, info.getDict(), "CreationDate", - "<meta name=\"CreationDate\" content=\"\">\n"); + "<meta name=\"CreationDate\" content=\"\"/>\n"); printInfoDate(f, info.getDict(), "LastModifiedDate", - "<meta name=\"ModDate\" content=\"\">\n"); + "<meta name=\"ModDate\" content=\"\"/>\n"); } info.free(); fputs("</head>\n", f); fputs("<body>\n", f); - fputs("<pre>\n", f); + if (!bbox) fputs("<pre>\n", f); if (f != stdout) { fclose(f); } } // write text file - textOut = new TextOutputDev(textFileName->getCString(), - physLayout, rawOrder, htmlMeta); - if (textOut->isOk()) { - if ((w==0) && (h==0) && (x==0) && (y==0)) { - doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, - gTrue, gFalse, gFalse); - } else { - int page; - - for (page = firstPage; page <= lastPage; ++page) { - doc->displayPageSlice(textOut, page, resolution, resolution, 0, + if (bbox) { + textOut = new TextOutputDev(NULL, physLayout, rawOrder, htmlMeta); + if (!(f = fopen(textFileName->getCString(), "ab"))) { + error(-1, "Couldn't open text file '%s' for append", textFileName->getCString()); + exitCode = 2; + delete textOut; + goto err3; + } + + if (textOut->isOk()) { + fprintf(f, "<doc>\n"); + for (int page = firstPage; page <= lastPage; ++page) { + fprintf(f, " <page width=\"%f\" height=\"%f\">\n",doc->getPageCropWidth(page), doc->getPageCropHeight(page)); + doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse); + TextWordList *wordlist = textOut->makeWordList(); + const int word_length = wordlist != NULL ? wordlist->getLength() : 0; + TextWord *word; + double xMinA, yMinA, xMaxA, yMaxA; + if (word_length == 0) + fprintf(stderr, "no word list\n"); + + for (int i = 0; i < word_length; ++i) { + word = wordlist->get(i); + word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA); + const std::string myString = myXmlTokenReplace(word->getText()->getCString()); + fprintf(f," <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str()); + } + fprintf(f, " </page>\n"); + delete wordlist; + } + fprintf(f, "</doc>\n"); + } + fclose(f); + delete textOut; + } else { + textOut = new TextOutputDev(textFileName->getCString(), + physLayout, rawOrder, htmlMeta); + if (textOut->isOk()) { + if ((w==0) && (h==0) && (x==0) && (y==0)) { + doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, + gTrue, gFalse, gFalse); + } else { + + for (int page = firstPage; page <= lastPage; ++page) { + doc->displayPageSlice(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse, x, y, w, h); - } - } + } + } - } else { + } else { delete textOut; exitCode = 2; goto err3; + } } delete textOut; @@ -322,7 +395,7 @@ int main(int argc, char *argv[]) { goto err3; } } - fputs("</pre>\n", f); + if (!bbox) fputs("</pre>\n", f); fputs("</body>\n", f); fputs("</html>\n", f); if (f != stdout) { |