summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog31
-rw-r--r--Makefile.am6
-rw-r--r--configure.ac25
-rw-r--r--goo/GooVector.h101
-rw-r--r--utils/HtmlFonts.cc326
-rw-r--r--utils/HtmlFonts.h85
-rw-r--r--utils/HtmlLinks.cc101
-rw-r--r--utils/HtmlLinks.h49
-rw-r--r--utils/HtmlOutputDev.cc1569
-rw-r--r--utils/HtmlOutputDev.h302
-rw-r--r--utils/ImageOutputDev.cc195
-rw-r--r--utils/ImageOutputDev.h76
-rw-r--r--utils/Makefile.am18
-rw-r--r--utils/parseargs.c190
-rw-r--r--utils/parseargs.h71
-rw-r--r--utils/pdffonts.1128
-rw-r--r--utils/pdffonts.cc294
-rw-r--r--utils/pdfimages.196
-rw-r--r--utils/pdfimages.cc159
-rw-r--r--utils/pdfinfo.1157
-rw-r--r--utils/pdfinfo.cc376
-rw-r--r--utils/pdftohtml.185
-rw-r--r--utils/pdftohtml.cc429
-rw-r--r--utils/pdftoppm.1113
-rw-r--r--utils/pdftoppm.cc189
-rw-r--r--utils/pdftops.1224
-rw-r--r--utils/pdftops.cc336
-rw-r--r--utils/pdftotext.1135
-rw-r--r--utils/pdftotext.cc337
29 files changed, 6194 insertions, 9 deletions
diff --git a/ChangeLog b/ChangeLog
index 97f611a6..a556ecdc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,34 @@
+2005-12-12 Kristian Høgsberg <krh@redhat.com>
+
+ * Makefile.am:
+ * configure.ac:
+ * goo/GooVector.h:
+ * utils/HtmlFonts.cc:
+ * utils/HtmlFonts.h:
+ * utils/HtmlLinks.cc:
+ * utils/HtmlLinks.h:
+ * utils/HtmlOutputDev.cc:
+ * utils/HtmlOutputDev.h:
+ * utils/ImageOutputDev.cc:
+ * utils/ImageOutputDev.h:
+ * utils/Makefile.am:
+ * utils/parseargs.c:
+ * utils/parseargs.h:
+ * utils/pdffonts.1:
+ * utils/pdffonts.cc:
+ * utils/pdfimages.1:
+ * utils/pdfimages.cc:
+ * utils/pdfinfo.1:
+ * utils/pdfinfo.cc:
+ * utils/pdftohtml.1:
+ * utils/pdftohtml.cc:
+ * utils/pdftoppm.1:
+ * utils/pdftoppm.cc:
+ * utils/pdftops.1:
+ * utils/pdftops.cc:
+ * utils/pdftotext.1:
+ * utils/pdftotext.cc: Add command line utilities from xpdf.
+
2005-12-10 Albert Astals Cid <aacid@kde.org>
* qt4/src/poppler-page.cc:
diff --git a/Makefile.am b/Makefile.am
index a8d53613..9bcf97e8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,7 +22,11 @@ qt4_subdir = qt4
qt4_pc_file = poppler-qt4.pc
endif
-SUBDIRS = goo fofi $(splash_subdir) poppler $(glib_subdir) $(qt_subdir) test $(qt4_subdir)
+if BUILD_UTILS
+utils_subdir = utils
+endif
+
+SUBDIRS = goo fofi $(splash_subdir) poppler $(utils_subdir) $(glib_subdir) $(qt_subdir) test $(qt4_subdir)
EXTRA_DIST = \
README-XPDF \
diff --git a/configure.ac b/configure.ac
index e7f6ee36..b7b5961b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -479,6 +479,13 @@ fi
AM_CONDITIONAL(BUILD_GTK_TEST, test x$enable_gtk_test = xyes)
+AC_ARG_ENABLE(utils,
+ AC_HELP_STRING([--disable-utils],
+ [Don't compile poppler command line utils.]),
+ enable_utils=$enableval,
+ enable_utils="yes")
+AM_CONDITIONAL(BUILD_UTILS, test x$enable_utils = xyes)
+
AC_ARG_ENABLE(compile-warnings,
AC_HELP_STRING([--enable-compile-warnings=@<:@no/yes/kde@:>@]
[Turn on compiler warnings.]),,
@@ -505,6 +512,7 @@ goo/Makefile
fofi/Makefile
splash/Makefile
poppler/Makefile
+utils/Makefile
glib/Makefile
test/Makefile
qt/Makefile
@@ -521,11 +529,12 @@ poppler-qt4.pc])
echo ""
echo "Building poppler with support for:"
-echo " splash output: $enable_splash_output"
-echo " cairo output: $enable_cairo_output"
-echo " qt wrapper: $enable_poppler_qt"
-echo " qt4 wrapper: $enable_poppler_qt4"
-echo " qt4 unittests: $enable_poppler_qt4testlib"
-echo " glib wrapper: $enable_poppler_glib"
-echo " use libjpeg: $enable_libjpeg"
-echo " use zlib: $enable_zlib"
+echo " splash output: $enable_splash_output"
+echo " cairo output: $enable_cairo_output"
+echo " qt wrapper: $enable_poppler_qt"
+echo " qt4 wrapper: $enable_poppler_qt4"
+echo " qt4 unittests: $enable_poppler_qt4testlib"
+echo " glib wrapper: $enable_poppler_glib"
+echo " use libjpeg: $enable_libjpeg"
+echo " use zlib: $enable_zlib"
+echo " command line utils: $enable_poppler_glib"
diff --git a/goo/GooVector.h b/goo/GooVector.h
new file mode 100644
index 00000000..3cd551b9
--- /dev/null
+++ b/goo/GooVector.h
@@ -0,0 +1,101 @@
+#ifndef _VECTOR_H
+#define _VECTOR_H
+#include "goo/gtypes.h"
+
+
+template<class T>
+class GooVector{
+private:
+
+ int _size;
+ T* last;
+ T* storage;
+
+ void resize(){
+ if (_size==0) _size=2;else _size=2*_size;
+ T *tmp=new T[_size];
+ if (storage){
+ last=copy(storage,last,tmp);
+ delete [] storage;
+ }
+ else last=tmp;
+ storage=tmp;
+ }
+
+ T* copy(T* src1,T* scr2,T* dest){
+ T* tmp=src1;
+ T* d=dest;
+ while(tmp!=scr2){
+ *d=*tmp;
+ d++;tmp++;
+ }
+ return d;
+ }
+
+public:
+ typedef T* iterator;
+
+ GooVector(){
+ _size=0;
+ last=0;
+ storage=0;
+}
+
+
+
+virtual ~GooVector(){
+ delete[] storage ;
+}
+
+void reset(){
+ last=storage;
+}
+
+int size(){
+ return (last-storage);
+}
+void push_back(const T& elem){
+ if (!storage||(size() >=_size)) resize();
+ *last=elem;
+ last++;
+
+
+}
+
+
+T pop_back() {
+ if (last!=storage) last--;
+
+ return *last;
+}
+
+
+T operator[](unsigned int i){
+ return *(storage+i);
+}
+
+
+GBool isEmpty() const{
+ return !_size || (last==storage) ;
+}
+
+
+
+iterator begin() const{
+ return storage;
+}
+
+iterator end() const {
+ return last;
+}
+};
+#endif
+
+
+
+
+
+
+
+
+
diff --git a/utils/HtmlFonts.cc b/utils/HtmlFonts.cc
new file mode 100644
index 00000000..c77683b0
--- /dev/null
+++ b/utils/HtmlFonts.cc
@@ -0,0 +1,326 @@
+#include "HtmlFonts.h"
+#include "GlobalParams.h"
+#include "UnicodeMap.h"
+#include <stdio.h>
+
+ struct Fonts{
+ char *Fontname;
+ char *name;
+ };
+
+const int font_num=13;
+
+static Fonts fonts[font_num+1]={
+ {"Courier", "Courier" },
+ {"Courier-Bold", "Courier"},
+ {"Courier-BoldOblique", "Courier"},
+ {"Courier-Oblique", "Courier"},
+ {"Helvetica", "Helvetica"},
+ {"Helvetica-Bold", "Helvetica"},
+ {"Helvetica-BoldOblique", "Helvetica"},
+ {"Helvetica-Oblique", "Helvetica"},
+ {"Symbol", "Symbol" },
+ {"Times-Bold", "Times" },
+ {"Times-BoldItalic", "Times" },
+ {"Times-Italic", "Times" },
+ {"Times-Roman", "Times" },
+ {" " , "Times" },
+};
+
+#define xoutRound(x) ((int)(x + 0.5))
+extern GBool xml;
+
+GooString* HtmlFont::DefaultFont=new GooString("Times"); // Arial,Helvetica,sans-serif
+
+HtmlFontColor::HtmlFontColor(GfxRGB rgb){
+ r=static_cast<int>(255*rgb.r);
+ g=static_cast<int>(255*rgb.g);
+ b=static_cast<int>(255*rgb.b);
+ if (!(Ok(r)&&Ok(b)&&Ok(g))) {printf("Error : Bad color \n");r=0;g=0;b=0;}
+}
+
+GooString *HtmlFontColor::convtoX(unsigned int xcol) const{
+ GooString *xret=new GooString();
+ char tmp;
+ unsigned int k;
+ k = (xcol/16);
+ if ((k>=0)&&(k<10)) tmp=(char) ('0'+k); else tmp=(char)('a'+k-10);
+ xret->append(tmp);
+ k = (xcol%16);
+ if ((k>=0)&&(k<10)) tmp=(char) ('0'+k); else tmp=(char)('a'+k-10);
+ xret->append(tmp);
+ return xret;
+}
+
+GooString *HtmlFontColor::toString() const{
+ GooString *tmp=new GooString("#");
+ GooString *tmpr=convtoX(r);
+ GooString *tmpg=convtoX(g);
+ GooString *tmpb=convtoX(b);
+ tmp->append(tmpr);
+ tmp->append(tmpg);
+ tmp->append(tmpb);
+ delete tmpr;
+ delete tmpg;
+ delete tmpb;
+ return tmp;
+}
+
+HtmlFont::HtmlFont(GooString* ftname,int _size, GfxRGB rgb){
+ //if (col) color=HtmlFontColor(col);
+ //else color=HtmlFontColor();
+ color=HtmlFontColor(rgb);
+
+ GooString *fontname = NULL;
+
+ if( ftname ){
+ fontname = new GooString(ftname);
+ FontName=new GooString(ftname);
+ }
+ else {
+ fontname = NULL;
+ FontName = NULL;
+ }
+
+ lineSize = -1;
+
+ size=(_size-1);
+ italic = gFalse;
+ bold = gFalse;
+
+ if (fontname){
+ if (strstr(fontname->lowerCase()->getCString(),"bold")) bold=gTrue;
+
+ if (strstr(fontname->lowerCase()->getCString(),"italic")||
+ strstr(fontname->lowerCase()->getCString(),"oblique")) italic=gTrue;
+
+ int i=0;
+ while (strcmp(ftname->getCString(),fonts[i].Fontname)&&(i<font_num))
+ {
+ i++;
+ }
+ pos=i;
+ delete fontname;
+ }
+ if (!DefaultFont) DefaultFont=new GooString(fonts[font_num].name);
+
+}
+
+HtmlFont::HtmlFont(const HtmlFont& x){
+ size=x.size;
+ lineSize=x.lineSize;
+ italic=x.italic;
+ bold=x.bold;
+ pos=x.pos;
+ color=x.color;
+ if (x.FontName) FontName=new GooString(x.FontName);
+ }
+
+
+HtmlFont::~HtmlFont(){
+ if (FontName) delete FontName;
+}
+
+HtmlFont& HtmlFont::operator=(const HtmlFont& x){
+ if (this==&x) return *this;
+ size=x.size;
+ lineSize=x.lineSize;
+ italic=x.italic;
+ bold=x.bold;
+ pos=x.pos;
+ color=x.color;
+ if (FontName) delete FontName;
+ if (x.FontName) FontName=new GooString(x.FontName);
+ return *this;
+}
+
+void HtmlFont::clear(){
+ if(DefaultFont) delete DefaultFont;
+ DefaultFont = NULL;
+}
+
+
+
+/*
+ This function is used to compare font uniquily for insertion into
+ the list of all encountered fonts
+*/
+GBool HtmlFont::isEqual(const HtmlFont& x) const{
+ return ((size==x.size) &&
+ (lineSize==x.lineSize) &&
+ (pos==x.pos) && (bold==x.bold) && (italic==x.italic) &&
+ (color.isEqual(x.getColor())));
+}
+
+/*
+ This one is used to decide whether two pieces of text can be joined together
+ and therefore we don't care about bold/italics properties
+*/
+GBool HtmlFont::isEqualIgnoreBold(const HtmlFont& x) const{
+ return ((size==x.size) &&
+ (!strcmp(fonts[pos].name, fonts[x.pos].name)) &&
+ (color.isEqual(x.getColor())));
+}
+
+GooString* HtmlFont::getFontName(){
+ if (pos!=font_num) return new GooString(fonts[pos].name);
+ else return new GooString(DefaultFont);
+}
+
+GooString* HtmlFont::getFullName(){
+ if (FontName)
+ return new GooString(FontName);
+ else return new GooString(DefaultFont);
+}
+
+void HtmlFont::setDefaultFont(GooString* defaultFont){
+ if (DefaultFont) delete DefaultFont;
+ DefaultFont=new GooString(defaultFont);
+}
+
+
+GooString* HtmlFont::getDefaultFont(){
+ return DefaultFont;
+}
+
+// this method if plain wrong todo
+GooString* HtmlFont::HtmlFilter(Unicode* u, int uLen) {
+ GooString *tmp = new GooString();
+ UnicodeMap *uMap;
+ char buf[8];
+ int n;
+
+ // get the output encoding
+ if (!(uMap = globalParams->getTextEncoding())) {
+ return tmp;
+ }
+
+ for (int i = 0; i < uLen; ++i) {
+ switch (u[i])
+ {
+ case '"': tmp->append("&quot;"); break;
+ case '&': tmp->append("&amp;"); break;
+ case '<': tmp->append("&lt;"); break;
+ case '>': tmp->append("&gt;"); break;
+ default:
+ {
+ // convert unicode to string
+ if ((n = uMap->mapUnicode(u[i], buf, sizeof(buf))) > 0) {
+ tmp->append(buf, n);
+ }
+ }
+ }
+ }
+
+ uMap->decRefCnt();
+ return tmp;
+}
+
+GooString* HtmlFont::simple(HtmlFont* font, Unicode* content, int uLen){
+ GooString *cont=HtmlFilter (content, uLen);
+
+ /*if (font.isBold()) {
+ cont->insert(0,"<b>",3);
+ cont->append("</b>",4);
+ }
+ if (font.isItalic()) {
+ cont->insert(0,"<i>",3);
+ cont->append("</i>",4);
+ } */
+
+ return cont;
+}
+
+HtmlFontAccu::HtmlFontAccu(){
+ accu=new GooVector<HtmlFont>();
+}
+
+HtmlFontAccu::~HtmlFontAccu(){
+ if (accu) delete accu;
+}
+
+int HtmlFontAccu::AddFont(const HtmlFont& font){
+ GooVector<HtmlFont>::iterator i;
+ for (i=accu->begin();i!=accu->end();i++)
+ {
+ if (font.isEqual(*i))
+ {
+ return (int)(i-(accu->begin()));
+ }
+ }
+
+ accu->push_back(font);
+ return (accu->size()-1);
+}
+
+// get CSS font name for font #i
+GooString* HtmlFontAccu::getCSStyle(int i, GooString* content){
+ GooString *tmp;
+ GooString *iStr=GooString::fromInt(i);
+
+ if (!xml) {
+ tmp = new GooString("<span class=\"ft");
+ tmp->append(iStr);
+ tmp->append("\">");
+ tmp->append(content);
+ tmp->append("</span>");
+ } else {
+ tmp = new GooString("");
+ tmp->append(content);
+ }
+
+ delete iStr;
+ return tmp;
+}
+
+// get CSS font definition for font #i
+GooString* HtmlFontAccu::CSStyle(int i){
+ GooString *tmp=new GooString();
+ GooString *iStr=GooString::fromInt(i);
+
+ GooVector<HtmlFont>::iterator g=accu->begin();
+ g+=i;
+ HtmlFont font=*g;
+ GooString *Size=GooString::fromInt(font.getSize());
+ GooString *colorStr=font.getColor().toString();
+ GooString *fontName=font.getFontName();
+ GooString *lSize;
+
+ if(!xml){
+ tmp->append(".ft");
+ tmp->append(iStr);
+ tmp->append("{font-size:");
+ tmp->append(Size);
+ if( font.getLineSize() != -1 )
+ {
+ lSize = GooString::fromInt(font.getLineSize());
+ tmp->append("px;line-height:");
+ tmp->append(lSize);
+ delete lSize;
+ }
+ tmp->append("px;font-family:");
+ tmp->append(fontName); //font.getFontName());
+ tmp->append(";color:");
+ tmp->append(colorStr);
+ tmp->append(";}");
+ }
+ if (xml) {
+ tmp->append("<fontspec id=\"");
+ tmp->append(iStr);
+ tmp->append("\" size=\"");
+ tmp->append(Size);
+ tmp->append("\" family=\"");
+ tmp->append(fontName); //font.getFontName());
+ tmp->append("\" color=\"");
+ tmp->append(colorStr);
+ tmp->append("\"/>");
+ }
+
+ delete fontName;
+ delete colorStr;
+ delete iStr;
+ delete Size;
+ return tmp;
+}
+
+
diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h
new file mode 100644
index 00000000..3ff5b81a
--- /dev/null
+++ b/utils/HtmlFonts.h
@@ -0,0 +1,85 @@
+#ifndef _HTML_FONTS_H
+#define _HTML_FONTS_H
+#include "goo/GooVector.h"
+#include "goo/GooString.h"
+#include "GfxState.h"
+#include "CharTypes.h"
+
+
+class HtmlFontColor{
+ private:
+ unsigned int r;
+ unsigned int g;
+ unsigned int b;
+ GBool Ok(unsigned int xcol){ return ((xcol<=255)&&(xcol>=0));}
+ GooString *convtoX(unsigned int xcol) const;
+ public:
+ HtmlFontColor():r(0),g(0),b(0){}
+ HtmlFontColor(GfxRGB rgb);
+ HtmlFontColor(const HtmlFontColor& x){r=x.r;g=x.g;b=x.b;}
+ HtmlFontColor& operator=(const HtmlFontColor &x){
+ r=x.r;g=x.g;b=x.b;
+ return *this;
+ }
+ ~HtmlFontColor(){};
+ GooString* toString() const;
+ GBool isEqual(const HtmlFontColor& col) const{
+ return ((r==col.r)&&(g==col.g)&&(b==col.b));
+ }
+} ;
+
+
+class HtmlFont{
+ private:
+ unsigned int size;
+ int lineSize;
+ GBool italic;
+ GBool bold;
+ int pos; // position of the font name in the fonts array
+ static GooString *DefaultFont;
+ GooString *FontName;
+ HtmlFontColor color;
+ static GooString* HtmlFilter(Unicode* u, int uLen); //char* s);
+public:
+
+ HtmlFont(){FontName=NULL;};
+ HtmlFont(GooString* fontname,int _size, GfxRGB rgb);
+ HtmlFont(const HtmlFont& x);
+ HtmlFont& operator=(const HtmlFont& x);
+ HtmlFontColor getColor() const {return color;}
+ ~HtmlFont();
+ static void clear();
+ GooString* getFullName();
+ GBool isItalic() const {return italic;}
+ GBool isBold() const {return bold;}
+ unsigned int getSize() const {return size;}
+ int getLineSize() const {return lineSize;}
+ void setLineSize(int _lineSize) { lineSize = _lineSize; }
+ GooString* getFontName();
+ static GooString* getDefaultFont();
+ static void setDefaultFont(GooString* defaultFont);
+ GBool isEqual(const HtmlFont& x) const;
+ GBool isEqualIgnoreBold(const HtmlFont& x) const;
+ static GooString* simple(HtmlFont *font, Unicode *content, int uLen);
+ void print() const {printf("font: %s %d %s%spos: %d\n", FontName->getCString(), size, bold ? "bold " : "", italic ? "italic " : "", pos);};
+};
+
+class HtmlFontAccu{
+private:
+ GooVector<HtmlFont> *accu;
+
+public:
+ HtmlFontAccu();
+ ~HtmlFontAccu();
+ int AddFont(const HtmlFont& font);
+ HtmlFont* Get(int i){
+ GooVector<HtmlFont>::iterator g=accu->begin();
+ g+=i;
+ return g;
+ }
+ GooString* getCSStyle (int i, GooString* content);
+ GooString* CSStyle(int i);
+ int size() const {return accu->size();}
+
+};
+#endif
diff --git a/utils/HtmlLinks.cc b/utils/HtmlLinks.cc
new file mode 100644
index 00000000..3010be5e
--- /dev/null
+++ b/utils/HtmlLinks.cc
@@ -0,0 +1,101 @@
+#include "HtmlLinks.h"
+
+HtmlLink::HtmlLink(const HtmlLink& x){
+ Xmin=x.Xmin;
+ Ymin=x.Ymin;
+ Xmax=x.Xmax;
+ Ymax=x.Ymax;
+ dest=new GooString(x.dest);
+}
+
+HtmlLink::HtmlLink(double xmin,double ymin,double xmax,double ymax,GooString * _dest)
+{
+ if (xmin < xmax) {
+ Xmin=xmin;
+ Xmax=xmax;
+ } else {
+ Xmin=xmax;
+ Xmax=xmin;
+ }
+ if (ymin < ymax) {
+ Ymin=ymin;
+ Ymax=ymax;
+ } else {
+ Ymin=ymax;
+ Ymax=ymin;
+ }
+ dest=new GooString(_dest);
+}
+
+HtmlLink::~HtmlLink(){
+ if (dest) delete dest;
+}
+
+GBool HtmlLink::isEqualDest(const HtmlLink& x) const{
+ return (!strcmp(dest->getCString(), x.dest->getCString()));
+}
+
+GBool HtmlLink::inLink(double xmin,double ymin,double xmax,double ymax) const {
+ double y=(ymin+ymax)/2;
+ if (y>Ymax) return gFalse;
+ return (y>Ymin)&&(xmin<Xmax)&&(xmax>Xmin);
+ }
+
+
+HtmlLink& HtmlLink::operator=(const HtmlLink& x){
+ if (this==&x) return *this;
+ if (dest) {delete dest;dest=NULL;}
+ Xmin=x.Xmin;
+ Ymin=x.Ymin;
+ Xmax=x.Xmax;
+ Ymax=x.Ymax;
+ dest=new GooString(x.dest);
+ return *this;
+}
+
+GooString* HtmlLink::getLinkStart() {
+ GooString *res = new GooString("<A href=\"");
+ res->append(dest);
+ res->append("\">");
+ return res;
+}
+
+/*GooString* HtmlLink::Link(GooString* content){
+ //GooString* _dest=new GooString(dest);
+ GooString *tmp=new GooString("<a href=\"");
+ tmp->append(dest);
+ tmp->append("\">");
+ tmp->append(content);
+ tmp->append("</a>");
+ //delete _dest;
+ return tmp;
+ }*/
+
+
+
+HtmlLinks::HtmlLinks(){
+ accu=new GooVector<HtmlLink>();
+}
+
+HtmlLinks::~HtmlLinks(){
+ delete accu;
+ accu=NULL;
+}
+
+GBool HtmlLinks::inLink(double xmin,double ymin,double xmax,double ymax,int& p)const {
+
+ for(GooVector<HtmlLink>::iterator i=accu->begin();i!=accu->end();i++){
+ if (i->inLink(xmin,ymin,xmax,ymax)) {
+ p=(i - accu->begin());
+ return 1;
+ }
+ }
+ return 0;
+}
+
+HtmlLink* HtmlLinks::getLink(int i) const{
+ GooVector<HtmlLink>::iterator g=accu->begin();
+ g+=i;
+ return g;
+}
+
diff --git a/utils/HtmlLinks.h b/utils/HtmlLinks.h
new file mode 100644
index 00000000..71f8065e
--- /dev/null
+++ b/utils/HtmlLinks.h
@@ -0,0 +1,49 @@
+#ifndef _HTML_LINKS
+#define _HTML_LINKS
+
+#include <stdlib.h>
+#include <string.h>
+#include "goo/GooVector.h"
+#include "goo/GooString.h"
+
+class HtmlLink{
+
+private:
+ double Xmin;
+ double Ymin;
+ double Xmax;
+ double Ymax;
+ GooString* dest;
+
+public:
+ HtmlLink(){dest=NULL;}
+ HtmlLink(const HtmlLink& x);
+ HtmlLink& operator=(const HtmlLink& x);
+ HtmlLink(double xmin,double ymin,double xmax,double ymax,GooString *_dest);
+ ~HtmlLink();
+ GBool isEqualDest(const HtmlLink& x) const;
+ GooString *getDest(){return new GooString(dest);}
+ double getX1() const {return Xmin;}
+ double getX2() const {return Xmax;}
+ double getY1() const {return Ymin;}
+ double getY2() const {return Ymax;}
+ GBool inLink(double xmin,double ymin,double xmax,double ymax) const ;
+ //GooString *Link(GooString *content);
+ GooString* getLinkStart();
+
+};
+
+class HtmlLinks{
+private:
+ GooVector<HtmlLink> *accu;
+public:
+ HtmlLinks();
+ ~HtmlLinks();
+ void AddLink(const HtmlLink& x) {accu->push_back(x);}
+ GBool inLink(double xmin,double ymin,double xmax,double ymax,int& p) const;
+ HtmlLink* getLink(int i) const;
+
+};
+
+#endif
+
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
new file mode 100644
index 00000000..fb8b66aa
--- /dev/null
+++ b/utils/HtmlOutputDev.cc
@@ -0,0 +1,1569 @@
+//========================================================================
+//
+// HtmlOutputDev.cc
+//
+// Copyright 1997-2002 Glyph & Cog, LLC
+//
+// Changed 1999-2000 by G.Ovtcharov
+//
+// Changed 2002 by Mikhail Kruk
+//
+//========================================================================
+
+#ifdef __GNUC__
+#pragma implementation
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <ctype.h>
+#include <math.h>
+#include "goo/GooString.h"
+#include "goo/GooList.h"
+#include "UnicodeMap.h"
+#include "goo/gmem.h"
+#include "config.h"
+#include "Error.h"
+#include "GfxState.h"
+#include "DCTStream.h"
+#include "GlobalParams.h"
+#include "HtmlOutputDev.h"
+#include "HtmlFonts.h"
+
+int HtmlPage::pgNum=0;
+int HtmlOutputDev::imgNum=1;
+
+extern double scale;
+extern GBool complexMode;
+extern GBool ignore;
+extern GBool printCommands;
+extern GBool printHtml;
+extern GBool noframes;
+extern GBool stout;
+extern GBool xml;
+extern GBool showHidden;
+extern GBool noMerge;
+
+static GooString* basename(GooString* str){
+
+ char *p=str->getCString();
+ int len=str->getLength();
+ for (int i=len-1;i>=0;i--)
+ if (*(p+i)==SLASH)
+ return new GooString((p+i+1),len-i-1);
+ return new GooString(str);
+}
+
+static GooString* Dirname(GooString* str){
+
+ char *p=str->getCString();
+ int len=str->getLength();
+ for (int i=len-1;i>=0;i--)
+ if (*(p+i)==SLASH)
+ return new GooString(p,i+1);
+ return new GooString();
+}
+
+//------------------------------------------------------------------------
+// HtmlString
+//------------------------------------------------------------------------
+
+HtmlString::HtmlString(GfxState *state, double fontSize, HtmlFontAccu* fonts) {
+ GfxFont *font;
+ double x, y;
+
+ state->transform(state->getCurX(), state->getCurY(), &x, &y);
+ if ((font = state->getFont())) {
+ yMin = y - font->getAscent() * fontSize;
+ yMax = y - font->getDescent() * fontSize;
+ GfxRGB rgb;
+ state->getFillRGB(&rgb);
+ GooString *name = state->getFont()->getName();
+ if (!name) name = HtmlFont::getDefaultFont(); //new GooString("default");
+ HtmlFont hfont=HtmlFont(name, static_cast<int>(fontSize-1), rgb);
+ fontpos = fonts->AddFont(hfont);
+ } else {
+ // this means that the PDF file draws text without a current font,
+ // which should never happen
+ yMin = y - 0.95 * fontSize;
+ yMax = y + 0.35 * fontSize;
+ fontpos=0;
+ }
+ if (yMin == yMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ yMin = y;
+ yMax = y + 1;
+ }
+ col = 0;
+ text = NULL;
+ xRight = NULL;
+ link = NULL;
+ len = size = 0;
+ yxNext = NULL;
+ xyNext = NULL;
+ htext=new GooString();
+ dir = textDirUnknown;
+}
+
+
+HtmlString::~HtmlString() {
+ delete text;
+ delete htext;
+ gfree(xRight);
+}
+
+void HtmlString::addChar(GfxState *state, double x, double y,
+ double dx, double dy, Unicode u) {
+ if (dir == textDirUnknown) {
+ //dir = UnicodeMap::getDirection(u);
+ dir = textDirLeftRight;
+ }
+
+ if (len == size) {
+ size += 16;
+ text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+ xRight = (double *)grealloc(xRight, size * sizeof(double));
+ }
+ text[len] = u;
+ if (len == 0) {
+ xMin = x;
+ }
+ xMax = xRight[len] = x + dx;
+//printf("added char: %f %f xright = %f\n", x, dx, x+dx);
+ ++len;
+}
+
+void HtmlString::endString()
+{
+ if( dir == textDirRightLeft && len > 1 )
+ {
+ //printf("will reverse!\n");
+ for (int i = 0; i < len / 2; i++)
+ {
+ Unicode ch = text[i];
+ text[i] = text[len - i - 1];
+ text[len - i - 1] = ch;
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// HtmlPage
+//------------------------------------------------------------------------
+
+HtmlPage::HtmlPage(GBool rawOrder, char *imgExtVal) {
+ this->rawOrder = rawOrder;
+ curStr = NULL;
+ yxStrings = NULL;
+ xyStrings = NULL;
+ yxCur1 = yxCur2 = NULL;
+ fonts=new HtmlFontAccu();
+ links=new HtmlLinks();
+ pageWidth=0;
+ pageHeight=0;
+ fontsPageMarker = 0;
+ DocName=NULL;
+ firstPage = -1;
+ imgExt = new GooString(imgExtVal);
+}
+
+HtmlPage::~HtmlPage() {
+ clear();
+ if (DocName) delete DocName;
+ if (fonts) delete fonts;
+ if (links) delete links;
+ if (imgExt) delete imgExt;
+}
+
+void HtmlPage::updateFont(GfxState *state) {
+ GfxFont *font;
+ double *fm;
+ char *name;
+ int code;
+ double w;
+
+ // adjust the font size
+ fontSize = state->getTransformedFontSize();
+ if ((font = state->getFont()) && font->getType() == fontType3) {
+ // This is a hack which makes it possible to deal with some Type 3
+ // fonts. The problem is that it's impossible to know what the
+ // base coordinate system used in the font is without actually
+ // rendering the font. This code tries to guess by looking at the
+ // width of the character 'm' (which breaks if the font is a
+ // subset that doesn't contain 'm').
+ for (code = 0; code < 256; ++code) {
+ if ((name = ((Gfx8BitFont *)font)->getCharName(code)) &&
+ name[0] == 'm' && name[1] == '\0') {
+ break;
+ }
+ }
+ if (code < 256) {
+ w = ((Gfx8BitFont *)font)->getWidth(code);
+ if (w != 0) {
+ // 600 is a generic average 'm' width -- yes, this is a hack
+ fontSize *= w / 0.6;
+ }
+ }
+ fm = font->getFontMatrix();
+ if (fm[0] != 0) {
+ fontSize *= fabs(fm[3] / fm[0]);
+ }
+ }
+}
+
+void HtmlPage::beginString(GfxState *state, GooString *s) {
+ curStr = new HtmlString(state, fontSize, fonts);
+}
+
+
+void HtmlPage::conv(){
+ HtmlString *tmp;
+
+ int linkIndex = 0;
+ HtmlFont* h;
+ for(tmp=yxStrings;tmp;tmp=tmp->yxNext){
+ int pos=tmp->fontpos;
+ // printf("%d\n",pos);
+ h=fonts->Get(pos);
+
+ if (tmp->htext) delete tmp->htext;
+ tmp->htext=HtmlFont::simple(h,tmp->text,tmp->len);
+
+ if (links->inLink(tmp->xMin,tmp->yMin,tmp->xMax,tmp->yMax, linkIndex)){
+ tmp->link = links->getLink(linkIndex);
+ /*GooString *t=tmp->htext;
+ tmp->htext=links->getLink(k)->Link(tmp->htext);
+ delete t;*/
+ }
+ }
+
+}
+
+
+void HtmlPage::addChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double ox, double oy, Unicode *u, int uLen) {
+ double x1, y1, w1, h1, dx2, dy2;
+ int n, i;
+ state->transform(x, y, &x1, &y1);
+ n = curStr->len;
+
+ // check that new character is in the same direction as current string
+ // and is not too far away from it before adding
+ //if ((UnicodeMap::getDirection(u[0]) != curStr->dir) ||
+ // XXX
+ if (
+ (n > 0 &&
+ fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin))) {
+ endString();
+ beginString(state, NULL);
+ }
+ state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
+ 0, &dx2, &dy2);
+ dx -= dx2;
+ dy -= dy2;
+ state->transformDelta(dx, dy, &w1, &h1);
+ if (uLen != 0) {
+ w1 /= uLen;
+ h1 /= uLen;
+ }
+ for (i = 0; i < uLen; ++i) {
+ curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
+ }
+}
+
+void HtmlPage::endString() {
+ HtmlString *p1, *p2;
+ double h, y1, y2;
+
+ // throw away zero-length strings -- they don't have valid xMin/xMax
+ // values, and they're useless anyway
+ if (curStr->len == 0) {
+ delete curStr;
+ curStr = NULL;
+ return;
+ }
+
+ curStr->endString();
+
+#if 0 //~tmp
+ if (curStr->yMax - curStr->yMin > 20) {
+ delete curStr;
+ curStr = NULL;
+ return;
+ }
+#endif
+
+ // insert string in y-major list
+ h = curStr->yMax - curStr->yMin;
+ y1 = curStr->yMin + 0.5 * h;
+ y2 = curStr->yMin + 0.8 * h;
+ if (rawOrder) {
+ p1 = yxCur1;
+ p2 = NULL;
+ } else if ((!yxCur1 ||
+ (y1 >= yxCur1->yMin &&
+ (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) &&
+ (!yxCur2 ||
+ (y1 < yxCur2->yMin ||
+ (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) {
+ p1 = yxCur1;
+ p2 = yxCur2;
+ } else {
+ for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) {
+ if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin))
+ break;
+ }
+ yxCur2 = p2;
+ }
+ yxCur1 = curStr;
+ if (p1)
+ p1->yxNext = curStr;
+ else
+ yxStrings = curStr;
+ curStr->yxNext = p2;
+ curStr = NULL;
+}
+
+void HtmlPage::coalesce() {
+ HtmlString *str1, *str2;
+ HtmlFont *hfont1, *hfont2;
+ double space, horSpace, vertSpace, vertOverlap;
+ GBool addSpace, addLineBreak;
+ int n, i;
+ double curX, curY;
+
+#if 0 //~ for debugging
+ for (str1 = yxStrings; str1; str1 = str1->yxNext) {
+ printf("x=%f..%f y=%f..%f size=%2d '",
+ str1->xMin, str1->xMax, str1->yMin, str1->yMax,
+ (int)(str1->yMax - str1->yMin));
+ for (i = 0; i < str1->len; ++i) {
+ fputc(str1->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ printf("\n------------------------------------------------------------\n\n");
+#endif
+ str1 = yxStrings;
+
+ if( !str1 ) return;
+
+ //----- discard duplicated text (fake boldface, drop shadows)
+ if( !complexMode )
+ { /* if not in complex mode get rid of duplicate strings */
+ HtmlString *str3;
+ GBool found;
+ while (str1)
+ {
+ double size = str1->yMax - str1->yMin;
+ double xLimit = str1->xMin + size * 0.2;
+ found = gFalse;
+ for (str2 = str1, str3 = str1->yxNext;
+ str3 && str3->xMin < xLimit;
+ str2 = str3, str3 = str2->yxNext)
+ {
+ if (str3->len == str1->len &&
+ !memcmp(str3->text, str1->text, str1->len * sizeof(Unicode)) &&
+ fabs(str3->yMin - str1->yMin) < size * 0.2 &&
+ fabs(str3->yMax - str1->yMax) < size * 0.2 &&
+ fabs(str3->xMax - str1->xMax) < size * 0.2)
+ {
+ found = gTrue;
+ //printf("found duplicate!\n");
+ break;
+ }
+ }
+ if (found)
+ {
+ str2->xyNext = str3->xyNext;
+ str2->yxNext = str3->yxNext;
+ delete str3;
+ }
+ else
+ {
+ str1 = str1->yxNext;
+ }
+ }
+ } /*- !complexMode */
+
+ str1 = yxStrings;
+
+ hfont1 = getFont(str1);
+ if( hfont1->isBold() )
+ str1->htext->insert(0,"<b>",3);
+ if( hfont1->isItalic() )
+ str1->htext->insert(0,"<i>",3);
+ if( str1->getLink() != NULL ) {
+ GooString *ls = str1->getLink()->getLinkStart();
+ str1->htext->insert(0, ls);
+ delete ls;
+ }
+ curX = str1->xMin; curY = str1->yMin;
+
+ while (str1 && (str2 = str1->yxNext)) {
+ hfont2 = getFont(str2);
+ space = str1->yMax - str1->yMin;
+ horSpace = str2->xMin - str1->xMax;
+ addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4);
+ vertSpace = str2->yMin - str1->yMax;
+
+//printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
+
+ if (str2->yMin >= str1->yMin && str2->yMin <= str1->yMax)
+ {
+ vertOverlap = str1->yMax - str2->yMin;
+ } else
+ if (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax)
+ {
+ vertOverlap = str2->yMax - str1->yMin;
+ } else
+ {
+ vertOverlap = 0;
+ }
+
+ if (
+ (
+ (
+ (
+ (rawOrder && vertOverlap > 0.5 * space)
+ ||
+ (!rawOrder && str2->yMin < str1->yMax)
+ ) &&
+ (horSpace > -0.5 * space && horSpace < space)
+ ) ||
+ (vertSpace >= 0 && vertSpace < 0.5 * space && addLineBreak)
+ ) &&
+ (!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex mode fonts must be the same, in other modes fonts do not metter
+ str1->dir == str2->dir // text direction the same
+ )
+ {
+// printf("yes\n");
+ n = str1->len + str2->len;
+ if ((addSpace = horSpace > 0.1 * space)) {
+ ++n;
+ }
+ if (addLineBreak) {
+ ++n;
+ }
+
+ str1->size = (n + 15) & ~15;
+ str1->text = (Unicode *)grealloc(str1->text,
+ str1->size * sizeof(Unicode));
+ str1->xRight = (double *)grealloc(str1->xRight,
+ str1->size * sizeof(double));
+ if (addSpace) {
+ str1->text[str1->len] = 0x20;
+ str1->htext->append(" ");
+ str1->xRight[str1->len] = str2->xMin;
+ ++str1->len;
+ }
+ if (addLineBreak) {
+ str1->text[str1->len] = '\n';
+ str1->htext->append("<br>");
+ str1->xRight[str1->len] = str2->xMin;
+ ++str1->len;
+ str1->yMin = str2->yMin;
+ str1->yMax = str2->yMax;
+ str1->xMax = str2->xMax;
+ int fontLineSize = hfont1->getLineSize();
+ int curLineSize = (int)(vertSpace + space);
+ if( curLineSize != fontLineSize )
+ {
+ HtmlFont *newfnt = new HtmlFont(*hfont1);
+ newfnt->setLineSize(curLineSize);
+ str1->fontpos = fonts->AddFont(*newfnt);
+ delete newfnt;
+ hfont1 = getFont(str1);
+ // we have to reget hfont2 because it's location could have
+ // changed on resize
+ hfont2 = getFont(str2);
+ }
+ }
+ for (i = 0; i < str2->len; ++i) {
+ str1->text[str1->len] = str2->text[i];
+ str1->xRight[str1->len] = str2->xRight[i];
+ ++str1->len;
+ }
+
+ /* fix <i> and <b> if str1 and str2 differ */
+ if( hfont1->isBold() && !hfont2->isBold() )
+ str1->htext->append("</b>", 4);
+ if( hfont1->isItalic() && !hfont2->isItalic() )
+ str1->htext->append("</i>", 4);
+ if( !hfont1->isBold() && hfont2->isBold() )
+ str1->htext->append("<b>", 3);
+ if( !hfont1->isItalic() && hfont2->isItalic() )
+ str1->htext->append("<i>", 3);
+
+ /* now handle switch of links */
+ HtmlLink *hlink1 = str1->getLink();
+ HtmlLink *hlink2 = str2->getLink();
+ if( !hlink1 || !hlink2 || !hlink1->isEqualDest(*hlink2) ) {
+ if(hlink1 != NULL )
+ str1->htext->append("</a>");
+ if(hlink2 != NULL ) {
+ GooString *ls = hlink2->getLinkStart();
+ str1->htext->append(ls);
+ delete ls;
+ }
+ }
+
+ str1->htext->append(str2->htext);
+ // str1 now contains href for link of str2 (if it is defined)
+ str1->link = str2->link;
+ hfont1 = hfont2;
+ if (str2->xMax > str1->xMax) {
+ str1->xMax = str2->xMax;
+ }
+ if (str2->yMax > str1->yMax) {
+ str1->yMax = str2->yMax;
+ }
+ str1->yxNext = str2->yxNext;
+ delete str2;
+ } else { // keep strings separate
+// printf("no\n");
+ if( hfont1->isBold() )
+ str1->htext->append("</b>",4);
+ if( hfont1->isItalic() )
+ str1->htext->append("</i>",4);
+ if(str1->getLink() != NULL )
+ str1->htext->append("</a>");
+
+ str1->xMin = curX; str1->yMin = curY;
+ str1 = str2;
+ curX = str1->xMin; curY = str1->yMin;
+ hfont1 = hfont2;
+ if( hfont1->isBold() )
+ str1->htext->insert(0,"<b>",3);
+ if( hfont1->isItalic() )
+ str1->htext->insert(0,"<i>",3);
+ if( str1->getLink() != NULL ) {
+ GooString *ls = str1->getLink()->getLinkStart();
+ str1->htext->insert(0, ls);
+ delete ls;
+ }
+ }
+ }
+ str1->xMin = curX; str1->yMin = curY;
+ if( hfont1->isBold() )
+ str1->htext->append("</b>",4);
+ if( hfont1->isItalic() )
+ str1->htext->append("</i>",4);
+ if(str1->getLink() != NULL )
+ str1->htext->append("</a>");
+
+#if 0 //~ for debugging
+ for (str1 = yxStrings; str1; str1 = str1->yxNext) {
+ printf("x=%3d..%3d y=%3d..%3d size=%2d ",
+ (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax,
+ (int)(str1->yMax - str1->yMin));
+ printf("'%s'\n", str1->htext->getCString());
+ }
+ printf("\n------------------------------------------------------------\n\n");
+#endif
+
+}
+
+void HtmlPage::dumpAsXML(FILE* f,int page){
+ fprintf(f, "<page number=\"%d\" position=\"absolute\"", page);
+ fprintf(f," top=\"0\" left=\"0\" height=\"%d\" width=\"%d\">\n", pageHeight,pageWidth);
+
+ for(int i=fontsPageMarker;i < fonts->size();i++) {
+ GooString *fontCSStyle = fonts->CSStyle(i);
+ fprintf(f,"\t%s\n",fontCSStyle->getCString());
+ delete fontCSStyle;
+ }
+
+ GooString *str, *str1;
+ for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){
+ if (tmp->htext){
+ str=new GooString(tmp->htext);
+ fprintf(f,"<text top=\"%d\" left=\"%d\" ",xoutRound(tmp->yMin),xoutRound(tmp->xMin));
+ fprintf(f,"width=\"%d\" height=\"%d\" ",xoutRound(tmp->xMax-tmp->xMin),xoutRound(tmp->yMax-tmp->yMin));
+ fprintf(f,"font=\"%d\">", tmp->fontpos);
+ if (tmp->fontpos!=-1){
+ str1=fonts->getCSStyle(tmp->fontpos, str);
+ }
+ fputs(str1->getCString(),f);
+ delete str;
+ delete str1;
+ fputs("</text>\n",f);
+ }
+ }
+ fputs("</page>\n",f);
+}
+
+
+void HtmlPage::dumpComplex(FILE *file, int page){
+ FILE* pageFile;
+ GooString* tmp;
+ char* htmlEncoding;
+
+ if( firstPage == -1 ) firstPage = page;
+
+ if( !noframes )
+ {
+ GooString* pgNum=GooString::fromInt(page);
+ tmp = new GooString(DocName);
+ tmp->append('-')->append(pgNum)->append(".html");
+ delete pgNum;
+
+ if (!(pageFile = fopen(getFileNameFromPath(tmp->getCString(),tmp->getLength()), "w"))) {
+ error(-1, "Couldn't open html file '%s'", tmp->getCString());
+ delete tmp;
+ return;
+ }
+ delete tmp;
+
+ fprintf(pageFile,"%s\n<HTML>\n<HEAD>\n<TITLE>Page %d</TITLE>\n\n",
+ DOCTYPE, page);
+
+ htmlEncoding = HtmlOutputDev::mapEncodingToHtml
+ (globalParams->getTextEncodingName());
+ fprintf(pageFile, "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n", htmlEncoding);
+ }
+ else
+ {
+ pageFile = file;
+ fprintf(pageFile,"<!-- Page %d -->\n", page);
+ fprintf(pageFile,"<a name=\"%d\"></a>\n", page);
+ }
+
+ fprintf(pageFile,"<DIV style=\"position:relative;width:%d;height:%d;\">\n",
+ pageWidth, pageHeight);
+
+ tmp=basename(DocName);
+
+ fputs("<STYLE type=\"text/css\">\n<!--\n",pageFile);
+ for(int i=fontsPageMarker;i!=fonts->size();i++) {
+ GooString *fontCSStyle = fonts->CSStyle(i);
+ fprintf(pageFile,"\t%s\n",fontCSStyle->getCString());
+ delete fontCSStyle;
+ }
+
+ fputs("-->\n</STYLE>\n",pageFile);
+
+ if( !noframes )
+ {
+ fputs("</HEAD>\n<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n",pageFile);
+ }
+
+ if( !ignore )
+ {
+ fprintf(pageFile,
+ "<IMG width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\">\n",
+ pageWidth, pageHeight, tmp->getCString(),
+ (page-firstPage+1), imgExt->getCString());
+ }
+
+ delete tmp;
+
+ GooString *str, *str1;
+ for(HtmlString *tmp1=yxStrings;tmp1;tmp1=tmp1->yxNext){
+ if (tmp1->htext){
+ str=new GooString(tmp1->htext);
+ fprintf(pageFile,
+ "<DIV style=\"position:absolute;top:%d;left:%d\">",
+ xoutRound(tmp1->yMin),
+ xoutRound(tmp1->xMin));
+ fputs("<nobr>",pageFile);
+ if (tmp1->fontpos!=-1){
+ str1=fonts->getCSStyle(tmp1->fontpos, str);
+ }
+ //printf("%s\n", str1->getCString());
+ fputs(str1->getCString(),pageFile);
+
+ delete str;
+ delete str1;
+ fputs("</nobr></DIV>\n",pageFile);
+ }
+ }
+
+ fputs("</DIV>\n", pageFile);
+
+ if( !noframes )
+ {
+ fputs("</BODY>\n</HTML>\n",pageFile);
+ fclose(pageFile);
+ }
+}
+
+
+void HtmlPage::dump(FILE *f, int pageNum)
+{
+ if (complexMode)
+ {
+ if (xml) dumpAsXML(f, pageNum);
+ if (!xml) dumpComplex(f, pageNum);
+ }
+ else
+ {
+ fprintf(f,"<A name=%d></a>",pageNum);
+ GooString* fName=basename(DocName);
+ for (int i=1;i<HtmlOutputDev::imgNum;i++)
+ fprintf(f,"<IMG src=\"%s-%d_%d.jpg\"><br>\n",fName->getCString(),pageNum,i);
+ HtmlOutputDev::imgNum=1;
+ delete fName;
+
+ GooString* str;
+ for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){
+ if (tmp->htext){
+ str=new GooString(tmp->htext);
+ fputs(str->getCString(),f);
+ delete str;
+ fputs("<br>\n",f);
+ }
+ }
+ fputs("<hr>\n",f);
+ }
+}
+
+
+
+void HtmlPage::clear() {
+ HtmlString *p1, *p2;
+
+ if (curStr) {
+ delete curStr;
+ curStr = NULL;
+ }
+ for (p1 = yxStrings; p1; p1 = p2) {
+ p2 = p1->yxNext;
+ delete p1;
+ }
+ yxStrings = NULL;
+ xyStrings = NULL;
+ yxCur1 = yxCur2 = NULL;
+
+ if( !noframes )
+ {
+ delete fonts;
+ fonts=new HtmlFontAccu();
+ fontsPageMarker = 0;
+ }
+ else
+ {
+ fontsPageMarker = fonts->size();
+ }
+
+ delete links;
+ links=new HtmlLinks();
+
+
+}
+
+void HtmlPage::setDocName(char *fname){
+ DocName=new GooString(fname);
+}
+
+//------------------------------------------------------------------------
+// HtmlMetaVar
+//------------------------------------------------------------------------
+
+HtmlMetaVar::HtmlMetaVar(char *_name, char *_content)
+{
+ name = new GooString(_name);
+ content = new GooString(_content);
+}
+
+HtmlMetaVar::~HtmlMetaVar()
+{
+ delete name;
+ delete content;
+}
+
+GooString* HtmlMetaVar::toString()
+{
+ GooString *result = new GooString("<META name=\"");
+ result->append(name);
+ result->append("\" content=\"");
+ result->append(content);
+ result->append("\">");
+ return result;
+}
+
+//------------------------------------------------------------------------
+// HtmlOutputDev
+//------------------------------------------------------------------------
+
+static char* HtmlEncodings[][2] = {
+ {"Latin1", "ISO-8859-1"},
+ {NULL, NULL}
+};
+
+
+char* HtmlOutputDev::mapEncodingToHtml(GooString* encoding)
+{
+ char* enc = encoding->getCString();
+ for(int i = 0; HtmlEncodings[i][0] != NULL; i++)
+ {
+ if( strcmp(enc, HtmlEncodings[i][0]) == 0 )
+ {
+ return HtmlEncodings[i][1];
+ }
+ }
+ return enc;
+}
+
+void HtmlOutputDev::doFrame(int firstPage){
+ GooString* fName=new GooString(Docname);
+ char* htmlEncoding;
+ fName->append(".html");
+
+ if (!(fContentsFrame = fopen(getFileNameFromPath(fName->getCString(),fName->getLength()), "w"))){
+ delete fName;
+ error(-1, "Couldn't open html file '%s'", fName->getCString());
+ return;
+ }
+
+ delete fName;
+
+ fName=basename(Docname);
+ fputs(DOCTYPE_FRAMES, fContentsFrame);
+ fputs("\n<HTML>",fContentsFrame);
+ fputs("\n<HEAD>",fContentsFrame);
+ fprintf(fContentsFrame,"\n<TITLE>%s</TITLE>",docTitle->getCString());
+ htmlEncoding = mapEncodingToHtml(globalParams->getTextEncodingName());
+ fprintf(fContentsFrame, "\n<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n", htmlEncoding);
+ dumpMetaVars(fContentsFrame);
+ fprintf(fContentsFrame, "</HEAD>\n");
+ fputs("<FRAMESET cols=\"100,*\">\n",fContentsFrame);
+ fprintf(fContentsFrame,"<FRAME name=\"links\" src=\"%s_ind.html\">\n",fName->getCString());
+ fputs("<FRAME name=\"contents\" src=",fContentsFrame);
+ if (complexMode)
+ fprintf(fContentsFrame,"\"%s-%d.html\"",fName->getCString(), firstPage);
+ else
+ fprintf(fContentsFrame,"\"%ss.html\"",fName->getCString());
+
+ fputs(">\n</FRAMESET>\n</HTML>\n",fContentsFrame);
+
+ delete fName;
+ fclose(fContentsFrame);
+}
+
+HtmlOutputDev::HtmlOutputDev(char *fileName, char *title,
+ char *author, char *keywords, char *subject, char *date,
+ char *extension,
+ GBool rawOrder, int firstPage, GBool outline)
+{
+ char *htmlEncoding;
+
+ fContentsFrame = NULL;
+ docTitle = new GooString(title);
+ pages = NULL;
+ dumpJPEG=gTrue;
+ //write = gTrue;
+ this->rawOrder = rawOrder;
+ this->doOutline = outline;
+ ok = gFalse;
+ imgNum=1;
+ //this->firstPage = firstPage;
+ //pageNum=firstPage;
+ // open file
+ needClose = gFalse;
+ pages = new HtmlPage(rawOrder, extension);
+
+ glMetaVars = new GooList();
+ glMetaVars->append(new HtmlMetaVar("generator", "pdftohtml 0.36"));
+ if( author ) glMetaVars->append(new HtmlMetaVar("author", author));
+ if( keywords ) glMetaVars->append(new HtmlMetaVar("keywords", keywords));
+ if( date ) glMetaVars->append(new HtmlMetaVar("date", date));
+ if( subject ) glMetaVars->append(new HtmlMetaVar("subject", subject));
+
+ maxPageWidth = 0;
+ maxPageHeight = 0;
+
+ pages->setDocName(fileName);
+ Docname=new GooString (fileName);
+
+ // for non-xml output (complex or simple) with frames generate the left frame
+ if(!xml && !noframes)
+ {
+ GooString* left=new GooString(fileName);
+ left->append("_ind.html");
+
+ doFrame(firstPage);
+
+ if (!(fContentsFrame = fopen(getFileNameFromPath(left->getCString(),left->getLength()), "w")))
+ {
+ error(-1, "Couldn't open html file '%s'", left->getCString());
+ delete left;
+ return;
+ }
+ delete left;
+ fputs(DOCTYPE, fContentsFrame);
+ fputs("<HTML>\n<HEAD>\n<TITLE></TITLE>\n</HEAD>\n<BODY>\n",fContentsFrame);
+
+ if (doOutline)
+ {
+ GooString *str = basename(Docname);
+ fprintf(fContentsFrame, "<A href=\"%s%s\" target=\"contents\">Outline</a><br>", str->getCString(), complexMode ? "-outline.html" : "s.html#outline");
+ delete str;
+ }
+
+ if (!complexMode)
+ { /* not in complex mode */
+
+ GooString* right=new GooString(fileName);
+ right->append("s.html");
+
+ if (!(page=fopen(getFileNameFromPath(right->getCString(),right->getLength()),"w"))){
+ error(-1, "Couldn't open html file '%s'", right->getCString());
+ delete right;
+ return;
+ }
+ delete right;
+ fputs(DOCTYPE, page);
+ fputs("<HTML>\n<HEAD>\n<TITLE></TITLE>\n</HEAD>\n<BODY>\n",page);
+ }
+ }
+
+ if (noframes) {
+ if (stout) page=stdout;
+ else {
+ GooString* right=new GooString(fileName);
+ if (!xml) right->append(".html");
+ if (xml) right->append(".xml");
+ if (!(page=fopen(getFileNameFromPath(right->getCString(),right->getLength()),"w"))){
+ delete right;
+ error(-1, "Couldn't open html file '%s'", right->getCString());
+ return;
+ }
+ delete right;
+ }
+
+ htmlEncoding = mapEncodingToHtml(globalParams->getTextEncodingName());
+ if (xml)
+ {
+ fprintf(page, "<?xml version=\"1.0\" encoding=\"%s\"?>\n", htmlEncoding);
+ fputs("<!DOCTYPE pdf2xml SYSTEM \"pdf2xml.dtd\">\n\n", page);
+ fputs("<pdf2xml>\n",page);
+ }
+ else
+ {
+ fprintf(page,"%s\n<HTML>\n<HEAD>\n<TITLE>%s</TITLE>\n",
+ DOCTYPE, docTitle->getCString());
+
+ fprintf(page, "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n", htmlEncoding);
+
+ dumpMetaVars(page);
+ fprintf(page,"</HEAD>\n");
+ fprintf(page,"<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n");
+ }
+ }
+ ok = gTrue;
+}
+
+HtmlOutputDev::~HtmlOutputDev() {
+ /*if (mode&&!xml){
+ int h=xoutRound(pages->pageHeight/scale);
+ int w=xoutRound(pages->pageWidth/scale);
+ fprintf(tin,"%s=%03d\n","PAPER_WIDTH",w);
+ fprintf(tin,"%s=%03d\n","PAPER_HEIGHT",h);
+ fclose(tin);
+ }*/
+
+ HtmlFont::clear();
+
+ delete Docname;
+ delete docTitle;
+
+ deleteGooList(glMetaVars, HtmlMetaVar);
+
+ if (fContentsFrame){
+ fputs("</BODY>\n</HTML>\n",fContentsFrame);
+ fclose(fContentsFrame);
+ }
+ if (xml) {
+ fputs("</pdf2xml>\n",page);
+ fclose(page);
+ } else
+ if ( !complexMode || xml || noframes )
+ {
+ fputs("</BODY>\n</HTML>\n",page);
+ fclose(page);
+ }
+ if (pages)
+ delete pages;
+}
+
+
+
+void HtmlOutputDev::startPage(int pageNum, GfxState *state) {
+ /*if (mode&&!xml){
+ if (write){
+ write=gFalse;
+ GooString* fname=Dirname(Docname);
+ fname->append("image.log");
+ if((tin=fopen(getFileNameFromPath(fname->getCString(),fname->getLength()),"w"))==NULL){
+ printf("Error : can not open %s",fname);
+ exit(1);
+ }
+ delete fname;
+ // if(state->getRotation()!=0)
+ // fprintf(tin,"ROTATE=%d rotate %d neg %d neg translate\n",state->getRotation(),state->getX1(),-state->getY1());
+ // else
+ fprintf(tin,"ROTATE=%d neg %d neg translate\n",state->getX1(),state->getY1());
+ }
+ }*/
+
+ this->pageNum = pageNum;
+ GooString *str=basename(Docname);
+ pages->clear();
+ if(!noframes)
+ {
+ if (fContentsFrame)
+ {
+ if (complexMode)
+ fprintf(fContentsFrame,"<A href=\"%s-%d.html\"",str->getCString(),pageNum);
+ else
+ fprintf(fContentsFrame,"<A href=\"%ss.html#%d\"",str->getCString(),pageNum);
+ fprintf(fContentsFrame," target=\"contents\" >Page %d</a><br>\n",pageNum);
+ }
+ }
+
+ pages->pageWidth=static_cast<int>(state->getPageWidth());
+ pages->pageHeight=static_cast<int>(state->getPageHeight());
+
+ delete str;
+}
+
+
+void HtmlOutputDev::endPage() {
+ pages->conv();
+ pages->coalesce();
+ pages->dump(page, pageNum);
+
+ // I don't yet know what to do in the case when there are pages of different
+ // sizes and we want complex output: running ghostscript many times
+ // seems very inefficient. So for now I'll just use last page's size
+ maxPageWidth = pages->pageWidth;
+ maxPageHeight = pages->pageHeight;
+
+ //if(!noframes&&!xml) fputs("<br>\n", fContentsFrame);
+ if(!stout && !globalParams->getErrQuiet()) printf("Page-%d\n",(pageNum));
+}
+
+void HtmlOutputDev::updateFont(GfxState *state) {
+ pages->updateFont(state);
+}
+
+void HtmlOutputDev::beginString(GfxState *state, GooString *s) {
+ pages->beginString(state, s);
+}
+
+void HtmlOutputDev::endString(GfxState *state) {
+ pages->endString();
+}
+
+void HtmlOutputDev::drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode code, Unicode *u, int uLen)
+{
+ if ( !showHidden && (state->getRender() & 3) == 3) {
+ return;
+ }
+ pages->addChar(state, x, y, dx, dy, originX, originY, u, uLen);
+}
+
+void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
+ int width, int height, GBool invert,
+ GBool inlineImg) {
+
+ int i, j;
+
+ if (ignore||complexMode) {
+ OutputDev::drawImageMask(state, ref, str, width, height, invert, inlineImg);
+ return;
+ }
+
+ FILE *f1;
+ int c;
+
+ int x0, y0; // top left corner of image
+ int w0, h0, w1, h1; // size of image
+ double xt, yt, wt, ht;
+ GBool rotate, xFlip, yFlip;
+ GBool dither;
+ int x, y;
+ int ix, iy;
+ int px1, px2, qx, dx;
+ int py1, py2, qy, dy;
+ Gulong pixel;
+ int nComps, nVals, nBits;
+ double r1, g1, b1;
+
+ // get image position and size
+ state->transform(0, 0, &xt, &yt);
+ state->transformDelta(1, 1, &wt, &ht);
+ if (wt > 0) {
+ x0 = xoutRound(xt);
+ w0 = xoutRound(wt);
+ } else {
+ x0 = xoutRound(xt + wt);
+ w0 = xoutRound(-wt);
+ }
+ if (ht > 0) {
+ y0 = xoutRound(yt);
+ h0 = xoutRound(ht);
+ } else {
+ y0 = xoutRound(yt + ht);
+ h0 = xoutRound(-ht);
+ }
+ state->transformDelta(1, 0, &xt, &yt);
+ rotate = fabs(xt) < fabs(yt);
+ if (rotate) {
+ w1 = h0;
+ h1 = w0;
+ xFlip = ht < 0;
+ yFlip = wt > 0;
+ } else {
+ w1 = w0;
+ h1 = h0;
+ xFlip = wt < 0;
+ yFlip = ht > 0;
+ }
+
+ // dump JPEG file
+ if (dumpJPEG && str->getKind() == strDCT) {
+ GooString *fName=new GooString(Docname);
+ fName->append("-");
+ GooString *pgNum=GooString::fromInt(pageNum);
+ GooString *imgnum=GooString::fromInt(imgNum);
+ // open the image file
+ fName->append(pgNum)->append("_")->append(imgnum)->append(".jpg");
+ ++imgNum;
+ if (!(f1 = fopen(getFileNameFromPath(fName->getCString(),fName->getLength()), "wb"))) {
+ error(-1, "Couldn't open image file '%s'", fName->getCString());
+ return;
+ }
+
+ // initialize stream
+ str = ((DCTStream *)str)->getRawStream();
+ str->reset();
+
+ // copy the stream
+ while ((c = str->getChar()) != EOF)
+ fputc(c, f1);
+
+ fclose(f1);
+
+ if (pgNum) delete pgNum;
+ if (imgnum) delete imgnum;
+ if (fName) delete fName;
+ }
+ else {
+ OutputDev::drawImageMask(state, ref, str, width, height, invert, inlineImg);
+ }
+}
+
+void HtmlOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
+ int width, int height, GfxImageColorMap *colorMap,
+ int *maskColors, GBool inlineImg) {
+
+ int i, j;
+
+ if (ignore||complexMode) {
+ OutputDev::drawImage(state, ref, str, width, height, colorMap,
+ maskColors, inlineImg);
+ return;
+ }
+
+ FILE *f1;
+ ImageStream *imgStr;
+ Guchar pixBuf[4];
+ GfxColor color;
+ int c;
+
+ int x0, y0; // top left corner of image
+ int w0, h0, w1, h1; // size of image
+ double xt, yt, wt, ht;
+ GBool rotate, xFlip, yFlip;
+ GBool dither;
+ int x, y;
+ int ix, iy;
+ int px1, px2, qx, dx;
+ int py1, py2, qy, dy;
+ Gulong pixel;
+ int nComps, nVals, nBits;
+ double r1, g1, b1;
+
+ // get image position and size
+ state->transform(0, 0, &xt, &yt);
+ state->transformDelta(1, 1, &wt, &ht);
+ if (wt > 0) {
+ x0 = xoutRound(xt);
+ w0 = xoutRound(wt);
+ } else {
+ x0 = xoutRound(xt + wt);
+ w0 = xoutRound(-wt);
+ }
+ if (ht > 0) {
+ y0 = xoutRound(yt);
+ h0 = xoutRound(ht);
+ } else {
+ y0 = xoutRound(yt + ht);
+ h0 = xoutRound(-ht);
+ }
+ state->transformDelta(1, 0, &xt, &yt);
+ rotate = fabs(xt) < fabs(yt);
+ if (rotate) {
+ w1 = h0;
+ h1 = w0;
+ xFlip = ht < 0;
+ yFlip = wt > 0;
+ } else {
+ w1 = w0;
+ h1 = h0;
+ xFlip = wt < 0;
+ yFlip = ht > 0;
+ }
+
+
+ /*if( !globalParams->getErrQuiet() )
+ printf("image stream of kind %d\n", str->getKind());*/
+ // dump JPEG file
+ if (dumpJPEG && str->getKind() == strDCT) {
+ GooString *fName=new GooString(Docname);
+ fName->append("-");
+ GooString *pgNum= GooString::fromInt(pageNum);
+ GooString *imgnum= GooString::fromInt(imgNum);
+
+ // open the image file
+ fName->append(pgNum)->append("_")->append(imgnum)->append(".jpg");
+ ++imgNum;
+
+ if (!(f1 = fopen(getFileNameFromPath(fName->getCString(),fName->getLength()), "wb"))) {
+ error(-1, "Couldn't open image file '%s'", fName->getCString());
+ return;
+ }
+
+ // initialize stream
+ str = ((DCTStream *)str)->getRawStream();
+ str->reset();
+
+ // copy the stream
+ while ((c = str->getChar()) != EOF)
+ fputc(c, f1);
+
+ fclose(f1);
+
+ delete fName;
+ delete pgNum;
+ delete imgnum;
+ }
+ else {
+ OutputDev::drawImage(state, ref, str, width, height, colorMap,
+ maskColors, inlineImg);
+ }
+}
+
+
+
+void HtmlOutputDev::drawLink(Link* link,Catalog *cat){
+ double _x1,_y1,_x2,_y2,w;
+ int x1,y1,x2,y2;
+
+ link->getRect(&_x1,&_y1,&_x2,&_y2);
+ w = link->getBorderStyle()->getWidth();
+ cvtUserToDev(_x1,_y1,&x1,&y1);
+
+ cvtUserToDev(_x2,_y2,&x2,&y2);
+
+
+ GooString* _dest=getLinkDest(link,cat);
+ HtmlLink t((double) x1,(double) y2,(double) x2,(double) y1,_dest);
+ pages->AddLink(t);
+ delete _dest;
+}
+
+GooString* HtmlOutputDev::getLinkDest(Link *link,Catalog* catalog){
+ char *p;
+ switch(link->getAction()->getKind())
+ {
+ case actionGoTo:
+ {
+ GooString* file=basename(Docname);
+ int page=1;
+ LinkGoTo *ha=(LinkGoTo *)link->getAction();
+ LinkDest *dest=NULL;
+ if (ha->getDest()==NULL)
+ dest=catalog->findDest(ha->getNamedDest());
+ else
+ dest=ha->getDest()->copy();
+ if (dest){
+ if (dest->isPageRef()){
+ Ref pageref=dest->getPageRef();
+ page=catalog->findPage(pageref.num,pageref.gen);
+ }
+ else {
+ page=dest->getPageNum();
+ }
+
+ delete dest;
+
+ GooString *str=GooString::fromInt(page);
+ /* complex simple
+ frames file-4.html files.html#4
+ noframes file.html#4 file.html#4
+ */
+ if (noframes)
+ {
+ file->append(".html#");
+ file->append(str);
+ }
+ else
+ {
+ if( complexMode )
+ {
+ file->append("-");
+ file->append(str);
+ file->append(".html");
+ }
+ else
+ {
+ file->append("s.html#");
+ file->append(str);
+ }
+ }
+
+ if (printCommands) printf(" link to page %d ",page);
+ delete str;
+ return file;
+ }
+ else
+ {
+ return new GooString();
+ }
+ }
+ case actionGoToR:
+ {
+ LinkGoToR *ha=(LinkGoToR *) link->getAction();
+ LinkDest *dest=NULL;
+ int page=1;
+ GooString *file=new GooString();
+ if (ha->getFileName()){
+ delete file;
+ file=new GooString(ha->getFileName()->getCString());
+ }
+ if (ha->getDest()!=NULL) dest=ha->getDest()->copy();
+ if (dest&&file){
+ if (!(dest->isPageRef())) page=dest->getPageNum();
+ delete dest;
+
+ if (printCommands) printf(" link to page %d ",page);
+ if (printHtml){
+ p=file->getCString()+file->getLength()-4;
+ if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")){
+ file->del(file->getLength()-4,4);
+ file->append(".html");
+ }
+ file->append('#');
+ file->append(GooString::fromInt(page));
+ }
+ }
+ if (printCommands) printf("filename %s\n",file->getCString());
+ return file;
+ }
+ case actionURI:
+ {
+ LinkURI *ha=(LinkURI *) link->getAction();
+ GooString* file=new GooString(ha->getURI()->getCString());
+ // printf("uri : %s\n",file->getCString());
+ return file;
+ }
+ case actionLaunch:
+ {
+ LinkLaunch *ha=(LinkLaunch *) link->getAction();
+ GooString* file=new GooString(ha->getFileName()->getCString());
+ if (printHtml) {
+ p=file->getCString()+file->getLength()-4;
+ if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")){
+ file->del(file->getLength()-4,4);
+ file->append(".html");
+ }
+ if (printCommands) printf("filename %s",file->getCString());
+
+ return file;
+
+ }
+ }
+ default:
+ return new GooString();
+ }
+}
+
+void HtmlOutputDev::dumpMetaVars(FILE *file)
+{
+ GooString *var;
+
+ for(int i = 0; i < glMetaVars->getLength(); i++)
+ {
+ HtmlMetaVar *t = (HtmlMetaVar*)glMetaVars->get(i);
+ var = t->toString();
+ fprintf(file, "%s\n", var->getCString());
+ delete var;
+ }
+}
+
+GBool HtmlOutputDev::dumpDocOutline(Catalog* catalog)
+{
+ FILE * output;
+ GBool bClose = gFalse;
+
+ if (!ok || xml)
+ return gFalse;
+
+ Object *outlines = catalog->getOutline();
+ if (!outlines->isDict())
+ return gFalse;
+
+ if (!complexMode && !xml)
+ {
+ output = page;
+ }
+ else if (complexMode && !xml)
+ {
+ if (noframes)
+ {
+ output = page;
+ fputs("<hr>\n", output);
+ }
+ else
+ {
+ GooString *str = basename(Docname);
+ str->append("-outline.html");
+ output = fopen(getFileNameFromPath(str->getCString(),str->getLength()), "w");
+ if (output == NULL)
+ return gFalse;
+ delete str;
+ bClose = gTrue;
+ fputs("<HTML>\n<HEAD>\n<TITLE>Document Outline</TITLE>\n</HEAD>\n<BODY>\n", output);
+ }
+ }
+
+ GBool done = newOutlineLevel(output, outlines, catalog);
+ if (done && !complexMode)
+ fputs("<hr>\n", output);
+
+ if (bClose)
+ {
+ fputs("</BODY>\n</HTML>\n", output);
+ fclose(output);
+ }
+ return done;
+}
+
+GBool HtmlOutputDev::newOutlineLevel(FILE *output, Object *node, Catalog* catalog, int level)
+{
+ Object curr, next;
+ GBool atLeastOne = gFalse;
+
+ if (node->dictLookup("First", &curr)->isDict()) {
+ if (level == 1)
+ {
+ fputs("<A name=\"outline\"></a>", output);
+ fputs("<h1>Document Outline</h1>\n", output);
+ }
+ fputs("<ul>",output);
+ do {
+ // get title, give up if not found
+ Object title;
+ if (curr.dictLookup("Title", &title)->isNull()) {
+ title.free();
+ break;
+ }
+ GooString *titleStr = new GooString(title.getString());
+ title.free();
+
+ // get corresponding link
+ // Note: some code duplicated from HtmlOutputDev::getLinkDest().
+ GooString *linkName = NULL;;
+ Object dest;
+ if (!curr.dictLookup("Dest", &dest)->isNull()) {
+ LinkGoTo *link = new LinkGoTo(&dest);
+ LinkDest *linkdest=NULL;
+ if (link->getDest()==NULL)
+ linkdest=catalog->findDest(link->getNamedDest());
+ else
+ linkdest=link->getDest()->copy();
+ delete link;
+ if (linkdest) {
+ int page;
+ if (linkdest->isPageRef()) {
+ Ref pageref=linkdest->getPageRef();
+ page=catalog->findPage(pageref.num,pageref.gen);
+ } else {
+ page=linkdest->getPageNum();
+ }
+ delete linkdest;
+
+ /* complex simple
+ frames file-4.html files.html#4
+ noframes file.html#4 file.html#4
+ */
+ linkName=basename(Docname);
+ GooString *str=GooString::fromInt(page);
+ if (noframes) {
+ linkName->append(".html#");
+ linkName->append(str);
+ } else {
+ if( complexMode ) {
+ linkName->append("-");
+ linkName->append(str);
+ linkName->append(".html");
+ } else {
+ linkName->append("s.html#");
+ linkName->append(str);
+ }
+ }
+ delete str;
+ }
+ }
+ dest.free();
+
+ fputs("<li>",output);
+ if (linkName)
+ fprintf(output,"<A href=\"%s\">", linkName->getCString());
+ fputs(titleStr->getCString(),output);
+ if (linkName) {
+ fputs("</A>",output);
+ delete linkName;
+ }
+ fputs("\n",output);
+ delete titleStr;
+ atLeastOne = gTrue;
+
+ newOutlineLevel(output, &curr, catalog, level+1);
+ curr.dictLookup("Next", &next);
+ curr.free();
+ curr = next;
+ } while(curr.isDict());
+ fputs("</ul>",output);
+ }
+ curr.free();
+
+ return atLeastOne;
+}
+
+char* getFileNameFromPath(char* c, int strlen) {
+ int last_slash_index = 0;
+ int i = 0;
+ char* res;
+
+ for (i=0;i<strlen;i++) {
+ if (*(c+i)=='/') {
+ /* printf("/ detected\n"); */
+ last_slash_index = i;
+ }
+ }
+ res = (char *)malloc(sizeof(char)*strlen-last_slash_index+1);
+ strcpy(res,c+last_slash_index+(last_slash_index?1:0));
+ /* printf("Fil: %s\n",res); */
+ return res;
+}
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
new file mode 100644
index 00000000..5196ee23
--- /dev/null
+++ b/utils/HtmlOutputDev.h
@@ -0,0 +1,302 @@
+//========================================================================
+//
+// HtmlOutputDev.h
+//
+// Copyright 1997 Derek B. Noonburg
+//
+// Changed 1999 by G.Ovtcharov
+//========================================================================
+
+#ifndef HTMLOUTPUTDEV_H
+#define HTMLOUTPUTDEV_H
+
+#ifdef __GNUC__
+#pragma interface
+#endif
+
+#include <stdio.h>
+#include "goo/gtypes.h"
+#include "goo/GooList.h"
+#include "GfxFont.h"
+#include "OutputDev.h"
+#include "HtmlLinks.h"
+#include "HtmlFonts.h"
+#include "Link.h"
+#include "Catalog.h"
+#include "UnicodeMap.h"
+
+
+#ifdef WIN32
+# define SLASH '\\'
+#else
+# define SLASH '/'
+#endif
+
+#define xoutRound(x) ((int)(x + 0.5))
+
+#define DOCTYPE "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">"
+#define DOCTYPE_FRAMES "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\"\n\"http://www.w3.org/TR/html4/frameset.dtd\">"
+
+class GfxState;
+class GooString;
+//------------------------------------------------------------------------
+// HtmlString
+//------------------------------------------------------------------------
+
+enum UnicodeTextDirection {
+ textDirUnknown,
+ textDirLeftRight,
+ textDirRightLeft,
+ textDirTopBottom
+};
+
+
+class HtmlString {
+public:
+
+ // Constructor.
+ HtmlString(GfxState *state, double fontSize, HtmlFontAccu* fonts);
+
+ // Destructor.
+ ~HtmlString();
+
+ // Add a character to the string.
+ void addChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ Unicode u);
+ HtmlLink* getLink() { return link; }
+ void endString(); // postprocessing
+
+private:
+// aender die text variable
+ HtmlLink *link;
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ int col; // starting column
+ Unicode *text; // the text
+ double *xRight; // right-hand x coord of each char
+ HtmlString *yxNext; // next string in y-major order
+ HtmlString *xyNext; // next string in x-major order
+ int fontpos;
+ GooString* htext;
+ int len; // length of text and xRight
+ int size; // size of text and xRight arrays
+ UnicodeTextDirection dir; // direction (left to right/right to left)
+
+ friend class HtmlPage;
+
+};
+
+
+//------------------------------------------------------------------------
+// HtmlPage
+//------------------------------------------------------------------------
+
+
+
+class HtmlPage {
+public:
+
+ // Constructor.
+ HtmlPage(GBool rawOrder, char *imgExtVal);
+
+ // Destructor.
+ ~HtmlPage();
+
+ // Begin a new string.
+ void beginString(GfxState *state, GooString *s);
+
+ // Add a character to the current string.
+ void addChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double ox, double oy,
+ Unicode *u, int uLen); //Guchar c);
+
+ void updateFont(GfxState *state);
+
+ // End the current string, sorting it into the list of strings.
+ void endString();
+
+ // Coalesce strings that look like parts of the same line.
+ void coalesce();
+
+ // Find a string. If <top> is true, starts looking at top of page;
+ // otherwise starts looking at <xMin>,<yMin>. If <bottom> is true,
+ // stops looking at bottom of page; otherwise stops looking at
+ // <xMax>,<yMax>. If found, sets the text bounding rectange and
+ // returns true; otherwise returns false.
+
+
+ // new functions
+ void AddLink(const HtmlLink& x){
+ links->AddLink(x);
+ }
+
+ void dump(FILE *f, int pageNum);
+
+ // Clear the page.
+ void clear();
+
+ void conv();
+private:
+ HtmlFont* getFont(HtmlString *hStr) { return fonts->Get(hStr->fontpos); }
+
+ double fontSize; // current font size
+ GBool rawOrder; // keep strings in content stream order
+
+ HtmlString *curStr; // currently active string
+
+ HtmlString *yxStrings; // strings in y-major order
+ HtmlString *xyStrings; // strings in x-major order
+ HtmlString *yxCur1, *yxCur2; // cursors for yxStrings list
+
+ void setDocName(char* fname);
+ void dumpAsXML(FILE* f,int page);
+ void dumpComplex(FILE* f, int page);
+
+ // marks the position of the fonts that belong to current page (for noframes)
+ int fontsPageMarker;
+ HtmlFontAccu *fonts;
+ HtmlLinks *links;
+
+ GooString *DocName;
+ GooString *imgExt;
+ int pageWidth;
+ int pageHeight;
+ static int pgNum;
+ int firstPage; // used to begin the numeration of pages
+
+ friend class HtmlOutputDev;
+};
+
+//------------------------------------------------------------------------
+// HtmlMetaVar
+//------------------------------------------------------------------------
+class HtmlMetaVar {
+public:
+ HtmlMetaVar(char *_name, char *_content);
+ ~HtmlMetaVar();
+
+ GooString* toString();
+
+private:
+
+ GooString *name;
+ GooString *content;
+};
+
+//------------------------------------------------------------------------
+// HtmlOutputDev
+//------------------------------------------------------------------------
+
+class HtmlOutputDev: public OutputDev {
+public:
+
+ // Open a text output file. If <fileName> is NULL, no file is written
+ // (this is useful, e.g., for searching text). If <useASCII7> is true,
+ // text is converted to 7-bit ASCII; otherwise, text is converted to
+ // 8-bit ISO Latin-1. <useASCII7> should also be set for Japanese
+ // (EUC-JP) text. If <rawOrder> is true, the text is kept in content
+ // stream order.
+ HtmlOutputDev(char *fileName, char *title,
+ char *author,
+ char *keywords,
+ char *subject,
+ char *date,
+ char *extension,
+ GBool rawOrder,
+ int firstPage = 1,
+ GBool outline = 0);
+
+ // Destructor.
+ virtual ~HtmlOutputDev();
+
+ // Check if file was successfully created.
+ virtual GBool isOk() { return ok; }
+
+ //---- get info about output device
+
+ // Does this device use upside-down coordinates?
+ // (Upside-down means (0,0) is the top left corner of the page.)
+ virtual GBool upsideDown() { return gTrue; }
+
+ // Does this device use drawChar() or drawString()?
+ virtual GBool useDrawChar() { return gTrue; }
+
+ // Does this device use beginType3Char/endType3Char? Otherwise,
+ // text in Type 3 fonts will be drawn with drawChar/drawString.
+ virtual GBool interpretType3Chars() { return gFalse; }
+
+ // Does this device need non-text content?
+ virtual GBool needNonText() { return gFalse; }
+
+ //----- initialization and control
+
+ // Start a page.
+ virtual void startPage(int pageNum, GfxState *state);
+
+ // End a page.
+ virtual void endPage();
+
+ //----- update text state
+ virtual void updateFont(GfxState *state);
+
+ //----- text drawing
+ virtual void beginString(GfxState *state, GooString *s);
+ virtual void endString(GfxState *state);
+ virtual void drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode code, Unicode *u, int uLen);
+
+ virtual void drawImageMask(GfxState *state, Object *ref,
+ Stream *str,
+ int width, int height, GBool invert,
+ GBool inlineImg);
+ virtual void drawImage(GfxState *state, Object *ref, Stream *str,
+ int width, int height, GfxImageColorMap *colorMap,
+ int *maskColors, GBool inlineImg);
+
+ //new feature
+ virtual int DevType() {return 1234;}
+ virtual void drawLink(Link *link,Catalog *cat);
+
+ int getPageWidth() { return maxPageWidth; }
+ int getPageHeight() { return maxPageHeight; }
+
+ GBool dumpDocOutline(Catalog* catalog);
+
+ /* char* getFileNameFromPath(char* c, int strlen); */
+
+private:
+ // convert encoding into a HTML standard, or encoding->getCString if not
+ // recognized
+ static char* mapEncodingToHtml(GooString* encoding);
+ GooString* getLinkDest(Link *link,Catalog *catalog);
+ void dumpMetaVars(FILE *);
+ void doFrame(int firstPage);
+ GBool newOutlineLevel(FILE *output, Object *node, Catalog* catalog, int level = 1);
+
+ FILE *fContentsFrame;
+ FILE *page; // html file
+ //FILE *tin; // image log file
+ //GBool write;
+ GBool needClose; // need to close the file?
+ HtmlPage *pages; // text for the current page
+ GBool rawOrder; // keep text in content stream order
+ GBool doOutline; // output document outline
+ GBool ok; // set up ok?
+ GBool dumpJPEG;
+ int pageNum;
+ int maxPageWidth;
+ int maxPageHeight;
+ static int imgNum;
+ GooString *Docname;
+ GooString *docTitle;
+ GooList *glMetaVars;
+ friend class HtmlPage;
+};
+
+char* getFileNameFromPath(char* c, int strlen);
+
+#endif
diff --git a/utils/ImageOutputDev.cc b/utils/ImageOutputDev.cc
new file mode 100644
index 00000000..9789a748
--- /dev/null
+++ b/utils/ImageOutputDev.cc
@@ -0,0 +1,195 @@
+//========================================================================
+//
+// ImageOutputDev.cc
+//
+// Copyright 1998-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+#include <poppler-config.h>
+
+#ifdef USE_GCC_PRAGMAS
+#pragma implementation
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <ctype.h>
+#include "goo/gmem.h"
+#include "config.h"
+#include "Error.h"
+#include "GfxState.h"
+#include "Object.h"
+#include "Stream.h"
+#include "DCTStream.h"
+#include "ImageOutputDev.h"
+
+ImageOutputDev::ImageOutputDev(char *fileRootA, GBool dumpJPEGA) {
+ fileRoot = copyString(fileRootA);
+ fileName = (char *)gmalloc(strlen(fileRoot) + 20);
+ dumpJPEG = dumpJPEGA;
+ imgNum = 0;
+ ok = gTrue;
+}
+
+ImageOutputDev::~ImageOutputDev() {
+ gfree(fileName);
+ gfree(fileRoot);
+}
+
+void ImageOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
+ int width, int height, GBool invert,
+ GBool inlineImg) {
+ FILE *f;
+ int c;
+ int size, i;
+
+ // dump JPEG file
+ if (dumpJPEG && str->getKind() == strDCT && !inlineImg) {
+
+ // open the image file
+ sprintf(fileName, "%s-%03d.jpg", fileRoot, imgNum);
+ ++imgNum;
+ if (!(f = fopen(fileName, "wb"))) {
+ error(-1, "Couldn't open image file '%s'", fileName);
+ return;
+ }
+
+ // initialize stream
+ str = ((DCTStream *)str)->getRawStream();
+ str->reset();
+
+ // copy the stream
+ while ((c = str->getChar()) != EOF)
+ fputc(c, f);
+
+ str->close();
+ fclose(f);
+
+ // dump PBM file
+ } else {
+
+ // open the image file and write the PBM header
+ sprintf(fileName, "%s-%03d.pbm", fileRoot, imgNum);
+ ++imgNum;
+ if (!(f = fopen(fileName, "wb"))) {
+ error(-1, "Couldn't open image file '%s'", fileName);
+ return;
+ }
+ fprintf(f, "P4\n");
+ fprintf(f, "%d %d\n", width, height);
+
+ // initialize stream
+ str->reset();
+
+ // copy the stream
+ size = height * ((width + 7) / 8);
+ for (i = 0; i < size; ++i) {
+ fputc(str->getChar(), f);
+ }
+
+ str->close();
+ fclose(f);
+ }
+}
+
+void ImageOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
+ int width, int height,
+ GfxImageColorMap *colorMap,
+ int *maskColors, GBool inlineImg) {
+ FILE *f;
+ ImageStream *imgStr;
+ Guchar *p;
+ GfxRGB rgb;
+ int x, y;
+ int c;
+ int size, i;
+
+ // dump JPEG file
+ if (dumpJPEG && str->getKind() == strDCT &&
+ colorMap->getNumPixelComps() == 3 &&
+ !inlineImg) {
+
+ // open the image file
+ sprintf(fileName, "%s-%03d.jpg", fileRoot, imgNum);
+ ++imgNum;
+ if (!(f = fopen(fileName, "wb"))) {
+ error(-1, "Couldn't open image file '%s'", fileName);
+ return;
+ }
+
+ // initialize stream
+ str = ((DCTStream *)str)->getRawStream();
+ str->reset();
+
+ // copy the stream
+ while ((c = str->getChar()) != EOF)
+ fputc(c, f);
+
+ str->close();
+ fclose(f);
+
+ // dump PBM file
+ } else if (colorMap->getNumPixelComps() == 1 &&
+ colorMap->getBits() == 1) {
+
+ // open the image file and write the PBM header
+ sprintf(fileName, "%s-%03d.pbm", fileRoot, imgNum);
+ ++imgNum;
+ if (!(f = fopen(fileName, "wb"))) {
+ error(-1, "Couldn't open image file '%s'", fileName);
+ return;
+ }
+ fprintf(f, "P4\n");
+ fprintf(f, "%d %d\n", width, height);
+
+ // initialize stream
+ str->reset();
+
+ // copy the stream
+ size = height * ((width + 7) / 8);
+ for (i = 0; i < size; ++i) {
+ fputc(str->getChar() ^ 0xff, f);
+ }
+
+ str->close();
+ fclose(f);
+
+ // dump PPM file
+ } else {
+
+ // open the image file and write the PPM header
+ sprintf(fileName, "%s-%03d.ppm", fileRoot, imgNum);
+ ++imgNum;
+ if (!(f = fopen(fileName, "wb"))) {
+ error(-1, "Couldn't open image file '%s'", fileName);
+ return;
+ }
+ fprintf(f, "P6\n");
+ fprintf(f, "%d %d\n", width, height);
+ fprintf(f, "255\n");
+
+ // initialize stream
+ imgStr = new ImageStream(str, width, colorMap->getNumPixelComps(),
+ colorMap->getBits());
+ imgStr->reset();
+
+ // for each line...
+ for (y = 0; y < height; ++y) {
+
+ // write the line
+ p = imgStr->getLine();
+ for (x = 0; x < width; ++x) {
+ colorMap->getRGB(p, &rgb);
+ fputc((int)(rgb.r * 255 + 0.5), f);
+ fputc((int)(rgb.g * 255 + 0.5), f);
+ fputc((int)(rgb.b * 255 + 0.5), f);
+ p += colorMap->getNumPixelComps();
+ }
+ }
+ delete imgStr;
+
+ fclose(f);
+ }
+}
diff --git a/utils/ImageOutputDev.h b/utils/ImageOutputDev.h
new file mode 100644
index 00000000..404e2f8c
--- /dev/null
+++ b/utils/ImageOutputDev.h
@@ -0,0 +1,76 @@
+//========================================================================
+//
+// ImageOutputDev.h
+//
+// Copyright 1998-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+#ifndef IMAGEOUTPUTDEV_H
+#define IMAGEOUTPUTDEV_H
+
+#include <poppler-config.h>
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include <stdio.h>
+#include "goo/gtypes.h"
+#include "OutputDev.h"
+
+class GfxState;
+
+//------------------------------------------------------------------------
+// ImageOutputDev
+//------------------------------------------------------------------------
+
+class ImageOutputDev: public OutputDev {
+public:
+
+ // Create an OutputDev which will write images to files named
+ // <fileRoot>-NNN.<type>. Normally, all images are written as PBM
+ // (.pbm) or PPM (.ppm) files. If <dumpJPEG> is set, JPEG images are
+ // written as JPEG (.jpg) files.
+ ImageOutputDev(char *fileRootA, GBool dumpJPEGA);
+
+ // Destructor.
+ virtual ~ImageOutputDev();
+
+ // Check if file was successfully created.
+ virtual GBool isOk() { return ok; }
+
+ // Does this device use beginType3Char/endType3Char? Otherwise,
+ // text in Type 3 fonts will be drawn with drawChar/drawString.
+ virtual GBool interpretType3Chars() { return gFalse; }
+
+ // Does this device need non-text content?
+ virtual GBool needNonText() { return gFalse; }
+
+ //---- get info about output device
+
+ // Does this device use upside-down coordinates?
+ // (Upside-down means (0,0) is the top left corner of the page.)
+ virtual GBool upsideDown() { return gTrue; }
+
+ // Does this device use drawChar() or drawString()?
+ virtual GBool useDrawChar() { return gFalse; }
+
+ //----- image drawing
+ virtual void drawImageMask(GfxState *state, Object *ref, Stream *str,
+ int width, int height, GBool invert,
+ GBool inlineImg);
+ virtual void drawImage(GfxState *state, Object *ref, Stream *str,
+ int width, int height, GfxImageColorMap *colorMap,
+ int *maskColors, GBool inlineImg);
+
+private:
+
+ char *fileRoot; // root of output file names
+ char *fileName; // buffer for output file names
+ GBool dumpJPEG; // set to dump native JPEG files
+ int imgNum; // current image number
+ GBool ok; // set up ok?
+};
+
+#endif
diff --git a/utils/Makefile.am b/utils/Makefile.am
new file mode 100644
index 00000000..9ddef40d
--- /dev/null
+++ b/utils/Makefile.am
@@ -0,0 +1,18 @@
+INCLUDES = \
+ -I$(top_srcdir) \
+ -I$(top_srcdir)/poppler
+
+LDADD = \
+ $(top_builddir)/poppler/libpoppler.la
+
+bin_PROGRAMS = pdffonts pdfimages pdfinfo pdftops pdftotext pdftohtml
+
+man1_MANS = pdffonts.1 pdfimages.1 pdfinfo.1 pdftops.1 pdftotext.1 pdftohtml.1
+
+pdffonts_SOURCES = pdffonts.cc parseargs.c
+pdfimages_SOURCES = pdfimages.cc ImageOutputDev.cc parseargs.c
+pdfinfo_SOURCES = pdfinfo.cc parseargs.c
+pdftops_SOURCES = pdftops.cc parseargs.c
+pdftotext_SOURCES = pdftotext.cc parseargs.c
+pdftohtml_SOURCES = pdftohtml.cc parseargs.c \
+ HtmlFonts.cc HtmlLinks.cc HtmlOutputDev.cc
diff --git a/utils/parseargs.c b/utils/parseargs.c
new file mode 100644
index 00000000..9f579436
--- /dev/null
+++ b/utils/parseargs.c
@@ -0,0 +1,190 @@
+/*
+ * parseargs.h
+ *
+ * Command line argument parser.
+ *
+ * Copyright 1996-2003 Glyph & Cog, LLC
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include "parseargs.h"
+
+static ArgDesc *findArg(ArgDesc *args, char *arg);
+static GBool grabArg(ArgDesc *arg, int i, int *argc, char *argv[]);
+
+GBool parseArgs(ArgDesc *args, int *argc, char *argv[]) {
+ ArgDesc *arg;
+ int i, j;
+ GBool ok;
+
+ ok = gTrue;
+ i = 1;
+ while (i < *argc) {
+ if (!strcmp(argv[i], "--")) {
+ --*argc;
+ for (j = i; j < *argc; ++j)
+ argv[j] = argv[j+1];
+ break;
+ } else if ((arg = findArg(args, argv[i]))) {
+ if (!grabArg(arg, i, argc, argv))
+ ok = gFalse;
+ } else {
+ ++i;
+ }
+ }
+ return ok;
+}
+
+void printUsage(char *program, char *otherArgs, ArgDesc *args) {
+ ArgDesc *arg;
+ char *typ;
+ int w, w1;
+
+ w = 0;
+ for (arg = args; arg->arg; ++arg) {
+ if ((w1 = strlen(arg->arg)) > w)
+ w = w1;
+ }
+
+ fprintf(stderr, "Usage: %s [options]", program);
+ if (otherArgs)
+ fprintf(stderr, " %s", otherArgs);
+ fprintf(stderr, "\n");
+
+ for (arg = args; arg->arg; ++arg) {
+ fprintf(stderr, " %s", arg->arg);
+ w1 = 9 + w - strlen(arg->arg);
+ switch (arg->kind) {
+ case argInt:
+ case argIntDummy:
+ typ = " <int>";
+ break;
+ case argFP:
+ case argFPDummy:
+ typ = " <fp>";
+ break;
+ case argString:
+ case argStringDummy:
+ typ = " <string>";
+ break;
+ case argFlag:
+ case argFlagDummy:
+ default:
+ typ = "";
+ break;
+ }
+ fprintf(stderr, "%-*s", w1, typ);
+ if (arg->usage)
+ fprintf(stderr, ": %s", arg->usage);
+ fprintf(stderr, "\n");
+ }
+}
+
+static ArgDesc *findArg(ArgDesc *args, char *arg) {
+ ArgDesc *p;
+
+ for (p = args; p->arg; ++p) {
+ if (p->kind < argFlagDummy && !strcmp(p->arg, arg))
+ return p;
+ }
+ return NULL;
+}
+
+static GBool grabArg(ArgDesc *arg, int i, int *argc, char *argv[]) {
+ int n;
+ int j;
+ GBool ok;
+
+ ok = gTrue;
+ n = 0;
+ switch (arg->kind) {
+ case argFlag:
+ *(GBool *)arg->val = gTrue;
+ n = 1;
+ break;
+ case argInt:
+ if (i + 1 < *argc && isInt(argv[i+1])) {
+ *(int *)arg->val = atoi(argv[i+1]);
+ n = 2;
+ } else {
+ ok = gFalse;
+ n = 1;
+ }
+ break;
+ case argFP:
+ if (i + 1 < *argc && isFP(argv[i+1])) {
+ *(double *)arg->val = atof(argv[i+1]);
+ n = 2;
+ } else {
+ ok = gFalse;
+ n = 1;
+ }
+ break;
+ case argString:
+ if (i + 1 < *argc) {
+ strncpy((char *)arg->val, argv[i+1], arg->size - 1);
+ ((char *)arg->val)[arg->size - 1] = '\0';
+ n = 2;
+ } else {
+ ok = gFalse;
+ n = 1;
+ }
+ break;
+ default:
+ fprintf(stderr, "Internal error in arg table\n");
+ n = 1;
+ break;
+ }
+ if (n > 0) {
+ *argc -= n;
+ for (j = i; j < *argc; ++j)
+ argv[j] = argv[j+n];
+ }
+ return ok;
+}
+
+GBool isInt(char *s) {
+ if (*s == '-' || *s == '+')
+ ++s;
+ while (isdigit(*s))
+ ++s;
+ if (*s)
+ return gFalse;
+ return gTrue;
+}
+
+GBool isFP(char *s) {
+ int n;
+
+ if (*s == '-' || *s == '+')
+ ++s;
+ n = 0;
+ while (isdigit(*s)) {
+ ++s;
+ ++n;
+ }
+ if (*s == '.')
+ ++s;
+ while (isdigit(*s)) {
+ ++s;
+ ++n;
+ }
+ if (n > 0 && (*s == 'e' || *s == 'E')) {
+ ++s;
+ if (*s == '-' || *s == '+')
+ ++s;
+ n = 0;
+ if (!isdigit(*s))
+ return gFalse;
+ do {
+ ++s;
+ } while (isdigit(*s));
+ }
+ if (*s)
+ return gFalse;
+ return gTrue;
+}
diff --git a/utils/parseargs.h b/utils/parseargs.h
new file mode 100644
index 00000000..1b1c570e
--- /dev/null
+++ b/utils/parseargs.h
@@ -0,0 +1,71 @@
+/*
+ * parseargs.h
+ *
+ * Command line argument parser.
+ *
+ * Copyright 1996-2003 Glyph & Cog, LLC
+ */
+
+#ifndef PARSEARGS_H
+#define PARSEARGS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "goo/gtypes.h"
+
+/*
+ * Argument kinds.
+ */
+typedef enum {
+ argFlag, /* flag (present / not-present) */
+ /* [val: GBool *] */
+ argInt, /* integer arg */
+ /* [val: int *] */
+ argFP, /* floating point arg */
+ /* [val: double *] */
+ argString, /* string arg */
+ /* [val: char *] */
+ /* dummy entries -- these show up in the usage listing only; */
+ /* useful for X args, for example */
+ argFlagDummy,
+ argIntDummy,
+ argFPDummy,
+ argStringDummy
+} ArgKind;
+
+/*
+ * Argument descriptor.
+ */
+typedef struct {
+ char *arg; /* the command line switch */
+ ArgKind kind; /* kind of arg */
+ void *val; /* place to store value */
+ int size; /* for argString: size of string */
+ char *usage; /* usage string */
+} ArgDesc;
+
+/*
+ * Parse command line. Removes all args which are found in the arg
+ * descriptor list <args>. Stops parsing if "--" is found (and removes
+ * it). Returns gFalse if there was an error.
+ */
+extern GBool parseArgs(ArgDesc *args, int *argc, char *argv[]);
+
+/*
+ * Print usage message, based on arg descriptor list.
+ */
+extern void printUsage(char *program, char *otherArgs, ArgDesc *args);
+
+/*
+ * Check if a string is a valid integer or floating point number.
+ */
+extern GBool isInt(char *s);
+extern GBool isFP(char *s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/utils/pdffonts.1 b/utils/pdffonts.1
new file mode 100644
index 00000000..73246903
--- /dev/null
+++ b/utils/pdffonts.1
@@ -0,0 +1,128 @@
+.\" Copyright 1999-2004 Glyph & Cog, LLC
+.TH pdffonts 1 "22 January 2004"
+.SH NAME
+pdffonts \- Portable Document Format (PDF) font analyzer (version
+3.00)
+.SH SYNOPSIS
+.B pdffonts
+[options]
+.RI [ PDF-file ]
+.SH DESCRIPTION
+.B Pdffonts
+lists the fonts used in a Portable Document Format (PDF) file along
+with various information for each font.
+.PP
+The following information is listed for each font:
+.TP
+.B name
+the font name, exactly as given in the PDF file (potentially including
+a subset prefix)
+.TP
+.B type
+the font type -- see below for details
+.TP
+.B emb
+"yes" if the font is embedded in the PDF file
+.TP
+.B sub
+"yes" if the font is a subset
+.TP
+.B uni
+"yes" if there is an explicit "ToUnicode" map in the PDF file (the
+absence of a ToUnicode map doesn't necessarily mean that the text
+can't be converted to Unicode)
+.TP
+.B object ID
+the font dictionary object ID (number and generation)
+.PP
+PDF files can contain the following types of fonts:
+.PP
+.RS
+Type 1
+.RE
+.RS
+Type 1C -- aka Compact Font Format (CFF)
+.RE
+.RS
+Type 3
+.RE
+.RS
+TrueType
+.RE
+.RS
+CID Type 0 -- 16-bit font with no specified type
+.RE
+.RS
+CID Type 0C -- 16-bit PostScript CFF font
+.RE
+.RS
+CID TrueType -- 16-bit TrueType font
+.RE
+.SH CONFIGURATION FILE
+Pdffonts reads a configuration file at startup. It first tries to
+find the user's private config file, ~/.xpdfrc. If that doesn't
+exist, it looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
+.BR xpdfrc (5)
+man page for details.
+.SH OPTIONS
+Many of the following options can be set with configuration file
+commands. These are listed in square brackets with the description of
+the corresponding command line option.
+.TP
+.BI \-f " number"
+Specifies the first page to analyze.
+.TP
+.BI \-l " number"
+Specifies the last page to analyze.
+.TP
+.BI \-opw " password"
+Specify the owner password for the PDF file. Providing this will
+bypass all security restrictions.
+.TP
+.BI \-upw " password"
+Specify the user password for the PDF file.
+.TP
+.BI \-cfg " config-file"
+Read
+.I config-file
+in place of ~/.xpdfrc or the system-wide config file.
+.TP
+.B \-v
+Print copyright and version information.
+.TP
+.B \-h
+Print usage information.
+.RB ( \-help
+and
+.B \-\-help
+are equivalent.)
+.SH EXIT CODES
+The Xpdf tools use the following exit codes:
+.TP
+0
+No error.
+.TP
+1
+Error opening a PDF file.
+.TP
+2
+Error opening an output file.
+.TP
+3
+Error related to PDF permissions.
+.TP
+99
+Other error.
+.SH AUTHOR
+The pdffonts software and documentation are copyright 1996-2004 Glyph
+& Cog, LLC.
+.SH "SEE ALSO"
+.BR xpdf (1),
+.BR pdftops (1),
+.BR pdftotext (1),
+.BR pdfinfo (1),
+.BR pdftoppm (1),
+.BR pdfimages (1),
+.BR xpdfrc (5)
+.br
+.B http://www.foolabs.com/xpdf/
diff --git a/utils/pdffonts.cc b/utils/pdffonts.cc
new file mode 100644
index 00000000..e4530d22
--- /dev/null
+++ b/utils/pdffonts.cc
@@ -0,0 +1,294 @@
+//========================================================================
+//
+// pdffonts.cc
+//
+// Copyright 2001-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+#include <poppler-config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <math.h>
+#include "parseargs.h"
+#include "goo/GooString.h"
+#include "goo/gmem.h"
+#include "GlobalParams.h"
+#include "Error.h"
+#include "Object.h"
+#include "Dict.h"
+#include "GfxFont.h"
+#include "Annot.h"
+#include "PDFDoc.h"
+#include "config.h"
+
+static char *fontTypeNames[] = {
+ "unknown",
+ "Type 1",
+ "Type 1C",
+ "Type 3",
+ "TrueType",
+ "CID Type 0",
+ "CID Type 0C",
+ "CID TrueType"
+};
+
+static void scanFonts(Dict *resDict, PDFDoc *doc);
+static void scanFont(GfxFont *font, PDFDoc *doc);
+
+static int firstPage = 1;
+static int lastPage = 0;
+static char ownerPassword[33] = "\001";
+static char userPassword[33] = "\001";
+static char cfgFileName[256] = "";
+static GBool printVersion = gFalse;
+static GBool printHelp = gFalse;
+
+static ArgDesc argDesc[] = {
+ {"-f", argInt, &firstPage, 0,
+ "first page to examine"},
+ {"-l", argInt, &lastPage, 0,
+ "last page to examine"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
+ "configuration file to use in place of .xpdfrc"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"--help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-?", argFlag, &printHelp, 0,
+ "print usage information"},
+ {NULL}
+};
+
+static Ref *fonts;
+static int fontsLen;
+static int fontsSize;
+
+int main(int argc, char *argv[]) {
+ PDFDoc *doc;
+ GooString *fileName;
+ GooString *ownerPW, *userPW;
+ GBool ok;
+ Page *page;
+ Dict *resDict;
+ Annots *annots;
+ Object obj1, obj2;
+ int pg, i;
+ int exitCode;
+
+ exitCode = 99;
+
+ // parse args
+ ok = parseArgs(argDesc, &argc, argv);
+ if (!ok || argc != 2 || printVersion || printHelp) {
+ fprintf(stderr, "pdffonts version %s\n", xpdfVersion);
+ fprintf(stderr, "%s\n", xpdfCopyright);
+ if (!printVersion) {
+ printUsage("pdffonts", "<PDF-file>", argDesc);
+ }
+ goto err0;
+ }
+ fileName = new GooString(argv[1]);
+
+ // read config file
+ globalParams = new GlobalParams(cfgFileName);
+
+ // open PDF file
+ if (ownerPassword[0] != '\001') {
+ ownerPW = new GooString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0] != '\001') {
+ userPW = new GooString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ exitCode = 1;
+ goto err1;
+ }
+
+ // get page range
+ if (firstPage < 1) {
+ firstPage = 1;
+ }
+ if (lastPage < 1 || lastPage > doc->getNumPages()) {
+ lastPage = doc->getNumPages();
+ }
+
+ // scan the fonts
+ printf("name type emb sub uni object ID\n");
+ printf("------------------------------------ ------------ --- --- --- ---------\n");
+ fonts = NULL;
+ fontsLen = fontsSize = 0;
+ for (pg = firstPage; pg <= lastPage; ++pg) {
+ page = doc->getCatalog()->getPage(pg);
+ if ((resDict = page->getResourceDict())) {
+ scanFonts(resDict, doc);
+ }
+ annots = new Annots(doc->getXRef(),
+ doc->getCatalog(),
+ page->getAnnots(&obj1));
+ obj1.free();
+ for (i = 0; i < annots->getNumAnnots(); ++i) {
+ if (annots->getAnnot(i)->getAppearance(&obj1)->isStream()) {
+ obj1.streamGetDict()->lookup("Resources", &obj2);
+ if (obj2.isDict()) {
+ scanFonts(obj2.getDict(), doc);
+ }
+ obj2.free();
+ }
+ obj1.free();
+ }
+ delete annots;
+ }
+
+ exitCode = 0;
+
+ // clean up
+ gfree(fonts);
+ err1:
+ delete doc;
+ delete globalParams;
+ err0:
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return exitCode;
+}
+
+static void scanFonts(Dict *resDict, PDFDoc *doc) {
+ Object obj1, obj2, xObjDict, xObj, resObj;
+ Ref r;
+ GfxFontDict *gfxFontDict;
+ GfxFont *font;
+ int i;
+
+ // scan the fonts in this resource dictionary
+ gfxFontDict = NULL;
+ resDict->lookupNF("Font", &obj1);
+ if (obj1.isRef()) {
+ obj1.fetch(doc->getXRef(), &obj2);
+ if (obj2.isDict()) {
+ r = obj1.getRef();
+ gfxFontDict = new GfxFontDict(doc->getXRef(), &r, obj2.getDict());
+ }
+ obj2.free();
+ } else if (obj1.isDict()) {
+ gfxFontDict = new GfxFontDict(doc->getXRef(), NULL, obj1.getDict());
+ }
+ if (gfxFontDict) {
+ for (i = 0; i < gfxFontDict->getNumFonts(); ++i) {
+ if ((font = gfxFontDict->getFont(i))) {
+ scanFont(font, doc);
+ }
+ }
+ delete gfxFontDict;
+ }
+ obj1.free();
+
+ // recursively scan any resource dictionaries in objects in this
+ // resource dictionary
+ resDict->lookup("XObject", &xObjDict);
+ if (xObjDict.isDict()) {
+ for (i = 0; i < xObjDict.dictGetLength(); ++i) {
+ xObjDict.dictGetVal(i, &xObj);
+ if (xObj.isStream()) {
+ xObj.streamGetDict()->lookup("Resources", &resObj);
+ if (resObj.isDict()) {
+ scanFonts(resObj.getDict(), doc);
+ }
+ resObj.free();
+ }
+ xObj.free();
+ }
+ }
+ xObjDict.free();
+}
+
+static void scanFont(GfxFont *font, PDFDoc *doc) {
+ Ref fontRef, embRef;
+ Object fontObj, toUnicodeObj;
+ GooString *name;
+ GBool emb, subset, hasToUnicode;
+ int i;
+
+ fontRef = *font->getID();
+
+ // check for an already-seen font
+ for (i = 0; i < fontsLen; ++i) {
+ if (fontRef.num == fonts[i].num && fontRef.gen == fonts[i].gen) {
+ return;
+ }
+ }
+
+ // font name
+ name = font->getOrigName();
+
+ // check for an embedded font
+ if (font->getType() == fontType3) {
+ emb = gTrue;
+ } else {
+ emb = font->getEmbeddedFontID(&embRef);
+ }
+
+ // look for a ToUnicode map
+ hasToUnicode = gFalse;
+ if (doc->getXRef()->fetch(fontRef.num, fontRef.gen, &fontObj)->isDict()) {
+ hasToUnicode = fontObj.dictLookup("ToUnicode", &toUnicodeObj)->isStream();
+ toUnicodeObj.free();
+ }
+ fontObj.free();
+
+ // check for a font subset name: capital letters followed by a '+'
+ // sign
+ subset = gFalse;
+ if (name) {
+ for (i = 0; i < name->getLength(); ++i) {
+ if (name->getChar(i) < 'A' || name->getChar(i) > 'Z') {
+ break;
+ }
+ }
+ subset = i > 0 && i < name->getLength() && name->getChar(i) == '+';
+ }
+
+ // print the font info
+ printf("%-36s %-12s %-3s %-3s %-3s",
+ name ? name->getCString() : "[none]",
+ fontTypeNames[font->getType()],
+ emb ? "yes" : "no",
+ subset ? "yes" : "no",
+ hasToUnicode ? "yes" : "no");
+ if (fontRef.gen >= 100000) {
+ printf(" [none]\n");
+ } else {
+ printf(" %6d %2d\n", fontRef.num, fontRef.gen);
+ }
+
+ // add this font to the list
+ if (fontsLen == fontsSize) {
+ fontsSize += 32;
+ fonts = (Ref *)grealloc(fonts, fontsSize * sizeof(Ref));
+ }
+ fonts[fontsLen++] = *font->getID();
+}
diff --git a/utils/pdfimages.1 b/utils/pdfimages.1
new file mode 100644
index 00000000..c580625e
--- /dev/null
+++ b/utils/pdfimages.1
@@ -0,0 +1,96 @@
+.\" Copyright 1998-2004 Glyph & Cog, LLC
+.TH pdfimages 1 "22 January 2004"
+.SH NAME
+pdfimages \- Portable Document Format (PDF) image extractor
+(version 3.00)
+.SH SYNOPSIS
+.B pdfimages
+[options]
+.I PDF-file image-root
+.SH DESCRIPTION
+.B Pdfimages
+saves images from a Portable Document Format (PDF) file as Portable
+Pixmap (PPM), Portable Bitmap (PBM), or JPEG files.
+.PP
+Pdfimages reads the PDF file
+.IR PDF-file ,
+scans one or more pages, and writes one PPM, PBM, or JPEG file for each image,
+.IR image-root - nnn . xxx ,
+where
+.I nnn
+is the image number and
+.I xxx
+is the image type (.ppm, .pbm, .jpg).
+.SH CONFIGURATION FILE
+Pdfimages reads a configuration file at startup. It first tries to
+find the user's private config file, ~/.xpdfrc. If that doesn't
+exist, it looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
+.BR xpdfrc (5)
+man page for details.
+.SH OPTIONS
+Many of the following options can be set with configuration file
+commands. These are listed in square brackets with the description of
+the corresponding command line option.
+.TP
+.BI \-f " number"
+Specifies the first page to scan.
+.TP
+.BI \-l " number"
+Specifies the last page to scan.
+.TP
+.B \-j
+Normally, all images are written as PBM (for monochrome images) or PPM
+(for non-monochrome images) files. With this option, images in DCT
+format are saved as JPEG files. All non-DCT images are saved in
+PBM/PPM format as usual.
+.TP
+.BI \-opw " password"
+Specify the owner password for the PDF file. Providing this will
+bypass all security restrictions.
+.TP
+.BI \-upw " password"
+Specify the user password for the PDF file.
+.TP
+.B \-q
+Don't print any messages or errors.
+.RB "[config file: " errQuiet ]
+.TP
+.B \-v
+Print copyright and version information.
+.TP
+.B \-h
+Print usage information.
+.RB ( \-help
+and
+.B \-\-help
+are equivalent.)
+.SH EXIT CODES
+The Xpdf tools use the following exit codes:
+.TP
+0
+No error.
+.TP
+1
+Error opening a PDF file.
+.TP
+2
+Error opening an output file.
+.TP
+3
+Error related to PDF permissions.
+.TP
+99
+Other error.
+.SH AUTHOR
+The pdfimages software and documentation are copyright 1998-2004 Glyph
+& Cog, LLC.
+.SH "SEE ALSO"
+.BR xpdf (1),
+.BR pdftops (1),
+.BR pdftotext (1),
+.BR pdfinfo (1),
+.BR pdffonts (1),
+.BR pdftoppm (1),
+.BR xpdfrc (5)
+.br
+.B http://www.foolabs.com/xpdf/
diff --git a/utils/pdfimages.cc b/utils/pdfimages.cc
new file mode 100644
index 00000000..be020ec8
--- /dev/null
+++ b/utils/pdfimages.cc
@@ -0,0 +1,159 @@
+//========================================================================
+//
+// pdfimages.cc
+//
+// Copyright 1998-2003 Glyph & Cog, LLC
+//
+// Modified for Debian by Hamish Moffatt, 22 May 2002.
+//
+//========================================================================
+
+#include <poppler-config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include "parseargs.h"
+#include "goo/GooString.h"
+#include "goo/gmem.h"
+#include "GlobalParams.h"
+#include "Object.h"
+#include "Stream.h"
+#include "Array.h"
+#include "Dict.h"
+#include "XRef.h"
+#include "Catalog.h"
+#include "Page.h"
+#include "PDFDoc.h"
+#include "ImageOutputDev.h"
+#include "Error.h"
+#include "config.h"
+
+static int firstPage = 1;
+static int lastPage = 0;
+static GBool dumpJPEG = gFalse;
+static char ownerPassword[33] = "\001";
+static char userPassword[33] = "\001";
+static GBool quiet = gFalse;
+static char cfgFileName[256] = "";
+static GBool printVersion = gFalse;
+static GBool printHelp = gFalse;
+
+static ArgDesc argDesc[] = {
+ {"-f", argInt, &firstPage, 0,
+ "first page to convert"},
+ {"-l", argInt, &lastPage, 0,
+ "last page to convert"},
+ {"-j", argFlag, &dumpJPEG, 0,
+ "write JPEG images as JPEG files"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-q", argFlag, &quiet, 0,
+ "don't print any messages or errors"},
+ {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
+ "configuration file to use in place of .xpdfrc"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"--help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-?", argFlag, &printHelp, 0,
+ "print usage information"},
+ {NULL}
+};
+
+int main(int argc, char *argv[]) {
+ PDFDoc *doc;
+ GooString *fileName;
+ char *imgRoot;
+ GooString *ownerPW, *userPW;
+ ImageOutputDev *imgOut;
+ GBool ok;
+ int exitCode;
+
+ exitCode = 99;
+
+ // parse args
+ ok = parseArgs(argDesc, &argc, argv);
+ if (!ok || argc != 3 || printVersion || printHelp) {
+ fprintf(stderr, "pdfimages version %s\n", xpdfVersion);
+ fprintf(stderr, "%s\n", xpdfCopyright);
+ if (!printVersion) {
+ printUsage("pdfimages", "<PDF-file> <image-root>", argDesc);
+ }
+ goto err0;
+ }
+ fileName = new GooString(argv[1]);
+ imgRoot = argv[2];
+
+ // read config file
+ globalParams = new GlobalParams(cfgFileName);
+ if (quiet) {
+ globalParams->setErrQuiet(quiet);
+ }
+
+ // open PDF file
+ if (ownerPassword[0] != '\001') {
+ ownerPW = new GooString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0] != '\001') {
+ userPW = new GooString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ exitCode = 1;
+ goto err1;
+ }
+
+ // check for copy permission
+#ifdef ENFORCE_PERMISSIONS
+ if (!doc->okToCopy()) {
+ error(-1, "Copying of images from this document is not allowed.");
+ exitCode = 3;
+ goto err1;
+ }
+#endif
+
+ // get page range
+ if (firstPage < 1)
+ firstPage = 1;
+ if (lastPage < 1 || lastPage > doc->getNumPages())
+ lastPage = doc->getNumPages();
+
+ // write image files
+ imgOut = new ImageOutputDev(imgRoot, dumpJPEG);
+ if (imgOut->isOk()) {
+ doc->displayPages(imgOut, firstPage, lastPage, 72, 72, 0,
+ gTrue, gFalse, gFalse);
+ }
+ delete imgOut;
+
+ exitCode = 0;
+
+ // clean up
+ err1:
+ delete doc;
+ delete globalParams;
+ err0:
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return exitCode;
+}
diff --git a/utils/pdfinfo.1 b/utils/pdfinfo.1
new file mode 100644
index 00000000..334520c8
--- /dev/null
+++ b/utils/pdfinfo.1
@@ -0,0 +1,157 @@
+.\" Copyright 1999-2004 Glyph & Cog, LLC
+.TH pdfinfo 1 "22 January 2004"
+.SH NAME
+pdfinfo \- Portable Document Format (PDF) document information
+extractor (version 3.00)
+.SH SYNOPSIS
+.B pdfinfo
+[options]
+.RI [ PDF-file ]
+.SH DESCRIPTION
+.B Pdfinfo
+prints the contents of the \'Info' dictionary (plus some other useful
+information) from a Portable Document Format (PDF) file.
+.PP
+The \'Info' dictionary contains the following values:
+.PP
+.RS
+title
+.RE
+.RS
+subject
+.RE
+.RS
+keywords
+.RE
+.RS
+author
+.RE
+.RS
+creator
+.RE
+.RS
+producer
+.RE
+.RS
+creation date
+.RE
+.RS
+modification date
+.RE
+.PP
+In addition, the following information is printed:
+.PP
+.RS
+tagged (yes/no)
+.RE
+.RS
+page count
+.RE
+.RS
+encrypted flag (yes/no)
+.RE
+.RS
+print and copy permissions (if encrypted)
+.RE
+.RS
+page size
+.RE
+.RS
+file size
+.RE
+.RS
+linearized (yes/no)
+.RE
+.RS
+PDF version
+.RE
+.RS
+metadata (only if requested)
+.RE
+.SH CONFIGURATION FILE
+Pdfinfo reads a configuration file at startup. It first tries to find
+the user's private config file, ~/.xpdfrc. If that doesn't exist, it
+looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
+.BR xpdfrc (5)
+man page for details.
+.SH OPTIONS
+Many of the following options can be set with configuration file
+commands. These are listed in square brackets with the description of
+the corresponding command line option.
+.TP
+.BI \-f " number"
+Specifies the first page to examine. If multiple pages are requested
+using the "-f" and "-l" options, the size of each requested page (and,
+optionally, the bounding boxes for each requested page) are printed.
+Otherwise, only page one is examined.
+.TP
+.BI \-l " number"
+Specifies the last page to examine.
+.TP
+.B \-box
+Prints the page box bounding boxes: MediaBox, CropBox, BleedBox,
+TrimBox, and ArtBox.
+.TP
+.B \-meta
+Prints document-level metadata. (This is the "Metadata" stream from
+the PDF file's Catalog object.)
+.TP
+.BI \-enc " encoding-name"
+Sets the encoding to use for text output. The
+.I encoding\-name
+must be defined with the unicodeMap command (see
+.BR xpdfrc (5)).
+This defaults to "Latin1" (which is a built-in encoding).
+.RB "[config file: " textEncoding ]
+.TP
+.BI \-opw " password"
+Specify the owner password for the PDF file. Providing this will
+bypass all security restrictions.
+.TP
+.BI \-upw " password"
+Specify the user password for the PDF file.
+.TP
+.BI \-cfg " config-file"
+Read
+.I config-file
+in place of ~/.xpdfrc or the system-wide config file.
+.TP
+.B \-v
+Print copyright and version information.
+.TP
+.B \-h
+Print usage information.
+.RB ( \-help
+and
+.B \-\-help
+are equivalent.)
+.SH EXIT CODES
+The Xpdf tools use the following exit codes:
+.TP
+0
+No error.
+.TP
+1
+Error opening a PDF file.
+.TP
+2
+Error opening an output file.
+.TP
+3
+Error related to PDF permissions.
+.TP
+99
+Other error.
+.SH AUTHOR
+The pdfinfo software and documentation are copyright 1996-2004 Glyph &
+Cog, LLC.
+.SH "SEE ALSO"
+.BR xpdf (1),
+.BR pdftops (1),
+.BR pdftotext (1),
+.BR pdffonts (1),
+.BR pdftoppm (1),
+.BR pdfimages (1),
+.BR xpdfrc (5)
+.br
+.B http://www.foolabs.com/xpdf/
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
new file mode 100644
index 00000000..3d375354
--- /dev/null
+++ b/utils/pdfinfo.cc
@@ -0,0 +1,376 @@
+//========================================================================
+//
+// pdfinfo.cc
+//
+// Copyright 1998-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+#include <poppler-config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include "parseargs.h"
+#include "goo/GooString.h"
+#include "goo/gmem.h"
+#include "GlobalParams.h"
+#include "Object.h"
+#include "Stream.h"
+#include "Array.h"
+#include "Dict.h"
+#include "XRef.h"
+#include "Catalog.h"
+#include "Page.h"
+#include "PDFDoc.h"
+#include "CharTypes.h"
+#include "UnicodeMap.h"
+#include "Error.h"
+#include "config.h"
+
+static void printInfoString(Dict *infoDict, char *key, char *text,
+ UnicodeMap *uMap);
+static void printInfoDate(Dict *infoDict, char *key, char *text);
+static void printBox(char *text, PDFRectangle *box);
+
+static int firstPage = 1;
+static int lastPage = 0;
+static GBool printBoxes = gFalse;
+static GBool printMetadata = gFalse;
+static char textEncName[128] = "";
+static char ownerPassword[33] = "\001";
+static char userPassword[33] = "\001";
+static char cfgFileName[256] = "";
+static GBool printVersion = gFalse;
+static GBool printHelp = gFalse;
+
+static ArgDesc argDesc[] = {
+ {"-f", argInt, &firstPage, 0,
+ "first page to convert"},
+ {"-l", argInt, &lastPage, 0,
+ "last page to convert"},
+ {"-box", argFlag, &printBoxes, 0,
+ "print the page bounding boxes"},
+ {"-meta", argFlag, &printMetadata, 0,
+ "print the document metadata (XML)"},
+ {"-enc", argString, textEncName, sizeof(textEncName),
+ "output text encoding name"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
+ "configuration file to use in place of .xpdfrc"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"--help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-?", argFlag, &printHelp, 0,
+ "print usage information"},
+ {NULL}
+};
+
+int main(int argc, char *argv[]) {
+ PDFDoc *doc;
+ GooString *fileName;
+ GooString *ownerPW, *userPW;
+ UnicodeMap *uMap;
+ Page *page;
+ Object info;
+ char buf[256];
+ double w, h, wISO, hISO;
+ FILE *f;
+ GooString *metadata;
+ GBool ok;
+ int exitCode;
+ int pg, i;
+ GBool multiPage;
+
+ exitCode = 99;
+
+ // parse args
+ ok = parseArgs(argDesc, &argc, argv);
+ if (!ok || argc != 2 || printVersion || printHelp) {
+ fprintf(stderr, "pdfinfo version %s\n", xpdfVersion);
+ fprintf(stderr, "%s\n", xpdfCopyright);
+ if (!printVersion) {
+ printUsage("pdfinfo", "<PDF-file>", argDesc);
+ }
+ goto err0;
+ }
+ fileName = new GooString(argv[1]);
+
+ // read config file
+ globalParams = new GlobalParams(cfgFileName);
+ if (textEncName[0]) {
+ globalParams->setTextEncoding(textEncName);
+ }
+
+ // get mapping to output encoding
+ if (!(uMap = globalParams->getTextEncoding())) {
+ error(-1, "Couldn't get text encoding");
+ delete fileName;
+ goto err1;
+ }
+
+ // open PDF file
+ if (ownerPassword[0] != '\001') {
+ ownerPW = new GooString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0] != '\001') {
+ userPW = new GooString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ exitCode = 1;
+ goto err2;
+ }
+
+ // get page range
+ if (firstPage < 1) {
+ firstPage = 1;
+ }
+ if (lastPage == 0) {
+ multiPage = gFalse;
+ lastPage = 1;
+ } else {
+ multiPage = gTrue;
+ }
+ if (lastPage < 1 || lastPage > doc->getNumPages()) {
+ lastPage = doc->getNumPages();
+ }
+
+ // print doc info
+ doc->getDocInfo(&info);
+ if (info.isDict()) {
+ printInfoString(info.getDict(), "Title", "Title: ", uMap);
+ printInfoString(info.getDict(), "Subject", "Subject: ", uMap);
+ printInfoString(info.getDict(), "Keywords", "Keywords: ", uMap);
+ printInfoString(info.getDict(), "Author", "Author: ", uMap);
+ printInfoString(info.getDict(), "Creator", "Creator: ", uMap);
+ printInfoString(info.getDict(), "Producer", "Producer: ", uMap);
+ printInfoDate(info.getDict(), "CreationDate", "CreationDate: ");
+ printInfoDate(info.getDict(), "ModDate", "ModDate: ");
+ }
+ info.free();
+
+ // print tagging info
+ printf("Tagged: %s\n",
+ doc->getStructTreeRoot()->isDict() ? "yes" : "no");
+
+ // print page count
+ printf("Pages: %d\n", doc->getNumPages());
+
+ // print encryption info
+ printf("Encrypted: ");
+ if (doc->isEncrypted()) {
+ printf("yes (print:%s copy:%s change:%s addNotes:%s)\n",
+ doc->okToPrint(gTrue) ? "yes" : "no",
+ doc->okToCopy(gTrue) ? "yes" : "no",
+ doc->okToChange(gTrue) ? "yes" : "no",
+ doc->okToAddNotes(gTrue) ? "yes" : "no");
+ } else {
+ printf("no\n");
+ }
+
+ // print page size
+ for (pg = firstPage; pg <= lastPage; ++pg) {
+ w = doc->getPageMediaWidth(pg);
+ h = doc->getPageMediaHeight(pg);
+ if (multiPage) {
+ printf("Page %4d size: %g x %g pts", pg, w, h);
+ } else {
+ printf("Page size: %g x %g pts", w, h);
+ }
+ if ((fabs(w - 612) < 0.1 && fabs(h - 792) < 0.1) ||
+ (fabs(w - 792) < 0.1 && fabs(h - 612) < 0.1)) {
+ printf(" (letter)");
+ } else {
+ hISO = sqrt(sqrt(2.0)) * 7200 / 2.54;
+ wISO = hISO / sqrt(2.0);
+ for (i = 0; i <= 6; ++i) {
+ if ((fabs(w - wISO) < 1 && fabs(h - hISO) < 1) ||
+ (fabs(w - hISO) < 1 && fabs(h - wISO) < 1)) {
+ printf(" (A%d)", i);
+ break;
+ }
+ hISO = wISO;
+ wISO /= sqrt(2.0);
+ }
+ }
+ printf("\n");
+ }
+
+ // print the boxes
+ if (printBoxes) {
+ if (multiPage) {
+ for (pg = firstPage; pg <= lastPage; ++pg) {
+ page = doc->getCatalog()->getPage(pg);
+ sprintf(buf, "Page %4d MediaBox: ", pg);
+ printBox(buf, page->getMediaBox());
+ sprintf(buf, "Page %4d CropBox: ", pg);
+ printBox(buf, page->getCropBox());
+ sprintf(buf, "Page %4d BleedBox: ", pg);
+ printBox(buf, page->getBleedBox());
+ sprintf(buf, "Page %4d TrimBox: ", pg);
+ printBox(buf, page->getTrimBox());
+ sprintf(buf, "Page %4d ArtBox: ", pg);
+ printBox(buf, page->getArtBox());
+ }
+ } else {
+ page = doc->getCatalog()->getPage(firstPage);
+ printBox("MediaBox: ", page->getMediaBox());
+ printBox("CropBox: ", page->getCropBox());
+ printBox("BleedBox: ", page->getBleedBox());
+ printBox("TrimBox: ", page->getTrimBox());
+ printBox("ArtBox: ", page->getArtBox());
+ }
+ }
+
+ // print file size
+#ifdef VMS
+ f = fopen(fileName->getCString(), "rb", "ctx=stm");
+#else
+ f = fopen(fileName->getCString(), "rb");
+#endif
+ if (f) {
+#if HAVE_FSEEKO
+ fseeko(f, 0, SEEK_END);
+ printf("File size: %u bytes\n", (Guint)ftello(f));
+#elif HAVE_FSEEK64
+ fseek64(f, 0, SEEK_END);
+ printf("File size: %u bytes\n", (Guint)ftell64(f));
+#else
+ fseek(f, 0, SEEK_END);
+ printf("File size: %d bytes\n", (int)ftell(f));
+#endif
+ fclose(f);
+ }
+
+ // print linearization info
+ printf("Optimized: %s\n", doc->isLinearized() ? "yes" : "no");
+
+ // print PDF version
+ printf("PDF version: %.1f\n", doc->getPDFVersion());
+
+ // print the metadata
+ if (printMetadata && (metadata = doc->readMetadata())) {
+ fputs("Metadata:\n", stdout);
+ fputs(metadata->getCString(), stdout);
+ fputc('\n', stdout);
+ delete metadata;
+ }
+
+ exitCode = 0;
+
+ // clean up
+ err2:
+ uMap->decRefCnt();
+ delete doc;
+ err1:
+ delete globalParams;
+ err0:
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return exitCode;
+}
+
+static void printInfoString(Dict *infoDict, char *key, char *text,
+ UnicodeMap *uMap) {
+ Object obj;
+ GooString *s1;
+ GBool isUnicode;
+ Unicode u;
+ char buf[8];
+ int i, n;
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ fputs(text, stdout);
+ s1 = obj.getString();
+ if ((s1->getChar(0) & 0xff) == 0xfe &&
+ (s1->getChar(1) & 0xff) == 0xff) {
+ isUnicode = gTrue;
+ i = 2;
+ } else {
+ isUnicode = gFalse;
+ i = 0;
+ }
+ while (i < obj.getString()->getLength()) {
+ if (isUnicode) {
+ u = ((s1->getChar(i) & 0xff) << 8) |
+ (s1->getChar(i+1) & 0xff);
+ i += 2;
+ } else {
+ u = s1->getChar(i) & 0xff;
+ ++i;
+ }
+ n = uMap->mapUnicode(u, buf, sizeof(buf));
+ fwrite(buf, 1, n, stdout);
+ }
+ fputc('\n', stdout);
+ }
+ obj.free();
+}
+
+static void printInfoDate(Dict *infoDict, char *key, char *text) {
+ Object obj;
+ char *s;
+ int year, mon, day, hour, min, sec;
+ struct tm tmStruct;
+ char buf[256];
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ fputs(text, stdout);
+ s = obj.getString()->getCString();
+ if (s[0] == 'D' && s[1] == ':') {
+ s += 2;
+ }
+ if (sscanf(s, "%4d%2d%2d%2d%2d%2d",
+ &year, &mon, &day, &hour, &min, &sec) == 6) {
+ tmStruct.tm_year = year - 1900;
+ tmStruct.tm_mon = mon - 1;
+ tmStruct.tm_mday = day;
+ tmStruct.tm_hour = hour;
+ tmStruct.tm_min = min;
+ tmStruct.tm_sec = sec;
+ tmStruct.tm_wday = -1;
+ tmStruct.tm_yday = -1;
+ tmStruct.tm_isdst = -1;
+ // compute the tm_wday and tm_yday fields
+ if (mktime(&tmStruct) != (time_t)-1 &&
+ strftime(buf, sizeof(buf), "%c", &tmStruct)) {
+ fputs(buf, stdout);
+ } else {
+ fputs(s, stdout);
+ }
+ } else {
+ fputs(s, stdout);
+ }
+ fputc('\n', stdout);
+ }
+ obj.free();
+}
+
+static void printBox(char *text, PDFRectangle *box) {
+ printf("%s%8.2f %8.2f %8.2f %8.2f\n",
+ text, box->x1, box->y1, box->x2, box->y2);
+}
diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1
new file mode 100644
index 00000000..850aa840
--- /dev/null
+++ b/utils/pdftohtml.1
@@ -0,0 +1,85 @@
+.TH PDFTOHTML 1
+.\" NAME should be all caps, SECTION should be 1-8, maybe w/ subsection
+.\" other parms are allowed: see man(7), man(1)
+.SH NAME
+pdftohtml \- program to convert pdf files into html, xml and png images
+.SH SYNOPSIS
+.B pdftohtml
+.I "[options] <PDF-file> [<html-file> <xml-file>]"
+.SH "DESCRIPTION"
+This manual page documents briefly the
+.BR pdftohtml
+command.
+This manual page was written for the Debian GNU/Linux distribution
+because the original program does not have a manual page.
+.PP
+.B pdftohtml
+is a program that converts pdf documents into html. It generates its output in
+the current working directory.
+.SH OPTIONS
+A summary of options are included below.
+.TP
+.B \-h, \-help
+Show summary of options.
+.TP
+.B \-f <int>
+first page to print
+.TP
+.B \-l <int>
+last page to print
+.TP
+.B \-q
+dont print any messages or errors
+.TP
+.B \-v
+print copyright and version info
+.TP
+.B \-p
+exchange .pdf links with .html
+.TP
+.B \-c
+generate complex output
+.TP
+.B \-i
+ignore images
+.TP
+.B \-noframes
+generate no frames. Not supported in complex output mode.
+.TP
+.B \-stdout
+use standard output
+.TP
+.B \-zoom <fp>
+zoom the pdf document (default 1.5)
+.TP
+.B \-xml
+output for XML post-processing
+.TP
+.B \-enc <string>
+output text encoding name
+.TP
+.B \-opw <string>
+owner password (for encrypted files)
+.TP
+.B \-upw <string>
+user password (for encrypted files)
+.TP
+.B \-hidden
+force hidden text extraction
+.TP
+.B \-dev
+output device name for Ghostscript (png16m, jpeg etc)
+.TP
+.B \-nomerge
+do not merge paragraphs
+.TP
+.B \-nodrm
+override document DRM settings
+
+.SH AUTHOR
+
+Pdftohtml was developed by Gueorgui Ovtcharov and Rainer Dorsch. It is
+based and benefits a lot from Derek Noonburg's xpdf package.
+
+This manual page was written by Søren Boll Overgaard <boll@debian.org>,
+for the Debian GNU/Linux system (but may be used by others).
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
new file mode 100644
index 00000000..99fbc0a3
--- /dev/null
+++ b/utils/pdftohtml.cc
@@ -0,0 +1,429 @@
+//========================================================================
+//
+// pdftohtml.cc
+//
+//
+// Copyright 1999-2000 G. Ovtcharov
+//========================================================================
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <dirent.h>
+#include <poppler-config.h>
+#include <time.h>
+#include "parseargs.h"
+#include "goo/GooString.h"
+#include "goo/gmem.h"
+#include "Object.h"
+#include "Stream.h"
+#include "Array.h"
+#include "Dict.h"
+#include "XRef.h"
+#include "Catalog.h"
+#include "Page.h"
+#include "PDFDoc.h"
+#include "HtmlOutputDev.h"
+#include "PSOutputDev.h"
+#include "GlobalParams.h"
+#include "Error.h"
+#include "config.h"
+#include "goo/gfile.h"
+
+#ifndef GHOSTSCRIPT
+# define GHOSTSCRIPT "gs"
+#endif
+
+static int firstPage = 1;
+static int lastPage = 0;
+static GBool rawOrder = gTrue;
+GBool printCommands = gTrue;
+static GBool printHelp = gFalse;
+GBool printHtml = gFalse;
+GBool complexMode=gFalse;
+GBool ignore=gFalse;
+//char extension[5]=".png";
+double scale=1.5;
+GBool noframes=gFalse;
+GBool stout=gFalse;
+GBool xml=gFalse;
+GBool errQuiet=gFalse;
+GBool noDrm=gFalse;
+
+GBool showHidden = gFalse;
+GBool noMerge = gFalse;
+static char ownerPassword[33] = "";
+static char userPassword[33] = "";
+static char gsDevice[33] = "png16m";
+static GBool printVersion = gFalse;
+
+static GooString* getInfoString(Dict *infoDict, char *key);
+static GooString* getInfoDate(Dict *infoDict, char *key);
+
+static char textEncName[128] = "";
+
+static ArgDesc argDesc[] = {
+ {"-f", argInt, &firstPage, 0,
+ "first page to convert"},
+ {"-l", argInt, &lastPage, 0,
+ "last page to convert"},
+ /*{"-raw", argFlag, &rawOrder, 0,
+ "keep strings in content stream order"},*/
+ {"-q", argFlag, &errQuiet, 0,
+ "don't print any messages or errors"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-p", argFlag, &printHtml, 0,
+ "exchange .pdf links by .html"},
+ {"-c", argFlag, &complexMode, 0,
+ "generate complex document"},
+ {"-i", argFlag, &ignore, 0,
+ "ignore images"},
+ {"-noframes", argFlag, &noframes, 0,
+ "generate no frames"},
+ {"-stdout" ,argFlag, &stout, 0,
+ "use standard output"},
+ {"-zoom", argFP, &scale, 0,
+ "zoom the pdf document (default 1.5)"},
+ {"-xml", argFlag, &xml, 0,
+ "output for XML post-processing"},
+ {"-hidden", argFlag, &showHidden, 0,
+ "output hidden text"},
+ {"-nomerge", argFlag, &noMerge, 0,
+ "do not merge paragraphs"},
+ {"-enc", argString, textEncName, sizeof(textEncName),
+ "output text encoding name"},
+ {"-dev", argString, gsDevice, sizeof(gsDevice),
+ "output device name for Ghostscript (png16m, jpeg etc)"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-nodrm", argFlag, &noDrm, 0,
+ "override document DRM settings"},
+ {NULL}
+};
+
+int main(int argc, char *argv[]) {
+ PDFDoc *doc = NULL;
+ GooString *fileName = NULL;
+ GooString *docTitle = NULL;
+ GooString *author = NULL, *keywords = NULL, *subject = NULL, *date = NULL;
+ GooString *htmlFileName = NULL;
+ GooString *psFileName = NULL;
+ HtmlOutputDev *htmlOut = NULL;
+ PSOutputDev *psOut = NULL;
+ GBool ok;
+ char *p;
+ char extension[16] = "png";
+ GooString *ownerPW, *userPW;
+ Object info;
+ char * extsList[] = {"png", "jpeg", "bmp", "pcx", "tiff", "pbm", NULL};
+
+ // parse args
+ ok = parseArgs(argDesc, &argc, argv);
+ if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
+ fprintf(stderr, "pdftohtml version %s http://pdftohtml.sourceforge.net/, based on Xpdf version %s\n", "0.36", xpdfVersion);
+ fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
+ fprintf(stderr, "%s\n\n", xpdfCopyright);
+ if (!printVersion) {
+ printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
+ }
+ exit(1);
+ }
+
+ // init error file
+ //errorInit();
+
+ // read config file
+ globalParams = new GlobalParams("");
+
+ if (errQuiet) {
+ globalParams->setErrQuiet(errQuiet);
+ printCommands = gFalse; // I'm not 100% what is the differecne between them
+ }
+
+ if (textEncName[0]) {
+ globalParams->setTextEncoding(textEncName);
+ if( !globalParams->getTextEncoding() ) {
+ goto error;
+ }
+ }
+
+ // open PDF file
+ if (ownerPassword[0]) {
+ ownerPW = new GooString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0]) {
+ userPW = new GooString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+
+ fileName = new GooString(argv[1]);
+
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ goto error;
+ }
+
+ // check for copy permission
+ if (!doc->okToCopy()) {
+ if (!noDrm) {
+ error(-1, "Copying of text from this document is not allowed.");
+ goto error;
+ }
+ fprintf(stderr, "Document has copy-protection bit set.\n");
+ }
+
+ // construct text file name
+ if (argc == 3) {
+ GooString* tmp = new GooString(argv[2]);
+ p=tmp->getCString()+tmp->getLength()-5;
+ if (!xml)
+ if (!strcmp(p, ".html") || !strcmp(p, ".HTML"))
+ htmlFileName = new GooString(tmp->getCString(),
+ tmp->getLength() - 5);
+ else htmlFileName =new GooString(tmp);
+ else
+ if (!strcmp(p, ".xml") || !strcmp(p, ".XML"))
+ htmlFileName = new GooString(tmp->getCString(),
+ tmp->getLength() - 5);
+ else htmlFileName =new GooString(tmp);
+
+ delete tmp;
+ } else {
+ p = fileName->getCString() + fileName->getLength() - 4;
+ if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))
+ htmlFileName = new GooString(fileName->getCString(),
+ fileName->getLength() - 4);
+ else
+ htmlFileName = fileName->copy();
+ // htmlFileName->append(".html");
+ }
+
+ if (scale>3.0) scale=3.0;
+ if (scale<0.5) scale=0.5;
+
+ if (complexMode) {
+ //noframes=gFalse;
+ stout=gFalse;
+ }
+
+ if (stout) {
+ noframes=gTrue;
+ complexMode=gFalse;
+ }
+
+ if (xml)
+ {
+ complexMode = gTrue;
+ noframes = gTrue;
+ noMerge = gTrue;
+ }
+
+ // get page range
+ if (firstPage < 1)
+ firstPage = 1;
+ if (lastPage < 1 || lastPage > doc->getNumPages())
+ lastPage = doc->getNumPages();
+
+ doc->getDocInfo(&info);
+ if (info.isDict()) {
+ docTitle = getInfoString(info.getDict(), "Title");
+ author = getInfoString(info.getDict(), "Author");
+ keywords = getInfoString(info.getDict(), "Keywords");
+ subject = getInfoString(info.getDict(), "Subject");
+ date = getInfoDate(info.getDict(), "ModDate");
+ if( !date )
+ date = getInfoDate(info.getDict(), "CreationDate");
+ }
+ info.free();
+ if( !docTitle ) docTitle = new GooString(htmlFileName);
+
+ /* determine extensions of output backgroun images */
+ {int i;
+ for(i = 0; extsList[i]; i++)
+ {
+ if( strstr(gsDevice, extsList[i]) != (char *) NULL )
+ {
+ strncpy(extension, extsList[i], sizeof(extension));
+ break;
+ }
+ }}
+
+ rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
+
+ // write text file
+ htmlOut = new HtmlOutputDev(htmlFileName->getCString(),
+ docTitle->getCString(),
+ author ? author->getCString() : NULL,
+ keywords ? keywords->getCString() : NULL,
+ subject ? subject->getCString() : NULL,
+ date ? date->getCString() : NULL,
+ extension,
+ rawOrder,
+ firstPage,
+ doc->getCatalog()->getOutline()->isDict());
+ delete docTitle;
+ if( author )
+ {
+ delete author;
+ }
+ if( keywords )
+ {
+ delete keywords;
+ }
+ if( subject )
+ {
+ delete subject;
+ }
+ if( date )
+ {
+ delete date;
+ }
+
+ if (htmlOut->isOk())
+ {
+ doc->displayPages(htmlOut, firstPage, lastPage, 72, 72, 0,
+ gTrue, gFalse, gFalse);
+ if (!xml)
+ {
+ htmlOut->dumpDocOutline(doc->getCatalog());
+ }
+ }
+
+ if( complexMode && !xml && !ignore ) {
+ int h=xoutRound(htmlOut->getPageHeight()/scale);
+ int w=xoutRound(htmlOut->getPageWidth()/scale);
+ //int h=xoutRound(doc->getPageHeight(1)/scale);
+ //int w=xoutRound(doc->getPageWidth(1)/scale);
+
+ psFileName = new GooString(htmlFileName->getCString());
+ psFileName->append(".ps");
+
+ globalParams->setPSPaperWidth(w);
+ globalParams->setPSPaperHeight(h);
+ // XXX
+ // globalParams->setPSNoText(gTrue);
+ psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
+ doc->getCatalog(), firstPage, lastPage, psModePS);
+ doc->displayPages(psOut, firstPage, lastPage, 72, 72, 0,
+ gTrue, gFalse, gFalse);
+ delete psOut;
+
+ /*sprintf(buf, "%s -sDEVICE=png16m -dBATCH -dNOPROMPT -dNOPAUSE -r72 -sOutputFile=%s%%03d.png -g%dx%d -q %s", GHOSTSCRIPT, htmlFileName->getCString(), w, h,
+ psFileName->getCString());*/
+
+ GooString *gsCmd = new GooString(GHOSTSCRIPT);
+ GooString *tw, *th, *sc;
+ gsCmd->append(" -sDEVICE=");
+ gsCmd->append(gsDevice);
+ gsCmd->append(" -dBATCH -dNOPROMPT -dNOPAUSE -r");
+ sc = GooString::fromInt(static_cast<int>(72*scale));
+ gsCmd->append(sc);
+ gsCmd->append(" -sOutputFile=");
+ gsCmd->append("\"");
+ gsCmd->append(htmlFileName);
+ gsCmd->append("%03d.");
+ gsCmd->append(extension);
+ gsCmd->append("\" -g");
+ tw = GooString::fromInt(static_cast<int>(scale*w));
+ gsCmd->append(tw);
+ gsCmd->append("x");
+ th = GooString::fromInt(static_cast<int>(scale*h));
+ gsCmd->append(th);
+ gsCmd->append(" -q \"");
+ gsCmd->append(psFileName);
+ gsCmd->append("\"");
+// printf("running: %s\n", gsCmd->getCString());
+ if( !executeCommand(gsCmd->getCString()) && !errQuiet) {
+ error(-1, "Failed to launch Ghostscript!\n");
+ }
+ unlink(psFileName->getCString());
+ delete tw;
+ delete th;
+ delete sc;
+ delete gsCmd;
+ delete psFileName;
+ }
+
+ delete htmlOut;
+
+ // clean up
+ error:
+ if(doc) delete doc;
+ if(globalParams) delete globalParams;
+
+ if(htmlFileName) delete htmlFileName;
+ HtmlFont::clear();
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return 0;
+}
+
+static GooString* getInfoString(Dict *infoDict, char *key) {
+ Object obj;
+ GooString *s1 = NULL;
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ s1 = new GooString(obj.getString());
+ }
+ obj.free();
+ return s1;
+}
+
+static GooString* getInfoDate(Dict *infoDict, char *key) {
+ Object obj;
+ char *s;
+ int year, mon, day, hour, min, sec;
+ struct tm tmStruct;
+ GooString *result = NULL;
+ char buf[256];
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ s = obj.getString()->getCString();
+ if (s[0] == 'D' && s[1] == ':') {
+ s += 2;
+ }
+ if (sscanf(s, "%4d%2d%2d%2d%2d%2d",
+ &year, &mon, &day, &hour, &min, &sec) == 6) {
+ tmStruct.tm_year = year - 1900;
+ tmStruct.tm_mon = mon - 1;
+ tmStruct.tm_mday = day;
+ tmStruct.tm_hour = hour;
+ tmStruct.tm_min = min;
+ tmStruct.tm_sec = sec;
+ tmStruct.tm_wday = -1;
+ tmStruct.tm_yday = -1;
+ tmStruct.tm_isdst = -1;
+ mktime(&tmStruct); // compute the tm_wday and tm_yday fields
+ if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
+ result = new GooString(buf);
+ } else {
+ result = new GooString(s);
+ }
+ } else {
+ result = new GooString(s);
+ }
+ }
+ obj.free();
+ return result;
+}
+
diff --git a/utils/pdftoppm.1 b/utils/pdftoppm.1
new file mode 100644
index 00000000..f4d93e3a
--- /dev/null
+++ b/utils/pdftoppm.1
@@ -0,0 +1,113 @@
+.\" Copyright 2004 Glyph & Cog, LLC
+.TH pdftoppm 1 "22 January 2004"
+.SH NAME
+pdftoppm \- Portable Document Format (PDF) to Portable Pixmap (PPM)
+converter (version 3.00)
+.SH SYNOPSIS
+.B pdftoppm
+[options]
+.I PDF-file PPM-root
+.SH DESCRIPTION
+.B Pdftoppm
+converts Portable Document Format (PDF) files to color image files in
+Portable Pixmap (PPM) format, grayscale image files in Portable
+Graymap (PGM) format, or monochrome image files in Portable Bitmap
+(PBM) format.
+.PP
+Pdftoppm reads the PDF file,
+.IR PDF-file ,
+and writes one PPM file for each page,
+.IR PPM-root - nnnnnn .ppm,
+where
+.I nnnnnn
+is the page number.
+.SH CONFIGURATION FILE
+Pdftoppm reads a configuration file at startup. It first tries to
+find the user's private config file, ~/.xpdfrc. If that doesn't
+exist, it looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
+.BR xpdfrc (5)
+man page for details.
+.SH OPTIONS
+Many of the following options can be set with configuration file
+commands. These are listed in square brackets with the description of
+the corresponding command line option.
+.TP
+.BI \-f " number"
+Specifies the first page to convert.
+.TP
+.BI \-l " number"
+Specifies the last page to convert.
+.TP
+.BI \-r " number"
+Specifies the resolution, in DPI. The default is 150 DPI.
+.TP
+.B \-mono
+Generate a monochrome PBM file (instead of a color PPM file).
+.TP
+.B \-gray
+Generate a grayscale PGM file (instead of a color PPM file).
+.TP
+.BI \-t1lib " yes | no"
+Enable or disable t1lib (a Type 1 font rasterizer). This defaults to
+"yes".
+.RB "[config file: " enableT1lib ]
+.TP
+.BI \-freetype " yes | no"
+Enable or disable FreeType (a TrueType / Type 1 font rasterizer).
+This defaults to "yes".
+.RB "[config file: " enableFreeType ]
+.TP
+.BI \-aa " yes | no"
+Enable or disable font anti-aliasing. This defaults to "yes".
+.RB "[config file: " antialias ]
+.TP
+.BI \-opw " password"
+Specify the owner password for the PDF file. Providing this will
+bypass all security restrictions.
+.TP
+.BI \-upw " password"
+Specify the user password for the PDF file.
+.TP
+.B \-q
+Don't print any messages or errors.
+.RB "[config file: " errQuiet ]
+.TP
+.B \-v
+Print copyright and version information.
+.TP
+.B \-h
+Print usage information.
+.RB ( \-help
+and
+.B \-\-help
+are equivalent.)
+.SH EXIT CODES
+The Xpdf tools use the following exit codes:
+.TP
+0
+No error.
+.TP
+1
+Error opening a PDF file.
+.TP
+2
+Error opening an output file.
+.TP
+3
+Error related to PDF permissions.
+.TP
+99
+Other error.
+.SH AUTHOR
+The pdftoppm software and documentation are copyright 1996-2004 Glyph
+& Cog, LLC.
+.SH "SEE ALSO"
+.BR xpdf (1),
+.BR pdftops (1),
+.BR pdftotext (1),
+.BR pdfinfo (1),
+.BR pdffonts (1),
+.BR pdfimages (1),
+.BR xpdfrc (5)
+.br
+.B http://www.foolabs.com/xpdf/
diff --git a/utils/pdftoppm.cc b/utils/pdftoppm.cc
new file mode 100644
index 00000000..ba153b72
--- /dev/null
+++ b/utils/pdftoppm.cc
@@ -0,0 +1,189 @@
+//========================================================================
+//
+// pdftoppm.cc
+//
+// Copyright 2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+#include <poppler-config.h>
+#include <stdio.h>
+#include "parseargs.h"
+#include "goo/gmem.h"
+#include "goo/GooString.h"
+#include "GlobalParams.h"
+#include "Object.h"
+#include "PDFDoc.h"
+#include "splash/SplashBitmap.h"
+#include "splash/Splash.h"
+#include "SplashOutputDev.h"
+#include "config.h"
+
+static int firstPage = 1;
+static int lastPage = 0;
+static int resolution = 150;
+static GBool mono = gFalse;
+static GBool gray = gFalse;
+static char enableT1libStr[16] = "";
+static char enableFreeTypeStr[16] = "";
+static char antialiasStr[16] = "";
+static char ownerPassword[33] = "";
+static char userPassword[33] = "";
+static GBool quiet = gFalse;
+static char cfgFileName[256] = "";
+static GBool printVersion = gFalse;
+static GBool printHelp = gFalse;
+
+static ArgDesc argDesc[] = {
+ {"-f", argInt, &firstPage, 0,
+ "first page to print"},
+ {"-l", argInt, &lastPage, 0,
+ "last page to print"},
+ {"-r", argInt, &resolution, 0,
+ "resolution, in DPI (default is 150)"},
+ {"-mono", argFlag, &mono, 0,
+ "generate a monochrome PBM file"},
+ {"-gray", argFlag, &gray, 0,
+ "generate a grayscale PGM file"},
+#if HAVE_T1LIB_H
+ {"-t1lib", argString, enableT1libStr, sizeof(enableT1libStr),
+ "enable t1lib font rasterizer: yes, no"},
+#endif
+#if HAVE_FREETYPE_FREETYPE_H | HAVE_FREETYPE_H
+ {"-freetype", argString, enableFreeTypeStr, sizeof(enableFreeTypeStr),
+ "enable FreeType font rasterizer: yes, no"},
+#endif
+ {"-aa", argString, antialiasStr, sizeof(antialiasStr),
+ "enable font anti-aliasing: yes, no"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-q", argFlag, &quiet, 0,
+ "don't print any messages or errors"},
+ {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
+ "configuration file to use in place of .xpdfrc"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"--help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-?", argFlag, &printHelp, 0,
+ "print usage information"},
+ {NULL}
+};
+
+int main(int argc, char *argv[]) {
+ PDFDoc *doc;
+ GooString *fileName;
+ char *ppmRoot;
+ char ppmFile[512];
+ GooString *ownerPW, *userPW;
+ SplashColor paperColor;
+ SplashOutputDev *splashOut;
+ GBool ok;
+ int exitCode;
+ int pg;
+
+ exitCode = 99;
+
+ // parse args
+ ok = parseArgs(argDesc, &argc, argv);
+ if (mono && gray) {
+ ok = gFalse;
+ }
+ if (!ok || argc != 3 || printVersion || printHelp) {
+ fprintf(stderr, "pdftoppm version %s\n", xpdfVersion);
+ fprintf(stderr, "%s\n", xpdfCopyright);
+ if (!printVersion) {
+ printUsage("pdftoppm", "<PDF-file> <PPM-root>", argDesc);
+ }
+ goto err0;
+ }
+ fileName = new GooString(argv[1]);
+ ppmRoot = argv[2];
+
+ // read config file
+ globalParams = new GlobalParams(cfgFileName);
+ globalParams->setupBaseFonts(NULL);
+ if (enableT1libStr[0]) {
+ if (!globalParams->setEnableT1lib(enableT1libStr)) {
+ fprintf(stderr, "Bad '-t1lib' value on command line\n");
+ }
+ }
+ if (enableFreeTypeStr[0]) {
+ if (!globalParams->setEnableFreeType(enableFreeTypeStr)) {
+ fprintf(stderr, "Bad '-freetype' value on command line\n");
+ }
+ }
+ if (antialiasStr[0]) {
+ if (!globalParams->setAntialias(antialiasStr)) {
+ fprintf(stderr, "Bad '-aa' value on command line\n");
+ }
+ }
+ if (quiet) {
+ globalParams->setErrQuiet(quiet);
+ }
+
+ // open PDF file
+ if (ownerPassword[0]) {
+ ownerPW = new GooString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0]) {
+ userPW = new GooString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ exitCode = 1;
+ goto err1;
+ }
+
+ // get page range
+ if (firstPage < 1)
+ firstPage = 1;
+ if (lastPage < 1 || lastPage > doc->getNumPages())
+ lastPage = doc->getNumPages();
+
+ // write PPM files
+ paperColor.rgb8 = splashMakeRGB8(255, 255, 255);
+ splashOut = new SplashOutputDev(mono ? splashModeMono1 :
+ gray ? splashModeMono8 :
+ splashModeRGB8,
+ gFalse, paperColor);
+ splashOut->startDoc(doc->getXRef());
+ for (pg = firstPage; pg <= lastPage; ++pg) {
+ doc->displayPage(splashOut, pg, resolution, resolution, 0, gTrue, gFalse);
+ sprintf(ppmFile, "%.*s-%06d.%s",
+ (int)sizeof(ppmFile) - 32, ppmRoot, pg,
+ mono ? "pbm" : gray ? "pgm" : "ppm");
+ splashOut->getBitmap()->writePNMFile(ppmFile);
+ }
+ delete splashOut;
+
+ exitCode = 0;
+
+ // clean up
+ err1:
+ delete doc;
+ delete globalParams;
+ err0:
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return exitCode;
+}
diff --git a/utils/pdftops.1 b/utils/pdftops.1
new file mode 100644
index 00000000..04c5c7e1
--- /dev/null
+++ b/utils/pdftops.1
@@ -0,0 +1,224 @@
+.\" Copyright 1996-2004 Glyph & Cog, LLC
+.TH pdftops 1 "22 January 2004"
+.SH NAME
+pdftops \- Portable Document Format (PDF) to PostScript converter
+(version 3.00)
+.SH SYNOPSIS
+.B pdftops
+[options]
+.RI [ PDF-file
+.RI [ PS-file ]]
+.SH DESCRIPTION
+.B Pdftops
+converts Portable Document Format (PDF) files to PostScript so they
+can be printed.
+.PP
+Pdftops reads the PDF file,
+.IR PDF-file ,
+and writes a PostScript file,
+.IR PS-file .
+If
+.I PS-file
+is not specified, pdftops converts
+.I file.pdf
+to
+.I file.ps
+(or
+.I file.eps
+with the -eps option). If
+.I PS-file
+is \'-', the PostScript is sent to stdout.
+.SH CONFIGURATION FILE
+Pdftops reads a configuration file at startup. It first tries to find
+the user's private config file, ~/.xpdfrc. If that doesn't exist, it
+looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
+.BR xpdfrc (5)
+man page for details.
+.SH OPTIONS
+Many of the following options can be set with configuration file
+commands. These are listed in square brackets with the description of
+the corresponding command line option.
+.TP
+.BI \-f " number"
+Specifies the first page to print.
+.TP
+.BI \-l " number"
+Specifies the last page to print.
+.TP
+.B \-level1
+Generate Level 1 PostScript. The resulting PostScript files will be
+significantly larger (if they contain images), but will print on Level
+1 printers. This also converts all images to black and white. No
+more than one of the PostScript level options (-level1, -level1sep,
+-level2, -level2sep, -level3, -level3Sep) may be given.
+.RB "[config file: " psLevel ]
+.TP
+.B \-level1sep
+Generate Level 1 separable PostScript. All colors are converted to
+CMYK. Images are written with separate stream data for the four
+components.
+.RB "[config file: " psLevel ]
+.TP
+.B \-level2
+Generate Level 2 PostScript. Level 2 supports color images and image
+compression. This is the default setting.
+.RB "[config file: " psLevel ]
+.TP
+.B \-level2sep
+Generate Level 2 separable PostScript. All colors are converted to
+CMYK. The PostScript separation convention operators are used to
+handle custom (spot) colors.
+.RB "[config file: " psLevel ]
+.TP
+.B \-level3
+Generate Level 3 PostScript. This enables all Level 2 features plus
+CID font embedding.
+.RB "[config file: " psLevel ]
+.TP
+.B \-level3Sep
+Generate Level 3 separable PostScript. The separation handling is the
+same as for -level2Sep.
+.RB "[config file: " psLevel ]
+.TP
+.B \-eps
+Generate an Encapsulated PostScript (EPS) file. An EPS file contains
+a single image, so if you use this option with a multi-page PDF file,
+you must use -f and -l to specify a single page. No more than one of
+the mode options (-eps, -form) may be given.
+.TP
+.B \-form
+Generate a PostScript form which can be imported by software that
+understands forms. A form contains a single page, so if you use this
+option with a multi-page PDF file, you must use -f and -l to specify a
+single page. The -level1 option cannot be used with -form.
+.TP
+.B \-opi
+Generate OPI comments for all images and forms which have OPI
+information. (This option is only available if pdftops was compiled
+with OPI support.)
+.RB "[config file: " psOPI ]
+.TP
+.B \-noembt1
+By default, any Type 1 fonts which are embedded in the PDF file are
+copied into the PostScript file. This option causes pdftops to
+substitute base fonts instead. Embedded fonts make PostScript files
+larger, but may be necessary for readable output.
+.RB "[config file: " psEmbedType1Fonts ]
+.TP
+.B \-noembtt
+By default, any TrueType fonts which are embedded in the PDF file are
+copied into the PostScript file. This option causes pdftops to
+substitute base fonts instead. Embedded fonts make PostScript files
+larger, but may be necessary for readable output. Also, some
+PostScript interpreters do not have TrueType rasterizers.
+.RB "[config file: " psEmbedTrueTypeFonts ]
+.TP
+.B \-noembcidps
+By default, any CID PostScript fonts which are embedded in the PDF
+file are copied into the PostScript file. This option disables that
+embedding. No attempt is made to substitute for non-embedded CID
+PostScript fonts.
+.RB "[config file: " psEmbedCIDPostScriptFonts ]
+.TP
+.B \-noembcidtt
+By default, any CID TrueType fonts which are embedded in the PDF file
+are copied into the PostScript file. This option disables that
+embedding. No attempt is made to substitute for non-embedded CID
+TrueType fonts.
+.RB "[config file: " psEmbedCIDTrueTypeFonts ]
+.TP
+.BI \-paper " size"
+Set the paper size to one of "letter", "legal", "A4", or "A3". This
+can also be set to "match", which will set the paper size to match the
+size specified in the PDF file.
+.RB "[config file: " psPaperSize ]
+.TP
+.BI \-paperw " size"
+Set the paper width, in points.
+.RB "[config file: " psPaperSize ]
+.TP
+.BI \-paperh " size"
+Set the paper height, in points.
+.RB "[config file: " psPaperSize ]
+.TP
+.B \-nocrop
+By default, output is cropped to the CropBox specified in the PDF
+file. This option disables cropping.
+.RB "[config file: " psCrop ]
+.TP
+.B \-expand
+Expand PDF pages smaller than the paper to fill the paper. By
+default, these pages are not scaled.
+.RB "[config file: " psExpandSmaller ]
+.TP
+.B \-noshrink
+Don't scale PDF pages which are larger than the paper. By default,
+pages larger than the paper are shrunk to fit.
+.RB "[config file: " psShrinkLarger ]
+.TP
+.B \-nocenter
+By default, PDF pages smaller than the paper (after any scaling) are
+centered on the paper. This option causes them to be aligned to the
+lower-left corner of the paper instead.
+.RB "[config file: " psCenter ]
+.TP
+.B \-duplex
+Set the Duplex pagedevice entry in the PostScript file. This tells
+duplex-capable printers to enable duplexing.
+.RB "[config file: " psDuplex ]
+.TP
+.BI \-opw " password"
+Specify the owner password for the PDF file. Providing this will
+bypass all security restrictions.
+.TP
+.BI \-upw " password"
+Specify the user password for the PDF file.
+.TP
+.B \-q
+Don't print any messages or errors.
+.RB "[config file: " errQuiet ]
+.TP
+.BI \-cfg " config-file"
+Read
+.I config-file
+in place of ~/.xpdfrc or the system-wide config file.
+.TP
+.B \-v
+Print copyright and version information.
+.TP
+.B \-h
+Print usage information.
+.RB ( \-help
+and
+.B \-\-help
+are equivalent.)
+.SH EXIT CODES
+The Xpdf tools use the following exit codes:
+.TP
+0
+No error.
+.TP
+1
+Error opening a PDF file.
+.TP
+2
+Error opening an output file.
+.TP
+3
+Error related to PDF permissions.
+.TP
+99
+Other error.
+.SH AUTHOR
+The pdftops software and documentation are copyright 1996-2004 Glyph &
+Cog, LLC.
+.SH "SEE ALSO"
+.BR xpdf (1),
+.BR pdftotext (1),
+.BR pdfinfo (1),
+.BR pdffonts (1),
+.BR pdftoppm (1),
+.BR pdfimages (1),
+.BR xpdfrc (5)
+.br
+.B http://www.foolabs.com/xpdf/
diff --git a/utils/pdftops.cc b/utils/pdftops.cc
new file mode 100644
index 00000000..308a6e0b
--- /dev/null
+++ b/utils/pdftops.cc
@@ -0,0 +1,336 @@
+//========================================================================
+//
+// pdftops.cc
+//
+// Copyright 1996-2003 Glyph & Cog, LLC
+//
+// Modified for Debian by Hamish Moffatt, 22 May 2002.
+//
+//========================================================================
+
+#include <poppler-config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include "parseargs.h"
+#include "goo/GooString.h"
+#include "goo/gmem.h"
+#include "GlobalParams.h"
+#include "Object.h"
+#include "Stream.h"
+#include "Array.h"
+#include "Dict.h"
+#include "XRef.h"
+#include "Catalog.h"
+#include "Page.h"
+#include "PDFDoc.h"
+#include "PSOutputDev.h"
+#include "Error.h"
+#include "config.h"
+
+static int firstPage = 1;
+static int lastPage = 0;
+static GBool level1 = gFalse;
+static GBool level1Sep = gFalse;
+static GBool level2 = gFalse;
+static GBool level2Sep = gFalse;
+static GBool level3 = gFalse;
+static GBool level3Sep = gFalse;
+static GBool doEPS = gFalse;
+static GBool doForm = gFalse;
+#if OPI_SUPPORT
+static GBool doOPI = gFalse;
+#endif
+static GBool noEmbedT1Fonts = gFalse;
+static GBool noEmbedTTFonts = gFalse;
+static GBool noEmbedCIDPSFonts = gFalse;
+static GBool noEmbedCIDTTFonts = gFalse;
+static char paperSize[15] = "";
+static int paperWidth = 0;
+static int paperHeight = 0;
+static GBool noCrop = gFalse;
+static GBool expand = gFalse;
+static GBool noShrink = gFalse;
+static GBool noCenter = gFalse;
+static GBool duplex = gFalse;
+static char ownerPassword[33] = "\001";
+static char userPassword[33] = "\001";
+static GBool quiet = gFalse;
+static char cfgFileName[256] = "";
+static GBool printVersion = gFalse;
+static GBool printHelp = gFalse;
+
+static ArgDesc argDesc[] = {
+ {"-f", argInt, &firstPage, 0,
+ "first page to print"},
+ {"-l", argInt, &lastPage, 0,
+ "last page to print"},
+ {"-level1", argFlag, &level1, 0,
+ "generate Level 1 PostScript"},
+ {"-level1sep", argFlag, &level1Sep, 0,
+ "generate Level 1 separable PostScript"},
+ {"-level2", argFlag, &level2, 0,
+ "generate Level 2 PostScript"},
+ {"-level2sep", argFlag, &level2Sep, 0,
+ "generate Level 2 separable PostScript"},
+ {"-level3", argFlag, &level3, 0,
+ "generate Level 3 PostScript"},
+ {"-level3sep", argFlag, &level3Sep, 0,
+ "generate Level 3 separable PostScript"},
+ {"-eps", argFlag, &doEPS, 0,
+ "generate Encapsulated PostScript (EPS)"},
+ {"-form", argFlag, &doForm, 0,
+ "generate a PostScript form"},
+#if OPI_SUPPORT
+ {"-opi", argFlag, &doOPI, 0,
+ "generate OPI comments"},
+#endif
+ {"-noembt1", argFlag, &noEmbedT1Fonts, 0,
+ "don't embed Type 1 fonts"},
+ {"-noembtt", argFlag, &noEmbedTTFonts, 0,
+ "don't embed TrueType fonts"},
+ {"-noembcidps", argFlag, &noEmbedCIDPSFonts, 0,
+ "don't embed CID PostScript fonts"},
+ {"-noembcidtt", argFlag, &noEmbedCIDTTFonts, 0,
+ "don't embed CID TrueType fonts"},
+ {"-paper", argString, paperSize, sizeof(paperSize),
+ "paper size (letter, legal, A4, A3, match)"},
+ {"-paperw", argInt, &paperWidth, 0,
+ "paper width, in points"},
+ {"-paperh", argInt, &paperHeight, 0,
+ "paper height, in points"},
+ {"-nocrop", argFlag, &noCrop, 0,
+ "don't crop pages to CropBox"},
+ {"-expand", argFlag, &expand, 0,
+ "expand pages smaller than the paper size"},
+ {"-noshrink", argFlag, &noShrink, 0,
+ "don't shrink pages larger than the paper size"},
+ {"-nocenter", argFlag, &noCenter, 0,
+ "don't center pages smaller than the paper size"},
+ {"-duplex", argFlag, &duplex, 0,
+ "enable duplex printing"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-q", argFlag, &quiet, 0,
+ "don't print any messages or errors"},
+ {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
+ "configuration file to use in place of .xpdfrc"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"--help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-?", argFlag, &printHelp, 0,
+ "print usage information"},
+ {NULL}
+};
+
+int main(int argc, char *argv[]) {
+ PDFDoc *doc;
+ GooString *fileName;
+ GooString *psFileName;
+ PSLevel level;
+ PSOutMode mode;
+ GooString *ownerPW, *userPW;
+ PSOutputDev *psOut;
+ GBool ok;
+ char *p;
+ int exitCode;
+
+ exitCode = 99;
+
+ // parse args
+ ok = parseArgs(argDesc, &argc, argv);
+ if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
+ fprintf(stderr, "pdftops version %s\n", xpdfVersion);
+ fprintf(stderr, "%s\n", xpdfCopyright);
+ if (!printVersion) {
+ printUsage("pdftops", "<PDF-file> [<PS-file>]", argDesc);
+ }
+ exit(1);
+ }
+ if ((level1 ? 1 : 0) +
+ (level1Sep ? 1 : 0) +
+ (level2 ? 1 : 0) +
+ (level2Sep ? 1 : 0) +
+ (level3 ? 1 : 0) +
+ (level3Sep ? 1 : 0) > 1) {
+ fprintf(stderr, "Error: use only one of the 'level' options.\n");
+ exit(1);
+ }
+ if (doEPS && doForm) {
+ fprintf(stderr, "Error: use only one of -eps and -form\n");
+ exit(1);
+ }
+ if (level1) {
+ level = psLevel1;
+ } else if (level1Sep) {
+ level = psLevel1Sep;
+ } else if (level2Sep) {
+ level = psLevel2Sep;
+ } else if (level3) {
+ level = psLevel3;
+ } else if (level3Sep) {
+ level = psLevel3Sep;
+ } else {
+ level = psLevel2;
+ }
+ if (doForm && level < psLevel2) {
+ fprintf(stderr, "Error: forms are only available with Level 2 output.\n");
+ exit(1);
+ }
+ mode = doEPS ? psModeEPS
+ : doForm ? psModeForm
+ : psModePS;
+ fileName = new GooString(argv[1]);
+
+ // read config file
+ globalParams = new GlobalParams(cfgFileName);
+ if (paperSize[0]) {
+ if (!globalParams->setPSPaperSize(paperSize)) {
+ fprintf(stderr, "Invalid paper size\n");
+ delete fileName;
+ goto err0;
+ }
+ } else {
+ if (paperWidth) {
+ globalParams->setPSPaperWidth(paperWidth);
+ }
+ if (paperHeight) {
+ globalParams->setPSPaperHeight(paperHeight);
+ }
+ }
+ if (noCrop) {
+ globalParams->setPSCrop(gFalse);
+ }
+ if (expand) {
+ globalParams->setPSExpandSmaller(gTrue);
+ }
+ if (noShrink) {
+ globalParams->setPSShrinkLarger(gFalse);
+ }
+ if (noCenter) {
+ globalParams->setPSCenter(gFalse);
+ }
+ if (duplex) {
+ globalParams->setPSDuplex(duplex);
+ }
+ if (level1 || level1Sep || level2 || level2Sep || level3 || level3Sep) {
+ globalParams->setPSLevel(level);
+ }
+ if (noEmbedT1Fonts) {
+ globalParams->setPSEmbedType1(!noEmbedT1Fonts);
+ }
+ if (noEmbedTTFonts) {
+ globalParams->setPSEmbedTrueType(!noEmbedTTFonts);
+ }
+ if (noEmbedCIDPSFonts) {
+ globalParams->setPSEmbedCIDPostScript(!noEmbedCIDPSFonts);
+ }
+ if (noEmbedCIDTTFonts) {
+ globalParams->setPSEmbedCIDTrueType(!noEmbedCIDTTFonts);
+ }
+#if OPI_SUPPORT
+ if (doOPI) {
+ globalParams->setPSOPI(doOPI);
+ }
+#endif
+ if (quiet) {
+ globalParams->setErrQuiet(quiet);
+ }
+
+ // open PDF file
+ if (ownerPassword[0] != '\001') {
+ ownerPW = new GooString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0] != '\001') {
+ userPW = new GooString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ exitCode = 1;
+ goto err1;
+ }
+
+#ifdef ENFORCE_PERMISSIONS
+ // check for print permission
+ if (!doc->okToPrint()) {
+ error(-1, "Printing this document is not allowed.");
+ exitCode = 3;
+ goto err1;
+ }
+#endif
+
+ // construct PostScript file name
+ if (argc == 3) {
+ psFileName = new GooString(argv[2]);
+ } else {
+ p = fileName->getCString() + fileName->getLength() - 4;
+ if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
+ psFileName = new GooString(fileName->getCString(),
+ fileName->getLength() - 4);
+ } else {
+ psFileName = fileName->copy();
+ }
+ psFileName->append(doEPS ? ".eps" : ".ps");
+ }
+
+ // get page range
+ if (firstPage < 1) {
+ firstPage = 1;
+ }
+ if (lastPage < 1 || lastPage > doc->getNumPages()) {
+ lastPage = doc->getNumPages();
+ }
+
+ // check for multi-page EPS or form
+ if ((doEPS || doForm) && firstPage != lastPage) {
+ error(-1, "EPS and form files can only contain one page.");
+ goto err2;
+ }
+
+ // write PostScript file
+ psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
+ doc->getCatalog(), firstPage, lastPage, mode);
+ if (psOut->isOk()) {
+ doc->displayPages(psOut, firstPage, lastPage, 72, 72,
+ 0, globalParams->getPSCrop(), gFalse, gFalse);
+ } else {
+ delete psOut;
+ exitCode = 2;
+ goto err2;
+ }
+ delete psOut;
+
+ exitCode = 0;
+
+ // clean up
+ err2:
+ delete psFileName;
+ err1:
+ delete doc;
+ err0:
+ delete globalParams;
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return exitCode;
+}
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
new file mode 100644
index 00000000..11a67694
--- /dev/null
+++ b/utils/pdftotext.1
@@ -0,0 +1,135 @@
+.\" Copyright 1997-2004 Glyph & Cog, LLC
+.TH pdftotext 1 "22 January 2004"
+.SH NAME
+pdftotext \- Portable Document Format (PDF) to text converter
+(version 3.00)
+.SH SYNOPSIS
+.B pdftotext
+[options]
+.RI [ PDF-file
+.RI [ text-file ]]
+.SH DESCRIPTION
+.B Pdftotext
+converts Portable Document Format (PDF) files to plain text.
+.PP
+Pdftotext reads the PDF file,
+.IR PDF-file ,
+and writes a text file,
+.IR text-file .
+If
+.I text-file
+is not specified, pdftotext converts
+.I file.pdf
+to
+.IR file.txt .
+If
+.I text-file
+is \'-', the text is sent to stdout.
+.SH CONFIGURATION FILE
+Pdftotext reads a configuration file at startup. It first tries to
+find the user's private config file, ~/.xpdfrc. If that doesn't
+exist, it looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
+.BR xpdfrc (5)
+man page for details.
+.SH OPTIONS
+Many of the following options can be set with configuration file
+commands. These are listed in square brackets with the description of
+the corresponding command line option.
+.TP
+.BI \-f " number"
+Specifies the first page to convert.
+.TP
+.BI \-l " number"
+Specifies the last page to convert.
+.TP
+.B \-layout
+Maintain (as best as possible) the original physical layout of the
+text. The default is to \'undo' physical layout (columns,
+hyphenation, etc.) and output the text in reading order.
+.TP
+.B \-raw
+Keep the text in content stream order. This is a hack which often
+"undoes" column formatting, etc. Use of raw mode is no longer
+recommended.
+.TP
+.B \-htmlmeta
+Generate a simple HTML file, including the meta information. This
+simply wraps the text in <pre> and </pre> and prepends the meta
+headers.
+.TP
+.BI \-enc " encoding-name"
+Sets the encoding to use for text output. The
+.I encoding\-name
+must be defined with the unicodeMap command (see
+.BR xpdfrc (5)).
+The encoding name is case-sensitive. This defaults to "Latin1" (which
+is a built-in encoding).
+.RB "[config file: " textEncoding ]
+.TP
+.BI \-eol " unix | dos | mac"
+Sets the end-of-line convention to use for text output.
+.RB "[config file: " textEOL ]
+.TP
+.B \-nopgbrk
+Don't insert page breaks (form feed characters) between pages.
+.RB "[config file: " textPageBreaks ]
+.TP
+.BI \-opw " password"
+Specify the owner password for the PDF file. Providing this will
+bypass all security restrictions.
+.TP
+.BI \-upw " password"
+Specify the user password for the PDF file.
+.TP
+.B \-q
+Don't print any messages or errors.
+.RB "[config file: " errQuiet ]
+.TP
+.BI \-cfg " config-file"
+Read
+.I config-file
+in place of ~/.xpdfrc or the system-wide config file.
+.TP
+.B \-v
+Print copyright and version information.
+.TP
+.B \-h
+Print usage information.
+.RB ( \-help
+and
+.B \-\-help
+are equivalent.)
+.SH BUGS
+Some PDF files contain fonts whose encodings have been mangled beyond
+recognition. There is no way (short of OCR) to extract text from
+these files.
+.SH EXIT CODES
+The Xpdf tools use the following exit codes:
+.TP
+0
+No error.
+.TP
+1
+Error opening a PDF file.
+.TP
+2
+Error opening an output file.
+.TP
+3
+Error related to PDF permissions.
+.TP
+99
+Other error.
+.SH AUTHOR
+The pdftotext software and documentation are copyright 1996-2004 Glyph
+& Cog, LLC.
+.SH "SEE ALSO"
+.BR xpdf (1),
+.BR pdftops (1),
+.BR pdfinfo (1),
+.BR pdffonts (1),
+.BR pdftoppm (1),
+.BR pdfimages (1),
+.BR xpdfrc (5)
+.br
+.B http://www.foolabs.com/xpdf/
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
new file mode 100644
index 00000000..f8dfa80f
--- /dev/null
+++ b/utils/pdftotext.cc
@@ -0,0 +1,337 @@
+//========================================================================
+//
+// pdftotext.cc
+//
+// Copyright 1997-2003 Glyph & Cog, LLC
+//
+// Modified for Debian by Hamish Moffatt, 22 May 2002.
+//
+//========================================================================
+
+#include <poppler-config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include "parseargs.h"
+#include "goo/GooString.h"
+#include "goo/gmem.h"
+#include "GlobalParams.h"
+#include "Object.h"
+#include "Stream.h"
+#include "Array.h"
+#include "Dict.h"
+#include "XRef.h"
+#include "Catalog.h"
+#include "Page.h"
+#include "PDFDoc.h"
+#include "TextOutputDev.h"
+#include "CharTypes.h"
+#include "UnicodeMap.h"
+#include "Error.h"
+#include "config.h"
+
+static void printInfoString(FILE *f, Dict *infoDict, char *key,
+ char *text1, char *text2, UnicodeMap *uMap);
+static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt);
+
+static int firstPage = 1;
+static int lastPage = 0;
+static GBool physLayout = gFalse;
+static GBool rawOrder = gFalse;
+static GBool htmlMeta = gFalse;
+static char textEncName[128] = "";
+static char textEOL[16] = "";
+static GBool noPageBreaks = gFalse;
+static char ownerPassword[33] = "\001";
+static char userPassword[33] = "\001";
+static GBool quiet = gFalse;
+static char cfgFileName[256] = "";
+static GBool printVersion = gFalse;
+static GBool printHelp = gFalse;
+
+static ArgDesc argDesc[] = {
+ {"-f", argInt, &firstPage, 0,
+ "first page to convert"},
+ {"-l", argInt, &lastPage, 0,
+ "last page to convert"},
+ {"-layout", argFlag, &physLayout, 0,
+ "maintain original physical layout"},
+ {"-raw", argFlag, &rawOrder, 0,
+ "keep strings in content stream order"},
+ {"-htmlmeta", argFlag, &htmlMeta, 0,
+ "generate a simple HTML file, including the meta information"},
+ {"-enc", argString, textEncName, sizeof(textEncName),
+ "output text encoding name"},
+ {"-eol", argString, textEOL, sizeof(textEOL),
+ "output end-of-line convention (unix, dos, or mac)"},
+ {"-nopgbrk", argFlag, &noPageBreaks, 0,
+ "don't insert page breaks between pages"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-q", argFlag, &quiet, 0,
+ "don't print any messages or errors"},
+ {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
+ "configuration file to use in place of .xpdfrc"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"--help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-?", argFlag, &printHelp, 0,
+ "print usage information"},
+ {NULL}
+};
+
+int main(int argc, char *argv[]) {
+ PDFDoc *doc;
+ GooString *fileName;
+ GooString *textFileName;
+ GooString *ownerPW, *userPW;
+ TextOutputDev *textOut;
+ FILE *f;
+ UnicodeMap *uMap;
+ Object info;
+ GBool ok;
+ char *p;
+ int exitCode;
+
+ exitCode = 99;
+
+ // parse args
+ ok = parseArgs(argDesc, &argc, argv);
+ if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
+ fprintf(stderr, "pdftotext version %s\n", xpdfVersion);
+ fprintf(stderr, "%s\n", xpdfCopyright);
+ if (!printVersion) {
+ printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
+ }
+ goto err0;
+ }
+ fileName = new GooString(argv[1]);
+
+ // read config file
+ globalParams = new GlobalParams(cfgFileName);
+ if (textEncName[0]) {
+ globalParams->setTextEncoding(textEncName);
+ }
+ if (textEOL[0]) {
+ if (!globalParams->setTextEOL(textEOL)) {
+ fprintf(stderr, "Bad '-eol' value on command line\n");
+ }
+ }
+ if (noPageBreaks) {
+ globalParams->setTextPageBreaks(gFalse);
+ }
+ if (quiet) {
+ globalParams->setErrQuiet(quiet);
+ }
+
+ // get mapping to output encoding
+ if (!(uMap = globalParams->getTextEncoding())) {
+ error(-1, "Couldn't get text encoding");
+ delete fileName;
+ goto err1;
+ }
+
+ // open PDF file
+ if (ownerPassword[0] != '\001') {
+ ownerPW = new GooString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0] != '\001') {
+ userPW = new GooString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ exitCode = 1;
+ goto err2;
+ }
+
+#ifdef ENFORCE_PERMISSIONS
+ // check for copy permission
+ if (!doc->okToCopy()) {
+ error(-1, "Copying of text from this document is not allowed.");
+ exitCode = 3;
+ goto err2;
+ }
+#endif
+
+ // construct text file name
+ if (argc == 3) {
+ textFileName = new GooString(argv[2]);
+ } else {
+ p = fileName->getCString() + fileName->getLength() - 4;
+ if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
+ textFileName = new GooString(fileName->getCString(),
+ fileName->getLength() - 4);
+ } else {
+ textFileName = fileName->copy();
+ }
+ textFileName->append(htmlMeta ? ".html" : ".txt");
+ }
+
+ // get page range
+ if (firstPage < 1) {
+ firstPage = 1;
+ }
+ if (lastPage < 1 || lastPage > doc->getNumPages()) {
+ lastPage = doc->getNumPages();
+ }
+
+ // write HTML header
+ if (htmlMeta) {
+ if (!textFileName->cmp("-")) {
+ f = stdout;
+ } else {
+ if (!(f = fopen(textFileName->getCString(), "wb"))) {
+ error(-1, "Couldn't open text file '%s'", textFileName->getCString());
+ exitCode = 2;
+ goto err3;
+ }
+ }
+ fputs("<html>\n", f);
+ fputs("<head>\n", f);
+ doc->getDocInfo(&info);
+ if (info.isDict()) {
+ printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
+ uMap);
+ printInfoString(f, info.getDict(), "Subject",
+ "<meta name=\"Subject\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Keywords",
+ "<meta name=\"Keywords\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Author",
+ "<meta name=\"Author\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Creator",
+ "<meta name=\"Creator\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Producer",
+ "<meta name=\"Producer\" content=\"", "\">\n", uMap);
+ printInfoDate(f, info.getDict(), "CreationDate",
+ "<meta name=\"CreationDate\" content=\"\">\n");
+ printInfoDate(f, info.getDict(), "LastModifiedDate",
+ "<meta name=\"ModDate\" content=\"\">\n");
+ }
+ info.free();
+ fputs("</head>\n", f);
+ fputs("<body>\n", f);
+ fputs("<pre>\n", f);
+ if (f != stdout) {
+ fclose(f);
+ }
+ }
+
+ // write text file
+ textOut = new TextOutputDev(textFileName->getCString(),
+ physLayout, rawOrder, htmlMeta);
+ if (textOut->isOk()) {
+ doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0,
+ gTrue, gFalse, gFalse);
+ } else {
+ delete textOut;
+ exitCode = 2;
+ goto err3;
+ }
+ delete textOut;
+
+ // write end of HTML file
+ if (htmlMeta) {
+ if (!textFileName->cmp("-")) {
+ f = stdout;
+ } else {
+ if (!(f = fopen(textFileName->getCString(), "ab"))) {
+ error(-1, "Couldn't open text file '%s'", textFileName->getCString());
+ exitCode = 2;
+ goto err3;
+ }
+ }
+ fputs("</pre>\n", f);
+ fputs("</body>\n", f);
+ fputs("</html>\n", f);
+ if (f != stdout) {
+ fclose(f);
+ }
+ }
+
+ exitCode = 0;
+
+ // clean up
+ err3:
+ delete textFileName;
+ err2:
+ delete doc;
+ uMap->decRefCnt();
+ err1:
+ delete globalParams;
+ err0:
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return exitCode;
+}
+
+static void printInfoString(FILE *f, Dict *infoDict, char *key,
+ char *text1, char *text2, UnicodeMap *uMap) {
+ Object obj;
+ GooString *s1;
+ GBool isUnicode;
+ Unicode u;
+ char buf[8];
+ int i, n;
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ fputs(text1, f);
+ s1 = obj.getString();
+ if ((s1->getChar(0) & 0xff) == 0xfe &&
+ (s1->getChar(1) & 0xff) == 0xff) {
+ isUnicode = gTrue;
+ i = 2;
+ } else {
+ isUnicode = gFalse;
+ i = 0;
+ }
+ while (i < obj.getString()->getLength()) {
+ if (isUnicode) {
+ u = ((s1->getChar(i) & 0xff) << 8) |
+ (s1->getChar(i+1) & 0xff);
+ i += 2;
+ } else {
+ u = s1->getChar(i) & 0xff;
+ ++i;
+ }
+ n = uMap->mapUnicode(u, buf, sizeof(buf));
+ fwrite(buf, 1, n, f);
+ }
+ fputs(text2, f);
+ }
+ obj.free();
+}
+
+static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt) {
+ Object obj;
+ char *s;
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ s = obj.getString()->getCString();
+ if (s[0] == 'D' && s[1] == ':') {
+ s += 2;
+ }
+ fprintf(f, fmt, s);
+ }
+ obj.free();
+}