summaryrefslogtreecommitdiff
path: root/utils/HtmlOutputDev.h
blob: e5823d27d9f5dee1845be3172e6ac7104bda3a4b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
//========================================================================
//
// HtmlOutputDev.h
//
// Copyright 1997 Derek B. Noonburg
//
// Changed 1999 by G.Ovtcharov
//========================================================================

//========================================================================
//
// Modified under the Poppler project - http://poppler.freedesktop.org
//
// All changes made under the Poppler project to this file are licensed
// under GPL version 2 or later
//
// Copyright (C) 2006, 2007, 2009, 2012, 2018, 2019 Albert Astals Cid <aacid@kde.org>
// Copyright (C) 2008, 2009 Warren Toomey <wkt@tuhs.org>
// Copyright (C) 2009, 2011 Carlos Garcia Campos <carlosgc@gnome.org>
// Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
// Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
// Copyright (C) 2011 Joshua Richardson <jric@chegg.com>
// Copyright (C) 2011 Stephen Reichling <sreichling@chegg.com>
// Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
// Copyright (C) 2012 Fabio D'Urso <fabiodurso@hotmail.it>
// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
// Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
//
//========================================================================

#ifndef HTMLOUTPUTDEV_H
#define HTMLOUTPUTDEV_H

#include <cstdio>
#include "goo/gbasename.h"
#include "GfxFont.h"
#include "OutputDev.h"
#include "HtmlLinks.h"
#include "HtmlFonts.h"
#include "Link.h"
#include "Catalog.h"
#include "UnicodeMap.h"

#define xoutRound(x) ((int)(x + 0.5))

#define DOCTYPE "<!DOCTYPE html>"

class GfxState;
class GooString;
class HtmlImage;
class PDFDoc;
class OutlineItem;
//------------------------------------------------------------------------
// HtmlString
//------------------------------------------------------------------------

enum UnicodeTextDirection {
  textDirUnknown,
  textDirLeftRight,
  textDirRightLeft,
  textDirTopBottom
};

class HtmlString {
public:

  // Constructor.
  HtmlString(GfxState *state, double fontSize, HtmlFontAccu* fonts);

  // Destructor.
  ~HtmlString();

  HtmlString(const HtmlString &) = delete;
  HtmlString& operator=(const HtmlString &) = delete;

  // Add a character to the string.
  void addChar(GfxState *state, double x, double y,
	       double dx, double dy,
	       Unicode u); 
  HtmlLink* getLink() { return link; }
  const HtmlFont &getFont() const { return *fonts->Get(fontpos); }
  void endString(); // postprocessing

private:
// aender die text variable
  HtmlLink *link;
  double xMin, xMax;		// bounding box x coordinates
  double yMin, yMax;		// bounding box y coordinates
  int col;			// starting column
  Unicode *text;		// the text
  double *xRight;		// right-hand x coord of each char
  HtmlString *yxNext;		// next string in y-major order
  HtmlString *xyNext;		// next string in x-major order
  int fontpos;
  GooString* htext;
  int len;			// length of text and xRight
  int size;			// size of text and xRight arrays
  UnicodeTextDirection dir;	// direction (left to right/right to left)
  HtmlFontAccu *fonts;
  
  friend class HtmlPage;

};


//------------------------------------------------------------------------
// HtmlPage
//------------------------------------------------------------------------



class HtmlPage {
public:

  // Constructor.
  HtmlPage(bool rawOrder);

  // Destructor.
  ~HtmlPage();

  HtmlPage(const HtmlPage &) = delete;
  HtmlPage& operator=(const HtmlPage &) = delete;

  // Begin a new string.
  void beginString(GfxState *state, const GooString *s);

  // Add a character to the current string.
  void addChar(GfxState *state, double x, double y,
	       double dx, double dy, 
		double ox, double oy, 
		const Unicode *u, int uLen); //unsigned char c);

  void updateFont(GfxState *state);

  // End the current string, sorting it into the list of strings.
  void endString();

  // Coalesce strings that look like parts of the same line.
  void coalesce();

  // Find a string.  If <top> is true, starts looking at top of page;
  // otherwise starts looking at <xMin>,<yMin>.  If <bottom> is true,
  // stops looking at bottom of page; otherwise stops looking at
  // <xMax>,<yMax>.  If found, sets the text bounding rectangle and
  // returns true; otherwise returns false.
  

  // new functions
  void AddLink(const HtmlLink& x){
    links->AddLink(x);
  }

  // add an image to the current page
  void addImage(GooString *fname, GfxState *state);

  // number of images on the current page
  int  getNumImages() { return imgList->size(); }

  void dump(FILE *f, int pageNum, const std::vector<std::string>& backgroundImages);

  // Clear the page.
  void clear();
  
  void conv();
private:
  HtmlFont* getFont(HtmlString *hStr) { return fonts->Get(hStr->fontpos); }

  double fontSize;		// current font size
  bool rawOrder;		// keep strings in content stream order

  HtmlString *curStr;		// currently active string

  HtmlString *yxStrings;	// strings in y-major order
  HtmlString *xyStrings;	// strings in x-major order
  HtmlString *yxCur1, *yxCur2;	// cursors for yxStrings list
  
  void setDocName(const char* fname);
  void dumpAsXML(FILE* f,int page);
  void dumpComplex(FILE* f, int page, const std::vector<std::string>& backgroundImages);
  int dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page);

  // marks the position of the fonts that belong to current page (for noframes)
  int fontsPageMarker; 
  HtmlFontAccu *fonts;
  HtmlLinks *links; 
  std::vector<HtmlImage*> *imgList;
  
  GooString *DocName;
  int pageWidth;
  int pageHeight;
  int firstPage;                // used to begin the numeration of pages

  friend class HtmlOutputDev;
};

//------------------------------------------------------------------------
// HtmlMetaVar
//------------------------------------------------------------------------
class HtmlMetaVar {
public:
    HtmlMetaVar(const char *_name, const char *_content);
    ~HtmlMetaVar();    
    
    HtmlMetaVar(const HtmlMetaVar &) = delete;
    HtmlMetaVar& operator=(const HtmlMetaVar &) = delete;

    GooString* toString() const;

private:

    GooString *name;
    GooString *content;
};

//------------------------------------------------------------------------
// HtmlOutputDev
//------------------------------------------------------------------------

class HtmlOutputDev: public OutputDev {
public:

  // Open a text output file.  If <fileName> is nullptr, no file is written
  // (this is useful, e.g., for searching text).  If <useASCII7> is true,
  // text is converted to 7-bit ASCII; otherwise, text is converted to
  // 8-bit ISO Latin-1.  <useASCII7> should also be set for Japanese
  // (EUC-JP) text.  If <rawOrder> is true, the text is kept in content
  // stream order.
  HtmlOutputDev(Catalog *catalogA, const char *fileName, const char *title,
	  const char *author,
	  const char *keywords,
	  const char *subject,
	  const char *date,
	  bool rawOrder,
	  int firstPage = 1,
	  bool outline = false);

  // Destructor.
  ~HtmlOutputDev() override;

  // Check if file was successfully created.
  virtual bool isOk() { return ok; }

  //---- get info about output device

  // Does this device use upside-down coordinates?
  // (Upside-down means (0,0) is the top left corner of the page.)
  bool upsideDown() override { return true; }

  // Does this device use drawChar() or drawString()?
  bool useDrawChar() override { return true; }

  // Does this device use beginType3Char/endType3Char?  Otherwise,
  // text in Type 3 fonts will be drawn with drawChar/drawString.
  bool interpretType3Chars() override { return false; }

  // Does this device need non-text content?
  bool needNonText() override { return true; }

  //----- initialization and control

  bool checkPageSlice(Page *p, double hDPI, double vDPI,
                       int rotate, bool useMediaBox, bool crop,
                       int sliceX, int sliceY, int sliceW, int sliceH,
                       bool printing,
                       bool (* abortCheckCbk)(void *data) = nullptr,
                       void * abortCheckCbkData = nullptr,
                       bool (*annotDisplayDecideCbk)(Annot *annot, void *user_data) = nullptr,
                       void *annotDisplayDecideCbkData = nullptr) override
  {
   docPage = p;
   return true;
  }


  // Start a page.
  void startPage(int pageNum, GfxState *state, XRef *xref) override;

  // End a page.
  void endPage() override;

  // add a background image to the list of background images,
  // as this seems to be done outside other processing. takes ownership of img.
  void addBackgroundImage(const std::string& img);

  //----- update text state
  void updateFont(GfxState *state) override;

  //----- text drawing
  void beginString(GfxState *state, const GooString *s) override;
  void endString(GfxState *state) override;
  void drawChar(GfxState *state, double x, double y,
		double dx, double dy,
		double originX, double originY,
		CharCode code, int nBytes, const Unicode *u, int uLen) override;
  
  void drawImageMask(GfxState *state, Object *ref,
		     Stream *str,
		     int width, int height, bool invert,
		     bool interpolate, bool inlineImg) override;
  void drawImage(GfxState *state, Object *ref, Stream *str,
		 int width, int height, GfxImageColorMap *colorMap,
		 bool interpolate, const int *maskColors, bool inlineImg) override;

  //new feature    
  virtual int DevType() {return 1234;}

  int getPageWidth() { return maxPageWidth; }
  int getPageHeight() { return maxPageHeight; }

  bool dumpDocOutline(PDFDoc* doc);

private:
  // convert encoding into a HTML standard, or encoding->c_str if not
  // recognized.
  static std::string mapEncodingToHtml(const std::string &encoding);
  void doProcessLink(AnnotLink *link);
  GooString* getLinkDest(AnnotLink *link);
  void dumpMetaVars(FILE *);
  void doFrame(int firstPage);
  bool newHtmlOutlineLevel(FILE *output, const std::vector<OutlineItem*> *outlines, int level = 1);
  void newXmlOutlineLevel(FILE *output, const std::vector<OutlineItem*> *outlines);
  int getOutlinePageNum(OutlineItem *item);
  void drawJpegImage(GfxState *state, Stream *str);
  void drawPngImage(GfxState *state, Stream *str, int width, int height,
                    GfxImageColorMap *colorMap, bool isMask = false);
  GooString *createImageFileName(const char *ext);

  FILE *fContentsFrame;
  FILE *page;                   // html file
  //FILE *tin;                    // image log file
  //bool write;
  bool needClose;		// need to close the file?
  HtmlPage *pages;		// text for the current page
  bool rawOrder;		// keep text in content stream order
  bool doOutline;		// output document outline
  bool ok;			// set up ok?
  bool dumpJPEG;
  int pageNum;
  int maxPageWidth;
  int maxPageHeight;
  GooString *Docname;
  GooString *docTitle;
  std::vector<HtmlMetaVar*> *glMetaVars;
  Catalog *catalog;
  Page *docPage;
  std::vector<std::string> backgroundImages;
  friend class HtmlPage;
};

#endif