summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKoji Otani <sho@bbr.jp>2008-06-03 21:07:15 +0200
committerAlbert Astals Cid <aacid@kde.org>2008-06-03 21:08:10 +0200
commit76b9ea1d5b4f9c140812b491a9ac80a822ad49fb (patch)
tree66341e7b3592bec2aab1dd04a524f9d4117b39fc
parent45a82d46434d88581390a3d9091efd3464d9c400 (diff)
Support for surrogates outside the BMP plane
-rw-r--r--poppler/TextOutputDev.cc19
-rw-r--r--poppler/UTF8.h14
2 files changed, 32 insertions, 1 deletions
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 75a0ac01..97f4f3fe 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -2075,7 +2075,24 @@ void TextPage::addChar(GfxState *state, double x, double y,
w1 /= uLen;
h1 /= uLen;
for (i = 0; i < uLen; ++i) {
- curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]);
+ if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */
+ if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) {
+ /* next code is a low surrogate */
+ Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
+ i++;
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, uu);
+ } else {
+ /* missing low surrogate
+ replace it with REPLACEMENT CHARACTER (U+FFFD) */
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd);
+ }
+ } else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
+ /* invalid low surrogate
+ replace it with REPLACEMENT CHARACTER (U+FFFD) */
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd);
+ } else {
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]);
+ }
}
}
if (curWord) {
diff --git a/poppler/UTF8.h b/poppler/UTF8.h
index 8536dbf9..11fb864e 100644
--- a/poppler/UTF8.h
+++ b/poppler/UTF8.h
@@ -50,6 +50,20 @@ static int mapUCS2(Unicode u, char *buf, int bufSize) {
buf[0] = (char)((u >> 8) & 0xff);
buf[1] = (char)(u & 0xff);
return 2;
+ } else if (u < 0x110000) {
+ Unicode uu;
+
+ /* using surrogate pair */
+ if (bufSize < 4) {
+ return 0;
+ }
+ uu = ((u - 0x10000) >> 10) + 0xd800;
+ buf[0] = (char)((uu >> 8) & 0xff);
+ buf[1] = (char)(uu & 0xff);
+ uu = (u & 0x3ff)+0xdc00;
+ buf[2] = (char)((uu >> 8) & 0xff);
+ buf[3] = (char)(uu & 0xff);
+ return 4;
} else {
return 0;
}