diff options
author | Koji Otani <sho@bbr.jp> | 2008-06-03 21:07:15 +0200 |
---|---|---|
committer | Albert Astals Cid <aacid@kde.org> | 2008-06-03 21:08:10 +0200 |
commit | 76b9ea1d5b4f9c140812b491a9ac80a822ad49fb (patch) | |
tree | 66341e7b3592bec2aab1dd04a524f9d4117b39fc | |
parent | 45a82d46434d88581390a3d9091efd3464d9c400 (diff) |
Support for surrogates outside the BMP plane
-rw-r--r-- | poppler/TextOutputDev.cc | 19 | ||||
-rw-r--r-- | poppler/UTF8.h | 14 |
2 files changed, 32 insertions, 1 deletions
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 75a0ac01..97f4f3fe 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -2075,7 +2075,24 @@ void TextPage::addChar(GfxState *state, double x, double y, w1 /= uLen; h1 /= uLen; for (i = 0; i < uLen; ++i) { - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]); + if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */ + if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) { + /* next code is a low surrogate */ + Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000; + i++; + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, uu); + } else { + /* missing low surrogate + replace it with REPLACEMENT CHARACTER (U+FFFD) */ + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd); + } + } else if (u[i] >= 0xdc00 && u[i] < 0xe000) { + /* invalid low surrogate + replace it with REPLACEMENT CHARACTER (U+FFFD) */ + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd); + } else { + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]); + } } } if (curWord) { diff --git a/poppler/UTF8.h b/poppler/UTF8.h index 8536dbf9..11fb864e 100644 --- a/poppler/UTF8.h +++ b/poppler/UTF8.h @@ -50,6 +50,20 @@ static int mapUCS2(Unicode u, char *buf, int bufSize) { buf[0] = (char)((u >> 8) & 0xff); buf[1] = (char)(u & 0xff); return 2; + } else if (u < 0x110000) { + Unicode uu; + + /* using surrogate pair */ + if (bufSize < 4) { + return 0; + } + uu = ((u - 0x10000) >> 10) + 0xd800; + buf[0] = (char)((uu >> 8) & 0xff); + buf[1] = (char)(uu & 0xff); + uu = (u & 0x3ff)+0xdc00; + buf[2] = (char)((uu >> 8) & 0xff); + buf[3] = (char)(uu & 0xff); + return 4; } else { return 0; } |