//======================================================================== // // UTF.cc // // Copyright 2001-2003 Glyph & Cog, LLC // //======================================================================== //======================================================================== // // Modified under the Poppler project - http://poppler.freedesktop.org // // All changes made under the Poppler project to this file are licensed // under GPL version 2 or later // // Copyright (C) 2008 Koji Otani // Copyright (C) 2012, 2017 Adrian Johnson // Copyright (C) 2012 Hib Eris // Copyright (C) 2016, 2018, 2019 Albert Astals Cid // Copyright (C) 2016 Jason Crain // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, . Work sponsored by the LiMux project of the city of Munich // Copyright (C) 2018 Nelson Benítez León // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git // //======================================================================== #include "goo/gmem.h" #include "PDFDocEncoding.h" #include "GlobalParams.h" #include "UnicodeMap.h" #include "UTF.h" #include "UnicodeMapFuncs.h" #include bool UnicodeIsValid(Unicode ucs4) { return (ucs4 < 0x110000) && ((ucs4 & 0xfffff800) != 0xd800) && (ucs4 < 0xfdd0 || ucs4 > 0xfdef) && ((ucs4 & 0xfffe) != 0xfffe); } int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4_out) { int i, n, len; Unicode *u; // count characters len = 0; for (i = 0; i < utf16Len; i++) { if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) { i++; /* surrogate pair */ } len++; } if (ucs4_out == nullptr) return len; u = (Unicode*)gmallocn(len, sizeof(Unicode)); n = 0; // convert string for (i = 0; i < utf16Len; i++) { if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */ if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) { /* next code is a low surrogate */ u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000; ++i; } else { /* missing low surrogate replace it with REPLACEMENT CHARACTER (U+FFFD) */ u[n] = 0xfffd; } } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) { /* invalid low surrogate replace it with REPLACEMENT CHARACTER (U+FFFD) */ u[n] = 0xfffd; } else { u[n] = utf16[i]; } if (!UnicodeIsValid(u[n])) { u[n] = 0xfffd; } n++; } *ucs4_out = u; return len; } int TextStringToUCS4(const GooString *textStr, Unicode **ucs4) { int i, len; const char *s; Unicode *u; len = textStr->getLength(); s = textStr->c_str(); if (len == 0) { *ucs4 = nullptr; return 0; } if (textStr->hasUnicodeMarker()) { Unicode *utf16; len = len/2 - 1; if (len > 0) { utf16 = new Unicode[len]; for (i = 0 ; i < len; i++) { utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff); } len = UTF16toUCS4(utf16, len, &u); delete[] utf16; } else { u = nullptr; } } else { u = (Unicode*)gmallocn(len, sizeof(Unicode)); for (i = 0 ; i < len; i++) { u[i] = pdfDocEncoding[s[i] & 0xff]; } } *ucs4 = u; return len; } bool UnicodeIsWhitespace(Unicode ucs4) { static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 }; Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]); Unicode const *i = std::lower_bound(spaces, end, ucs4); return (i != end && *i == ucs4); } // // decodeUtf8() and decodeUtf8Table are: // // Copyright (c) 2008-2009 Bjoern Hoehrmann // // Permission is hereby granted, free of charge, to any person // obtaining a copy of this software and associated documentation // files (the "Software"), to deal in the Software without // restriction, including without limitation the rights to use, copy, // modify, merge, publish, distribute, sublicense, and/or sell copies // of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be // included in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. // static const uint32_t UTF8_ACCEPT = 0; static const uint32_t UTF8_REJECT = 12; static const uint32_t UCS4_MAX = 0x10FFFF; static const Unicode REPLACEMENT_CHAR = 0xFFFD; static const uint8_t decodeUtf8Table[] = { // The first part of the table maps bytes to character classes // to reduce the size of the transition table and create bitmasks. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff // The second part is a transition table that maps a combination // of a state of the automaton and a character class to a state. 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,12,12,12,12,12, }; // Decode utf8 state machine for fast UTF-8 decoding. Initialise state // to 0 and call decodeUtf8() for each byte of UTF-8. Return value // (and state) is UTF8_ACCEPT when it has found a valid codepoint // (codepoint returned in codep), UTF8_REJECT when the byte is not // allowed to occur at its position, and some other positive value if // more bytes have to be read. Reset state to 0 to recover from // errors. inline uint32_t decodeUtf8(uint32_t* state, uint32_t* codep, char byte) { uint32_t b = (unsigned char)byte; uint32_t type = decodeUtf8Table[b]; *codep = (*state != UTF8_ACCEPT) ? (b & 0x3fu) | (*codep << 6) : (0xff >> type) & (b); *state = decodeUtf8Table[256 + *state + type]; return *state; } // Count number of UTF-16 code units required to convert a UTF-8 string // (excluding terminating NULL). Each invalid byte is counted as a // code point since the UTF-8 conversion functions will replace it with // REPLACEMENT_CHAR. int utf8CountUtf16CodeUnits(const char *utf8) { uint32_t codepoint; uint32_t state = 0; int count = 0; while (*utf8) { decodeUtf8(&state, &codepoint, *utf8); if (state == UTF8_ACCEPT) { if (codepoint < 0x10000) count++; else if (codepoint <= UCS4_MAX) count += 2; else count++; // replace with REPLACEMENT_CHAR } else if (state == UTF8_REJECT) { count++; // replace with REPLACEMENT_CHAR state = 0; } utf8++; } if (state != UTF8_ACCEPT && state != UTF8_REJECT) count++; // replace with REPLACEMENT_CHAR return count; } // Convert UTF-8 to UTF-16 // utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num // bytes to convert // utf16 - output buffer to write UTF-16 to. Output will always be null terminated. // maxUtf16 - maximum size of output buffer including space for null. // maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when // either this count is reached or a null is encountered. // Returns number of UTF-16 code units written (excluding NULL). int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8) { uint16_t *p = utf16; uint32_t codepoint; uint32_t state = 0; int nIn = 0; int nOut = 0; while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) { decodeUtf8(&state, &codepoint, *utf8); if (state == UTF8_ACCEPT) { if (codepoint < 0x10000) { *p++ = (uint16_t)codepoint; nOut++; } else if (codepoint <= UCS4_MAX) { *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10)); *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF)); nOut += 2; } else { *p++ = REPLACEMENT_CHAR; nOut++; state = 0; } } else if (state == UTF8_REJECT) { *p++ = REPLACEMENT_CHAR; // invalid byte for this position nOut++; } utf8++; nIn++; } // replace any trailing bytes too short for a valid UTF-8 with a replacement char if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) { *p++ = REPLACEMENT_CHAR; nOut++; } if (nOut > maxUtf16 - 1) nOut = maxUtf16 - 1; utf16[nOut] = 0; return nOut; } // Allocate utf16 string and convert utf8 into it. uint16_t *utf8ToUtf16(const char *utf8, int *len) { int n = utf8CountUtf16CodeUnits(utf8); if (len) *len = n; uint16_t *utf16 = (uint16_t*)gmallocn(n + 1, sizeof(uint16_t)); utf8ToUtf16(utf8, utf16); return utf16; } static const uint32_t UTF16_ACCEPT = 0; static const uint32_t UTF16_REJECT = -1; // Initialise state to 0. Returns UTF16_ACCEPT when a valid code point // has been found, UTF16_REJECT when invalid code unit for this state, // some other valid if another code unit needs to be read. inline uint32_t decodeUtf16(uint32_t* state, uint32_t* codePoint, uint16_t codeUnit) { if (*state == 0) { if (codeUnit >= 0xd800 && codeUnit < 0xdc00) { /* surrogate pair */ *state = codeUnit; return *state; } else if (codeUnit >= 0xdc00 && codeUnit < 0xe000) { /* invalid low surrogate */ return UTF16_REJECT; } else { *codePoint = codeUnit; return UTF16_ACCEPT; } } else { if (codeUnit >= 0xdc00 && codeUnit < 0xe000) { *codePoint = (((*state & 0x3ff) << 10) | (codeUnit & 0x3ff)) + 0x10000; *state = 0; return UTF16_ACCEPT; } else { /* invalid high surrogate */ return UTF16_REJECT; } } } // Count number of UTF-8 bytes required to convert a UTF-16 string to // UTF-8 (excluding terminating NULL). int utf16CountUtf8Bytes(const uint16_t *utf16) { uint32_t codepoint = 0; uint32_t state = 0; int count = 0; while (*utf16) { decodeUtf16(&state, &codepoint, *utf16); if (state == UTF16_ACCEPT) { if (codepoint < 0x80) count++; else if (codepoint < 0x800) count += 2; else if (codepoint < 0x10000) count += 3; else if (codepoint <= UCS4_MAX) count += 4; else count += 3; // replace with REPLACEMENT_CHAR } else if (state == UTF16_REJECT) { count += 3; // replace with REPLACEMENT_CHAR state = 0; } utf16++; } if (state != UTF8_ACCEPT && state != UTF8_REJECT) count++; // replace with REPLACEMENT_CHAR return count; } // Convert UTF-16 to UTF-8 // utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num // code units to convert // utf8 - output buffer to write UTF-8 to. Output will always be null terminated. // maxUtf8 - maximum size of output buffer including space for null. // maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when // either this count is reached or a null is encountered. // Returns number of UTF-8 bytes written (excluding NULL). int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16) { uint32_t codepoint = 0; uint32_t state = 0; int nIn = 0; int nOut = 0; char *p = utf8; while (*utf16 && nIn < maxUtf16 && nOut < maxUtf8 - 1) { decodeUtf16(&state, &codepoint, *utf16); if (state == UTF16_ACCEPT || state == UTF16_REJECT) { if (state == UTF16_REJECT || codepoint > UCS4_MAX) { codepoint = REPLACEMENT_CHAR; state = 0; } int bufSize = maxUtf8 - nOut; int count = mapUTF8(codepoint, p, bufSize); p += count; nOut += count; } utf16++; nIn++; } // replace any trailing bytes too short for a valid UTF-8 with a replacement char if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) { int bufSize = maxUtf8 - nOut; int count = mapUTF8(REPLACEMENT_CHAR, p, bufSize); p += count; nOut += count; nOut++; } if (nOut > maxUtf8 - 1) nOut = maxUtf8 - 1; utf8[nOut] = 0; return nOut; } // Allocate utf8 string and convert utf16 into it. char *utf16ToUtf8(const uint16_t *utf16, int *len) { int n = utf16CountUtf8Bytes(utf16); if (len) *len = n; char *utf8 = (char*)gmalloc(n + 1); utf16ToUtf8(utf16, utf8); return utf8; } struct Ascii7Map { UnicodeMap *d; Ascii7Map() { GooString enc("ASCII7"); d = globalParams->getUnicodeMap(&enc); } }; void unicodeToAscii7(Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices) { static Ascii7Map uMap; int *idx = nullptr; if (!len) { *ucs4_out = nullptr; *out_len = 0; return; } if (indices) { if (!in_idx) indices = nullptr; else idx = (int *) gmallocn(len * 2 + 1, sizeof(int)); } GooString gstr; char buf[8]; // 8 is enough for mapping an unicode char to a string int i, n, k; for (i = k = 0; i < len; ++i) { n = uMap.d->mapUnicode(in[i], buf, sizeof(buf)); if (!n) { // the Unicode char could not be converted to ascii7 counterpart // so just fill with a non-printable ascii char buf[0] = 31; n = 1; } gstr.append(buf, n); if (indices) { for (; n > 0; n--) idx[k++] = in_idx[i]; } } *out_len = TextStringToUCS4(&gstr, ucs4_out); if (indices) { idx[k] = in_idx[len]; *indices = idx; } }