diff options
author | Sune Vuorela <sune@vuorela.dk> | 2025-01-16 11:35:50 +0100 |
---|---|---|
committer | Albert Astals Cid <aacid@kde.org> | 2025-01-19 15:20:48 +0000 |
commit | 7d9b708cb7dc735e332570a67185a295174a2d78 (patch) | |
tree | bfe85cbae205bb4fb13e0a0548b8298d971a9570 | |
parent | 0d7c1e697358fd736bfb6051afaa9cb20691b8ae (diff) |
Simplify to utf16 conversions
-rw-r--r-- | poppler/PDFDoc.cc | 4 | ||||
-rw-r--r-- | poppler/UTF.cc | 71 | ||||
-rw-r--r-- | poppler/UTF.h | 15 | ||||
-rw-r--r-- | qt5/tests/check_utf_conversion.cpp | 39 | ||||
-rw-r--r-- | qt6/tests/check_utf_conversion.cpp | 39 | ||||
-rw-r--r-- | utils/Win32Console.cc | 6 |
6 files changed, 46 insertions, 128 deletions
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc index 6f94ac2b..5e2de9c9 100644 --- a/poppler/PDFDoc.cc +++ b/poppler/PDFDoc.cc @@ -147,9 +147,9 @@ PDFDoc::PDFDoc(std::unique_ptr<GooString> &&fileNameA, const std::optional<GooSt } fileNameU[n] = L'\0'; - wchar_t *wFileName = (wchar_t *)utf8ToUtf16(fileName->c_str()); + std::u16string u16fileName = utf8ToUtf16(fileName->toStr()); + wchar_t *wFileName = (wchar_t *)u16fileName.data(); file = GooFile::open(wFileName); - gfree(wFileName); #else file = GooFile::open(fileName->toStr()); #endif diff --git a/poppler/UTF.cc b/poppler/UTF.cc index 38fb0a28..1180baf8 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -260,14 +260,14 @@ int utf8ToUCS4(const char *utf8, Unicode **ucs4_out) // (excluding terminating NULL). Each invalid byte is counted as a // code point since the UTF-8 conversion functions will replace it with // REPLACEMENT_CHAR. -int utf8CountUtf16CodeUnits(const char *utf8) +int utf8CountUtf16CodeUnits(std::string_view utf8) { uint32_t codepoint; uint32_t state = 0; int count = 0; - while (*utf8) { - decodeUtf8(&state, &codepoint, *utf8); + for (auto c : utf8) { + decodeUtf8(&state, &codepoint, c); if (state == UTF8_ACCEPT) { if (codepoint < 0x10000) { count++; @@ -280,7 +280,6 @@ int utf8CountUtf16CodeUnits(const char *utf8) count++; // replace with REPLACEMENT_CHAR state = 0; } - utf8++; } if (state != UTF8_ACCEPT && state != UTF8_REJECT) { count++; // replace with REPLACEMENT_CHAR @@ -289,78 +288,52 @@ int utf8CountUtf16CodeUnits(const char *utf8) return count; } -int utf8ToUtf16(const char *utf8, int maxUtf8, uint16_t *utf16, int maxUtf16) +std::u16string utf8ToUtf16(std::string_view utf8) { - uint16_t *p = utf16; uint32_t codepoint; uint32_t state = 0; - int nIn = 0; - int nOut = 0; - while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) { - decodeUtf8(&state, &codepoint, *utf8); + if (isUtf8WithBom(utf8)) { + utf8 = utf8.substr(3); + } + std::u16string utf16; + for (auto c : utf8) { + decodeUtf8(&state, &codepoint, c); if (state == UTF8_ACCEPT) { if (codepoint < 0x10000) { - *p++ = (uint16_t)codepoint; - nOut++; + utf16.push_back((uint16_t)codepoint); } else if (codepoint <= UCS4_MAX) { - *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10)); - *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF)); - nOut += 2; + utf16.push_back((uint16_t)(0xD7C0 + (codepoint >> 10))); + utf16.push_back((uint16_t)(0xDC00 + (codepoint & 0x3FF))); } else { - *p++ = REPLACEMENT_CHAR; - nOut++; + utf16.push_back(REPLACEMENT_CHAR); state = 0; } } else if (state == UTF8_REJECT) { - *p++ = REPLACEMENT_CHAR; // invalid byte for this position - nOut++; + utf16.push_back(REPLACEMENT_CHAR); // invalid byte for this position } - utf8++; - nIn++; } // replace any trailing bytes too short for a valid UTF-8 with a replacement char - if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) { - *p++ = REPLACEMENT_CHAR; - nOut++; - } - if (nOut > maxUtf16 - 1) { - nOut = maxUtf16 - 1; - } - utf16[nOut] = 0; - return nOut; -} - -// Allocate utf16 string and convert utf8 into it. -uint16_t *utf8ToUtf16(const char *utf8, int *len) -{ - if (isUtf8WithBom(utf8)) { - utf8 += 3; - } - int n = utf8CountUtf16CodeUnits(utf8); - if (len) { - *len = n; + if (state != UTF8_ACCEPT && state != UTF8_REJECT) { + utf16.push_back(REPLACEMENT_CHAR); } - uint16_t *utf16 = (uint16_t *)gmallocn(n + 1, sizeof(uint16_t)); - utf8ToUtf16(utf8, INT_MAX, utf16, n + 1); return utf16; } -std::string utf8ToUtf16WithBom(const std::string &utf8) +std::string utf8ToUtf16WithBom(std::string_view utf8) { if (utf8.empty()) { return {}; } - int tmp_length; // Number of UTF-16 symbols. - char *tmp_str = (char *)utf8ToUtf16(utf8.c_str(), &tmp_length); + std::u16string utf16 = utf8ToUtf16(utf8); + char *tmp_str = (char *)utf16.data(); #ifndef WORDS_BIGENDIAN - for (int i = 0; i < tmp_length; i++) { + for (size_t i = 0; i < utf16.size(); i++) { std::swap(tmp_str[i * 2], tmp_str[i * 2 + 1]); } #endif std::string result(unicodeByteOrderMark); - result.append(tmp_str, tmp_length * 2); - gfree(tmp_str); + result.append(tmp_str, utf16.size() * 2); return result; } diff --git a/poppler/UTF.h b/poppler/UTF.h index 4486560f..db764463 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -93,20 +93,13 @@ int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out); // (excluding terminating NULL). Each invalid byte is counted as a // code point since the UTF-8 conversion functions will replace it with // REPLACEMENT_CHAR. -int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(const char *utf8); +int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(std::string_view utf8); // Convert UTF-8 to UTF-16 // utf8 - UTF-8 string to convert. If not null terminated, ensure // maxUtf8 is set the the exact number of bytes to convert. -// maxUtf8 - Maximum number of UTF-8 bytes to convert. Conversion stops when -// either this count is reached or a null is encountered. -// utf16 - Output buffer to write UTF-16 to. Output will always be null terminated. -// maxUtf16 - Maximum size of output buffer including space for null. -// Returns number of UTF-16 code units written (excluding NULL). -int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, int maxUtf8, uint16_t *utf16, int maxUtf16); - -// Allocate utf16 string and convert utf8 into it. -uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr); +// Returns utf16 string +std::u16string POPPLER_PRIVATE_EXPORT utf8ToUtf16(std::string_view utf8); inline bool isUtf8WithBom(std::string_view str) { @@ -123,7 +116,7 @@ inline bool isUtf8WithBom(std::string_view str) // The caller owns the returned pointer. // utf8 - UTF-8 string to convert. An empty string is acceptable. // Returns a big endian UTF-16 string with BOM or an empty string without BOM. -std::string POPPLER_PRIVATE_EXPORT utf8ToUtf16WithBom(const std::string &utf8); +std::string POPPLER_PRIVATE_EXPORT utf8ToUtf16WithBom(std::string_view utf8); // Count number of UTF-8 bytes required to convert a UTF-16 string to // UTF-8 (excluding terminating NULL). diff --git a/qt5/tests/check_utf_conversion.cpp b/qt5/tests/check_utf_conversion.cpp index 05ef519e..99efc489 100644 --- a/qt5/tests/check_utf_conversion.cpp +++ b/qt5/tests/check_utf_conversion.cpp @@ -22,21 +22,6 @@ private slots: void testUnicodeLittleEndian(); }; -static bool compare(const char *a, const char *b) -{ - return strcmp(a, b) == 0; -} - -static bool compare(const uint16_t *a, const uint16_t *b) -{ - while (*a && *b) { - if (*a++ != *b++) { - return false; - } - } - return *a == *b; -} - static bool compare(const Unicode *a, const char *b, int len) { for (int i = 0; i < len; i++) { @@ -80,43 +65,35 @@ void TestUTFConversion::testUTF_data() void TestUTFConversion::testUTF() { std::string utf8String; - uint16_t utf16Buf[1000]; - uint16_t *utf16String; int len; QFETCH(QString, s); - QByteArray str = s.toUtf8(); + const std::string str = s.toStdString(); // UTF-8 to UTF-16 len = utf8CountUtf16CodeUnits(str); QCOMPARE(len, s.size()); // QString size() returns number of code units, not code points - Q_ASSERT(len < (int)sizeof(utf16Buf)); // if this fails, make utf16Buf larger - len = utf8ToUtf16(str, INT_MAX, utf16Buf, sizeof(utf16Buf)); - QVERIFY(compare(utf16Buf, s.utf16())); + std::u16string utf16String = utf8ToUtf16(str); + QCOMPARE(utf16String, s.toStdU16String()); QCOMPARE(len, s.size()); - utf16String = utf8ToUtf16(str); - QVERIFY(compare(utf16String, s.utf16())); - free(utf16String); - - std::string sUtf8(str); - std::string gsUtf16_a(utf8ToUtf16WithBom(sUtf8)); + std::string gsUtf16_a(utf8ToUtf16WithBom(str)); std::unique_ptr<GooString> gsUtf16_b(Poppler::QStringToUnicodeGooString(s)); QCOMPARE(gsUtf16_b->cmp(gsUtf16_a), 0); // UTF-16 to UTF-8 len = utf16CountUtf8Bytes(s.utf16()); - QCOMPARE(len, (int)strlen(str)); + QCOMPARE(len, str.size()); utf8String = utf16ToUtf8(s.utf16(), INT_MAX); - QVERIFY(compare(utf8String.c_str(), str)); - QCOMPARE(len, (int)strlen(str)); + QCOMPARE(utf8String, str); + QCOMPARE(len, str.size()); utf8String = utf16ToUtf8(s.utf16()); - QVERIFY(compare(utf8String.c_str(), str)); + QCOMPARE(utf8String, str); } void TestUTFConversion::testUnicodeToAscii7() diff --git a/qt6/tests/check_utf_conversion.cpp b/qt6/tests/check_utf_conversion.cpp index 53fe4bec..831f002e 100644 --- a/qt6/tests/check_utf_conversion.cpp +++ b/qt6/tests/check_utf_conversion.cpp @@ -21,21 +21,6 @@ private slots: void testUnicodeLittleEndian(); }; -static bool compare(const char *a, const char *b) -{ - return strcmp(a, b) == 0; -} - -static bool compare(const uint16_t *a, const uint16_t *b) -{ - while (*a && *b) { - if (*a++ != *b++) { - return false; - } - } - return *a == *b; -} - static bool compare(const Unicode *a, const char *b, int len) { for (int i = 0; i < len; i++) { @@ -78,43 +63,35 @@ void TestUTFConversion::testUTF_data() void TestUTFConversion::testUTF() { std::string utf8String; - uint16_t utf16Buf[1000]; - uint16_t *utf16String; int len; QFETCH(QString, s); - QByteArray str = s.toUtf8().constData(); + const std::string str = s.toStdString(); // UTF-8 to UTF-16 len = utf8CountUtf16CodeUnits(str); QCOMPARE(len, s.size()); // QString size() returns number of code units, not code points - Q_ASSERT(len < (int)sizeof(utf16Buf)); // if this fails, make utf16Buf larger - len = utf8ToUtf16(str, INT_MAX, utf16Buf, sizeof(utf16Buf)); - QVERIFY(compare(utf16Buf, s.utf16())); + std::u16string utf16String = utf8ToUtf16(str); + QCOMPARE(utf16String, s.toStdU16String()); QCOMPARE(len, s.size()); - utf16String = utf8ToUtf16(str); - QVERIFY(compare(utf16String, s.utf16())); - free(utf16String); - - std::string sUtf8(str); - std::string gsUtf16_a(utf8ToUtf16WithBom(sUtf8)); + std::string gsUtf16_a(utf8ToUtf16WithBom(str)); std::unique_ptr<GooString> gsUtf16_b(Poppler::QStringToUnicodeGooString(s)); QCOMPARE(gsUtf16_b->cmp(gsUtf16_a), 0); // UTF-16 to UTF-8 len = utf16CountUtf8Bytes(s.utf16()); - QCOMPARE(len, (int)strlen(str)); + QCOMPARE(len, str.size()); utf8String = utf16ToUtf8(s.utf16(), INT_MAX); - QVERIFY(compare(utf8String.c_str(), str)); - QCOMPARE(len, (int)strlen(str)); + QCOMPARE(utf8String, str); + QCOMPARE(len, str.size()); utf8String = utf16ToUtf8(s.utf16()); - QVERIFY(compare(utf8String.c_str(), str)); + QCOMPARE(utf8String, str); } void TestUTFConversion::testUnicodeToAscii7() diff --git a/utils/Win32Console.cc b/utils/Win32Console.cc index 181d3bb7..950b1b56 100644 --- a/utils/Win32Console.cc +++ b/utils/Win32Console.cc @@ -25,7 +25,6 @@ static const int BUF_SIZE = 4096; static int bufLen = 0; static char buf[BUF_SIZE]; -static wchar_t wbuf[BUF_SIZE]; static bool stdoutIsConsole = true; static bool stderrIsConsole = true; static HANDLE consoleHandle = nullptr; @@ -49,8 +48,8 @@ static void flush(bool all = false) } if (nchars > 0) { - DWORD wlen = utf8ToUtf16(buf, nchars, (uint16_t *)wbuf, BUF_SIZE); - WriteConsoleW(consoleHandle, wbuf, wlen, &wlen, nullptr); + std::u16string u16string = utf8ToUtf16(std::string_view { buf, nchars }); + WriteConsoleW(consoleHandle, u16string.data(), u16string.size(), nullptr, nullptr); if (nchars < bufLen) { memmove(buf, buf + nchars, bufLen - nchars); bufLen -= nchars; @@ -133,7 +132,6 @@ Win32Console::Win32Console(int *argc, char **argv[]) bufLen = 0; buf[0] = 0; - wbuf[0] = 0; // check if stdout or stderr redirected // GetFileType() returns CHAR for console and special devices COMx, PRN, CON, NUL etc |