summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSune Vuorela <sune@vuorela.dk>2025-01-16 11:35:50 +0100
committerAlbert Astals Cid <aacid@kde.org>2025-01-19 15:20:48 +0000
commit7d9b708cb7dc735e332570a67185a295174a2d78 (patch)
treebfe85cbae205bb4fb13e0a0548b8298d971a9570
parent0d7c1e697358fd736bfb6051afaa9cb20691b8ae (diff)
Simplify to utf16 conversions
-rw-r--r--poppler/PDFDoc.cc4
-rw-r--r--poppler/UTF.cc71
-rw-r--r--poppler/UTF.h15
-rw-r--r--qt5/tests/check_utf_conversion.cpp39
-rw-r--r--qt6/tests/check_utf_conversion.cpp39
-rw-r--r--utils/Win32Console.cc6
6 files changed, 46 insertions, 128 deletions
diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc
index 6f94ac2b..5e2de9c9 100644
--- a/poppler/PDFDoc.cc
+++ b/poppler/PDFDoc.cc
@@ -147,9 +147,9 @@ PDFDoc::PDFDoc(std::unique_ptr<GooString> &&fileNameA, const std::optional<GooSt
}
fileNameU[n] = L'\0';
- wchar_t *wFileName = (wchar_t *)utf8ToUtf16(fileName->c_str());
+ std::u16string u16fileName = utf8ToUtf16(fileName->toStr());
+ wchar_t *wFileName = (wchar_t *)u16fileName.data();
file = GooFile::open(wFileName);
- gfree(wFileName);
#else
file = GooFile::open(fileName->toStr());
#endif
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index 38fb0a28..1180baf8 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -260,14 +260,14 @@ int utf8ToUCS4(const char *utf8, Unicode **ucs4_out)
// (excluding terminating NULL). Each invalid byte is counted as a
// code point since the UTF-8 conversion functions will replace it with
// REPLACEMENT_CHAR.
-int utf8CountUtf16CodeUnits(const char *utf8)
+int utf8CountUtf16CodeUnits(std::string_view utf8)
{
uint32_t codepoint;
uint32_t state = 0;
int count = 0;
- while (*utf8) {
- decodeUtf8(&state, &codepoint, *utf8);
+ for (auto c : utf8) {
+ decodeUtf8(&state, &codepoint, c);
if (state == UTF8_ACCEPT) {
if (codepoint < 0x10000) {
count++;
@@ -280,7 +280,6 @@ int utf8CountUtf16CodeUnits(const char *utf8)
count++; // replace with REPLACEMENT_CHAR
state = 0;
}
- utf8++;
}
if (state != UTF8_ACCEPT && state != UTF8_REJECT) {
count++; // replace with REPLACEMENT_CHAR
@@ -289,78 +288,52 @@ int utf8CountUtf16CodeUnits(const char *utf8)
return count;
}
-int utf8ToUtf16(const char *utf8, int maxUtf8, uint16_t *utf16, int maxUtf16)
+std::u16string utf8ToUtf16(std::string_view utf8)
{
- uint16_t *p = utf16;
uint32_t codepoint;
uint32_t state = 0;
- int nIn = 0;
- int nOut = 0;
- while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) {
- decodeUtf8(&state, &codepoint, *utf8);
+ if (isUtf8WithBom(utf8)) {
+ utf8 = utf8.substr(3);
+ }
+ std::u16string utf16;
+ for (auto c : utf8) {
+ decodeUtf8(&state, &codepoint, c);
if (state == UTF8_ACCEPT) {
if (codepoint < 0x10000) {
- *p++ = (uint16_t)codepoint;
- nOut++;
+ utf16.push_back((uint16_t)codepoint);
} else if (codepoint <= UCS4_MAX) {
- *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10));
- *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF));
- nOut += 2;
+ utf16.push_back((uint16_t)(0xD7C0 + (codepoint >> 10)));
+ utf16.push_back((uint16_t)(0xDC00 + (codepoint & 0x3FF)));
} else {
- *p++ = REPLACEMENT_CHAR;
- nOut++;
+ utf16.push_back(REPLACEMENT_CHAR);
state = 0;
}
} else if (state == UTF8_REJECT) {
- *p++ = REPLACEMENT_CHAR; // invalid byte for this position
- nOut++;
+ utf16.push_back(REPLACEMENT_CHAR); // invalid byte for this position
}
- utf8++;
- nIn++;
}
// replace any trailing bytes too short for a valid UTF-8 with a replacement char
- if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) {
- *p++ = REPLACEMENT_CHAR;
- nOut++;
- }
- if (nOut > maxUtf16 - 1) {
- nOut = maxUtf16 - 1;
- }
- utf16[nOut] = 0;
- return nOut;
-}
-
-// Allocate utf16 string and convert utf8 into it.
-uint16_t *utf8ToUtf16(const char *utf8, int *len)
-{
- if (isUtf8WithBom(utf8)) {
- utf8 += 3;
- }
- int n = utf8CountUtf16CodeUnits(utf8);
- if (len) {
- *len = n;
+ if (state != UTF8_ACCEPT && state != UTF8_REJECT) {
+ utf16.push_back(REPLACEMENT_CHAR);
}
- uint16_t *utf16 = (uint16_t *)gmallocn(n + 1, sizeof(uint16_t));
- utf8ToUtf16(utf8, INT_MAX, utf16, n + 1);
return utf16;
}
-std::string utf8ToUtf16WithBom(const std::string &utf8)
+std::string utf8ToUtf16WithBom(std::string_view utf8)
{
if (utf8.empty()) {
return {};
}
- int tmp_length; // Number of UTF-16 symbols.
- char *tmp_str = (char *)utf8ToUtf16(utf8.c_str(), &tmp_length);
+ std::u16string utf16 = utf8ToUtf16(utf8);
+ char *tmp_str = (char *)utf16.data();
#ifndef WORDS_BIGENDIAN
- for (int i = 0; i < tmp_length; i++) {
+ for (size_t i = 0; i < utf16.size(); i++) {
std::swap(tmp_str[i * 2], tmp_str[i * 2 + 1]);
}
#endif
std::string result(unicodeByteOrderMark);
- result.append(tmp_str, tmp_length * 2);
- gfree(tmp_str);
+ result.append(tmp_str, utf16.size() * 2);
return result;
}
diff --git a/poppler/UTF.h b/poppler/UTF.h
index 4486560f..db764463 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -93,20 +93,13 @@ int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out);
// (excluding terminating NULL). Each invalid byte is counted as a
// code point since the UTF-8 conversion functions will replace it with
// REPLACEMENT_CHAR.
-int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(const char *utf8);
+int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(std::string_view utf8);
// Convert UTF-8 to UTF-16
// utf8 - UTF-8 string to convert. If not null terminated, ensure
// maxUtf8 is set the the exact number of bytes to convert.
-// maxUtf8 - Maximum number of UTF-8 bytes to convert. Conversion stops when
-// either this count is reached or a null is encountered.
-// utf16 - Output buffer to write UTF-16 to. Output will always be null terminated.
-// maxUtf16 - Maximum size of output buffer including space for null.
-// Returns number of UTF-16 code units written (excluding NULL).
-int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, int maxUtf8, uint16_t *utf16, int maxUtf16);
-
-// Allocate utf16 string and convert utf8 into it.
-uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr);
+// Returns utf16 string
+std::u16string POPPLER_PRIVATE_EXPORT utf8ToUtf16(std::string_view utf8);
inline bool isUtf8WithBom(std::string_view str)
{
@@ -123,7 +116,7 @@ inline bool isUtf8WithBom(std::string_view str)
// The caller owns the returned pointer.
// utf8 - UTF-8 string to convert. An empty string is acceptable.
// Returns a big endian UTF-16 string with BOM or an empty string without BOM.
-std::string POPPLER_PRIVATE_EXPORT utf8ToUtf16WithBom(const std::string &utf8);
+std::string POPPLER_PRIVATE_EXPORT utf8ToUtf16WithBom(std::string_view utf8);
// Count number of UTF-8 bytes required to convert a UTF-16 string to
// UTF-8 (excluding terminating NULL).
diff --git a/qt5/tests/check_utf_conversion.cpp b/qt5/tests/check_utf_conversion.cpp
index 05ef519e..99efc489 100644
--- a/qt5/tests/check_utf_conversion.cpp
+++ b/qt5/tests/check_utf_conversion.cpp
@@ -22,21 +22,6 @@ private slots:
void testUnicodeLittleEndian();
};
-static bool compare(const char *a, const char *b)
-{
- return strcmp(a, b) == 0;
-}
-
-static bool compare(const uint16_t *a, const uint16_t *b)
-{
- while (*a && *b) {
- if (*a++ != *b++) {
- return false;
- }
- }
- return *a == *b;
-}
-
static bool compare(const Unicode *a, const char *b, int len)
{
for (int i = 0; i < len; i++) {
@@ -80,43 +65,35 @@ void TestUTFConversion::testUTF_data()
void TestUTFConversion::testUTF()
{
std::string utf8String;
- uint16_t utf16Buf[1000];
- uint16_t *utf16String;
int len;
QFETCH(QString, s);
- QByteArray str = s.toUtf8();
+ const std::string str = s.toStdString();
// UTF-8 to UTF-16
len = utf8CountUtf16CodeUnits(str);
QCOMPARE(len, s.size()); // QString size() returns number of code units, not code points
- Q_ASSERT(len < (int)sizeof(utf16Buf)); // if this fails, make utf16Buf larger
- len = utf8ToUtf16(str, INT_MAX, utf16Buf, sizeof(utf16Buf));
- QVERIFY(compare(utf16Buf, s.utf16()));
+ std::u16string utf16String = utf8ToUtf16(str);
+ QCOMPARE(utf16String, s.toStdU16String());
QCOMPARE(len, s.size());
- utf16String = utf8ToUtf16(str);
- QVERIFY(compare(utf16String, s.utf16()));
- free(utf16String);
-
- std::string sUtf8(str);
- std::string gsUtf16_a(utf8ToUtf16WithBom(sUtf8));
+ std::string gsUtf16_a(utf8ToUtf16WithBom(str));
std::unique_ptr<GooString> gsUtf16_b(Poppler::QStringToUnicodeGooString(s));
QCOMPARE(gsUtf16_b->cmp(gsUtf16_a), 0);
// UTF-16 to UTF-8
len = utf16CountUtf8Bytes(s.utf16());
- QCOMPARE(len, (int)strlen(str));
+ QCOMPARE(len, str.size());
utf8String = utf16ToUtf8(s.utf16(), INT_MAX);
- QVERIFY(compare(utf8String.c_str(), str));
- QCOMPARE(len, (int)strlen(str));
+ QCOMPARE(utf8String, str);
+ QCOMPARE(len, str.size());
utf8String = utf16ToUtf8(s.utf16());
- QVERIFY(compare(utf8String.c_str(), str));
+ QCOMPARE(utf8String, str);
}
void TestUTFConversion::testUnicodeToAscii7()
diff --git a/qt6/tests/check_utf_conversion.cpp b/qt6/tests/check_utf_conversion.cpp
index 53fe4bec..831f002e 100644
--- a/qt6/tests/check_utf_conversion.cpp
+++ b/qt6/tests/check_utf_conversion.cpp
@@ -21,21 +21,6 @@ private slots:
void testUnicodeLittleEndian();
};
-static bool compare(const char *a, const char *b)
-{
- return strcmp(a, b) == 0;
-}
-
-static bool compare(const uint16_t *a, const uint16_t *b)
-{
- while (*a && *b) {
- if (*a++ != *b++) {
- return false;
- }
- }
- return *a == *b;
-}
-
static bool compare(const Unicode *a, const char *b, int len)
{
for (int i = 0; i < len; i++) {
@@ -78,43 +63,35 @@ void TestUTFConversion::testUTF_data()
void TestUTFConversion::testUTF()
{
std::string utf8String;
- uint16_t utf16Buf[1000];
- uint16_t *utf16String;
int len;
QFETCH(QString, s);
- QByteArray str = s.toUtf8().constData();
+ const std::string str = s.toStdString();
// UTF-8 to UTF-16
len = utf8CountUtf16CodeUnits(str);
QCOMPARE(len, s.size()); // QString size() returns number of code units, not code points
- Q_ASSERT(len < (int)sizeof(utf16Buf)); // if this fails, make utf16Buf larger
- len = utf8ToUtf16(str, INT_MAX, utf16Buf, sizeof(utf16Buf));
- QVERIFY(compare(utf16Buf, s.utf16()));
+ std::u16string utf16String = utf8ToUtf16(str);
+ QCOMPARE(utf16String, s.toStdU16String());
QCOMPARE(len, s.size());
- utf16String = utf8ToUtf16(str);
- QVERIFY(compare(utf16String, s.utf16()));
- free(utf16String);
-
- std::string sUtf8(str);
- std::string gsUtf16_a(utf8ToUtf16WithBom(sUtf8));
+ std::string gsUtf16_a(utf8ToUtf16WithBom(str));
std::unique_ptr<GooString> gsUtf16_b(Poppler::QStringToUnicodeGooString(s));
QCOMPARE(gsUtf16_b->cmp(gsUtf16_a), 0);
// UTF-16 to UTF-8
len = utf16CountUtf8Bytes(s.utf16());
- QCOMPARE(len, (int)strlen(str));
+ QCOMPARE(len, str.size());
utf8String = utf16ToUtf8(s.utf16(), INT_MAX);
- QVERIFY(compare(utf8String.c_str(), str));
- QCOMPARE(len, (int)strlen(str));
+ QCOMPARE(utf8String, str);
+ QCOMPARE(len, str.size());
utf8String = utf16ToUtf8(s.utf16());
- QVERIFY(compare(utf8String.c_str(), str));
+ QCOMPARE(utf8String, str);
}
void TestUTFConversion::testUnicodeToAscii7()
diff --git a/utils/Win32Console.cc b/utils/Win32Console.cc
index 181d3bb7..950b1b56 100644
--- a/utils/Win32Console.cc
+++ b/utils/Win32Console.cc
@@ -25,7 +25,6 @@
static const int BUF_SIZE = 4096;
static int bufLen = 0;
static char buf[BUF_SIZE];
-static wchar_t wbuf[BUF_SIZE];
static bool stdoutIsConsole = true;
static bool stderrIsConsole = true;
static HANDLE consoleHandle = nullptr;
@@ -49,8 +48,8 @@ static void flush(bool all = false)
}
if (nchars > 0) {
- DWORD wlen = utf8ToUtf16(buf, nchars, (uint16_t *)wbuf, BUF_SIZE);
- WriteConsoleW(consoleHandle, wbuf, wlen, &wlen, nullptr);
+ std::u16string u16string = utf8ToUtf16(std::string_view { buf, nchars });
+ WriteConsoleW(consoleHandle, u16string.data(), u16string.size(), nullptr, nullptr);
if (nchars < bufLen) {
memmove(buf, buf + nchars, bufLen - nchars);
bufLen -= nchars;
@@ -133,7 +132,6 @@ Win32Console::Win32Console(int *argc, char **argv[])
bufLen = 0;
buf[0] = 0;
- wbuf[0] = 0;
// check if stdout or stderr redirected
// GetFileType() returns CHAR for console and special devices COMx, PRN, CON, NUL etc