diff options
author | Vasily Melenchuk <vasily.melenchuk@cib.de> | 2022-04-05 19:13:05 +0300 |
---|---|---|
committer | Miklos Vajna <vmiklos@collabora.com> | 2022-04-07 14:29:04 +0200 |
commit | 965313b9efc761c70aacf6e3ebee60ffa2b1d5dd (patch) | |
tree | b9b2bb0d66303485b1db0aa4a2f5aa8be8b597fe /writerfilter/source/rtftok/rtfdocumentimpl.cxx | |
parent | 751c6e25a3998845325c9b107163fc23a85b3367 (diff) |
tdf#95706: RTF import: Use fontname suffixes to detect encoding
Font names like "Arial CE", "Times New Roman Cyr" are not special
fonts. They are classical Arial, Times New Roman... And these
suffixes can be used to detect encoding used for RTF text.
Most interesting: for MS Word these suffixes have priority:
{\f34\cpg1253\fcharset161 Arial Baltic;} will have cp1257
and not cp1253.
Looks like compatibility issue came from dark ages.
Change-Id: Ife8e781d5d04c3f6a8c11fcf604357c74bf33055
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/132584
Tested-by: Jenkins
Reviewed-by: Miklos Vajna <vmiklos@collabora.com>
Diffstat (limited to 'writerfilter/source/rtftok/rtfdocumentimpl.cxx')
-rw-r--r-- | writerfilter/source/rtftok/rtfdocumentimpl.cxx | 39 |
1 files changed, 37 insertions, 2 deletions
diff --git a/writerfilter/source/rtftok/rtfdocumentimpl.cxx b/writerfilter/source/rtftok/rtfdocumentimpl.cxx index da53e2df1f24..5a19ccebb20a 100644 --- a/writerfilter/source/rtftok/rtfdocumentimpl.cxx +++ b/writerfilter/source/rtftok/rtfdocumentimpl.cxx @@ -50,6 +50,7 @@ #include "rtfskipdestination.hxx" #include "rtftokenizer.hxx" #include "rtflookahead.hxx" +#include "rtfcharsets.hxx" using namespace com::sun::star; @@ -1370,14 +1371,48 @@ void RTFDocumentImpl::text(OUString& rString) case Destination::FONTTABLE: case Destination::FONTENTRY: { - m_aFontNames[m_nCurrentFontIndex] = aName; + // Old documents can contain no encoding information in fontinfo, + // but there can be font name suffixes: Arial CE is not a special + // font, it is ordinal Arial, but with used cp 1250 encoding. + // Moreover these suffixes have priority over \cpgN and \fcharsetN + // in MS Word. + OUString aFontSuffix; + OUString aNameNoSuffix(aName); + sal_Int32 nLastSpace = aName.lastIndexOf(' '); + if (nLastSpace >= 0) + { + aFontSuffix = aName.copy(nLastSpace + 1); + aNameNoSuffix = aName.copy(0, nLastSpace); + sal_Int32 nEncoding = RTL_TEXTENCODING_DONTKNOW; + for (int i = 0; + aRTFFontNameSuffixes[i].codepage != RTL_TEXTENCODING_DONTKNOW; i++) + { + if (aFontSuffix.equalsAscii(aRTFFontNameSuffixes[i].suffix)) + { + nEncoding = aRTFFontNameSuffixes[i].codepage; + break; + } + } + if (nEncoding > RTL_TEXTENCODING_DONTKNOW) + { + m_nCurrentEncoding = nEncoding; + m_aStates.top().setCurrentEncoding(m_nCurrentEncoding); + } + else + { + // Unknown suffix: looks like it is just a part of font name, restore it + aNameNoSuffix = aName; + } + } + + m_aFontNames[m_nCurrentFontIndex] = aNameNoSuffix; if (m_nCurrentEncoding >= 0) { m_aFontEncodings[m_nCurrentFontIndex] = m_nCurrentEncoding; m_nCurrentEncoding = -1; } m_aStates.top().getTableAttributes().set(NS_ooxml::LN_CT_Font_name, - new RTFValue(aName)); + new RTFValue(aNameNoSuffix)); writerfilter::Reference<Properties>::Pointer_t const pProp( new RTFReferenceProperties(m_aStates.top().getTableAttributes(), |