summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Kaganski <mike.kaganski@collabora.com>2017-04-04 22:37:45 +0300
committerMike Kaganski <mike.kaganski@collabora.com>2017-04-05 13:23:49 +0000
commite07d7f4f25be00e479f4df4160c89be4e001eb28 (patch)
treed2f6e2e39a2683589f096329339fcf6a9d633b31
parent5b8c4fa704408f1b26949ad4500e6678e56fc2cf (diff)
tdf#106955: Detect XML by MediaType
According to Extensible Markup Language (XML) 1.0 (see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-prolog-dtd), all parts of XML prolog (including XML declaration) are optional, so XML stream without <?xml ... ?> is well-formed (though not valid). XMLFilterDetect uses only XML declaration to detect if the file is to be processed further. However, this creates problems with said documents. This commit checks if the document has MediaType set to one of known XML media types, in case when the check for XML declaration failed. Change-Id: I31627c0e3a39bee241f609650280ebac3f1cede8 Reviewed-on: https://gerrit.libreoffice.org/36101 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com> (cherry picked from commit 156f778593ca9c57845076a88c6b544a63e12e7a) Reviewed-on: https://gerrit.libreoffice.org/36134 Tested-by: Mike Kaganski <mike.kaganski@collabora.com>
-rw-r--r--filter/Library_xmlfd.mk1
-rw-r--r--filter/source/xmlfilterdetect/filterdetect.cxx47
2 files changed, 45 insertions, 3 deletions
diff --git a/filter/Library_xmlfd.mk b/filter/Library_xmlfd.mk
index 54fdd0aa01e9..601a7678312b 100644
--- a/filter/Library_xmlfd.mk
+++ b/filter/Library_xmlfd.mk
@@ -30,6 +30,7 @@ $(eval $(call gb_Library_use_libraries,xmlfd,\
cppuhelper \
cppu \
sal \
+ svl \
utl \
tl \
$(gb_UWINAPI) \
diff --git a/filter/source/xmlfilterdetect/filterdetect.cxx b/filter/source/xmlfilterdetect/filterdetect.cxx
index 8cce08fc2d76..cd49649e3de6 100644
--- a/filter/source/xmlfilterdetect/filterdetect.cxx
+++ b/filter/source/xmlfilterdetect/filterdetect.cxx
@@ -27,6 +27,7 @@
#include <cppuhelper/supportsservice.hxx>
#include <ucbhelper/content.hxx>
#include <unotools/ucbstreamhelper.hxx>
+#include <svl/inettype.hxx>
#include <memory>
using namespace com::sun::star::container;
@@ -49,6 +50,25 @@ OUString supportedByType( const OUString& clipBoardFormat, const OUString& resu
return sTypeName;
}
+bool IsMediaTypeXML( const OUString& mediaType )
+{
+ if (!mediaType.isEmpty())
+ {
+ OUString sType, sSubType;
+ INetContentTypes::parse(mediaType, sType, sSubType);
+ if (sType.equalsIgnoreAsciiCase("application"))
+ {
+ // RFC 3023: application/xml; don't detect text/xml
+ if (sSubType.equalsIgnoreAsciiCase("xml"))
+ return true;
+ // Registered media types: application/XXXX+xml
+ if (sSubType.endsWithIgnoreAsciiCase("+xml"))
+ return true;
+ }
+ }
+ return false;
+}
+
}
OUString SAL_CALL FilterDetect::detect( css::uno::Sequence< css::beans::PropertyValue >& aArguments ) throw( css::uno::RuntimeException, std::exception )
@@ -125,9 +145,30 @@ OUString SAL_CALL FilterDetect::detect( css::uno::Sequence< css::beans::Property
resultString = read_uInt16s_ToOUString( *pInStream, nSize );
if ( !resultString.startsWith( "<?xml" ) )
- // This is not an XML stream. It makes no sense to try to detect
- // a non-XML file type here.
- return OUString();
+ {
+ // Check the content type; XML declaration is optional in XML files according to XML 1.0 ch.2.8
+ // (see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-prolog-dtd)
+ OUString sMediaType;
+ try
+ {
+ ::ucbhelper::Content aContent(
+ sUrl, Reference< css::ucb::XCommandEnvironment >(),
+ mxCtx);
+ aContent.getPropertyValue("MediaType") >>= sMediaType;
+ if (sMediaType.isEmpty())
+ {
+ aContent.getPropertyValue("Content-Type") >>= sMediaType;
+ }
+ }
+ catch (...) {}
+
+ if (!IsMediaTypeXML(sMediaType))
+ {
+ // This is not an XML stream. It makes no sense to try to detect
+ // a non-XML file type here.
+ return OUString();
+ }
+ }
// test typedetect code
Reference <XNameAccess> xTypeCont(mxCtx->getServiceManager()->createInstanceWithContext("com.sun.star.document.TypeDetection", mxCtx), UNO_QUERY);