diff options
author | Miklos Vajna <vmiklos@collabora.co.uk> | 2017-12-13 09:49:41 +0100 |
---|---|---|
committer | Caolán McNamara <caolanm@redhat.com> | 2018-01-27 21:03:41 +0100 |
commit | 6aa65f7664fe0dbe8c9d4ba7f320ef216e928780 (patch) | |
tree | 37309a725d654203608903bcf07c21269d6551c5 | |
parent | f937a432c2351852e8b237c6e11dd9e43a2b28c9 (diff) |
tdf#114428 filter: recognize XHTML with XML declaration as HTML
The problem was the additional
<?xml version="1.0" encoding="utf-8"?>
XML declaration before the usual
<!DOCTYPE html ...
line, just ignore it.
Change-Id: I294aae5504b40b42f76da00fef645d0d89009da9
Reviewed-on: https://gerrit.libreoffice.org/46324
Reviewed-by: Miklos Vajna <vmiklos@collabora.co.uk>
Tested-by: Jenkins <ci@libreoffice.org>
(cherry picked from commit 4af729f31c64c09c76ea8bcfa5067092571b92de)
Reviewed-on: https://gerrit.libreoffice.org/47587
Reviewed-by: Caolán McNamara <caolanm@redhat.com>
Tested-by: Caolán McNamara <caolanm@redhat.com>
-rw-r--r-- | filter/CppunitTest_filter_textfilterdetect.mk | 46 | ||||
-rw-r--r-- | filter/Module_filter.mk | 1 | ||||
-rw-r--r-- | filter/qa/unit/data/tdf114428.xhtml | 9 | ||||
-rw-r--r-- | filter/qa/unit/textfilterdetect.cxx | 63 | ||||
-rw-r--r-- | filter/source/textfilterdetect/filterdetect.cxx | 24 |
5 files changed, 140 insertions, 3 deletions
diff --git a/filter/CppunitTest_filter_textfilterdetect.mk b/filter/CppunitTest_filter_textfilterdetect.mk new file mode 100644 index 000000000000..dfcaee9ce16a --- /dev/null +++ b/filter/CppunitTest_filter_textfilterdetect.mk @@ -0,0 +1,46 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_CppunitTest_CppunitTest,filter_textfilterdetect)) + +$(eval $(call gb_CppunitTest_use_api,filter_textfilterdetect,\ + offapi \ + udkapi \ +)) + +$(eval $(call gb_CppunitTest_use_libraries,filter_textfilterdetect, \ + comphelper \ + cppu \ + cppuhelper \ + sal \ + test \ + textfd \ + tl \ + unotest \ + utl \ +)) + +$(eval $(call gb_CppunitTest_add_exception_objects,filter_textfilterdetect, \ + filter/qa/unit/textfilterdetect \ +)) + +$(eval $(call gb_CppunitTest_use_ure,filter_textfilterdetect)) + +$(eval $(call gb_CppunitTest_use_vcl,filter_textfilterdetect)) + +$(eval $(call gb_CppunitTest_use_components,filter_textfilterdetect,\ + configmgr/source/configmgr \ + filter/source/textfilterdetect/textfd \ + ucb/source/core/ucb1 \ + ucb/source/ucp/file/ucpfile1 \ +)) + +$(eval $(call gb_CppunitTest_use_configuration,filter_textfilterdetect)) + +# vim: set noet sw=4 ts=4: diff --git a/filter/Module_filter.mk b/filter/Module_filter.mk index a7834b6dc631..08aa0f2d395b 100644 --- a/filter/Module_filter.mk +++ b/filter/Module_filter.mk @@ -57,6 +57,7 @@ $(eval $(call gb_Module_add_check_targets,filter,\ CppunitTest_filter_xslt \ CppunitTest_filter_priority \ CppunitTest_filter_msfilter \ + CppunitTest_filter_textfilterdetect \ )) ifneq ($(DISABLE_CVE_TESTS),TRUE) diff --git a/filter/qa/unit/data/tdf114428.xhtml b/filter/qa/unit/data/tdf114428.xhtml new file mode 100644 index 000000000000..f08f0fa4a028 --- /dev/null +++ b/filter/qa/unit/data/tdf114428.xhtml @@ -0,0 +1,9 @@ +<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <title>Title of document</title> + </head> + <body>hello world</body> +</html> diff --git a/filter/qa/unit/textfilterdetect.cxx b/filter/qa/unit/textfilterdetect.cxx new file mode 100644 index 000000000000..272ba85b330b --- /dev/null +++ b/filter/qa/unit/textfilterdetect.cxx @@ -0,0 +1,63 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <com/sun/star/document/XExtendedFilterDetection.hpp> +#include <com/sun/star/io/XInputStream.hpp> + +#include <comphelper/processfactory.hxx> +#include <comphelper/propertyvalue.hxx> +#include <test/bootstrapfixture.hxx> +#include <unotools/mediadescriptor.hxx> +#include <unotools/streamwrap.hxx> + +using namespace com::sun::star; + +namespace +{ +/// Test class for PlainTextFilterDetect. +class TextFilterDetectTest : public test::BootstrapFixture +{ +public: + void testTdf114428(); + + CPPUNIT_TEST_SUITE(TextFilterDetectTest); + CPPUNIT_TEST(testTdf114428); + CPPUNIT_TEST_SUITE_END(); +}; + +char const DATA_DIRECTORY[] = "/filter/qa/unit/data/"; + +void TextFilterDetectTest::testTdf114428() +{ + uno::Reference<uno::XComponentContext> xComponentContext + = comphelper::getComponentContext(getMultiServiceFactory()); + uno::Reference<document::XExtendedFilterDetection> xDetect( + getMultiServiceFactory()->createInstance("com.sun.star.comp.filters.PlainTextFilterDetect"), + uno::UNO_QUERY); + OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf114428.xhtml"; + SvFileStream aStream(aURL, StreamMode::READ); + uno::Reference<io::XInputStream> xStream(new utl::OStreamWrapper(aStream)); + uno::Sequence<beans::PropertyValue> aDescriptor + = { comphelper::makePropertyValue("DocumentService", + OUString("com.sun.star.text.TextDocument")), + comphelper::makePropertyValue("InputStream", xStream), + comphelper::makePropertyValue("TypeName", OUString("generic_HTML")) }; + xDetect->detect(aDescriptor); + utl::MediaDescriptor aMediaDesc(aDescriptor); + OUString aFilterName = aMediaDesc.getUnpackedValueOrDefault("FilterName", OUString()); + // This was empty, XML declaration caused HTML detect to not handle XHTML. + CPPUNIT_ASSERT_EQUAL(OUString("HTML (StarWriter)"), aFilterName); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(TextFilterDetectTest); +} + +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/filter/source/textfilterdetect/filterdetect.cxx b/filter/source/textfilterdetect/filterdetect.cxx index ee93d28ddbed..3228ca53f62f 100644 --- a/filter/source/textfilterdetect/filterdetect.cxx +++ b/filter/source/textfilterdetect/filterdetect.cxx @@ -58,6 +58,13 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream ) // Now check whether the stream begins with a known HTML tag. enum DetectPhase { BeforeTag, TagOpened, InTagName }; DetectPhase dp = BeforeTag; + /// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration. + enum DeclarationPhase + { + BeforeDeclaration, + DeclarationOpened + }; + DeclarationPhase eDeclaration = BeforeDeclaration; const char* pHeader = sHeader.getStr(); const int nLength = sHeader.getLength(); @@ -66,7 +73,8 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream ) for ( i = 0; i < nLength; ++i, ++pHeader ) { char c = *pHeader; - if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' ) + if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f') + && eDeclaration == BeforeDeclaration) { if ( dp == TagOpened ) return false; // Invalid: Should start with a tag name @@ -84,6 +92,11 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream ) { if ( dp == InTagName ) break; // End of tag name reached + else if (eDeclaration == DeclarationOpened) + { + dp = BeforeTag; + eDeclaration = BeforeDeclaration; + } else return false; // Invalid: Empty tag or before '<' } @@ -100,8 +113,13 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream ) return false; // Invalid: Should start with a tag else if ( dp == TagOpened ) { - nStartOfTagIndex = i; - dp = InTagName; + if (c == '?' && eDeclaration == BeforeDeclaration) + eDeclaration = DeclarationOpened; + else if (eDeclaration == BeforeDeclaration) + { + nStartOfTagIndex = i; + dp = InTagName; + } } } } |