summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiklos Vajna <vmiklos@collabora.co.uk>2017-12-13 09:49:41 +0100
committerMiklos Vajna <vmiklos@collabora.co.uk>2017-12-13 14:44:15 +0100
commit4af729f31c64c09c76ea8bcfa5067092571b92de (patch)
tree555e540d0d7564218fc9f31486a59e8119e3cd50
parent1d6b85f85925c3b4d2d2bb8eaf237b10bb8f7d60 (diff)
tdf#114428 filter: recognize XHTML with XML declaration as HTML
The problem was the additional <?xml version="1.0" encoding="utf-8"?> XML declaration before the usual <!DOCTYPE html ... line, just ignore it. Change-Id: I294aae5504b40b42f76da00fef645d0d89009da9 Reviewed-on: https://gerrit.libreoffice.org/46324 Reviewed-by: Miklos Vajna <vmiklos@collabora.co.uk> Tested-by: Jenkins <ci@libreoffice.org>
-rw-r--r--filter/CppunitTest_filter_textfilterdetect.mk46
-rw-r--r--filter/Module_filter.mk1
-rw-r--r--filter/qa/unit/data/tdf114428.xhtml9
-rw-r--r--filter/qa/unit/textfilterdetect.cxx63
-rw-r--r--filter/source/textfilterdetect/filterdetect.cxx24
5 files changed, 140 insertions, 3 deletions
diff --git a/filter/CppunitTest_filter_textfilterdetect.mk b/filter/CppunitTest_filter_textfilterdetect.mk
new file mode 100644
index 000000000000..dfcaee9ce16a
--- /dev/null
+++ b/filter/CppunitTest_filter_textfilterdetect.mk
@@ -0,0 +1,46 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_CppunitTest_CppunitTest,filter_textfilterdetect))
+
+$(eval $(call gb_CppunitTest_use_api,filter_textfilterdetect,\
+ offapi \
+ udkapi \
+))
+
+$(eval $(call gb_CppunitTest_use_libraries,filter_textfilterdetect, \
+ comphelper \
+ cppu \
+ cppuhelper \
+ sal \
+ test \
+ textfd \
+ tl \
+ unotest \
+ utl \
+))
+
+$(eval $(call gb_CppunitTest_add_exception_objects,filter_textfilterdetect, \
+ filter/qa/unit/textfilterdetect \
+))
+
+$(eval $(call gb_CppunitTest_use_ure,filter_textfilterdetect))
+
+$(eval $(call gb_CppunitTest_use_vcl,filter_textfilterdetect))
+
+$(eval $(call gb_CppunitTest_use_components,filter_textfilterdetect,\
+ configmgr/source/configmgr \
+ filter/source/textfilterdetect/textfd \
+ ucb/source/core/ucb1 \
+ ucb/source/ucp/file/ucpfile1 \
+))
+
+$(eval $(call gb_CppunitTest_use_configuration,filter_textfilterdetect))
+
+# vim: set noet sw=4 ts=4:
diff --git a/filter/Module_filter.mk b/filter/Module_filter.mk
index a7834b6dc631..08aa0f2d395b 100644
--- a/filter/Module_filter.mk
+++ b/filter/Module_filter.mk
@@ -57,6 +57,7 @@ $(eval $(call gb_Module_add_check_targets,filter,\
CppunitTest_filter_xslt \
CppunitTest_filter_priority \
CppunitTest_filter_msfilter \
+ CppunitTest_filter_textfilterdetect \
))
ifneq ($(DISABLE_CVE_TESTS),TRUE)
diff --git a/filter/qa/unit/data/tdf114428.xhtml b/filter/qa/unit/data/tdf114428.xhtml
new file mode 100644
index 000000000000..f08f0fa4a028
--- /dev/null
+++ b/filter/qa/unit/data/tdf114428.xhtml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+ <title>Title of document</title>
+ </head>
+ <body>hello world</body>
+</html>
diff --git a/filter/qa/unit/textfilterdetect.cxx b/filter/qa/unit/textfilterdetect.cxx
new file mode 100644
index 000000000000..272ba85b330b
--- /dev/null
+++ b/filter/qa/unit/textfilterdetect.cxx
@@ -0,0 +1,63 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <com/sun/star/document/XExtendedFilterDetection.hpp>
+#include <com/sun/star/io/XInputStream.hpp>
+
+#include <comphelper/processfactory.hxx>
+#include <comphelper/propertyvalue.hxx>
+#include <test/bootstrapfixture.hxx>
+#include <unotools/mediadescriptor.hxx>
+#include <unotools/streamwrap.hxx>
+
+using namespace com::sun::star;
+
+namespace
+{
+/// Test class for PlainTextFilterDetect.
+class TextFilterDetectTest : public test::BootstrapFixture
+{
+public:
+ void testTdf114428();
+
+ CPPUNIT_TEST_SUITE(TextFilterDetectTest);
+ CPPUNIT_TEST(testTdf114428);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+char const DATA_DIRECTORY[] = "/filter/qa/unit/data/";
+
+void TextFilterDetectTest::testTdf114428()
+{
+ uno::Reference<uno::XComponentContext> xComponentContext
+ = comphelper::getComponentContext(getMultiServiceFactory());
+ uno::Reference<document::XExtendedFilterDetection> xDetect(
+ getMultiServiceFactory()->createInstance("com.sun.star.comp.filters.PlainTextFilterDetect"),
+ uno::UNO_QUERY);
+ OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf114428.xhtml";
+ SvFileStream aStream(aURL, StreamMode::READ);
+ uno::Reference<io::XInputStream> xStream(new utl::OStreamWrapper(aStream));
+ uno::Sequence<beans::PropertyValue> aDescriptor
+ = { comphelper::makePropertyValue("DocumentService",
+ OUString("com.sun.star.text.TextDocument")),
+ comphelper::makePropertyValue("InputStream", xStream),
+ comphelper::makePropertyValue("TypeName", OUString("generic_HTML")) };
+ xDetect->detect(aDescriptor);
+ utl::MediaDescriptor aMediaDesc(aDescriptor);
+ OUString aFilterName = aMediaDesc.getUnpackedValueOrDefault("FilterName", OUString());
+ // This was empty, XML declaration caused HTML detect to not handle XHTML.
+ CPPUNIT_ASSERT_EQUAL(OUString("HTML (StarWriter)"), aFilterName);
+}
+
+CPPUNIT_TEST_SUITE_REGISTRATION(TextFilterDetectTest);
+}
+
+CPPUNIT_PLUGIN_IMPLEMENT();
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/filter/source/textfilterdetect/filterdetect.cxx b/filter/source/textfilterdetect/filterdetect.cxx
index d2f8fb0bd54c..2e3e08028683 100644
--- a/filter/source/textfilterdetect/filterdetect.cxx
+++ b/filter/source/textfilterdetect/filterdetect.cxx
@@ -58,6 +58,13 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
// Now check whether the stream begins with a known HTML tag.
enum DetectPhase { BeforeTag, TagOpened, InTagName };
DetectPhase dp = BeforeTag;
+ /// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration.
+ enum DeclarationPhase
+ {
+ BeforeDeclaration,
+ DeclarationOpened
+ };
+ DeclarationPhase eDeclaration = BeforeDeclaration;
const char* pHeader = sHeader.getStr();
const int nLength = sHeader.getLength();
@@ -66,7 +73,8 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
for ( i = 0; i < nLength; ++i, ++pHeader )
{
char c = *pHeader;
- if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' )
+ if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f')
+ && eDeclaration == BeforeDeclaration)
{
if ( dp == TagOpened )
return false; // Invalid: Should start with a tag name
@@ -84,6 +92,11 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
{
if ( dp == InTagName )
break; // End of tag name reached
+ else if (eDeclaration == DeclarationOpened)
+ {
+ dp = BeforeTag;
+ eDeclaration = BeforeDeclaration;
+ }
else
return false; // Invalid: Empty tag or before '<'
}
@@ -100,8 +113,13 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
return false; // Invalid: Should start with a tag
else if ( dp == TagOpened )
{
- nStartOfTagIndex = i;
- dp = InTagName;
+ if (c == '?' && eDeclaration == BeforeDeclaration)
+ eDeclaration = DeclarationOpened;
+ else if (eDeclaration == BeforeDeclaration)
+ {
+ nStartOfTagIndex = i;
+ dp = InTagName;
+ }
}
}
}