summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaxim Monastirsky <momonasmon@gmail.com>2014-01-20 10:17:05 +0200
committerKohei Yoshida <libreoffice@kohei.us>2014-01-23 14:49:22 +0000
commitcc2893834d8ac699dbb38b152f21f17f3debb06b (patch)
treef6872c1bb50bff0ada758ba68f2a6867f6b9c053
parent6063555744ed89d8a757b667cddcdd4357839466 (diff)
related: fdo#73682 Introduce HTML detection service
Change-Id: I66bb579019ce8411b821c623955a454fd81cf811 Reviewed-on: https://gerrit.libreoffice.org/7600 Reviewed-by: Kohei Yoshida <libreoffice@kohei.us> Tested-by: Kohei Yoshida <libreoffice@kohei.us>
-rw-r--r--Repository.mk1
-rw-r--r--filter/Library_htmlfd.mk36
-rw-r--r--filter/Module_filter.mk1
-rw-r--r--filter/source/config/fragments/types/generic_HTML.xcu2
-rw-r--r--filter/source/htmlfilterdetect/fdcomp.cxx36
-rw-r--r--filter/source/htmlfilterdetect/filterdetect.cxx232
-rw-r--r--filter/source/htmlfilterdetect/filterdetect.hxx64
-rw-r--r--filter/source/htmlfilterdetect/htmlfd.component15
-rwxr-xr-xpostprocess/Rdb_services.mk1
-rw-r--r--solenv/gbuild/extensions/pre_MergedLibsList.mk1
10 files changed, 388 insertions, 1 deletions
diff --git a/Repository.mk b/Repository.mk
index 6c4d488a0d64..7066001679bc 100644
--- a/Repository.mk
+++ b/Repository.mk
@@ -270,6 +270,7 @@ $(eval $(call gb_Helper_register_libraries_for_install,OOOLIBS,ooo, \
$(if $(ENABLE_DIRECTX),gdipluscanvas) \
guesslang \
$(if $(filter DESKTOP,$(BUILD_TYPE)),helplinker) \
+ htmlfd \
i18npool \
i18nsearch \
hyphen \
diff --git a/filter/Library_htmlfd.mk b/filter/Library_htmlfd.mk
new file mode 100644
index 000000000000..a147509e899e
--- /dev/null
+++ b/filter/Library_htmlfd.mk
@@ -0,0 +1,36 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#*************************************************************************
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+#*************************************************************************
+
+$(eval $(call gb_Library_Library,htmlfd))
+
+$(eval $(call gb_Library_set_componentfile,htmlfd,filter/source/htmlfilterdetect/htmlfd))
+
+$(eval $(call gb_Library_use_external,xmlfd,boost_headers))
+
+$(eval $(call gb_Library_use_sdk_api,htmlfd))
+
+$(eval $(call gb_Library_use_libraries,htmlfd,\
+ ucbhelper \
+ cppuhelper \
+ cppu \
+ sal \
+ tl \
+ utl \
+ svt \
+ $(gb_UWINAPI) \
+))
+
+$(eval $(call gb_Library_add_exception_objects,htmlfd,\
+ filter/source/htmlfilterdetect/fdcomp \
+ filter/source/htmlfilterdetect/filterdetect \
+))
+
+# vim: set noet sw=4 ts=4:
diff --git a/filter/Module_filter.mk b/filter/Module_filter.mk
index 403184a93feb..58307b42a7e9 100644
--- a/filter/Module_filter.mk
+++ b/filter/Module_filter.mk
@@ -34,6 +34,7 @@ $(eval $(call gb_Module_add_targets,filter,\
Library_exp) \
Library_filterconfig \
Library_flash \
+ Library_htmlfd \
Library_icd \
Library_icg \
Library_idx \
diff --git a/filter/source/config/fragments/types/generic_HTML.xcu b/filter/source/config/fragments/types/generic_HTML.xcu
index ede6d2b8fefb..58ffedc85f1e 100644
--- a/filter/source/config/fragments/types/generic_HTML.xcu
+++ b/filter/source/config/fragments/types/generic_HTML.xcu
@@ -16,7 +16,7 @@
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
-->
<node oor:name="generic_HTML" oor:op="replace" >
- <prop oor:name="DetectService"><value>com.sun.star.text.FormatDetector</value></prop>
+ <prop oor:name="DetectService"><value>com.sun.star.comp.filters.HtmlFilterDetect</value></prop>
<prop oor:name="URLPattern"><value>private:factory/swriter/web*</value></prop>
<prop oor:name="Extensions"><value>html htm</value></prop>
<prop oor:name="MediaType"><value>text/html</value></prop>
diff --git a/filter/source/htmlfilterdetect/fdcomp.cxx b/filter/source/htmlfilterdetect/fdcomp.cxx
new file mode 100644
index 000000000000..40360e923c33
--- /dev/null
+++ b/filter/source/htmlfilterdetect/fdcomp.cxx
@@ -0,0 +1,36 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <sal/config.h>
+
+#include <cppuhelper/factory.hxx>
+#include <cppuhelper/implementationentry.hxx>
+#include <sal/types.h>
+
+#include "filterdetect.hxx"
+
+namespace {
+
+static cppu::ImplementationEntry const services[] = {
+ { &HtmlFilterDetect_createInstance, &HtmlFilterDetect_getImplementationName,
+ &HtmlFilterDetect_getSupportedServiceNames,
+ &cppu::createSingleComponentFactory, 0, 0 },
+ { 0, 0, 0, 0, 0, 0 }
+};
+
+}
+
+extern "C" SAL_DLLPUBLIC_EXPORT void * SAL_CALL htmlfd_component_getFactory(
+ char const * pImplName, void * pServiceManager, void * pRegistryKey)
+{
+ return cppu::component_getFactoryHelper(
+ pImplName, pServiceManager, pRegistryKey, services);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/filter/source/htmlfilterdetect/filterdetect.cxx b/filter/source/htmlfilterdetect/filterdetect.cxx
new file mode 100644
index 000000000000..140912d37379
--- /dev/null
+++ b/filter/source/htmlfilterdetect/filterdetect.cxx
@@ -0,0 +1,232 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "filterdetect.hxx"
+
+#include <svtools/htmltokn.h>
+#include <tools/urlobj.hxx>
+#include <ucbhelper/content.hxx>
+#include <unotools/mediadescriptor.hxx>
+#include <unotools/ucbstreamhelper.hxx>
+
+#include <com/sun/star/io/XInputStream.hpp>
+#include <cppuhelper/supportsservice.hxx>
+
+#include <boost/scoped_ptr.hpp>
+
+using com::sun::star::io::XInputStream;
+using com::sun::star::uno::Sequence;
+using com::sun::star::uno::Reference;
+using com::sun::star::uno::Any;
+using com::sun::star::uno::XComponentContext;
+using com::sun::star::uno::XInterface;
+using com::sun::star::uno::Exception;
+using com::sun::star::uno::RuntimeException;
+using com::sun::star::ucb::XCommandEnvironment;
+
+using namespace com::sun::star;
+using namespace com::sun::star::beans;
+
+namespace {
+
+enum DetectPhase {
+ BeforeTag,
+ TagOpened,
+ InTagName
+};
+
+bool isHTMLStream(const OString& aStreamHeader)
+{
+ const char* pHeader = aStreamHeader.getStr();
+ const int nLength = aStreamHeader.getLength();
+ int nStartOfTagIndex = 0;
+ int i = 0;
+
+ DetectPhase dp = BeforeTag;
+
+ for ( i = 0; i < nLength; ++i, ++pHeader )
+ {
+ char c = *pHeader;
+ if ( c == ' ' || c == '\n' || c == '\t' )
+ {
+ if ( dp == TagOpened )
+ return false; // Invalid: Should start with a tag name
+ else if ( dp == InTagName )
+ break; // End of tag name reached
+ }
+ else if ( c == '<' )
+ {
+ if ( dp == BeforeTag )
+ dp = TagOpened;
+ else
+ return false; // Invalid: Nested '<'
+ }
+ else if ( c == '>' )
+ {
+ if ( dp == InTagName )
+ break; // End of tag name reached
+ else
+ return false; // Invalid: Empty tag or before '<'
+ }
+ else if ( c == '!' )
+ {
+ if ( i == 1 && dp == TagOpened )
+ return true; // "<!" at the very beginning of the file
+ else
+ return false; // Invalid: '!' before '<' or inside tag name
+ }
+ else
+ {
+ if ( dp == BeforeTag )
+ return false; // Invalid: Should start with a tag
+ else if ( dp == TagOpened )
+ {
+ nStartOfTagIndex = i;
+ dp = InTagName;
+ }
+ }
+ }
+
+ // The string following '<' has to be a known HTML token.
+ if ( GetHTMLToken( OStringToOUString( aStreamHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex ),
+ RTL_TEXTENCODING_ASCII_US ) ) != 0 )
+ return true;
+
+ return false;
+}
+
+}
+
+OUString SAL_CALL HtmlFilterDetect::detect(Sequence<PropertyValue>& lDescriptor)
+ throw (RuntimeException)
+{
+ OUString sUrl;
+ OUString sDocService;
+ OString resultString;
+ Reference<XInputStream> xInStream;
+
+ const PropertyValue *pValue = lDescriptor.getConstArray();
+ sal_Int32 nLength = lDescriptor.getLength();
+ sal_Int32 location = nLength;
+
+ for ( sal_Int32 i = 0; i < nLength; ++i )
+ {
+ if ( pValue[i].Name == utl::MediaDescriptor::PROP_URL() )
+ pValue[i].Value >>= sUrl;
+ else if ( pValue[i].Name == utl::MediaDescriptor::PROP_INPUTSTREAM() )
+ pValue[i].Value >>= xInStream;
+ else if ( pValue[i].Name == utl::MediaDescriptor::PROP_DOCUMENTSERVICE() )
+ {
+ location = i;
+ pValue[i].Value >>= sDocService;
+ }
+ }
+
+ try
+ {
+ if ( !xInStream.is() )
+ {
+ ucbhelper::Content aContent( sUrl, Reference<XCommandEnvironment>(), mxCtx );
+ xInStream = aContent.openStream();
+ if ( !xInStream.is() )
+ return OUString();
+ }
+
+ boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
+ if ( !pInStream || pInStream->GetError() )
+ return OUString();
+
+ pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
+ sal_Size nUniPos = pInStream->Tell();
+
+ const sal_uInt16 nSize = 4096;
+
+ if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
+ resultString = read_uInt8s_ToOString( *pInStream, nSize );
+ else // UTF-16
+ resultString = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
+
+ if ( isHTMLStream( resultString.toAsciiLowerCase() ) )
+ {
+ // Some Apps/Web services use ".xls" extension to indicate that
+ // the given file should be opened by a spreadsheet software
+ if ( sDocService.isEmpty() )
+ {
+ INetURLObject aParser( sUrl );
+ OUString aExt = aParser.getExtension( INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET );
+ aExt = aExt.toAsciiLowerCase();
+
+ if ( aExt == "xls" )
+ {
+ if ( location == lDescriptor.getLength() )
+ {
+ lDescriptor.realloc( location + 1 );
+ lDescriptor[location].Name = utl::MediaDescriptor::PROP_DOCUMENTSERVICE();
+ }
+ lDescriptor[location].Value <<= OUString( "com.sun.star.sheet.SpreadsheetDocument" );
+ }
+ }
+ return OUString( "generic_HTML" );
+ }
+ }
+ catch (const Exception &)
+ {
+ OSL_FAIL( "An Exception occurred while opening File stream" );
+ }
+
+ return OUString(); // Failed
+}
+
+// XInitialization
+
+void SAL_CALL HtmlFilterDetect::initialize(const Sequence<Any>& /*aArguments*/)
+ throw (Exception, RuntimeException)
+{
+}
+
+OUString HtmlFilterDetect_getImplementationName()
+{
+ return OUString( "com.sun.star.comp.filters.HtmlFilterDetect" );
+}
+
+Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames()
+{
+ Sequence<OUString> aRet(2);
+ OUString* pArray = aRet.getArray();
+ pArray[0] = "com.sun.star.document.ExtendedTypeDetection";
+ pArray[1] = "com.sun.star.comp.filters.HtmlFilterDetect";
+ return aRet;
+}
+
+Reference<XInterface> HtmlFilterDetect_createInstance(const Reference<XComponentContext>& rCtx)
+{
+ return (cppu::OWeakObject*) new HtmlFilterDetect( rCtx );
+}
+
+// XServiceInfo
+
+OUString SAL_CALL HtmlFilterDetect::getImplementationName()
+ throw (RuntimeException)
+{
+ return HtmlFilterDetect_getImplementationName();
+}
+
+sal_Bool SAL_CALL HtmlFilterDetect::supportsService(const OUString& rServiceName)
+ throw (RuntimeException)
+{
+ return cppu::supportsService( this, rServiceName );
+}
+
+Sequence<OUString> SAL_CALL HtmlFilterDetect::getSupportedServiceNames()
+ throw (RuntimeException)
+{
+ return HtmlFilterDetect_getSupportedServiceNames();
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/filter/source/htmlfilterdetect/filterdetect.hxx b/filter/source/htmlfilterdetect/filterdetect.hxx
new file mode 100644
index 000000000000..631d4d3715e5
--- /dev/null
+++ b/filter/source/htmlfilterdetect/filterdetect.hxx
@@ -0,0 +1,64 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#ifndef INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX
+#define INCLUDED_FILTER_SOURCE_HTMLFILTERDETECT_FILTERDETECT_HXX
+
+#include <com/sun/star/document/XExtendedFilterDetection.hpp>
+#include <com/sun/star/lang/XInitialization.hpp>
+#include <com/sun/star/lang/XServiceInfo.hpp>
+#include <com/sun/star/uno/XComponentContext.hpp>
+
+#include <cppuhelper/implbase3.hxx>
+
+class HtmlFilterDetect : public cppu::WeakImplHelper3<
+ com::sun::star::document::XExtendedFilterDetection,
+ com::sun::star::lang::XInitialization,
+ com::sun::star::lang::XServiceInfo>
+{
+ com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext> mxCtx;
+
+public:
+
+ HtmlFilterDetect(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& xCtx) :
+ mxCtx(xCtx) {}
+ virtual ~HtmlFilterDetect() {}
+
+ // XExtendedFilterDetection
+
+ virtual OUString SAL_CALL detect(com::sun::star::uno::Sequence<com::sun::star::beans::PropertyValue>& lDescriptor)
+ throw (com::sun::star::uno::RuntimeException);
+
+ // XInitialization
+
+ virtual void SAL_CALL initialize(const ::com::sun::star::uno::Sequence<com::sun::star::uno::Any>& aArguments)
+ throw (com::sun::star::uno::Exception, com::sun::star::uno::RuntimeException);
+
+ // XServiceInfo
+
+ virtual OUString SAL_CALL getImplementationName()
+ throw (com::sun::star::uno::RuntimeException);
+
+ virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName)
+ throw (com::sun::star::uno::RuntimeException);
+
+ virtual com::sun::star::uno::Sequence<OUString> SAL_CALL getSupportedServiceNames()
+ throw (com::sun::star::uno::RuntimeException);
+};
+
+OUString HtmlFilterDetect_getImplementationName();
+
+com::sun::star::uno::Sequence<OUString> HtmlFilterDetect_getSupportedServiceNames();
+
+com::sun::star::uno::Reference<com::sun::star::uno::XInterface>
+HtmlFilterDetect_createInstance(const com::sun::star::uno::Reference<com::sun::star::uno::XComponentContext>& rCtx);
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/filter/source/htmlfilterdetect/htmlfd.component b/filter/source/htmlfilterdetect/htmlfd.component
new file mode 100644
index 000000000000..32c41b8bef26
--- /dev/null
+++ b/filter/source/htmlfilterdetect/htmlfd.component
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ -->
+
+<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@"
+ prefix="htmlfd" xmlns="http://openoffice.org/2010/uno-components">
+ <implementation name="com.sun.star.comp.filters.HtmlFilterDetect">
+ <service name="com.sun.star.document.ExtendedTypeDetection"/>
+ </implementation>
+</component>
diff --git a/postprocess/Rdb_services.mk b/postprocess/Rdb_services.mk
index cd8e3c92bae4..b0c8a10d29af 100755
--- a/postprocess/Rdb_services.mk
+++ b/postprocess/Rdb_services.mk
@@ -29,6 +29,7 @@ $(eval $(call gb_Rdb_add_components,services,\
filter/source/config/cache/filterconfig1 \
filter/source/flash/flash \
filter/source/graphic/graphicfilter \
+ filter/source/htmlfilterdetect/htmlfd \
filter/source/msfilter/msfilter \
filter/source/odfflatxml/odfflatxml \
filter/source/pdf/pdffilter \
diff --git a/solenv/gbuild/extensions/pre_MergedLibsList.mk b/solenv/gbuild/extensions/pre_MergedLibsList.mk
index 9cc207915e11..ba7ad86aeaff 100644
--- a/solenv/gbuild/extensions/pre_MergedLibsList.mk
+++ b/solenv/gbuild/extensions/pre_MergedLibsList.mk
@@ -46,6 +46,7 @@ gb_EXTRAMERGEDLIBS := \
graphicfilter \
guesslang \
$(if $(ENABLE_JAVA),hsqldb) \
+ htmlfd \
hyphen \
icd \
icg \