diff options
author | Tomaž Vajngerl <tomaz.vajngerl@collabora.co.uk> | 2021-05-31 22:55:02 +0900 |
---|---|---|
committer | Tomaž Vajngerl <quikee@gmail.com> | 2021-06-03 23:05:45 +0200 |
commit | ed984f8e68996bcbdcf590814c41479ba76907f2 (patch) | |
tree | 61daaf9803f65a437d91a0ac0a9a9bfc0c8ff566 /sw/source | |
parent | abcfae0b64e6733b1de469414fe3c73f94f9a39a (diff) |
indexing: start of Indexing Export impl. based on (X)HTML export
This adds a indexing output/export to HTML code, which will output
a xml document that will be used for indexing of the document. It
is based on HTML to reuse the traversal through the doc. model.
It is enabled by setting the "IndexingOutput" export parameter.
This commit only adds in the groundwork, but the output is still
more or less the same as html and the follow up commits will
add more indexing specific changes. The only change is to use
"indexing" as the top level element and the document is a valid
xml (has the xml header).
Also add basic test that the indexing output/export works and
can be parsed as an xml with "indexing" top level element.
Change-Id: I153b1a70da7cbcf0d33b8610d962e6b7ae23ad23
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/116631
Tested-by: Jenkins
Reviewed-by: Tomaž Vajngerl <quikee@gmail.com>
Diffstat (limited to 'sw/source')
-rw-r--r-- | sw/source/filter/html/wrthtml.cxx | 31 | ||||
-rw-r--r-- | sw/source/filter/html/wrthtml.hxx | 5 |
2 files changed, 33 insertions, 3 deletions
diff --git a/sw/source/filter/html/wrthtml.cxx b/sw/source/filter/html/wrthtml.cxx index 63857cf65c73..dfd03a5360f2 100644 --- a/sw/source/filter/html/wrthtml.cxx +++ b/sw/source/filter/html/wrthtml.cxx @@ -79,6 +79,7 @@ #include <xmloff/odffields.hxx> #include <tools/urlobj.hxx> #include <osl/file.hxx> +#include <tools/stream.hxx> #include <comphelper/scopeguard.hxx> #include <unotools/tempfile.hxx> #include <comphelper/sequenceashashmap.hxx> @@ -150,6 +151,7 @@ SwHTMLWriter::SwHTMLWriter( const OUString& rBaseURL, const OUString& rFilterOpt , mbSkipImages(false) , mbSkipHeaderFooter(false) , mbEmbedImages(false) + , mbIndexingOutput(false) , m_bCfgPrintLayout( false ) , m_bParaDotLeaders( false ) { @@ -222,6 +224,13 @@ void SwHTMLWriter::SetupFilterOptions(const OUString& rFilterOptions) { mbEmbedImages = true; } + else if (rFilterOptions == "IndexingOutput") + { + mbIndexingOutput = true; + mbSkipHeaderFooter = true; + mbSkipImages = true; + mbXHTML = true; + } const uno::Sequence<OUString> aOptionSeq = comphelper::string::convertCommaSeparated(rFilterOptions); static const OUStringLiteral aXhtmlNsKey(u"xhtmlns="); @@ -265,6 +274,8 @@ ErrCode SwHTMLWriter::WriteStream() } comphelper::ScopeGuard g([this, pOldPasteStream] { this->SetStream(pOldPasteStream); }); + HtmlWriter aHtmlWriter(Strm(), GetNamespace()); + SvxHtmlOptions& rHtmlOptions = SvxHtmlOptions::Get(); // font heights 1-7 @@ -449,7 +460,7 @@ ErrCode SwHTMLWriter::WriteStream() CollectLinkTargets(); sal_uInt16 nHeaderAttrs = 0; - m_pCurrPageDesc = MakeHeader( nHeaderAttrs ); + m_pCurrPageDesc = MakeHeader(aHtmlWriter, nHeaderAttrs); m_bLFPossible = true; @@ -501,8 +512,14 @@ ErrCode SwHTMLWriter::WriteStream() HTMLOutFuncs::Out_AsciiTag( Strm(), OString(GetNamespace() + OOO_STRING_SVTOOLS_HTML_html), false ); } else if (mbReqIF) + { // ReqIF: end xhtml.BlkStruct.class. HTMLOutFuncs::Out_AsciiTag(Strm(), OString(GetNamespace() + OOO_STRING_SVTOOLS_HTML_division), false); + } + else if (mbIndexingOutput) + { + aHtmlWriter.end("indexing"); + } // delete the table with floating frames OSL_ENSURE( !m_pHTMLPosFlyFrames, "Were not all frames output?" ); @@ -983,7 +1000,7 @@ sal_uInt16 SwHTMLWriter::OutHeaderAttrs() return nAttrs; } -const SwPageDesc *SwHTMLWriter::MakeHeader( sal_uInt16 &rHeaderAttrs ) +const SwPageDesc* SwHTMLWriter::MakeHeader(HtmlWriter & rHtmlWriter, sal_uInt16 &rHeaderAttrs ) { OStringBuffer sOut; if (!mbSkipHeaderFooter) @@ -999,6 +1016,7 @@ const SwPageDesc *SwHTMLWriter::MakeHeader( sal_uInt16 &rHeaderAttrs ) HTMLOutFuncs::Out_AsciiTag( Strm(), OString(GetNamespace() + OOO_STRING_SVTOOLS_HTML_html) ); OutNewLine(); + HTMLOutFuncs::Out_AsciiTag( Strm(), OString(GetNamespace() + OOO_STRING_SVTOOLS_HTML_head) ); IncIndentLevel(); // indent content of <HEAD> @@ -1025,6 +1043,14 @@ const SwPageDesc *SwHTMLWriter::MakeHeader( sal_uInt16 &rHeaderAttrs ) OutFootEndNoteInfo(); } + else if (mbIndexingOutput) + { + Strm().WriteCharPtr("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); + Strm().WriteCharPtr(SAL_NEWLINE_STRING); + rHtmlWriter.start("indexing"); + rHtmlWriter.characters(""); + Strm().WriteCharPtr(SAL_NEWLINE_STRING); + } const SwPageDesc *pPageDesc = nullptr; @@ -1068,6 +1094,7 @@ const SwPageDesc *SwHTMLWriter::MakeHeader( sal_uInt16 &rHeaderAttrs ) DecIndentLevel(); // indent content of <HEAD> OutNewLine(); + HTMLOutFuncs::Out_AsciiTag( Strm(), OString(GetNamespace() + OOO_STRING_SVTOOLS_HTML_head), false ); // the body won't be indented, because then everything would be indented! diff --git a/sw/source/filter/html/wrthtml.hxx b/sw/source/filter/html/wrthtml.hxx index 8d6d1751e5a3..cc84b39f1e65 100644 --- a/sw/source/filter/html/wrthtml.hxx +++ b/sw/source/filter/html/wrthtml.hxx @@ -33,6 +33,7 @@ #include <o3tl/typed_flags_set.hxx> #include <rtl/ref.hxx> #include <svtools/htmlout.hxx> +#include <svtools/HtmlWriter.hxx> #include <tools/fldunit.hxx> #include <shellio.hxx> @@ -264,7 +265,7 @@ class SW_DLLPUBLIC SwHTMLWriter : public Writer FieldUnit m_eCSS1Unit; sal_uInt16 OutHeaderAttrs(); - const SwPageDesc *MakeHeader( sal_uInt16& rHeaderAtrs ); + const SwPageDesc* MakeHeader(HtmlWriter & rXmlWriter, sal_uInt16& rHeaderAtrs); void GetControls(); void AddLinkTarget( const OUString& rURL ); @@ -397,6 +398,8 @@ public: OString maNamespace; /// If the ReqIF subset of XHTML should be written. bool mbReqIF = false; + /// Indexing output. + bool mbIndexingOutput : 1; #define sCSS2_P_CLASS_leaders "leaders" bool m_bCfgPrintLayout : 1; // PrintLayout option for TOC dot leaders |