/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 . */ #include "HtmlFmtFlt.hxx" #include #include #include #include #include #include #include using namespace com::sun::star::uno; // converts the openoffice text/html clipboard format to the HTML Format // well known under MS Windows // the MS HTML Format has a header before the real html data // Version:1.0 Version number of the clipboard. Starting is 0.9 // StartHTML: Byte count from the beginning of the clipboard to the start // of the context, or -1 if no context // EndHTML: Byte count from the beginning of the clipboard to the end // of the context, or -1 if no context // StartFragment: Byte count from the beginning of the clipboard to the // start of the fragment // EndFragment: Byte count from the beginning of the clipboard to the // end of the fragment // StartSelection: Byte count from the beginning of the clipboard to the // start of the selection // EndSelection: Byte count from the beginning of the clipboard to the // end of the selection // StartSelection and EndSelection are optional // The fragment should be preceded and followed by the HTML comments // and (no space between !-- and the // text namespace { std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment) { std::ostringstream htmlHeader; htmlHeader << "Version:1.0" << '\r' << '\n'; htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n'; htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n'; htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n'; htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n'; return htmlHeader.str(); } } // the office always writes the start and end html tag in upper cases and // without spaces both tags don't allow parameters const std::string TAG_HTML = std::string(""); const std::string TAG_END_HTML = std::string(""); // The body tag may have parameters so we need to search for the // closing '>' manually e.g. #92840# const std::string TAG_BODY = std::string(" TextHtmlToHTMLFormat(Sequence const & aTextHtml) { OSL_ASSERT(aTextHtml.getLength() > 0); if (aTextHtml.getLength() <= 0) return Sequence(); // fill the buffer with dummy values to calc the exact length std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0); size_t lHtmlFormatHeader = dummyHtmlHeader.length(); std::string textHtml( reinterpret_cast(aTextHtml.getConstArray()), reinterpret_cast(aTextHtml.getConstArray()) + aTextHtml.getLength()); std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '' Word 2000 does also so std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind ? // The body tag may have parameters so we need to search for the // closing '>' manually e.g. #92840# std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1; std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader; std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment); htmlFormat += textHtml; Sequence byteSequence(htmlFormat.length() + 1); // space the trailing '\0' memset(byteSequence.getArray(), 0, byteSequence.getLength()); memcpy( static_cast(byteSequence.getArray()), static_cast(htmlFormat.c_str()), htmlFormat.length()); return byteSequence; } const char* const HtmlStartTag = " HTMLFormatToTextHtml(const Sequence& aHTMLFormat) { assert(isHTMLFormat(aHTMLFormat) && "No HTML Format provided"); Sequence& nonconstHTMLFormatRef = const_cast< Sequence& >(aHTMLFormat); sal_Char* dataStart = reinterpret_cast(nonconstHTMLFormatRef.getArray()); sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1; const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag); assert(htmlStartTag && "Seems to be no HTML at all"); // It doesn't seem to be HTML? Well then simply return what has been // provided in non-debug builds if (htmlStartTag == nullptr) { return aHTMLFormat; } sal_Int32 len = dataEnd - htmlStartTag; Sequence plainHtmlData(len); memcpy(static_cast(plainHtmlData.getArray()), htmlStartTag, len); return plainHtmlData; } /* A simple format detection. We are just comparing the first few bytes of the provided byte sequence to see whether or not it is the MS Office Html format. If it shows that this is not reliable enough we can improve this */ const char HtmlFormatStart[] = "Version:"; int const HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1); bool isHTMLFormat(const Sequence& aHtmlSequence) { if (aHtmlSequence.getLength() < HtmlFormatStartLen) return false; return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart, HtmlFormatStartLen, reinterpret_cast(aHtmlSequence.getConstArray()), HtmlFormatStartLen) == 0; } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */