vcl/aqua/source/dtrans/HtmlFmtFlt.cxx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
#include "HtmlFmtFlt.hxx"

#include <rtl/string.h>

#include <string>
#include <sstream>
#include <vector>
#include <iomanip>

#include <boost/assert.hpp>

using namespace com::sun::star::uno;

//------------------------------------------------------------------------------
// converts the openoffice text/html clipboard format to the HTML Format
// well known under MS Windows
// the MS HTML Format has a header before the real html data
//
// Version:1.0      Version number of the clipboard. Staring is 0.9
// StartHTML:       Byte count from the beginning of the clipboard to the start
//                  of the context, or -1 if no context
// EndHTML:         Byte count from the beginning of the clipboard to the end
//                  of the context, or -1 if no context
// StartFragment:   Byte count from the beginning of the clipboard to the
//                  start of the fragment
// EndFragment:     Byte count from the beginning of the clipboard to the
//                  end of the fragment
// StartSelection:  Byte count from the beginning of the clipboard to the
//                  start of the selection
// EndSelection:    Byte count from the beginning of the clipboard to the
//                  end of the selection
//
// StartSelection and EndSelection are optional
// The fragment should be preceded and followed by the HTML comments
// <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
// text
//------------------------------------------------------------------------------

namespace // private
{
std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
{
    std::ostringstream htmlHeader;
    htmlHeader << "Version:1.0" << '\r' << '\n';
    htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
    htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
    htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
    htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
    return htmlHeader.str();
}

} // namespace private


// the office allways writes the start and end html tag in upper cases and
// without spaces both tags don't allow parameters
const std::string TAG_HTML = std::string("<HTML>");
const std::string TAG_END_HTML = std::string("</HTML>");

// The body tag may have parameters so we need to search for the
// closing '>' manually e.g. <BODY param> #92840#
const std::string TAG_BODY = std::string("<BODY");
const std::string TAG_END_BODY = std::string("</BODY");

Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml)
{
    OSL_ASSERT(aTextHtml.getLength() > 0);

    if (!(aTextHtml.getLength() > 0))
        return Sequence<sal_Int8>();

    // fill the buffer with dummy values to calc the exact length
    std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
    size_t lHtmlFormatHeader = dummyHtmlHeader.length();

    std::string textHtml(
        reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()),
        reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());

    std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
    std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?

    // The body tag may have parameters so we need to search for the
    // closing '>' manually e.g. <BODY param> #92840#
    std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
    std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;

    std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
    htmlFormat += textHtml;

    Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
    rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength());

    rtl_copyMemory(
        static_cast<void*>(byteSequence.getArray()),
        static_cast<const void*>(htmlFormat.c_str()),
        htmlFormat.length());

    return byteSequence;
}

const char* HtmlStartTag = "<html";

Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
{
  BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");

  Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
  sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray());
  sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
  const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);

  BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all");

  // It doesn't seem to be HTML? Well then simply return what has been
  // provided in non-debug builds
  if (htmlStartTag == NULL)
    {
    return aHTMLFormat;
    }

  sal_Int32 len = dataEnd - htmlStartTag;
  Sequence<sal_Int8> plainHtmlData(len);

  rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);

  return plainHtmlData;
}

/* A simple format detection. We are just comparing the first few bytes
   of the provided byte sequence to see whether or not it is the MS
   Office Html format. If it shows that this is not reliable enough we
   can improve this
*/
const char HtmlFormatStart[] = "Version:";
int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1);

bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
{
  if (aHtmlSequence.getLength() < HtmlFormatStartLen)
    return false;

  return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
                                                   HtmlFormatStartLen,
                                                   reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()),
                                                   HtmlFormatStartLen) == 0;
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */