summaryrefslogtreecommitdiff
path: root/xmlreader/source/xmlreader.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'xmlreader/source/xmlreader.cxx')
-rw-r--r--xmlreader/source/xmlreader.cxx1054
1 files changed, 1054 insertions, 0 deletions
diff --git a/xmlreader/source/xmlreader.cxx b/xmlreader/source/xmlreader.cxx
new file mode 100644
index 000000000000..27350a8f0947
--- /dev/null
+++ b/xmlreader/source/xmlreader.cxx
@@ -0,0 +1,1054 @@
+/*************************************************************************
+*
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* Copyright 2000, 2010 Oracle and/or its affiliates.
+*
+* OpenOffice.org - a multi-platform office productivity suite
+*
+* This file is part of OpenOffice.org.
+*
+* OpenOffice.org is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License version 3
+* only, as published by the Free Software Foundation.
+*
+* OpenOffice.org is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Lesser General Public License version 3 for more details
+* (a copy is included in the LICENSE file that accompanied this code).
+*
+* You should have received a copy of the GNU Lesser General Public License
+* version 3 along with OpenOffice.org. If not, see
+* <http://www.openoffice.org/license.html>
+* for a copy of the LGPLv3 License.
+*
+************************************************************************/
+
+#include "precompiled_xmlreader.hxx"
+#include "sal/config.h"
+
+#include <climits>
+#include <cstddef>
+
+#include "com/sun/star/container/NoSuchElementException.hpp"
+#include "com/sun/star/uno/Reference.hxx"
+#include "com/sun/star/uno/RuntimeException.hpp"
+#include "com/sun/star/uno/XInterface.hpp"
+#include "osl/diagnose.h"
+#include "osl/file.h"
+#include "rtl/string.h"
+#include "rtl/ustring.h"
+#include "rtl/ustring.hxx"
+#include "sal/types.h"
+#include "xmlreader/pad.hxx"
+#include "xmlreader/span.hxx"
+#include "xmlreader/xmlreader.hxx"
+
+namespace xmlreader {
+
+namespace {
+
+namespace css = com::sun::star;
+
+bool isSpace(char c) {
+ switch (c) {
+ case '\x09':
+ case '\x0A':
+ case '\x0D':
+ case ' ':
+ return true;
+ default:
+ return false;
+ }
+}
+
+}
+
+XmlReader::XmlReader(rtl::OUString const & fileUrl)
+ SAL_THROW((
+ css::container::NoSuchElementException, css::uno::RuntimeException)):
+ fileUrl_(fileUrl)
+{
+ switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
+ {
+ case osl_File_E_None:
+ break;
+ case osl_File_E_NOENT:
+ throw css::container::NoSuchElementException(
+ fileUrl_, css::uno::Reference< css::uno::XInterface >());
+ default:
+ throw css::uno::RuntimeException(
+ (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
+ if (e == osl_File_E_None) {
+ e = osl_mapFile(
+ fileHandle_, &fileAddress_, fileSize_, 0,
+ osl_File_MapFlag_WillNeed);
+ }
+ if (e != osl_File_E_None) {
+ e = osl_closeFile(fileHandle_);
+ if (e != osl_File_E_None) {
+ OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
+ }
+ throw css::uno::RuntimeException(
+ (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ namespaceIris_.push_back(
+ Span(
+ RTL_CONSTASCII_STRINGPARAM(
+ "http://www.w3.org/XML/1998/namespace")));
+ namespaces_.push_back(
+ NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
+ pos_ = static_cast< char * >(fileAddress_);
+ end_ = pos_ + fileSize_;
+ state_ = STATE_CONTENT;
+}
+
+XmlReader::~XmlReader() {
+ oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
+ if (e != osl_File_E_None) {
+ OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
+ }
+ e = osl_closeFile(fileHandle_);
+ if (e != osl_File_E_None) {
+ OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
+ }
+}
+
+int XmlReader::registerNamespaceIri(Span const & iri) {
+ int id = toNamespaceId(namespaceIris_.size());
+ namespaceIris_.push_back(iri);
+ if (iri.equals(
+ Span(
+ RTL_CONSTASCII_STRINGPARAM(
+ "http://www.w3.org/2001/XMLSchema-instance"))))
+ {
+ // Old user layer .xcu files used the xsi namespace prefix without
+ // declaring a corresponding namespace binding, see issue 77174; reading
+ // those files during migration would fail without this hack that can be
+ // removed once migration is no longer relevant (see
+ // configmgr::Components::parseModificationLayer):
+ namespaces_.push_back(
+ NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
+ }
+ return id;
+}
+
+XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
+{
+ switch (state_) {
+ case STATE_CONTENT:
+ switch (reportText) {
+ case TEXT_NONE:
+ return handleSkippedText(data, nsId);
+ case TEXT_RAW:
+ return handleRawText(data);
+ case TEXT_NORMALIZED:
+ return handleNormalizedText(data);
+ }
+ case STATE_START_TAG:
+ return handleStartTag(nsId, data);
+ case STATE_END_TAG:
+ return handleEndTag();
+ case STATE_EMPTY_ELEMENT_TAG:
+ handleElementEnd();
+ return RESULT_END;
+ default: // STATE_DONE
+ return RESULT_DONE;
+ }
+}
+
+bool XmlReader::nextAttribute(int * nsId, Span * localName) {
+ OSL_ASSERT(nsId != 0 && localName != 0);
+ if (firstAttribute_) {
+ currentAttribute_ = attributes_.begin();
+ firstAttribute_ = false;
+ } else {
+ ++currentAttribute_;
+ }
+ if (currentAttribute_ == attributes_.end()) {
+ return false;
+ }
+ if (currentAttribute_->nameColon == 0) {
+ *nsId = NAMESPACE_NONE;
+ *localName = Span(
+ currentAttribute_->nameBegin,
+ currentAttribute_->nameEnd - currentAttribute_->nameBegin);
+ } else {
+ *nsId = getNamespaceId(
+ Span(
+ currentAttribute_->nameBegin,
+ currentAttribute_->nameColon - currentAttribute_->nameBegin));
+ *localName = Span(
+ currentAttribute_->nameColon + 1,
+ currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
+ }
+ return true;
+}
+
+Span XmlReader::getAttributeValue(bool fullyNormalize) {
+ return handleAttributeValue(
+ currentAttribute_->valueBegin, currentAttribute_->valueEnd,
+ fullyNormalize);
+}
+
+int XmlReader::getNamespaceId(Span const & prefix) const {
+ for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
+ i != namespaces_.rend(); ++i)
+ {
+ if (prefix.equals(i->prefix)) {
+ return i->nsId;
+ }
+ }
+ return NAMESPACE_UNKNOWN;
+}
+
+rtl::OUString XmlReader::getUrl() const {
+ return fileUrl_;
+}
+
+void XmlReader::normalizeLineEnds(Span const & text) {
+ char const * p = text.begin;
+ sal_Int32 n = text.length;
+ for (;;) {
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
+ if (i < 0) {
+ break;
+ }
+ pad_.add(p, i);
+ p += i + 1;
+ n -= i + 1;
+ if (n == 0 || *p != '\x0A') {
+ pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
+ }
+ }
+ pad_.add(p, n);
+}
+
+void XmlReader::skipSpace() {
+ while (isSpace(peek())) {
+ ++pos_;
+ }
+}
+
+bool XmlReader::skipComment() {
+ if (rtl_str_shortenedCompare_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
+ RTL_CONSTASCII_LENGTH("--")) !=
+ 0)
+ {
+ return false;
+ }
+ pos_ += RTL_CONSTASCII_LENGTH("--");
+ sal_Int32 i = rtl_str_indexOfStr_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "premature end (within comment) of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ pos_ += i + RTL_CONSTASCII_LENGTH("--");
+ if (read() != '>') {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "illegal \"--\" within comment in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ return true;
+}
+
+void XmlReader::skipProcessingInstruction() {
+ sal_Int32 i = rtl_str_indexOfStr_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ pos_ += i + RTL_CONSTASCII_LENGTH("?>");
+}
+
+void XmlReader::skipDocumentTypeDeclaration() {
+ // Neither is it checked that the doctypedecl is at the correct position in
+ // the document, nor that it is well-formed:
+ for (;;) {
+ char c = read();
+ switch (c) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "premature end (within DTD) of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ case '"':
+ case '\'':
+ {
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(
+ pos_, end_ - pos_, c);
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "premature end (within DTD) of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ pos_ += i + 1;
+ }
+ break;
+ case '>':
+ return;
+ case '[':
+ for (;;) {
+ c = read();
+ switch (c) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "premature end (within DTD) of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ case '"':
+ case '\'':
+ {
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(
+ pos_, end_ - pos_, c);
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "premature end (within DTD) of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ pos_ += i + 1;
+ }
+ break;
+ case '<':
+ switch (read()) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "premature end (within DTD) of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ case '!':
+ skipComment();
+ break;
+ case '?':
+ skipProcessingInstruction();
+ break;
+ default:
+ break;
+ }
+ break;
+ case ']':
+ skipSpace();
+ if (read() != '>') {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "missing \">\" of DTD in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ return;
+ default:
+ break;
+ }
+ }
+ default:
+ break;
+ }
+ }
+}
+
+Span XmlReader::scanCdataSection() {
+ if (rtl_str_shortenedCompare_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
+ RTL_CONSTASCII_LENGTH("[CDATA[")) !=
+ 0)
+ {
+ return Span();
+ }
+ pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
+ char const * begin = pos_;
+ sal_Int32 i = rtl_str_indexOfStr_WithLength(
+ pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "premature end (within CDATA section) of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
+ return Span(begin, i);
+}
+
+bool XmlReader::scanName(char const ** nameColon) {
+ OSL_ASSERT(nameColon != 0 && *nameColon == 0);
+ for (char const * begin = pos_;; ++pos_) {
+ switch (peek()) {
+ case '\0': // i.e., EOF
+ case '\x09':
+ case '\x0A':
+ case '\x0D':
+ case ' ':
+ case '/':
+ case '=':
+ case '>':
+ return pos_ != begin;
+ case ':':
+ *nameColon = pos_;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
+ OSL_ASSERT(begin != 0 && begin <= end);
+ Span iri(handleAttributeValue(begin, end, false));
+ for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
+ if (namespaceIris_[i].equals(iri)) {
+ return toNamespaceId(i);
+ }
+ }
+ return XmlReader::NAMESPACE_UNKNOWN;
+}
+
+char const * XmlReader::handleReference(char const * position, char const * end)
+{
+ OSL_ASSERT(position != 0 && *position == '&' && position < end);
+ ++position;
+ if (*position == '#') {
+ ++position;
+ sal_Int32 val = 0;
+ char const * p;
+ if (*position == 'x') {
+ ++position;
+ p = position;
+ for (;; ++position) {
+ char c = *position;
+ if (c >= '0' && c <= '9') {
+ val = 16 * val + (c - '0');
+ } else if (c >= 'A' && c <= 'F') {
+ val = 16 * val + (c - 'A') + 10;
+ } else if (c >= 'a' && c <= 'f') {
+ val = 16 * val + (c - 'a') + 10;
+ } else {
+ break;
+ }
+ if (val > 0x10FFFF) { // avoid overflow
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "'&#x...' too large in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ }
+ } else {
+ p = position;
+ for (;; ++position) {
+ char c = *position;
+ if (c >= '0' && c <= '9') {
+ val = 10 * val + (c - '0');
+ } else {
+ break;
+ }
+ if (val > 0x10FFFF) { // avoid overflow
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "'&#...' too large in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ }
+ }
+ if (position == p || *position++ != ';') {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
+ if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
+ (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
+ {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "character reference denoting invalid character in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ char buf[4];
+ sal_Int32 len;
+ if (val < 0x80) {
+ buf[0] = static_cast< char >(val);
+ len = 1;
+ } else if (val < 0x800) {
+ buf[0] = static_cast< char >((val >> 6) | 0xC0);
+ buf[1] = static_cast< char >((val & 0x3F) | 0x80);
+ len = 2;
+ } else if (val < 0x10000) {
+ buf[0] = static_cast< char >((val >> 12) | 0xE0);
+ buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
+ buf[2] = static_cast< char >((val & 0x3F) | 0x80);
+ len = 3;
+ } else {
+ buf[0] = static_cast< char >((val >> 18) | 0xF0);
+ buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
+ buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
+ buf[3] = static_cast< char >((val & 0x3F) | 0x80);
+ len = 4;
+ }
+ pad_.addEphemeral(buf, len);
+ return position;
+ } else {
+ struct EntityRef {
+ char const * inBegin;
+ sal_Int32 inLength;
+ char const * outBegin;
+ sal_Int32 outLength;
+ };
+ static EntityRef const refs[] = {
+ { RTL_CONSTASCII_STRINGPARAM("amp;"),
+ RTL_CONSTASCII_STRINGPARAM("&") },
+ { RTL_CONSTASCII_STRINGPARAM("lt;"),
+ RTL_CONSTASCII_STRINGPARAM("<") },
+ { RTL_CONSTASCII_STRINGPARAM("gt;"),
+ RTL_CONSTASCII_STRINGPARAM(">") },
+ { RTL_CONSTASCII_STRINGPARAM("apos;"),
+ RTL_CONSTASCII_STRINGPARAM("'") },
+ { RTL_CONSTASCII_STRINGPARAM("quot;"),
+ RTL_CONSTASCII_STRINGPARAM("\"") } };
+ for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
+ if (rtl_str_shortenedCompare_WithLength(
+ position, end - position, refs[i].inBegin, refs[i].inLength,
+ refs[i].inLength) ==
+ 0)
+ {
+ position += refs[i].inLength;
+ pad_.add(refs[i].outBegin, refs[i].outLength);
+ return position;
+ }
+ }
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+}
+
+Span XmlReader::handleAttributeValue(
+ char const * begin, char const * end, bool fullyNormalize)
+{
+ pad_.clear();
+ if (fullyNormalize) {
+ while (begin != end && isSpace(*begin)) {
+ ++begin;
+ }
+ while (end != begin && isSpace(end[-1])) {
+ --end;
+ }
+ char const * p = begin;
+ enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
+ // a single true space character can go into the current span,
+ // everything else breaks the span
+ Space space = SPACE_NONE;
+ while (p != end) {
+ switch (*p) {
+ case '\x09':
+ case '\x0A':
+ case '\x0D':
+ switch (space) {
+ case SPACE_NONE:
+ pad_.add(begin, p - begin);
+ pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
+ space = SPACE_BREAK;
+ break;
+ case SPACE_SPAN:
+ pad_.add(begin, p - begin);
+ space = SPACE_BREAK;
+ break;
+ case SPACE_BREAK:
+ break;
+ }
+ begin = ++p;
+ break;
+ case ' ':
+ switch (space) {
+ case SPACE_NONE:
+ ++p;
+ space = SPACE_SPAN;
+ break;
+ case SPACE_SPAN:
+ pad_.add(begin, p - begin);
+ begin = ++p;
+ space = SPACE_BREAK;
+ break;
+ case SPACE_BREAK:
+ begin = ++p;
+ break;
+ }
+ break;
+ case '&':
+ pad_.add(begin, p - begin);
+ p = handleReference(p, end);
+ begin = p;
+ space = SPACE_NONE;
+ break;
+ default:
+ ++p;
+ space = SPACE_NONE;
+ break;
+ }
+ }
+ pad_.add(begin, p - begin);
+ } else {
+ char const * p = begin;
+ while (p != end) {
+ switch (*p) {
+ case '\x09':
+ case '\x0A':
+ pad_.add(begin, p - begin);
+ begin = ++p;
+ pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
+ break;
+ case '\x0D':
+ pad_.add(begin, p - begin);
+ ++p;
+ if (peek() == '\x0A') {
+ ++p;
+ }
+ begin = p;
+ pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
+ break;
+ case '&':
+ pad_.add(begin, p - begin);
+ p = handleReference(p, end);
+ begin = p;
+ break;
+ default:
+ ++p;
+ break;
+ }
+ }
+ pad_.add(begin, p - begin);
+ }
+ return pad_.get();
+}
+
+XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
+ OSL_ASSERT(nsId != 0 && localName);
+ char const * nameBegin = pos_;
+ char const * nameColon = 0;
+ if (!scanName(&nameColon)) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ char const * nameEnd = pos_;
+ NamespaceList::size_type inheritedNamespaces = namespaces_.size();
+ bool hasDefaultNs = false;
+ int defaultNsId = NAMESPACE_NONE;
+ attributes_.clear();
+ for (;;) {
+ char const * p = pos_;
+ skipSpace();
+ if (peek() == '/' || peek() == '>') {
+ break;
+ }
+ if (pos_ == p) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "missing whitespace before attribute in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ char const * attrNameBegin = pos_;
+ char const * attrNameColon = 0;
+ if (!scanName(&attrNameColon)) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ char const * attrNameEnd = pos_;
+ skipSpace();
+ if (read() != '=') {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ skipSpace();
+ char del = read();
+ if (del != '\'' && del != '"') {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ char const * valueBegin = pos_;
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ "unterminated attribute value in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ char const * valueEnd = pos_ + i;
+ pos_ += i + 1;
+ if (attrNameColon == 0 &&
+ Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
+ RTL_CONSTASCII_STRINGPARAM("xmlns")))
+ {
+ hasDefaultNs = true;
+ defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
+ } else if (attrNameColon != 0 &&
+ Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
+ RTL_CONSTASCII_STRINGPARAM("xmlns")))
+ {
+ namespaces_.push_back(
+ NamespaceData(
+ Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
+ scanNamespaceIri(valueBegin, valueEnd)));
+ } else {
+ attributes_.push_back(
+ AttributeData(
+ attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
+ valueEnd));
+ }
+ }
+ if (!hasDefaultNs && !elements_.empty()) {
+ defaultNsId = elements_.top().defaultNamespaceId;
+ }
+ firstAttribute_ = true;
+ if (peek() == '/') {
+ state_ = STATE_EMPTY_ELEMENT_TAG;
+ ++pos_;
+ } else {
+ state_ = STATE_CONTENT;
+ }
+ if (peek() != '>') {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ ++pos_;
+ elements_.push(
+ ElementData(
+ Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
+ defaultNsId));
+ if (nameColon == 0) {
+ *nsId = defaultNsId;
+ *localName = Span(nameBegin, nameEnd - nameBegin);
+ } else {
+ *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
+ *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
+ }
+ return RESULT_BEGIN;
+}
+
+XmlReader::Result XmlReader::handleEndTag() {
+ if (elements_.empty()) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ char const * nameBegin = pos_;
+ char const * nameColon = 0;
+ if (!scanName(&nameColon) ||
+ !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
+ {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ handleElementEnd();
+ skipSpace();
+ if (peek() != '>') {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ ++pos_;
+ return RESULT_END;
+}
+
+void XmlReader::handleElementEnd() {
+ OSL_ASSERT(!elements_.empty());
+ namespaces_.resize(elements_.top().inheritedNamespaces);
+ elements_.pop();
+ state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
+}
+
+XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
+ for (;;) {
+ sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
+ if (i < 0) {
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ }
+ pos_ += i + 1;
+ switch (peek()) {
+ case '!':
+ ++pos_;
+ if (!skipComment() && !scanCdataSection().is()) {
+ skipDocumentTypeDeclaration();
+ }
+ break;
+ case '/':
+ ++pos_;
+ return handleEndTag();
+ case '?':
+ ++pos_;
+ skipProcessingInstruction();
+ break;
+ default:
+ return handleStartTag(nsId, data);
+ }
+ }
+}
+
+XmlReader::Result XmlReader::handleRawText(Span * text) {
+ pad_.clear();
+ for (char const * begin = pos_;;) {
+ switch (peek()) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ case '\x0D':
+ pad_.add(begin, pos_ - begin);
+ ++pos_;
+ if (peek() != '\x0A') {
+ pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
+ }
+ begin = pos_;
+ break;
+ case '&':
+ pad_.add(begin, pos_ - begin);
+ pos_ = handleReference(pos_, end_);
+ begin = pos_;
+ break;
+ case '<':
+ pad_.add(begin, pos_ - begin);
+ ++pos_;
+ switch (peek()) {
+ case '!':
+ ++pos_;
+ if (!skipComment()) {
+ Span cdata(scanCdataSection());
+ if (cdata.is()) {
+ normalizeLineEnds(cdata);
+ } else {
+ skipDocumentTypeDeclaration();
+ }
+ }
+ begin = pos_;
+ break;
+ case '/':
+ *text = pad_.get();
+ ++pos_;
+ state_ = STATE_END_TAG;
+ return RESULT_TEXT;
+ case '?':
+ ++pos_;
+ skipProcessingInstruction();
+ begin = pos_;
+ break;
+ default:
+ *text = pad_.get();
+ state_ = STATE_START_TAG;
+ return RESULT_TEXT;
+ }
+ break;
+ default:
+ ++pos_;
+ break;
+ }
+ }
+}
+
+XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
+ pad_.clear();
+ char const * flowBegin = pos_;
+ char const * flowEnd = pos_;
+ enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
+ // a single true space character can go into the current flow,
+ // everything else breaks the flow
+ Space space = SPACE_START;
+ for (;;) {
+ switch (peek()) {
+ case '\0': // i.e., EOF
+ throw css::uno::RuntimeException(
+ (rtl::OUString(
+ RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
+ fileUrl_),
+ css::uno::Reference< css::uno::XInterface >());
+ case '\x09':
+ case '\x0A':
+ case '\x0D':
+ switch (space) {
+ case SPACE_START:
+ case SPACE_BREAK:
+ break;
+ case SPACE_NONE:
+ case SPACE_SPAN:
+ space = SPACE_BREAK;
+ break;
+ }
+ ++pos_;
+ break;
+ case ' ':
+ switch (space) {
+ case SPACE_START:
+ case SPACE_BREAK:
+ break;
+ case SPACE_NONE:
+ space = SPACE_SPAN;
+ break;
+ case SPACE_SPAN:
+ space = SPACE_BREAK;
+ break;
+ }
+ ++pos_;
+ break;
+ case '&':
+ switch (space) {
+ case SPACE_START:
+ break;
+ case SPACE_NONE:
+ case SPACE_SPAN:
+ pad_.add(flowBegin, pos_ - flowBegin);
+ break;
+ case SPACE_BREAK:
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
+ break;
+ }
+ pos_ = handleReference(pos_, end_);
+ flowBegin = pos_;
+ flowEnd = pos_;
+ space = SPACE_NONE;
+ break;
+ case '<':
+ ++pos_;
+ switch (peek()) {
+ case '!':
+ ++pos_;
+ if (skipComment()) {
+ space = SPACE_BREAK;
+ } else {
+ Span cdata(scanCdataSection());
+ if (cdata.is()) {
+ // CDATA is not normalized (similar to character
+ // references; it keeps the code simple), but it might
+ // arguably be better to normalize it:
+ switch (space) {
+ case SPACE_START:
+ break;
+ case SPACE_NONE:
+ case SPACE_SPAN:
+ pad_.add(flowBegin, pos_ - flowBegin);
+ break;
+ case SPACE_BREAK:
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
+ break;
+ }
+ normalizeLineEnds(cdata);
+ flowBegin = pos_;
+ flowEnd = pos_;
+ space = SPACE_NONE;
+ } else {
+ skipDocumentTypeDeclaration();
+ }
+ }
+ break;
+ case '/':
+ ++pos_;
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ *text = pad_.get();
+ state_ = STATE_END_TAG;
+ return RESULT_TEXT;
+ case '?':
+ ++pos_;
+ skipProcessingInstruction();
+ space = SPACE_BREAK;
+ break;
+ default:
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ *text = pad_.get();
+ state_ = STATE_START_TAG;
+ return RESULT_TEXT;
+ }
+ break;
+ default:
+ switch (space) {
+ case SPACE_START:
+ flowBegin = pos_;
+ break;
+ case SPACE_NONE:
+ case SPACE_SPAN:
+ break;
+ case SPACE_BREAK:
+ pad_.add(flowBegin, flowEnd - flowBegin);
+ pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
+ flowBegin = pos_;
+ break;
+ }
+ flowEnd = ++pos_;
+ space = SPACE_NONE;
+ break;
+ }
+ }
+}
+
+int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
+ OSL_ASSERT(pos <= INT_MAX);
+ return static_cast< int >(pos);
+}
+
+}