summaryrefslogtreecommitdiff
path: root/sc/inc
diff options
context:
space:
mode:
authorKohei Yoshida <kohei.yoshida@suse.com>2011-10-21 13:32:49 -0400
committerKohei Yoshida <kohei.yoshida@suse.com>2011-10-21 15:13:59 -0400
commitea44e5464a487519de84e30a5f299387127c78ea (patch)
treec940ba773cd76579b7bf025905711c30788d5a2a /sc/inc
parentc955cce91f21e6381f9d17b83c73671ad135d791 (diff)
Updated csv parser from the orcus repository.
This change should allow handling double-quation inside quoted cells correctly.
Diffstat (limited to 'sc/inc')
-rw-r--r--sc/inc/orcus/csv_parser.hpp134
1 files changed, 110 insertions, 24 deletions
diff --git a/sc/inc/orcus/csv_parser.hpp b/sc/inc/orcus/csv_parser.hpp
index cc2337489330..8d119f977a01 100644
--- a/sc/inc/orcus/csv_parser.hpp
+++ b/sc/inc/orcus/csv_parser.hpp
@@ -1,4 +1,3 @@
-/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*************************************************************************
*
* Copyright (c) 2011 Kohei Yoshida
@@ -76,8 +75,10 @@ public:
private:
bool has_char() const { return m_pos < m_length; }
+ bool has_next() const { return m_pos + 1 < m_length; }
void next();
char cur_char() const;
+ char next_char() const;
bool is_delim(char c) const;
bool is_text_qualifier(char c) const;
@@ -87,8 +88,12 @@ private:
void cell();
void quoted_cell();
+ void parse_cell_with_quote(const char* p0, size_t len0);
void skip_blanks();
+ void init_cell_buf();
+ void append_to_cell_buf(const char* p, size_t len);
+
/**
* Push cell value to the handler.
*/
@@ -102,9 +107,11 @@ private:
private:
handler_type& m_handler;
const csv_parser_config& m_config;
+ std::string m_cell_buf;
const char* mp_char;
size_t m_pos;
size_t m_length;
+ size_t m_cell_buf_size;
};
template<typename _Handler>
@@ -141,6 +148,12 @@ char csv_parser<_Handler>::cur_char() const
}
template<typename _Handler>
+char csv_parser<_Handler>::next_char() const
+{
+ return *(mp_char+1);
+}
+
+template<typename _Handler>
bool csv_parser<_Handler>::is_delim(char c) const
{
return m_config.delimiters.find(c) != std::string::npos;
@@ -210,42 +223,99 @@ void csv_parser<_Handler>::cell()
template<typename _Handler>
void csv_parser<_Handler>::quoted_cell()
{
+#if ORCUS_DEBUG_CSV
+ using namespace std;
+ cout << "--- quoted cell" << endl;
+#endif
char c = cur_char();
assert(is_text_qualifier(c));
next(); // Skip the opening quote.
if (!has_char())
return;
- const char* p = mp_char;
- size_t len = 0;
- for (c = cur_char(); !is_text_qualifier(c); c = cur_char())
+ const char* p0 = mp_char;
+ size_t len = 1;
+ for (; has_char(); next(), ++len)
{
- ++len;
- next();
- if (!has_char())
+ c = cur_char();
+#if ORCUS_DEBUG_CSV
+ cout << "'" << c << "'" << endl;
+#endif
+ if (!is_text_qualifier(c))
+ continue;
+
+ // current char is a quote. Check if the next char is also a text
+ // qualifier.
+
+ if (has_next() && is_text_qualifier(next_char()))
{
- // Stream ended prematurely. Handle it gracefully.
- push_cell_value(p, len);
+ next();
+ parse_cell_with_quote(p0, len);
return;
}
+
+ // Closing quote.
+ push_cell_value(p0, len-1);
+ next();
+ skip_blanks();
+ return;
}
- assert(is_text_qualifier(c));
- next(); // Skip the closing quote.
+ // Stream ended prematurely. Handle it gracefully.
+ push_cell_value(p0, len);
+ next();
skip_blanks();
- c = cur_char();
- if (!is_delim(c) && c != '\n')
+}
+
+template<typename _Handler>
+void csv_parser<_Handler>::parse_cell_with_quote(const char* p0, size_t len0)
+{
+#if ORCUS_DEBUG_CSV
+ using namespace std;
+ cout << "--- parse cell with quote" << endl;
+#endif
+ assert(is_text_qualifier(cur_char()));
+
+ // Push the preceding chars to the temp buffer.
+ init_cell_buf();
+ append_to_cell_buf(p0, len0);
+
+ // Parse the rest, until the closing quote.
+ next();
+ const char* p_cur = mp_char;
+ size_t cur_len = 0;
+ for (; has_char(); next(), ++cur_len)
{
- std::ostringstream os;
- os << "A quoted cell value must be immediately followed by a delimiter. ";
- os << "'" << c << "' is found instead.";
- throw csv_parse_error(os.str());
- }
+ char c = cur_char();
+#if ORCUS_DEBUG_CSV
+ cout << "'" << c << "'" << endl;
+#endif
+ if (!is_text_qualifier(c))
+ continue;
- if (!len)
- p = NULL;
+ if (has_next() && is_text_qualifier(next_char()))
+ {
+ // double quotation. Copy the current segment to the cell buffer.
+ append_to_cell_buf(p_cur, cur_len);
- push_cell_value(p, len);
+ next(); // to the 2nd quote.
+ p_cur = mp_char;
+ cur_len = 0;
+ continue;
+ }
+
+ // closing quote. Flush the current segment to the cell
+ // buffer, push the value to the handler, and exit normally.
+ append_to_cell_buf(p_cur, cur_len);
+
+ push_cell_value(&m_cell_buf[0], m_cell_buf_size);
+ next();
+ skip_blanks();
+ return;
+ }
+
+ // Stream ended prematurely.
+ throw csv_parse_error("stream ended prematurely while parsing quoted cell.");
}
template<typename _Handler>
@@ -259,6 +329,24 @@ void csv_parser<_Handler>::skip_blanks()
}
template<typename _Handler>
+void csv_parser<_Handler>::init_cell_buf()
+{
+ m_cell_buf_size = 0;
+}
+
+template<typename _Handler>
+void csv_parser<_Handler>::append_to_cell_buf(const char* p, size_t len)
+{
+ size_t size_needed = m_cell_buf_size + len;
+ if (m_cell_buf.size() < size_needed)
+ m_cell_buf.resize(size_needed);
+
+ char* p_dest = &m_cell_buf[m_cell_buf_size];
+ std::strncpy(p_dest, p, len);
+ m_cell_buf_size += len;
+}
+
+template<typename _Handler>
void csv_parser<_Handler>::push_cell_value(const char* p, size_t n)
{
size_t len = n;
@@ -286,12 +374,10 @@ void csv_parser<_Handler>::push_cell_value(const char* p, size_t n)
m_handler.cell(p, len);
#if ORCUS_DEBUG_CSV
- cout << "(cell:'" << std::string(p, len) << "')";
+ cout << "(cell:'" << std::string(p, len) << "')" << endl;
#endif
}
}
#endif
-
-/* vim:set shiftwidth=4 softtabstop=4 expandtab: */