1 files changed, 108 insertions, 43 deletions
diff --git a/to-wiki/wikiconv2.py b/to-wiki/wikiconv2.py
index be968cbfba..0e3bb5bf97 100755
--- a/to-wiki/wikiconv2.py
+++ b/to-wiki/wikiconv2.py
@@ -7,7 +7,7 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-import os, sys, thread, threading, time
+import os, sys, thread, threading, time, re, copy
 import xml.parsers.expat
 import codecs
 from threading import Thread
@@ -179,31 +179,79 @@ def escape_equals_sign(text):
 
     return t
 
-def load_localization_data(sdf_file):
-    global localization_data
-    localization_data = {}
-    try:
-        file = codecs.open(sdf_file, "r", "utf-8")
-    except:
-        sys.stderr.write('Error: Cannot open .sdf file "%s"\n'% sdf_file)
-        return False
+def xopen(path, mode, encoding):
+    """Wrapper around open() to support both python2 and python3."""
+    if sys.version_info >= (3,):
+        return open(path, mode, encoding=encoding)
+    else:
+        return open(path, mode)
+
+# used by ecape_help_text
+helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-zA-Z]+?=[\\\\]??".*?") *[/]??>''')
+
+def escape_help_text(text):
+    """Escapes the help text as it would be in an SDF file."""
+
+    for tag in helptagre.findall(text):
+        escapethistag = False
+        for escape_tag in ["ahelp", "link", "item", "emph", "defaultinline", "switchinline", "caseinline", "variable", "bookmark_value", "image", "embedvar", "alt"]:
+            if tag.startswith("<%s" % escape_tag) or tag == "</%s>" % escape_tag:
+                escapethistag = True
+        if tag in ["<br/>", "<help-id-missing/>"]:
+            escapethistag = True
+        if escapethistag:
+            escaped_tag = ("\\<" + tag[1:-1] + "\\>")
+            text = text.replace(tag, escaped_tag)
+    return text
 
-    for line in file:
-        line = line.strip()
-        if line[0] == '#':
-            continue
-        spl = line.split("\t")
-
-        # the form of the key is like
-        # source/text/shared/explorer/database/02010100.xhp#hd_id3149233
-        # otherwise we are getting duplicates
-        key = '%s#%s'% (spl[1].replace('\\', '/'), spl[4])
-        try:
-            localization_data[key] = spl[10]
-        except:
-            sys.stderr.write('Warning: Ignored line "%s"\n'% line.encode('utf-8'))
 
-    file.close()
+def load_localization_data(po_root):
+    global localization_data
+    localization_data = {}
+    for root, dirs, files in os.walk(po_root):
+        for file in files:
+            if re.search(r'\.po$', file) == None:
+                continue
+            path = "%s/%s" % (root, file)
+            sock = xopen(path, "r", encoding='utf-8')
+            hashKey = None
+            transCollecting = False
+            trans = ""
+            it = iter(sock)
+            line = next(it, None)
+            while line != None:
+                line=line.decode("utf-8")
+                if line.startswith('msgctxt ""'): # constructing the hashKey
+                    key=[]
+                    allGood = True
+                    i=0
+                    while i<2 and allGood:
+                        msgctxt_line = next(it, None);
+                        if  msgctxt_line != None and msgctxt_line.strip().startswith('"'):
+                            key.append( msgctxt_line[1:-4] ) #-4 cuts \\n"\n from the end of the line
+                            i=i+1
+                        else:
+                            allGood = False
+                    if i==2: #hash key is allowed to be constructed
+                        hashKey = '#'.join( (re.sub(r'^.*helpcontent2/source/', r'source/', path[:-3]) + '/' + key[0] , key[1]) )
+                    else:
+                        hashKey = None
+                elif hashKey != None: # constructing trans value for hashKey
+                    if transCollecting:
+                        if line.startswith('"'):
+                            trans= trans + line.strip()[1:-1]
+                        else:
+                            transCollecting = False
+                            localization_data[hashKey] = escape_help_text(trans)
+                            hashKey = None
+                    elif line.startswith('msgstr '):
+                        trans = line.strip()[8:-1]
+                        if trans == '': # possibly multiline
+                            transCollecting = True
+                        else:
+                            localization_data[hashKey] = escape_help_text(trans)
+                            hashKey = None
+                line = next(it, None)
     return True
 
 def unescape(str):
@@ -250,6 +298,9 @@ def href_to_fname_id(href):
 
     return [fname, id]
 
+# Exception classes
+class UnhandledItemType(Exception):
+    pass
 # Base class for all the elements
 #
 # self.name - name of the element, to drop the self.child_parsing flag
@@ -500,7 +551,7 @@ class TableCell(ElementBase):
             if parser.follow_embed:
                 self.embed_href(parser, fname, id)
         elif name == 'paragraph':
-            parser.parse_localized_paragraph(TableContentParagraph(attrs, self), attrs, self)
+            parser.parse_localized_paragraph(TableContentParagraph, attrs, self)
         elif name == 'section':
             self.parse_child(Section(attrs, self))
         elif name == 'bascode':
@@ -541,7 +592,7 @@ class BasicCode(ElementBase):
 
     def start_element(self, parser, name, attrs):
         if name == 'paragraph':
-            parser.parse_localized_paragraph(BasicCodeParagraph(attrs, self), attrs, self)
+            parser.parse_localized_paragraph(BasicCodeParagraph, attrs, self)
         else:
             self.unhandled_element(parser, name)
 
@@ -580,7 +631,7 @@ class ListItem(ElementBase):
             if parser.follow_embed:
                 self.embed_href(parser, fname, id)
         elif name == 'paragraph':
-            parser.parse_localized_paragraph(ListItemParagraph(attrs, self), attrs, self)
+            parser.parse_localized_paragraph(ListItemParagraph, attrs, self)
         elif name == 'list':
             self.parse_child(List(attrs, self))
         else:
@@ -683,7 +734,7 @@ class Meta(ElementBase):
 class Section(ElementBase):
     def __init__(self, attrs, parent):
         ElementBase.__init__(self, 'section', parent)
-        self.id = attrs['id']
+        self.id = attrs[ 'id' ]
 
     def start_element(self, parser, name, attrs):
         if name == 'bookmark':
@@ -970,8 +1021,12 @@ class Item(ElementBase):
                    text + \
                    self.replace_type['end'][self.type]
         except:
-            sys.stderr.write('Unhandled item type "%s".\n'% self.type)
-
+            try:
+                sys.stderr.write('Unhandled item type "%s".\n'% self.type)
+            except:
+                sys.stderr.write('Unhandled item type. Possibly type has been localized.\n')
+            finally:
+                raise UnhandledItemType
         return replace_text(self.text)
 
 
@@ -1062,7 +1117,10 @@ class Paragraph(ElementBase):
                 role = 'tablenextpara'
 
         # the text itself
-        children = ElementBase.get_all(self)
+        try:
+            children = ElementBase.get_all(self)
+        except UnhandledItemType:
+            raise UnhandledItemType('Paragraph id: '+str(self.id))
         if self.role != 'emph' and self.role != 'bascode' and self.role != 'logocode':
             children = children.strip()
 
@@ -1196,23 +1254,30 @@ class ParserBase:
     def get_variable(self, id):
         return self.head_obj.get_variable(id)
 
-    def parse_localized_paragraph(self, paragraph, attrs, obj):
+    def parse_localized_paragraph(self, Paragraph_type, attrs, obj):
         localized_text = ''
         try:
             localized_text = get_localized_text(self.filename, attrs['id'])
         except:
             pass
 
+        paragraph = Paragraph_type(attrs, obj)
         if localized_text != '':
             # parse the localized text
             text = u'<?xml version="1.0" encoding="UTF-8"?><localized>' + localized_text + '</localized>'
-            ParserBase(self.filename, self.follow_embed, self.embedding_app, \
-                    self.current_app, self.wiki_page_name, self.lang, \
-                    paragraph, text.encode('utf-8'))
-            # add it to the overall structure
-            obj.objects.append(paragraph)
-            # and ignore the original text
-            obj.parse_child(Ignore(attrs, obj, 'paragraph'))
+            try:
+                ParserBase(self.filename, self.follow_embed, self.embedding_app, \
+                        self.current_app, self.wiki_page_name, self.lang, \
+                        paragraph, text.encode('utf-8'))
+            except xml.parsers.expat.ExpatError:
+                sys.stderr.write( 'Invalid XML in translated text. Using the original text. Error location:\n'\
+                                  + 'Curren xhp: ' + self.filename + '\nParagraph id: ' + attrs['id'] + '\n')
+                obj.parse_child(Paragraph_type(attrs, obj)) # new paragraph must be created because "paragraph" is corrupted by "ParserBase"
+            else:
+                # add it to the overall structure
+                obj.objects.append(paragraph)
+                # and ignore the original text
+                obj.parse_child(Ignore(attrs, obj, 'paragraph'))
         else:
             obj.parse_child(paragraph)
 
@@ -1229,7 +1294,7 @@ class ParserBase:
         if ignore_this:
             obj.parse_child(Ignore(attrs, obj, 'paragraph'))
         else:
-            self.parse_localized_paragraph(Paragraph(attrs, obj), attrs, obj)
+            self.parse_localized_paragraph(Paragraph, attrs, obj)
 
 class XhpParser(ParserBase):
     def __init__(self, filename, follow_embed, embedding_app, wiki_page_name, lang):
@@ -1329,7 +1394,7 @@ def write_redirects():
                 write_link(r, target)
 
 # Main Function
-def convert(generate_redirects, lang, sdf_file):
+def convert(generate_redirects, lang, po_root):
     if lang == '':
         print 'Generating the main wiki pages...'
     else:
@@ -1343,8 +1408,8 @@ def convert(generate_redirects, lang, sdf_file):
     loadallfiles("alltitles.csv")
 
     if lang != '':
-        sys.stderr.write('Using localizations from "%s"\n'% sdf_file)
-        if not load_localization_data(sdf_file):
+        sys.stderr.write('Using localizations from "%s"\n'% po_root)
+        if not load_localization_data(po_root):
             return
 
     for title in titles: