move help structure one directory up

Change-Id: Ie970e39fbb6795a92d9fdd13510409d7dcd071bc
author: Norbert Thiebaud <nthiebaud@gmail.com> 2012-09-01 09:51:27 -0500
committer: Norbert Thiebaud <nthiebaud@gmail.com> 2012-10-16 11:07:30 -0500
commit: 61173c1b58efa79c0ba6b08348d2796a249d0186 (patch)
tree: 00ebf544db18942e2a1ecfc5e5fa16931127d38f /to-wiki
parent: 3dc2e7497f1798ae4ff6c5c8c562666bc10a393c (diff)
2 files changed, 1530 insertions, 0 deletions
diff --git a/to-wiki/getalltitles.py b/to-wiki/getalltitles.py
new file mode 100755
index 0000000000..18a2dc0828
--- /dev/null
+++ b/to-wiki/getalltitles.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import xml.parsers.expat
+
+alltitles=[]
+
+def is_present(title):
+    for i in alltitles:
+        try:
+            if i.strip() == title.strip():
+                return True
+        except:
+            return False
+    return False
+
+def make_unique(title):
+    n=0
+    t = title
+    while is_present(t):
+        n=n+1
+        t = title+"_%d"%(n)
+    return t
+
+replace_text_list = [
+        ["$[officename]",""], # to fit both LibreOffice and BrOffice
+        ["%PRODUCTNAME",""], # to fit both LibreOffice and BrOffice
+        ["$PRODUCTNAME",""], # to fit both LibreOffice and BrOffice
+        ['"+"',"Plus"],
+        ['"*"',"Star"],
+        ['"-"',"Minus"],
+        ['"/"',"Slash"],
+        ['"^"',"Cap"],
+        ['#',"No"],
+        [')','_'],
+        ['(','_'],
+        [']','_'],
+        ['[','_'],
+        ['\\','_'],
+        ['/','_'],
+        ['&',"and"],
+        [';','_']
+        ]
+
+replace_readable_list = [
+        ["$[officename]","{{ProductName}}"],
+        ["%PRODUCTNAME","{{ProductName}}"],
+        ["$PRODUCTNAME","{{ProductName}}"]
+        ]
+
+modules_list = [
+        ["sbasic","Basic"],
+        ["scalc","Calc"],
+        ["schart","Chart"],
+        ["sdraw","Draw"],
+        ["simpress","Impress"],
+        ["smath","Math"],
+        ["swriter","Writer"],
+        ["shared","Common"]
+        ]
+
+def get_module(text):
+    for i in modules_list:
+        if text.find('/' + i[0] + '/') >=0:
+            return i[1]
+    return ""
+
+def replace_text(text, replace_list):
+    for i in replace_list:
+        if text.find(i[0]) >= 0:
+            text = text.replace(i[0],i[1])
+    return text
+
+def wiki_text(text):
+    t = replace_text(text, replace_text_list)
+    if t == '':
+        t = 'LibreOffice' # hardcoded fallback
+    return t.strip()
+
+def readable_text(text):
+    return replace_text(text, replace_readable_list)
+
+class TitleParser:
+    title = ''
+    is_title = False
+
+    def start_element(self, name, attrs):
+        if name == 'title':
+            self.is_title = True
+
+    def end_element(self, name):
+        if name == 'title':
+            self.is_title = False
+
+    def char_data(self, data):
+        if self.is_title:
+            self.title = self.title + data
+
+    def get_title(self):
+        return self.title.strip()
+
+def parsexhp(filename):
+    module = get_module(filename)
+    if module == '':
+        return
+
+    parsing = True
+    file=open(filename,"r")
+    p = xml.parsers.expat.ParserCreate()
+    tp = TitleParser()
+    p.StartElementHandler = tp.start_element
+    p.EndElementHandler = tp.end_element
+    p.CharacterDataHandler = tp.char_data
+    buf = file.read()
+    try:
+        p.Parse(buf)
+    except:
+        sys.stderr.write('Cannot parse %s, skipping it\n'% filename)
+        file.close()
+        return
+    file.close()
+    title = tp.get_title()
+    if len(title) > 0:
+        readable_title = readable_text(title)
+        title = module + '/' + wiki_text(title)
+        title = title.replace(' ', '_')
+        title = title.replace('___', '_')
+        title = title.replace('__', '_')
+        title = title.strip('_')
+        title = make_unique(title)
+        alltitles.append(title)
+        print filename + ';' + title + ';' + readable_title
+
+if len(sys.argv) < 2:
+    print "getalltitles.py <directory>"
+    print "e.g. getalltitles.py source/text/scalc"
+    sys.exit(1)
+
+pattern = "xhp"
+
+for root, dirs, files in os.walk(sys.argv[1]):
+    for i in files:
+        if i.find(pattern) >= 0:
+            parsexhp(root+"/"+i)
+
+# vim:set shiftwidth=4 softtabstop=4 expandtab:
diff --git a/to-wiki/wikiconv2.py b/to-wiki/wikiconv2.py
new file mode 100755
index 0000000000..762484d225
--- /dev/null
+++ b/to-wiki/wikiconv2.py
@@ -0,0 +1,1383 @@
+#!/usr/bin/env python
+
+import os, sys, thread, threading, time
+import xml.parsers.expat
+import codecs
+from threading import Thread
+
+root="source/"
+max_threads = 25
+
+titles = []
+
+# map of id -> localized text
+localization_data = {}
+
+# to collect a list of pages that will be redirections to the pages with nice
+# names
+redirects = []
+
+# to collect images that we will up-load later
+images = set()
+
+# various types of paragraphs
+replace_paragraph_role = \
+    {'start':{'bascode': '',
+              'code': '<code>',
+              'codeintip': '<code>',
+              'emph' : '', # must be empty to be able to strip empty <emph/>
+              'example': '<code>',
+              'heading1': '= ',
+              'heading2': '== ',
+              'heading3': '=== ',
+              'heading4': '==== ',
+              'heading5': '===== ',
+              'heading6': '====== ',
+              'head1': '= ', # used only in one file, probably in error?
+              'head2': '== ', # used only in one file, probably in error?
+              'listitem': '',
+              'note': '{{Note|',
+              'null': '', # special paragraph for Variable, CaseInline, etc.
+              'paragraph': '',
+              'related': '', # used only in one file, probably in error?
+              'relatedtopics': '', # used only in one file, probably in error?
+              'sup' : '',
+              'tablecontent': '| | ',
+              'tablecontentcode': '| | <code>',
+              'tablehead': '! scope="col" | ',
+              'tablenextpara': '\n',
+              'tablenextparacode': '\n<code>',
+              'tip': '{{Tip|',
+              'variable': '',
+              'warning': '{{Warning|',
+             },
+     'end':{'bascode': '\n',
+            'code': '</code>\n\n',
+            'codeintip': '</code>\n\n',
+            'emph' : '',
+            'example': '</code>\n\n',
+            'heading1': ' =\n\n',
+            'heading2': ' ==\n\n',
+            'heading3': ' ===\n\n',
+            'heading4': ' ====\n\n',
+            'heading5': ' =====\n\n',
+            'heading6': ' ======\n\n',
+            'head1': ' =\n\n', # used only in one file, probably in error?
+            'head2': ' ==\n\n', # used only in one file, probably in error?
+            'listitem': '',
+            'note': '}}\n\n',
+            'null': '', # special paragraph for Variable, CaseInline, etc.
+            'paragraph': '\n\n',
+            'related': '\n\n', # used only in one file, probably in error?
+            'relatedtopics': '\n\n', # used only in one file, probably in error?
+            'sup' : '',
+            'tablecontent': '\n',
+            'tablecontentcode': '</code>\n',
+            'tablehead': '\n',
+            'tablenextpara': '\n',
+            'tablenextparacode': '</code>\n',
+            'tip': '}}\n\n',
+            'variable': '',
+            'warning': '}}\n\n',
+           },
+     'templ':{'bascode': False,
+              'code': False,
+              'codeintip': False,
+              'emph' : False,
+              'example': False,
+              'heading1': False,
+              'heading2': False,
+              'heading3': False,
+              'heading4': False,
+              'heading5': False,
+              'heading6': False,
+              'head1': False,
+              'head2': False,
+              'listitem': False,
+              'note': True,
+              'null': False,
+              'paragraph': False,
+              'related': False,
+              'relatedtopics': False,
+              'sup' : False,
+              'tablecontent': False,
+              'tablecontentcode': False,
+              'tablehead': False,
+              'tablenextpara': False,
+              'tablenextparacode': False,
+              'tip': True,
+              'variable': False,
+              'warning': True,
+           }
+    }
+
+section_id_mapping = \
+    {'relatedtopics': 'RelatedTopics'}
+
+# text snippets that we need to convert
+replace_text_list = \
+    [["$[officename]", "{{ProductName}}"],
+     ["%PRODUCTNAME", "{{ProductName}}"],
+     ["$PRODUCTNAME", "{{ProductName}}"]
+    ]
+
+def get_link_filename(link, name):
+    text = link.strip()
+    fragment = ''
+    if text.find('http') == 0:
+        text = name
+    else:
+        f = text.find('#')
+        if f >= 0:
+            fragment = text[f:]
+            text = text[0:f]
+
+    for title in titles:
+        try:
+            if title[0].find(text) >= 0:
+                return (title[1].strip(), fragment)
+        except:
+            pass
+    return (link, '')
+
+def replace_text(text):
+    for i in replace_text_list:
+        if text.find(i[0]) >= 0:
+            text = text.replace(i[0],i[1])
+    return text
+
+# modify the text so that in templates like {{Name|something}}, the 'something'
+# does not look like template params
+def escape_equals_sign(text):
+    depth = 0
+    t = ''
+    for i in text:
+        if i == '=':
+            if depth == 0:
+                t = t + '&#61;'
+            else:
+                t = t + '='
+        else:
+            t = t + i
+            if i == '{' or i == '[' or i == '<':
+                depth = depth + 1
+            elif i == '}' or i == ']' or i == '>':
+                depth = depth - 1
+                if depth < 0:
+                    depth = 0
+
+    return t
+
+def load_localization_data(sdf_file):
+    global localization_data
+    localization_data = {}
+    try:
+        file = codecs.open(sdf_file, "r", "utf-8")
+    except:
+        sys.stderr.write('Error: Cannot open .sdf file "%s"\n'% sdf_file)
+        return False
+
+    for line in file:
+        line = line.strip()
+        if line[0] == '#':
+            continue
+        spl = line.split("\t")
+
+        # the form of the key is like
+        # source/text/shared/explorer/database/02010100.xhp#hd_id3149233
+        # otherwise we are getting duplicates
+        key = '%s#%s'% (spl[1].replace('\\', '/'), spl[4])
+        try:
+            localization_data[key] = spl[10]
+        except:
+            sys.stderr.write('Warning: Ignored line "%s"\n'% line.encode('utf-8'))
+
+    file.close()
+    return True
+
+def unescape(str):
+    unescape_map = {'<': {True:'<', False:'&lt;'},
+                    '>': {True:'>', False:'&gt;'},
+                    '&': {True:'&', False:'&amp;'},
+                    '"': {True:'"', False:'"'}}
+    result = ''
+    escape = False
+    for c in str:
+        if c == '\\':
+            if escape:
+                result = result + '\\'
+                escape = False
+            else:
+                escape = True
+        else:
+            try:
+                replace = unescape_map[c]
+                result = result + replace[escape]
+            except:
+                result = result + c
+            escape = False
+
+    return result
+
+def get_localized_text(filename, id):
+    try:
+        str = localization_data['%s#%s'% (filename, id)]
+    except:
+        return ''
+
+    return unescape(str)
+
+def href_to_fname_id(href):
+    link = href.replace('"', '')
+    fname = link
+    id = ''
+    if link.find("#") >= 0:
+        fname = link[:link.find("#")]
+        id = link[link.find("#")+1:]
+    else:
+        sys.stderr.write('Reference without a "#" in "%s".'% link)
+
+    return [fname, id]
+
+# Base class for all the elements
+#
+# self.name - name of the element, to drop the self.child_parsing flag
+# self.objects - collects the child objects that are constructed during
+#                parsing of the child elements
+# self.child_parsing - flag whether we are parsing a child, or the object
+#                      itself
+# self.parent - parent object
+class ElementBase:
+    def __init__(self, name, parent):
+        self.name = name
+        self.objects = []
+        self.child_parsing = False
+        self.parent = parent
+
+    def start_element(self, parser, name, attrs):
+        pass
+
+    def end_element(self, parser, name):
+        if name == self.name:
+            self.parent.child_parsing = False
+
+    def char_data(self, parser, data):
+        pass
+
+    def get_curobj(self):
+        if self.child_parsing:
+            return self.objects[len(self.objects)-1].get_curobj()
+        return self
+
+    # start parsing a child element
+    def parse_child(self, child):
+        self.child_parsing = True
+        self.objects.append(child)
+
+    # construct the wiki representation of this object, including the objects
+    # held in self.objects (here only the text of the objects)
+    def get_all(self):
+        text = u''
+        for i in self.objects:
+            text = text + i.get_all()
+        return text
+
+    # for handling variables, and embedding in general
+    # id - the variable name we want to get
+    def get_variable(self, id):
+        for i in self.objects:
+            if i != None:
+                var = i.get_variable(id)
+                if var != None:
+                    return var
+        return None
+
+    # embed part of another file into current structure
+    def embed_href(self, parent_parser, fname, id):
+        # parse another xhp
+        parser = XhpParser('source/' + fname, False, \
+                parent_parser.current_app, parent_parser.wiki_page_name, \
+                parent_parser.lang)
+        var = parser.get_variable(id)
+
+        if var != None:
+            try:
+                if var.role == 'variable':
+                    var.role = 'paragraph'
+            except:
+                pass
+            self.objects.append(var)
+        elif parser.follow_embed:
+            sys.stderr.write('Cannot find reference "#%s" in "%s".\n'% \
+                    (id, fname))
+
+    def unhandled_element(self, parser, name):
+        sys.stderr.write('Warning: Unhandled element "%s" in "%s" (%s)\n'% \
+                        (name, self.name, parser.filename))
+
+# Base class for trivial elements that operate on char_data
+#
+# Like <comment>, or <title>
+class TextElementBase(ElementBase):
+    def __init__(self, attrs, parent, element_name, start, end, templ):
+        ElementBase.__init__(self, element_name, parent)
+        self.text = u''
+        self.start = start
+        self.end = end
+        self.templ = templ
+
+    def char_data(self, parser, data):
+        self.text = self.text + data
+
+    def get_all(self):
+        if self.templ:
+            return self.start + escape_equals_sign(replace_text(self.text)) + self.end
+        else:
+            return self.start + replace_text(self.text) + self.end
+
+class XhpFile(ElementBase):
+    def __init__(self):
+        ElementBase.__init__(self, None, None)
+
+    def start_element(self, parser, name, attrs):
+        if name == 'body':
+            # ignored, we flatten the structure
+            pass
+        elif name == 'bookmark':
+            self.parse_child(Bookmark(attrs, self, 'div', parser))
+        elif name == 'comment':
+            self.parse_child(Comment(attrs, self))
+        elif name == 'embed' or name == 'embedvar':
+            if parser.follow_embed:
+                (fname, id) = href_to_fname_id(attrs['href'])
+                self.embed_href(parser, fname, id)
+        elif name == 'helpdocument':
+            # ignored, we flatten the structure
+            pass
+        elif name == 'list':
+            self.parse_child(List(attrs, self))
+        elif name == 'meta':
+            self.parse_child(Meta(attrs, self))
+        elif name == 'paragraph':
+            parser.parse_paragraph(attrs, self)
+        elif name == 'section':
+            self.parse_child(Section(attrs, self))
+        elif name == 'sort':
+            self.parse_child(Sort(attrs, self))
+        elif name == 'switch':
+            self.parse_child(Switch(attrs, self, parser.embedding_app))
+        elif name == 'table':
+            self.parse_child(Table(attrs, self))
+        elif name == 'bascode':
+            self.parse_child(BasicCode(attrs, self))
+        else:
+            self.unhandled_element(parser, name)
+
+class Bookmark(ElementBase):
+    def __init__(self, attrs, parent, type, parser):
+        ElementBase.__init__(self, 'bookmark', parent)
+
+        self.type = type
+
+        self.id = attrs['id']
+        self.app = ''
+        self.redirect = ''
+        self.target = ''
+        self.authoritative = False
+
+        # let's construct the name of the redirect, so that we can point
+        # to the wikihelp directly from the LO code; wiki then takes care of
+        # the correct redirect
+        branch = attrs['branch']
+        if branch.find('hid/') == 0 and (parser.current_app_raw != '' or parser.follow_embed):
+            name = branch[branch.find('/') + 1:]
+
+            self.app = parser.current_app_raw
+            self.target = parser.wiki_page_name
+            self.authoritative = parser.follow_embed
+            self.redirect = name
+
+    def get_all(self):
+        global redirects
+        # first of all, we need to create a redirect page for this one
+        if self.redirect != '' and self.target != '':
+            redirects.append([self.app, self.redirect, \
+                '%s#%s'% (self.target, self.id), \
+                self.authoritative])
+
+        # then we also have to setup ID inside the page
+        if self.type == 'div':
+            return '<div id="%s"></div>\n'% self.id
+        elif self.type == 'span':
+            return '<span id="%s"></span>'% self.id
+        else:
+            sys.stderr.write('Unknown bookmark type "%s"'% self.type)
+
+        return ''
+
+class Image(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'image', parent)
+        self.src     = attrs['src']
+        self.align   = 'left'
+        self.alt     = False
+        self.alttext = ""
+
+    def start_element(self, parser, name, attrs):
+        if name == 'alt':
+            self.alt = True
+        else:
+            self.unhandled_element(parser, name)
+
+    def end_element(self, parser, name):
+        ElementBase.end_element(self, parser, name)
+
+        if name == 'alt':
+            self.alt = False
+
+    def char_data(self, parser, data):
+        if self.alt:
+            self.alttext = self.alttext + data
+
+    def get_all(self):
+        global images
+        images.add(self.src)
+
+        name = self.src[self.src.rfind('/') + 1:]
+        wikitext = "[[Image:"+name+"|border|"+self.align+"|"
+        wikitext = wikitext + self.alttext+"]]"
+        return wikitext
+
+    def get_curobj(self):
+        return self
+
+class Br(TextElementBase):
+    def __init__(self, attrs, parent):
+        TextElementBase.__init__(self, attrs, parent, 'br', '<br/>', '', False)
+
+class Comment(TextElementBase):
+    def __init__(self, attrs, parent):
+        TextElementBase.__init__(self, attrs, parent, 'comment', '<!-- ', ' -->', False)
+
+class HelpIdMissing(TextElementBase):
+    def __init__(self, attrs, parent):
+        TextElementBase.__init__(self, attrs, parent, 'help-id-missing', '{{MissingHelpId}}', '', False)
+
+class Text:
+    def __init__(self, text):
+        self.wikitext = replace_text(text)
+
+    def get_all(self):
+        return self.wikitext
+
+    def get_variable(self, id):
+        return None
+
+class TableCell(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'tablecell', parent)
+        self.cellHasChildElement = False
+
+    def start_element(self, parser, name, attrs):
+        self.cellHasChildElement = True
+        if name == 'bookmark':
+            self.parse_child(Bookmark(attrs, self, 'div', parser))
+        elif name == 'comment':
+            self.parse_child(Comment(attrs, self))
+        elif name == 'embed' or name == 'embedvar':
+            (fname, id) = href_to_fname_id(attrs['href'])
+            if parser.follow_embed:
+                self.embed_href(parser, fname, id)
+        elif name == 'paragraph':
+            parser.parse_localized_paragraph(TableContentParagraph(attrs, self), attrs, self)
+        elif name == 'section':
+            self.parse_child(Section(attrs, self))
+        elif name == 'bascode':
+            # ignored, do not syntax highlight in table cells
+            pass
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        text = ''
+        if not self.cellHasChildElement: # an empty element
+            if self.parent.isTableHeader: # get from TableRow Element
+                role = 'tablehead'
+            else:
+                role = 'tablecontent'
+            text = text + replace_paragraph_role['start'][role]
+            text = text + replace_paragraph_role['end'][role]
+        text = text + ElementBase.get_all(self)
+        return text
+
+class TableRow(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'tablerow', parent)
+
+    def start_element(self, parser, name, attrs):
+        if name == 'tablecell':
+            self.parse_child(TableCell(attrs, self))
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        text = '|-\n' + ElementBase.get_all(self)
+        return text
+
+class BasicCode(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'bascode', parent)
+
+    def start_element(self, parser, name, attrs):
+        if name == 'paragraph':
+            parser.parse_localized_paragraph(BasicCodeParagraph(attrs, self), attrs, self)
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        text = '<source lang="oobas">\n' + ElementBase.get_all(self) + '</source>\n\n'
+        return text
+
+class Table(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'table', parent)
+
+    def start_element(self, parser, name, attrs):
+        if name == 'comment':
+            self.parse_child(Comment(attrs, self))
+        elif name == 'tablerow':
+            self.parse_child(TableRow(attrs, self))
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        # + ' align="left"' etc.?
+        text = '{| class="wikitable"\n' + \
+            ElementBase.get_all(self) + \
+            '|}\n\n'
+        return text
+
+class ListItem(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'listitem', parent)
+
+    def start_element(self, parser, name, attrs):
+        if name == 'bookmark':
+            self.parse_child(Bookmark(attrs, self, 'span', parser))
+        elif name == 'embed' or name == 'embedvar':
+            (fname, id) = href_to_fname_id(attrs['href'])
+            if parser.follow_embed:
+                self.embed_href(parser, fname, id)
+        elif name == 'paragraph':
+            parser.parse_localized_paragraph(ListItemParagraph(attrs, self), attrs, self)
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        text = '*'
+        postfix = '\n'
+        if self.parent.startwith > 0:
+            text = '<li>'
+            postfix = '</li>'
+        elif self.parent.type == 'ordered':
+            text = '#'
+
+        # add the text itself
+        linebreak = False
+        for i in self.objects:
+            if linebreak:
+                text = text + '<br/>'
+            text = text + i.get_all()
+            linebreak = True
+
+        return text + postfix
+
+class List(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'list', parent)
+
+        self.type = attrs['type']
+        try:
+            self.startwith = int(attrs['startwith'])
+        except:
+            self.startwith = 0
+
+    def start_element(self, parser, name, attrs):
+        if name == 'listitem':
+            self.parse_child(ListItem(attrs, self))
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        text = ""
+        if self.startwith > 0:
+            text = text + '<ol start="%d">\n'% self.startwith
+
+        text = text + ElementBase.get_all(self)
+
+        if self.startwith > 0:
+            text = text + '\n</ol>\n'
+        else:
+            text = text + '\n'
+        return text
+
+# To handle elements that should be completely ignored
+class Ignore(ElementBase):
+    def __init__(self, attrs, parent, element_name):
+        ElementBase.__init__(self, element_name, parent)
+
+class OrigTitle(TextElementBase):
+    def __init__(self, attrs, parent):
+        TextElementBase.__init__(self, attrs, parent, 'title', '{{OrigLang|', '}}\n', True)
+
+class Title(TextElementBase):
+    def __init__(self, attrs, parent, localized_title):
+        TextElementBase.__init__(self, attrs, parent, 'title', '{{Lang|', '}}\n', True)
+        self.localized_title = localized_title
+
+    def get_all(self):
+        if self.localized_title != '':
+            self.text = self.localized_title
+        return TextElementBase.get_all(self)
+
+class Topic(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'topic', parent)
+
+    def start_element(self, parser, name, attrs):
+        if name == 'title':
+            if parser.lang == '':
+                self.parse_child(OrigTitle(attrs, self))
+            else:
+                self.parse_child(Title(attrs, self, get_localized_text(parser.filename, 'tit')))
+        elif name == 'filename':
+            self.parse_child(Ignore(attrs, self, name))
+        else:
+            self.unhandled_element(parser, name)
+
+class Meta(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'meta', parent)
+
+    def start_element(self, parser, name, attrs):
+        if name == 'topic':
+            self.parse_child(Topic(attrs, self))
+        elif name == 'history' or name == 'lastedited':
+            self.parse_child(Ignore(attrs, self, name))
+        else:
+            self.unhandled_element(parser, name)
+
+class Section(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'section', parent)
+        self.id = attrs['id']
+
+    def start_element(self, parser, name, attrs):
+        if name == 'bookmark':
+            self.parse_child(Bookmark(attrs, self, 'div', parser))
+        elif name == 'comment':
+            self.parse_child(Comment(attrs, self))
+        elif name == 'embed' or name == 'embedvar':
+            (fname, id) = href_to_fname_id(attrs['href'])
+            if parser.follow_embed:
+                self.embed_href(parser, fname, id)
+        elif name == 'list':
+            self.parse_child(List(attrs, self))
+        elif name == 'paragraph':
+            parser.parse_paragraph(attrs, self)
+        elif name == 'section':
+            # sections can be nested
+            self.parse_child(Section(attrs, self))
+        elif name == 'switch':
+            self.parse_child(Switch(attrs, self, parser.embedding_app))
+        elif name == 'table':
+            self.parse_child(Table(attrs, self))
+        elif name == 'bascode':
+            self.parse_child(BasicCode(attrs, self))
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        mapping = ''
+        try:
+            mapping = section_id_mapping[self.id]
+        except:
+            pass
+
+        # some of the section ids are used as real id's, some of them have
+        # function (like relatetopics), and have to be templatized
+        text = ''
+        if mapping != '':
+            text = '{{%s|%s}}\n\n'% (mapping, \
+                    escape_equals_sign(ElementBase.get_all(self)))
+        else:
+            text = ElementBase.get_all(self)
+
+        return text
+
+    def get_variable(self, id):
+        var = ElementBase.get_variable(self, id)
+        if var != None:
+            return var
+        if id == self.id:
+            return self
+        return None
+
+class Sort(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'sort', parent)
+
+        try:
+            self.order = attrs['order']
+        except:
+            self.order = 'asc'
+
+    def start_element(self, parser, name, attrs):
+        if name == 'section':
+            self.parse_child(Section(attrs, self))
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        rev = False
+        if self.order == 'asc':
+            rev = True
+        self.objects = sorted(self.objects, key=lambda obj: obj.id, reverse=rev)
+
+        return ElementBase.get_all(self)
+
+class Link(ElementBase):
+    def __init__(self, attrs, parent, lang):
+        ElementBase.__init__(self, 'link', parent)
+
+        self.link = attrs['href']
+        try:
+            self.lname = attrs['name']
+        except:
+            self.lname = self.link[self.link.rfind("/")+1:]
+        # Override lname
+        self.default_name = self.lname
+        (self.lname, self.fragment) = get_link_filename(self.link, self.lname)
+        self.wikitext = ""
+        self.lang = lang
+
+    def char_data(self, parser, data):
+        self.wikitext = self.wikitext + data
+
+    def get_all(self):
+        if self.wikitext == "":
+            self.wikitext = self.default_name
+
+        self.wikitext = replace_text(self.wikitext)
+        if self.link.find("http") == 0:
+            text = '[%s %s]'% (self.link, self.wikitext)
+        elif self.lang != '':
+            text = '[[%s/%s%s|%s]]'% (self.lname, self.lang, self.fragment, self.wikitext)
+        else:
+            text = '[[%s%s|%s]]'% (self.lname, self.fragment, self.wikitext)
+        return text
+
+class SwitchInline(ElementBase):
+    def __init__(self, attrs, parent, app):
+        ElementBase.__init__(self, 'switchinline', parent)
+        self.switch = attrs['select']
+        self.embedding_app = app
+
+    def start_element(self, parser, name, attrs):
+        if name == 'caseinline':
+            self.parse_child(CaseInline(attrs, self, False))
+        elif name == 'defaultinline':
+            self.parse_child(CaseInline(attrs, self, True))
+        else:
+            self.unhandled_element(parser, name)
+
+    def get_all(self):
+        if len(self.objects) == 0:
+            return ''
+        elif self.switch == 'sys':
+            system = {'MAC':'', 'UNIX':'', 'WIN':'', 'default':''}
+            for i in self.objects:
+                if i.case == 'MAC' or i.case == 'UNIX' or \
+                   i.case == 'WIN' or i.case == 'default':
+                    system[i.case] = i.get_all()
+                elif i.case == 'OS2':
+                    # ignore, there is only one mention of OS2, which is a
+                    # 'note to translators', and no meat
+                    pass
+                elif i.case == 'HIDE_HERE':
+                    # do what the name suggest ;-)
+                    pass
+                else:
+                    sys.stderr.write('Unhandled "%s" case in "sys" switchinline.\n'% \
+                            i.case )
+            text = '{{System'
+            for i in [['default', 'default'], ['MAC', 'mac'], \
+                      ['UNIX', 'unx'], ['WIN', 'win']]:
+                if system[i[0]] != '':
+                    text = '%s|%s=%s'% (text, i[1], system[i[0]])
+            return text + '}}'
+        elif self.switch == 'appl':
+            # we want directly use the right text, when inlining something
+            # 'shared' into an 'app'
+            if self.embedding_app == '':
+                text = ''
+                default = ''
+                for i in self.objects:
+                    appls = {'BASIC':'Basic', 'CALC':'Calc', \
+                             'CHART':'Chart', 'DRAW':'Draw', \
+                             'IMAGE':'Draw', 'IMPRESS': 'Impress', \
+                             'MATH':'Math', 'WRITER':'Writer', \
+                             'OFFICE':'', 'default':''}
+                    try:
+                        app = appls[i.case]
+                        all = i.get_all()
+                        if all == '':
+                            pass
+                        elif app == '':
+                            default = all
+                        else:
+                            text = text + '{{WhenIn%s|%s}}'% (app, escape_equals_sign(all))
+                    except:
+                        sys.stderr.write('Unhandled "%s" case in "appl" switchinline.\n'% \
+                                i.case)
+
+                if text == '':
+                    text = default
+                elif default != '':
+                    text = text + '{{WhenDefault|%s}}'% escape_equals_sign(default)
+
+                return text
+            else:
+                for i in self.objects:
+                    if i.case == self.embedding_app:
+                        return i.get_all()
+
+        return ''
+
+class Case(ElementBase):
+    def __init__(self, attrs, parent, is_default):
+        ElementBase.__init__(self, 'case', parent)
+
+        if is_default:
+            self.name = 'default'
+            self.case = 'default'
+        else:
+            self.case = attrs['select']
+
+    def start_element(self, parser, name, attrs):
+        if name == 'bookmark':
+            self.parse_child(Bookmark(attrs, self, 'div', parser))
+        elif name == 'comment':
+            self.parse_child(Comment(attrs, self))
+        elif name == 'embed' or name == 'embedvar':
+            if parser.follow_embed:
+                (fname, id) = href_to_fname_id(attrs['href'])
+                self.embed_href(parser, fname, id)
+        elif name == 'list':
+            self.parse_child(List(attrs, self))
+        elif name == 'paragraph':
+            parser.parse_paragraph(attrs, self)
+        elif name == 'section':
+            self.parse_child(Section(attrs, self))
+        elif name == 'table':
+            self.parse_child(Table(attrs, self))
+        else:
+            self.unhandled_element(parser, name)
+
+class Switch(SwitchInline):
+    def __init__(self, attrs, parent, app):
+        SwitchInline.__init__(self, attrs, parent, app)
+        self.name = 'switch'
+
+    def start_element(self, parser, name, attrs):
+        self.embedding_app = parser.embedding_app
+        if name == 'case':
+            self.parse_child(Case(attrs, self, False))
+        elif name == 'default':
+            self.parse_child(Case(attrs, self, True))
+        else:
+            self.unhandled_element(parser, name)
+
+class Item(ElementBase):
+    replace_type = \
+            {'start':{'input': '<code>',
+                      'keycode': '{{KeyCode|',
+                      'tasto': '{{KeyCode|',
+                      'litera': '<code>',
+                      'literal': '<code>',
+                      'menuitem': '{{MenuItem|',
+                      'mwnuitem': '{{MenuItem|',
+                      'OpenOffice.org': '',
+                      'productname': '',
+                      'unknown': '<code>'
+                     },
+             'end':{'input': '</code>',
+                    'keycode': '}}',
+                    'tasto': '}}',
+                    'litera': '</code>',
+                    'literal': '</code>',
+                    'menuitem': '}}',
+                    'mwnuitem': '}}',
+                    'OpenOffice.org': '',
+                    'productname': '',
+                    'unknown': '</code>'
+                   },
+             'templ':{'input': False,
+                      'keycode': True,
+                      'tasto': True,
+                      'litera': False,
+                      'literal': False,
+                      'menuitem': True,
+                      'mwnuitem': True,
+                      'OpenOffice.org': False,
+                      'productname': False,
+                      'unknown': False
+                     }}
+
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'item', parent)
+
+        try:
+            self.type = attrs['type']
+        except:
+            self.type = 'unknown'
+        self.text = ''
+
+    def char_data(self, parser, data):
+        self.text = self.text + data
+
+    def get_all(self):
+        try:
+            text = ''
+            if self.replace_type['templ'][self.type]:
+                text = escape_equals_sign(replace_text(self.text))
+            else:
+                text = replace_text(self.text)
+            return self.replace_type['start'][self.type] + \
+                   text + \
+                   self.replace_type['end'][self.type]
+        except:
+            sys.stderr.write('Unhandled item type "%s".\n'% self.type)
+
+        return replace_text(self.text)
+
+
+class Paragraph(ElementBase):
+    def __init__(self, attrs, parent):
+        ElementBase.__init__(self, 'paragraph', parent)
+
+        try:
+            self.role = attrs['role']
+        except:
+            self.role = 'paragraph'
+
+        try:
+            self.id = attrs['id']
+        except:
+            self.id = ""
+
+        try:
+            self.level = int(attrs['level'])
+        except:
+            self.level = 0
+
+        self.is_first = (len(self.parent.objects) == 0)
+
+    def start_element(self, parser, name, attrs):
+        if name == 'ahelp':
+            try:
+                if attrs['visibility'] == 'hidden':
+                    self.parse_child(Ignore(attrs, self, name))
+            except:
+                pass
+        elif name == 'br':
+            self.parse_child(Br(attrs, self))
+        elif name == 'comment':
+            self.parse_child(Comment(attrs, self))
+        elif name == 'emph':
+            self.parse_child(Emph(attrs, self))
+        elif name == 'sup':
+            self.parse_child(Sup(attrs, self))
+        elif name == 'embedvar':
+            if parser.follow_embed:
+                (fname, id) = href_to_fname_id(attrs['href'])
+                self.embed_href(parser, fname, id)
+        elif name == 'help-id-missing':
+            self.parse_child(HelpIdMissing(attrs, self))
+        elif name == 'image':
+            self.parse_child(Image(attrs, self))
+        elif name == 'item':
+            self.parse_child(Item(attrs, self))
+        elif name == 'link':
+            self.parse_child(Link(attrs, self, parser.lang))
+        elif name == 'localized':
+            # we ignore this tag, it is added arbitrary for the paragraphs
+            # that come from .sdf files
+            pass
+        elif name == 'switchinline':
+            self.parse_child(SwitchInline(attrs, self, parser.embedding_app))
+        elif name == 'variable':
+            self.parse_child(Variable(attrs, self))
+        else:
+            self.unhandled_element(parser, name)
+
+    def char_data(self, parser, data):
+        if self.role == 'paragraph' or self.role == 'heading' or \
+                self.role == 'listitem' or self.role == 'variable':
+            if data != '' and data[0] == ' ':
+                data = ' ' + data.lstrip()
+            data = data.replace('\n', ' ')
+
+        if len(data):
+            self.objects.append(Text(data))
+
+    def get_all(self):
+        role = self.role
+        if role == 'heading':
+            if self.level <= 0:
+                sys.stderr.write('Heading, but the level is %d.\n'% self.level)
+            elif self.level < 6:
+                role = 'heading%d'% self.level
+            else:
+                role = 'heading6'
+
+        # if we are not the first para in the table, we need special handling
+        if not self.is_first and role.find('table') == 0:
+            if role == 'tablecontentcode':
+                role = 'tablenextparacode'
+            else:
+                role = 'tablenextpara'
+
+        # the text itself
+        children = ElementBase.get_all(self)
+        if self.role != 'emph' and self.role != 'bascode':
+            children = children.strip()
+
+        if len(children) == 0:
+            return ''
+
+        # prepend the markup according to the role
+        text = ''
+        try:
+            text = text + replace_paragraph_role['start'][role]
+        except:
+            sys.stderr.write( "Unknown paragraph role start: " + role + "\n" )
+
+        if replace_paragraph_role['templ'][role]:
+            text = text + escape_equals_sign(children)
+        else:
+            text = text + children
+
+        # append the markup according to the role
+        try:
+            text = text + replace_paragraph_role['end'][role]
+        except:
+            sys.stderr.write( "Unknown paragraph role end: " + role + "\n" )
+
+        return text
+
+class Variable(Paragraph):
+    def __init__(self, attrs, parent):
+        Paragraph.__init__(self, attrs, parent)
+        self.name = 'variable'
+        self.role = 'variable'
+        self.id = attrs['id']
+
+    def get_variable(self, id):
+        if id == self.id:
+            return self
+        return None
+
+class CaseInline(Paragraph):
+    def __init__(self, attrs, parent, is_default):
+        Paragraph.__init__(self, attrs, parent)
+
+        self.role = 'null'
+        if is_default:
+            self.name = 'defaultinline'
+            self.case = 'default'
+        else:
+            self.name = 'caseinline'
+            self.case = attrs['select']
+
+class Emph(Paragraph):
+    def __init__(self, attrs, parent):
+        Paragraph.__init__(self, attrs, parent)
+        self.name = 'emph'
+        self.role = 'emph'
+
+    def get_all(self):
+        text = Paragraph.get_all(self)
+        if len(text):
+            return "'''" + text + "'''"
+        return ''
+
+class Sup(Paragraph):
+    def __init__(self, attrs, parent):
+        Paragraph.__init__(self, attrs, parent)
+        self.name = 'sup'
+        self.role = 'sup'
+
+    def get_all(self):
+        text = Paragraph.get_all(self)
+        if len(text):
+            return "<sup>" + text + "</sup>"
+        return ''
+
+class ListItemParagraph(Paragraph):
+    def __init__(self, attrs, parent):
+        Paragraph.__init__(self, attrs, parent)
+        self.role = 'listitem'
+
+class BasicCodeParagraph(Paragraph):
+    def __init__(self, attrs, parent):
+        Paragraph.__init__(self, attrs, parent)
+        self.role = 'bascode'
+
+class TableContentParagraph(Paragraph):
+    def __init__(self, attrs, parent):
+        Paragraph.__init__(self, attrs, parent)
+        if self.role != 'tablehead' and self.role != 'tablecontent':
+            if self.role == 'code':
+                self.role = 'tablecontentcode'
+            elif self.role == 'bascode':
+                self.role = 'tablecontentcode'
+            else:
+                self.role = 'tablecontent'
+        if self.role == 'tablehead':
+            self.parent.parent.isTableHeader = True # self.parent.parent is TableRow Element
+        else:
+            self.parent.parent.isTableHeader = False
+
+class ParserBase:
+    def __init__(self, filename, follow_embed, embedding_app, current_app, wiki_page_name, lang, head_object, buffer):
+        self.filename = filename
+        self.follow_embed = follow_embed
+        self.embedding_app = embedding_app
+        self.current_app = current_app
+        self.wiki_page_name = wiki_page_name
+        self.lang = lang
+        self.head_obj = head_object
+
+        p = xml.parsers.expat.ParserCreate()
+        p.StartElementHandler = self.start_element
+        p.EndElementHandler = self.end_element
+        p.CharacterDataHandler = self.char_data
+
+        p.Parse(buffer)
+
+    def start_element(self, name, attrs):
+        self.head_obj.get_curobj().start_element(self, name, attrs)
+
+    def end_element(self, name):
+        self.head_obj.get_curobj().end_element(self, name)
+
+    def char_data(self, data):
+        self.head_obj.get_curobj().char_data(self, data)
+
+    def get_all(self):
+        return self.head_obj.get_all()
+
+    def get_variable(self, id):
+        return self.head_obj.get_variable(id)
+
+    def parse_localized_paragraph(self, paragraph, attrs, obj):
+        localized_text = ''
+        try:
+            localized_text = get_localized_text(self.filename, attrs['id'])
+        except:
+            pass
+
+        if localized_text != '':
+            # parse the localized text
+            text = u'<?xml version="1.0" encoding="UTF-8"?><localized>' + localized_text + '</localized>'
+            ParserBase(self.filename, self.follow_embed, self.embedding_app, \
+                    self.current_app, self.wiki_page_name, self.lang, \
+                    paragraph, text.encode('utf-8'))
+            # add it to the overall structure
+            obj.objects.append(paragraph)
+            # and ignore the original text
+            obj.parse_child(Ignore(attrs, obj, 'paragraph'))
+        else:
+            obj.parse_child(paragraph)
+
+    def parse_paragraph(self, attrs, obj):
+        ignore_this = False
+        try:
+            if attrs['role'] == 'heading' and int(attrs['level']) == 1 \
+                    and self.ignore_heading and self.follow_embed:
+                self.ignore_heading = False
+                ignore_this = True
+        except:
+            pass
+
+        if ignore_this:
+            obj.parse_child(Ignore(attrs, obj, 'paragraph'))
+        else:
+            self.parse_localized_paragraph(Paragraph(attrs, obj), attrs, obj)
+
+class XhpParser(ParserBase):
+    def __init__(self, filename, follow_embed, embedding_app, wiki_page_name, lang):
+        # we want to ignore the 1st level="1" heading, because in most of the
+        # cases, it is the only level="1" heading in the file, and it is the
+        # same as the page title
+        self.ignore_heading = True
+
+        current_app = ''
+        self.current_app_raw = ''
+        for i in [['sbasic', 'BASIC'], ['scalc', 'CALC'], \
+                  ['sdatabase', 'DATABASE'], ['sdraw', 'DRAW'], \
+                  ['schart', 'CHART'], ['simpress', 'IMPRESS'], \
+                  ['smath', 'MATH'], ['swriter', 'WRITER']]:
+            if filename.find('/%s/'% i[0]) >= 0:
+                self.current_app_raw = i[0]
+                current_app = i[1]
+                break
+
+        if embedding_app == '':
+            embedding_app = current_app
+
+        file = codecs.open(filename, "r", "utf-8")
+        buf = file.read()
+        file.close()
+
+        ParserBase.__init__(self, filename, follow_embed, embedding_app,
+                current_app, wiki_page_name, lang, XhpFile(), buf.encode('utf-8'))
+
+def loadallfiles(filename):
+    global titles
+    titles = []
+    file = codecs.open(filename, "r", "utf-8")
+    for line in file:
+        title = line.split(";", 2)
+        titles.append(title)
+    file.close()
+
+class WikiConverter(Thread):
+    def __init__(self, inputfile, wiki_page_name, lang, outputfile):
+        Thread.__init__(self)
+        self.inputfile = inputfile
+        self.wiki_page_name = wiki_page_name
+        self.lang = lang
+        self.outputfile = outputfile
+
+    def run(self):
+        parser = XhpParser(self.inputfile, True, '', self.wiki_page_name, self.lang)
+        file = codecs.open(self.outputfile, "wb", "utf-8")
+        file.write(parser.get_all())
+        file.close()
+
+def write_link(r, target):
+    fname = 'wiki/%s'% r
+    try:
+        file = open(fname, "w")
+        file.write('#REDIRECT [[%s]]\n'% target)
+        file.close()
+    except:
+        sys.stderr.write('Unable to write "%s".\n'%'wiki/%s'% fname)
+
+def write_redirects():
+    print 'Generating the redirects...'
+    written = {}
+    # in the first pass, immediately writte the links that are embedded, so that
+    # we can always point to that source versions
+    for redir in redirects:
+        app = redir[0]
+        redirect = redir[1]
+        target = redir[2]
+        authoritative = redir[3]
+    
+        if app != '':
+            r = '%s/%s'% (app, redirect)
+            if authoritative:
+                write_link(r, target)
+                written[r] = True
+            else:
+                try:
+                    written[r]
+                except:
+                    written[r] = False
+    
+    # in the second pass, output the wiki links
+    for redir in redirects:
+        app = redir[0]
+        redirect = redir[1]
+        target = redir[2]
+    
+        if app == '':
+            for i in ['swriter', 'scalc', 'simpress', 'sdraw', 'smath', \
+                      'schart', 'sbasic', 'sdatabase']:
+                write_link('%s/%s'% (i, redirect), target)
+        else:
+            r = '%s/%s'% (app, redirect)
+            if not written[r]:
+                write_link(r, target)
+
+# Main Function
+def convert(generate_redirects, lang, sdf_file):
+    if lang == '':
+        print 'Generating the main wiki pages...'
+    else:
+        print 'Generating the wiki pages for language %s...'% lang
+
+    global redirects
+    redirects = []
+    global images
+    images = set()
+
+    loadallfiles("alltitles.csv")
+
+    if lang != '':
+        sys.stderr.write('Using localizations from "%s"\n'% sdf_file)
+        if not load_localization_data(sdf_file):
+            return
+
+    for title in titles:
+        while threading.active_count() > max_threads:
+            time.sleep(0.001)
+    
+        infile = title[0].strip()
+        wikiname = title[1].strip()
+        articledir = 'wiki/' + wikiname
+        try:
+            os.mkdir(articledir)
+        except:
+            pass
+
+        outfile = ''
+        if lang != '':
+            wikiname = '%s/%s'% (wikiname, lang)
+            outfile = '%s/%s'% (articledir, lang)
+        else:
+            outfile = '%s/MAIN'% articledir
+
+        try:
+            file = open(outfile, 'r')
+        except:
+            try:
+                wiki = WikiConverter(infile, wikiname, lang, outfile)
+                wiki.start()
+                continue
+            except:
+                print 'Failed to convert "%s" into "%s".\n'% \
+                        (infile, outfile)
+        sys.stderr.write('Warning: Skipping: %s > %s\n'% (infile, outfile))
+        file.close()
+    
+    # wait for everyone to finish
+    while threading.active_count() > 1:
+        time.sleep(0.001)
+
+    if lang == '':
+        # set of the images used here
+        print 'Generating "images.txt", the list of used images...'
+        file = open('images.txt', "w")
+        for image in images:
+            file.write('%s\n'% image)
+        file.close()
+
+        # generate the redirects
+        if generate_redirects:
+            write_redirects()
+
+# vim:set shiftwidth=4 softtabstop=4 expandtab:
author	Norbert Thiebaud <nthiebaud@gmail.com>	2012-09-01 09:51:27 -0500
committer	Norbert Thiebaud <nthiebaud@gmail.com>	2012-10-16 11:07:30 -0500
commit	61173c1b58efa79c0ba6b08348d2796a249d0186 (patch)
tree	00ebf544db18942e2a1ecfc5e5fa16931127d38f /to-wiki
parent	3dc2e7497f1798ae4ff6c5c8c562666bc10a393c (diff)