#!/usr/bin/env python import os, sys, thread, threading, time import xml.parsers.expat import codecs from threading import Thread root="source/" max_threads = 25 titles = [] # map of id -> localized text localization_data = {} # to collect a list of pages that will be redirections to the pages with nice # names redirects = [] # to collect images that we will up-load later images = set() # various types of paragraphs replace_paragraph_role = \ {'start':{'bascode': '', 'code': '', 'codeintip': '', 'emph' : '', # must be empty to be able to strip empty 'example': '', 'heading1': '= ', 'heading2': '== ', 'heading3': '=== ', 'heading4': '==== ', 'heading5': '===== ', 'heading6': '====== ', 'head1': '= ', # used only in one file, probably in error? 'head2': '== ', # used only in one file, probably in error? 'listitem': '', 'note': '{{Note|', 'null': '', # special paragraph for Variable, CaseInline, etc. 'paragraph': '', 'related': '', # used only in one file, probably in error? 'relatedtopics': '', # used only in one file, probably in error? 'sup' : '', 'tablecontent': '| | ', 'tablecontentcode': '| | ', 'tablehead': '! scope="col" | ', 'tablenextpara': '\n', 'tablenextparacode': '\n', 'tip': '{{Tip|', 'variable': '', 'warning': '{{Warning|', }, 'end':{'bascode': '\n', 'code': '\n\n', 'codeintip': '\n\n', 'emph' : '', 'example': '\n\n', 'heading1': ' =\n\n', 'heading2': ' ==\n\n', 'heading3': ' ===\n\n', 'heading4': ' ====\n\n', 'heading5': ' =====\n\n', 'heading6': ' ======\n\n', 'head1': ' =\n\n', # used only in one file, probably in error? 'head2': ' ==\n\n', # used only in one file, probably in error? 'listitem': '', 'note': '}}\n\n', 'null': '', # special paragraph for Variable, CaseInline, etc. 'paragraph': '\n\n', 'related': '\n\n', # used only in one file, probably in error? 'relatedtopics': '\n\n', # used only in one file, probably in error? 'sup' : '', 'tablecontent': '\n', 'tablecontentcode': '\n', 'tablehead': '\n', 'tablenextpara': '\n', 'tablenextparacode': '\n', 'tip': '}}\n\n', 'variable': '', 'warning': '}}\n\n', }, 'templ':{'bascode': False, 'code': False, 'codeintip': False, 'emph' : False, 'example': False, 'heading1': False, 'heading2': False, 'heading3': False, 'heading4': False, 'heading5': False, 'heading6': False, 'head1': False, 'head2': False, 'listitem': False, 'note': True, 'null': False, 'paragraph': False, 'related': False, 'relatedtopics': False, 'sup' : False, 'tablecontent': False, 'tablecontentcode': False, 'tablehead': False, 'tablenextpara': False, 'tablenextparacode': False, 'tip': True, 'variable': False, 'warning': True, } } section_id_mapping = \ {'relatedtopics': 'RelatedTopics'} # text snippets that we need to convert replace_text_list = \ [["$[officename]", "{{ProductName}}"], ["%PRODUCTNAME", "{{ProductName}}"], ["$PRODUCTNAME", "{{ProductName}}"], ["font size", u"\u200dfont size"] ] def get_link_filename(link, name): text = link.strip() fragment = '' if text.find('http') == 0: text = name else: f = text.find('#') if f >= 0: fragment = text[f:] text = text[0:f] for title in titles: try: if title[0].find(text) >= 0: return (title[1].strip(), fragment) except: pass return (link, '') def replace_text(text): for i in replace_text_list: if text.find(i[0]) >= 0: text = text.replace(i[0],i[1]) return text # modify the text so that in templates like {{Name|something}}, the 'something' # does not look like template params def escape_equals_sign(text): depth = 0 t = '' for i in text: if i == '=': if depth == 0: t = t + '=' else: t = t + '=' else: t = t + i if i == '{' or i == '[' or i == '<': depth = depth + 1 elif i == '}' or i == ']' or i == '>': depth = depth - 1 if depth < 0: depth = 0 return t def load_localization_data(sdf_file): global localization_data localization_data = {} try: file = codecs.open(sdf_file, "r", "utf-8") except: sys.stderr.write('Error: Cannot open .sdf file "%s"\n'% sdf_file) return False for line in file: line = line.strip() if line[0] == '#': continue spl = line.split("\t") # the form of the key is like # source/text/shared/explorer/database/02010100.xhp#hd_id3149233 # otherwise we are getting duplicates key = '%s#%s'% (spl[1].replace('\\', '/'), spl[4]) try: localization_data[key] = spl[10] except: sys.stderr.write('Warning: Ignored line "%s"\n'% line.encode('utf-8')) file.close() return True def unescape(str): unescape_map = {'<': {True:'<', False:'<'}, '>': {True:'>', False:'>'}, '&': {True:'&', False:'&'}, '"': {True:'"', False:'"'}} result = '' escape = False for c in str: if c == '\\': if escape: result = result + '\\' escape = False else: escape = True else: try: replace = unescape_map[c] result = result + replace[escape] except: result = result + c escape = False return result def get_localized_text(filename, id): try: str = localization_data['%s#%s'% (filename, id)] except: return '' return unescape(str) def href_to_fname_id(href): link = href.replace('"', '') fname = link id = '' if link.find("#") >= 0: fname = link[:link.find("#")] id = link[link.find("#")+1:] else: sys.stderr.write('Reference without a "#" in "%s".'% link) return [fname, id] # Base class for all the elements # # self.name - name of the element, to drop the self.child_parsing flag # self.objects - collects the child objects that are constructed during # parsing of the child elements # self.child_parsing - flag whether we are parsing a child, or the object # itself # self.parent - parent object class ElementBase: def __init__(self, name, parent): self.name = name self.objects = [] self.child_parsing = False self.parent = parent def start_element(self, parser, name, attrs): pass def end_element(self, parser, name): if name == self.name: self.parent.child_parsing = False def char_data(self, parser, data): pass def get_curobj(self): if self.child_parsing: return self.objects[len(self.objects)-1].get_curobj() return self # start parsing a child element def parse_child(self, child): self.child_parsing = True self.objects.append(child) # construct the wiki representation of this object, including the objects # held in self.objects (here only the text of the objects) def get_all(self): text = u'' for i in self.objects: text = text + i.get_all() return text # for handling variables, and embedding in general # id - the variable name we want to get def get_variable(self, id): for i in self.objects: if i != None: var = i.get_variable(id) if var != None: return var return None # embed part of another file into current structure def embed_href(self, parent_parser, fname, id): # parse another xhp parser = XhpParser('source/' + fname, False, \ parent_parser.current_app, parent_parser.wiki_page_name, \ parent_parser.lang) var = parser.get_variable(id) if var != None: try: if var.role == 'variable': var.role = 'paragraph' except: pass self.objects.append(var) elif parser.follow_embed: sys.stderr.write('Cannot find reference "#%s" in "%s".\n'% \ (id, fname)) def unhandled_element(self, parser, name): sys.stderr.write('Warning: Unhandled element "%s" in "%s" (%s)\n'% \ (name, self.name, parser.filename)) # Base class for trivial elements that operate on char_data # # Like , or class TextElementBase(ElementBase): def __init__(self, attrs, parent, element_name, start, end, templ): ElementBase.__init__(self, element_name, parent) self.text = u'' self.start = start self.end = end self.templ = templ def char_data(self, parser, data): self.text = self.text + data def get_all(self): if self.templ: return self.start + escape_equals_sign(replace_text(self.text)) + self.end else: return self.start + replace_text(self.text) + self.end class XhpFile(ElementBase): def __init__(self): ElementBase.__init__(self, None, None) def start_element(self, parser, name, attrs): if name == 'body': # ignored, we flatten the structure pass elif name == 'bookmark': self.parse_child(Bookmark(attrs, self, 'div', parser)) elif name == 'comment': self.parse_child(Comment(attrs, self)) elif name == 'embed' or name == 'embedvar': if parser.follow_embed: (fname, id) = href_to_fname_id(attrs['href']) self.embed_href(parser, fname, id) elif name == 'helpdocument': # ignored, we flatten the structure pass elif name == 'list': self.parse_child(List(attrs, self)) elif name == 'meta': self.parse_child(Meta(attrs, self)) elif name == 'paragraph': parser.parse_paragraph(attrs, self) elif name == 'section': self.parse_child(Section(attrs, self)) elif name == 'sort': self.parse_child(Sort(attrs, self)) elif name == 'switch': self.parse_child(Switch(attrs, self, parser.embedding_app)) elif name == 'table': self.parse_child(Table(attrs, self)) elif name == 'bascode': self.parse_child(BasicCode(attrs, self)) else: self.unhandled_element(parser, name) class Bookmark(ElementBase): def __init__(self, attrs, parent, type, parser): ElementBase.__init__(self, 'bookmark', parent) self.type = type self.id = attrs['id'] self.app = '' self.redirect = '' self.target = '' self.authoritative = False # let's construct the name of the redirect, so that we can point # to the wikihelp directly from the LO code; wiki then takes care of # the correct redirect branch = attrs['branch'] if branch.find('hid/') == 0 and (parser.current_app_raw != '' or parser.follow_embed): name = branch[branch.find('/') + 1:] self.app = parser.current_app_raw self.target = parser.wiki_page_name self.authoritative = parser.follow_embed self.redirect = name def get_all(self): global redirects # first of all, we need to create a redirect page for this one if self.redirect != '' and self.target != '': redirects.append([self.app, self.redirect, \ '%s#%s'% (self.target, self.id), \ self.authoritative]) # then we also have to setup ID inside the page if self.type == 'div': return '<div id="%s"></div>\n'% self.id elif self.type == 'span': return '<span id="%s"></span>'% self.id else: sys.stderr.write('Unknown bookmark type "%s"'% self.type) return '' class Image(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'image', parent) self.src = attrs['src'] self.align = 'left' self.alt = False self.alttext = "" def start_element(self, parser, name, attrs): if name == 'alt': self.alt = True else: self.unhandled_element(parser, name) def end_element(self, parser, name): ElementBase.end_element(self, parser, name) if name == 'alt': self.alt = False def char_data(self, parser, data): if self.alt: self.alttext = self.alttext + data def get_all(self): global images images.add(self.src) name = self.src[self.src.rfind('/') + 1:] wikitext = "[[Image:"+name+"|border|"+self.align+"|" wikitext = wikitext + self.alttext+"]]" return wikitext def get_curobj(self): return self class Br(TextElementBase): def __init__(self, attrs, parent): TextElementBase.__init__(self, attrs, parent, 'br', '<br/>', '', False) class Comment(TextElementBase): def __init__(self, attrs, parent): TextElementBase.__init__(self, attrs, parent, 'comment', '<!-- ', ' -->', False) class HelpIdMissing(TextElementBase): def __init__(self, attrs, parent): TextElementBase.__init__(self, attrs, parent, 'help-id-missing', '{{MissingHelpId}}', '', False) class Text: def __init__(self, text): self.wikitext = replace_text(text) def get_all(self): return self.wikitext def get_variable(self, id): return None class TableCell(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'tablecell', parent) self.cellHasChildElement = False def start_element(self, parser, name, attrs): self.cellHasChildElement = True if name == 'bookmark': self.parse_child(Bookmark(attrs, self, 'div', parser)) elif name == 'comment': self.parse_child(Comment(attrs, self)) elif name == 'embed' or name == 'embedvar': (fname, id) = href_to_fname_id(attrs['href']) if parser.follow_embed: self.embed_href(parser, fname, id) elif name == 'paragraph': parser.parse_localized_paragraph(TableContentParagraph(attrs, self), attrs, self) elif name == 'section': self.parse_child(Section(attrs, self)) elif name == 'bascode': # ignored, do not syntax highlight in table cells pass else: self.unhandled_element(parser, name) def get_all(self): text = '' if not self.cellHasChildElement: # an empty element if self.parent.isTableHeader: # get from TableRow Element role = 'tablehead' else: role = 'tablecontent' text = text + replace_paragraph_role['start'][role] text = text + replace_paragraph_role['end'][role] text = text + ElementBase.get_all(self) return text class TableRow(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'tablerow', parent) def start_element(self, parser, name, attrs): if name == 'tablecell': self.parse_child(TableCell(attrs, self)) else: self.unhandled_element(parser, name) def get_all(self): text = '|-\n' + ElementBase.get_all(self) return text class BasicCode(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'bascode', parent) def start_element(self, parser, name, attrs): if name == 'paragraph': parser.parse_localized_paragraph(BasicCodeParagraph(attrs, self), attrs, self) else: self.unhandled_element(parser, name) def get_all(self): text = '<source lang="oobas">\n' + ElementBase.get_all(self) + '</source>\n\n' return text class Table(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'table', parent) def start_element(self, parser, name, attrs): if name == 'comment': self.parse_child(Comment(attrs, self)) elif name == 'tablerow': self.parse_child(TableRow(attrs, self)) else: self.unhandled_element(parser, name) def get_all(self): # + ' align="left"' etc.? text = '{| class="wikitable"\n' + \ ElementBase.get_all(self) + \ '|}\n\n' return text class ListItem(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'listitem', parent) def start_element(self, parser, name, attrs): if name == 'bookmark': self.parse_child(Bookmark(attrs, self, 'span', parser)) elif name == 'embed' or name == 'embedvar': (fname, id) = href_to_fname_id(attrs['href']) if parser.follow_embed: self.embed_href(parser, fname, id) elif name == 'paragraph': parser.parse_localized_paragraph(ListItemParagraph(attrs, self), attrs, self) else: self.unhandled_element(parser, name) def get_all(self): text = '*' postfix = '\n' if self.parent.startwith > 0: text = '<li>' postfix = '</li>' elif self.parent.type == 'ordered': text = '#' # add the text itself linebreak = False for i in self.objects: if linebreak: text = text + '<br/>' text = text + i.get_all() linebreak = True return text + postfix class List(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'list', parent) self.type = attrs['type'] try: self.startwith = int(attrs['startwith']) except: self.startwith = 0 def start_element(self, parser, name, attrs): if name == 'listitem': self.parse_child(ListItem(attrs, self)) else: self.unhandled_element(parser, name) def get_all(self): text = "" if self.startwith > 0: text = text + '<ol start="%d">\n'% self.startwith text = text + ElementBase.get_all(self) if self.startwith > 0: text = text + '\n</ol>\n' else: text = text + '\n' return text # To handle elements that should be completely ignored class Ignore(ElementBase): def __init__(self, attrs, parent, element_name): ElementBase.__init__(self, element_name, parent) class OrigTitle(TextElementBase): def __init__(self, attrs, parent): TextElementBase.__init__(self, attrs, parent, 'title', '{{OrigLang|', '}}\n', True) class Title(TextElementBase): def __init__(self, attrs, parent, localized_title): TextElementBase.__init__(self, attrs, parent, 'title', '{{Lang|', '}}\n', True) self.localized_title = localized_title def get_all(self): if self.localized_title != '': self.text = self.localized_title return TextElementBase.get_all(self) class Topic(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'topic', parent) def start_element(self, parser, name, attrs): if name == 'title': if parser.lang == '': self.parse_child(OrigTitle(attrs, self)) else: self.parse_child(Title(attrs, self, get_localized_text(parser.filename, 'tit'))) elif name == 'filename': self.parse_child(Ignore(attrs, self, name)) else: self.unhandled_element(parser, name) class Meta(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'meta', parent) def start_element(self, parser, name, attrs): if name == 'topic': self.parse_child(Topic(attrs, self)) elif name == 'history' or name == 'lastedited': self.parse_child(Ignore(attrs, self, name)) else: self.unhandled_element(parser, name) class Section(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'section', parent) self.id = attrs['id'] def start_element(self, parser, name, attrs): if name == 'bookmark': self.parse_child(Bookmark(attrs, self, 'div', parser)) elif name == 'comment': self.parse_child(Comment(attrs, self)) elif name == 'embed' or name == 'embedvar': (fname, id) = href_to_fname_id(attrs['href']) if parser.follow_embed: self.embed_href(parser, fname, id) elif name == 'list': self.parse_child(List(attrs, self)) elif name == 'paragraph': parser.parse_paragraph(attrs, self) elif name == 'section': # sections can be nested self.parse_child(Section(attrs, self)) elif name == 'switch': self.parse_child(Switch(attrs, self, parser.embedding_app)) elif name == 'table': self.parse_child(Table(attrs, self)) elif name == 'bascode': self.parse_child(BasicCode(attrs, self)) else: self.unhandled_element(parser, name) def get_all(self): mapping = '' try: mapping = section_id_mapping[self.id] except: pass # some of the section ids are used as real id's, some of them have # function (like relatetopics), and have to be templatized text = '' if mapping != '': text = '{{%s|%s}}\n\n'% (mapping, \ escape_equals_sign(ElementBase.get_all(self))) else: text = ElementBase.get_all(self) return text def get_variable(self, id): var = ElementBase.get_variable(self, id) if var != None: return var if id == self.id: return self return None class Sort(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'sort', parent) try: self.order = attrs['order'] except: self.order = 'asc' def start_element(self, parser, name, attrs): if name == 'section': self.parse_child(Section(attrs, self)) else: self.unhandled_element(parser, name) def get_all(self): rev = False if self.order == 'asc': rev = True self.objects = sorted(self.objects, key=lambda obj: obj.id, reverse=rev) return ElementBase.get_all(self) class Link(ElementBase): def __init__(self, attrs, parent, lang): ElementBase.__init__(self, 'link', parent) self.link = attrs['href'] try: self.lname = attrs['name'] except: self.lname = self.link[self.link.rfind("/")+1:] # Override lname self.default_name = self.lname (self.lname, self.fragment) = get_link_filename(self.link, self.lname) self.wikitext = "" self.lang = lang def char_data(self, parser, data): self.wikitext = self.wikitext + data def get_all(self): if self.wikitext == "": self.wikitext = self.default_name self.wikitext = replace_text(self.wikitext) if self.link.find("http") == 0: text = '[%s %s]'% (self.link, self.wikitext) elif self.lang != '': text = '[[%s/%s%s|%s]]'% (self.lname, self.lang, self.fragment, self.wikitext) else: text = '[[%s%s|%s]]'% (self.lname, self.fragment, self.wikitext) return text class SwitchInline(ElementBase): def __init__(self, attrs, parent, app): ElementBase.__init__(self, 'switchinline', parent) self.switch = attrs['select'] self.embedding_app = app def start_element(self, parser, name, attrs): if name == 'caseinline': self.parse_child(CaseInline(attrs, self, False)) elif name == 'defaultinline': self.parse_child(CaseInline(attrs, self, True)) else: self.unhandled_element(parser, name) def get_all(self): if len(self.objects) == 0: return '' elif self.switch == 'sys': system = {'MAC':'', 'UNIX':'', 'WIN':'', 'default':''} for i in self.objects: if i.case == 'MAC' or i.case == 'UNIX' or \ i.case == 'WIN' or i.case == 'default': system[i.case] = i.get_all() elif i.case == 'OS2': # ignore, there is only one mention of OS2, which is a # 'note to translators', and no meat pass elif i.case == 'HIDE_HERE': # do what the name suggest ;-) pass else: sys.stderr.write('Unhandled "%s" case in "sys" switchinline.\n'% \ i.case ) text = '{{System' for i in [['default', 'default'], ['MAC', 'mac'], \ ['UNIX', 'unx'], ['WIN', 'win']]: if system[i[0]] != '': text = '%s|%s=%s'% (text, i[1], system[i[0]]) return text + '}}' elif self.switch == 'appl': # we want directly use the right text, when inlining something # 'shared' into an 'app' if self.embedding_app == '': text = '' default = '' for i in self.objects: appls = {'BASIC':'Basic', 'CALC':'Calc', \ 'CHART':'Chart', 'DRAW':'Draw', \ 'IMAGE':'Draw', 'IMPRESS': 'Impress', \ 'MATH':'Math', 'WRITER':'Writer', \ 'OFFICE':'', 'default':''} try: app = appls[i.case] all = i.get_all() if all == '': pass elif app == '': default = all else: text = text + '{{WhenIn%s|%s}}'% (app, escape_equals_sign(all)) except: sys.stderr.write('Unhandled "%s" case in "appl" switchinline.\n'% \ i.case) if text == '': text = default elif default != '': text = text + '{{WhenDefault|%s}}'% escape_equals_sign(default) return text else: for i in self.objects: if i.case == self.embedding_app: return i.get_all() return '' class Case(ElementBase): def __init__(self, attrs, parent, is_default): ElementBase.__init__(self, 'case', parent) if is_default: self.name = 'default' self.case = 'default' else: self.case = attrs['select'] def start_element(self, parser, name, attrs): if name == 'bookmark': self.parse_child(Bookmark(attrs, self, 'div', parser)) elif name == 'comment': self.parse_child(Comment(attrs, self)) elif name == 'embed' or name == 'embedvar': if parser.follow_embed: (fname, id) = href_to_fname_id(attrs['href']) self.embed_href(parser, fname, id) elif name == 'list': self.parse_child(List(attrs, self)) elif name == 'paragraph': parser.parse_paragraph(attrs, self) elif name == 'section': self.parse_child(Section(attrs, self)) elif name == 'table': self.parse_child(Table(attrs, self)) else: self.unhandled_element(parser, name) class Switch(SwitchInline): def __init__(self, attrs, parent, app): SwitchInline.__init__(self, attrs, parent, app) self.name = 'switch' def start_element(self, parser, name, attrs): self.embedding_app = parser.embedding_app if name == 'case': self.parse_child(Case(attrs, self, False)) elif name == 'default': self.parse_child(Case(attrs, self, True)) else: self.unhandled_element(parser, name) class Item(ElementBase): replace_type = \ {'start':{'input': '<code>', 'keycode': '{{KeyCode|', 'tasto': '{{KeyCode|', 'litera': '<code>', 'literal': '<code>', 'menuitem': '{{MenuItem|', 'mwnuitem': '{{MenuItem|', 'OpenOffice.org': '', 'productname': '', 'unknown': '<code>' }, 'end':{'input': '</code>', 'keycode': '}}', 'tasto': '}}', 'litera': '</code>', 'literal': '</code>', 'menuitem': '}}', 'mwnuitem': '}}', 'OpenOffice.org': '', 'productname': '', 'unknown': '</code>' }, 'templ':{'input': False, 'keycode': True, 'tasto': True, 'litera': False, 'literal': False, 'menuitem': True, 'mwnuitem': True, 'OpenOffice.org': False, 'productname': False, 'unknown': False }} def __init__(self, attrs, parent): ElementBase.__init__(self, 'item', parent) try: self.type = attrs['type'] except: self.type = 'unknown' self.text = '' def char_data(self, parser, data): self.text = self.text + data def get_all(self): try: text = '' if self.replace_type['templ'][self.type]: text = escape_equals_sign(replace_text(self.text)) else: text = replace_text(self.text) return self.replace_type['start'][self.type] + \ text + \ self.replace_type['end'][self.type] except: sys.stderr.write('Unhandled item type "%s".\n'% self.type) return replace_text(self.text) class Paragraph(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'paragraph', parent) try: self.role = attrs['role'] except: self.role = 'paragraph' try: self.id = attrs['id'] except: self.id = "" try: self.level = int(attrs['level']) except: self.level = 0 self.is_first = (len(self.parent.objects) == 0) def start_element(self, parser, name, attrs): if name == 'ahelp': try: if attrs['visibility'] == 'hidden': self.parse_child(Ignore(attrs, self, name)) except: pass elif name == 'br': self.parse_child(Br(attrs, self)) elif name == 'comment': self.parse_child(Comment(attrs, self)) elif name == 'emph': self.parse_child(Emph(attrs, self)) elif name == 'sup': self.parse_child(Sup(attrs, self)) elif name == 'embedvar': if parser.follow_embed: (fname, id) = href_to_fname_id(attrs['href']) self.embed_href(parser, fname, id) elif name == 'help-id-missing': self.parse_child(HelpIdMissing(attrs, self)) elif name == 'image': self.parse_child(Image(attrs, self)) elif name == 'item': self.parse_child(Item(attrs, self)) elif name == 'link': self.parse_child(Link(attrs, self, parser.lang)) elif name == 'localized': # we ignore this tag, it is added arbitrary for the paragraphs # that come from .sdf files pass elif name == 'switchinline': self.parse_child(SwitchInline(attrs, self, parser.embedding_app)) elif name == 'variable': self.parse_child(Variable(attrs, self)) else: self.unhandled_element(parser, name) def char_data(self, parser, data): if self.role == 'paragraph' or self.role == 'heading' or \ self.role == 'listitem' or self.role == 'variable': if data != '' and data[0] == ' ': data = ' ' + data.lstrip() data = data.replace('\n', ' ') if len(data): self.objects.append(Text(data)) def get_all(self): role = self.role if role == 'heading': if self.level <= 0: sys.stderr.write('Heading, but the level is %d.\n'% self.level) elif self.level < 6: role = 'heading%d'% self.level else: role = 'heading6' # if we are not the first para in the table, we need special handling if not self.is_first and role.find('table') == 0: if role == 'tablecontentcode': role = 'tablenextparacode' else: role = 'tablenextpara' # the text itself children = ElementBase.get_all(self) if self.role != 'emph' and self.role != 'bascode': children = children.strip() if len(children) == 0: return '' # prepend the markup according to the role text = '' try: text = text + replace_paragraph_role['start'][role] except: sys.stderr.write( "Unknown paragraph role start: " + role + "\n" ) if replace_paragraph_role['templ'][role]: text = text + escape_equals_sign(children) else: text = text + children # append the markup according to the role try: text = text + replace_paragraph_role['end'][role] except: sys.stderr.write( "Unknown paragraph role end: " + role + "\n" ) return text class Variable(Paragraph): def __init__(self, attrs, parent): Paragraph.__init__(self, attrs, parent) self.name = 'variable' self.role = 'variable' self.id = attrs['id'] def get_variable(self, id): if id == self.id: return self return None class CaseInline(Paragraph): def __init__(self, attrs, parent, is_default): Paragraph.__init__(self, attrs, parent) self.role = 'null' if is_default: self.name = 'defaultinline' self.case = 'default' else: self.name = 'caseinline' self.case = attrs['select'] class Emph(Paragraph): def __init__(self, attrs, parent): Paragraph.__init__(self, attrs, parent) self.name = 'emph' self.role = 'emph' def get_all(self): text = Paragraph.get_all(self) if len(text): return "'''" + text + "'''" return '' class Sup(Paragraph): def __init__(self, attrs, parent): Paragraph.__init__(self, attrs, parent) self.name = 'sup' self.role = 'sup' def get_all(self): text = Paragraph.get_all(self) if len(text): return "<sup>" + text + "</sup>" return '' class ListItemParagraph(Paragraph): def __init__(self, attrs, parent): Paragraph.__init__(self, attrs, parent) self.role = 'listitem' class BasicCodeParagraph(Paragraph): def __init__(self, attrs, parent): Paragraph.__init__(self, attrs, parent) self.role = 'bascode' class TableContentParagraph(Paragraph): def __init__(self, attrs, parent): Paragraph.__init__(self, attrs, parent) if self.role != 'tablehead' and self.role != 'tablecontent': if self.role == 'code': self.role = 'tablecontentcode' elif self.role == 'bascode': self.role = 'tablecontentcode' else: self.role = 'tablecontent' if self.role == 'tablehead': self.parent.parent.isTableHeader = True # self.parent.parent is TableRow Element else: self.parent.parent.isTableHeader = False class ParserBase: def __init__(self, filename, follow_embed, embedding_app, current_app, wiki_page_name, lang, head_object, buffer): self.filename = filename self.follow_embed = follow_embed self.embedding_app = embedding_app self.current_app = current_app self.wiki_page_name = wiki_page_name self.lang = lang self.head_obj = head_object p = xml.parsers.expat.ParserCreate() p.StartElementHandler = self.start_element p.EndElementHandler = self.end_element p.CharacterDataHandler = self.char_data p.Parse(buffer) def start_element(self, name, attrs): self.head_obj.get_curobj().start_element(self, name, attrs) def end_element(self, name): self.head_obj.get_curobj().end_element(self, name) def char_data(self, data): self.head_obj.get_curobj().char_data(self, data) def get_all(self): return self.head_obj.get_all() def get_variable(self, id): return self.head_obj.get_variable(id) def parse_localized_paragraph(self, paragraph, attrs, obj): localized_text = '' try: localized_text = get_localized_text(self.filename, attrs['id']) except: pass if localized_text != '': # parse the localized text text = u'<?xml version="1.0" encoding="UTF-8"?><localized>' + localized_text + '</localized>' ParserBase(self.filename, self.follow_embed, self.embedding_app, \ self.current_app, self.wiki_page_name, self.lang, \ paragraph, text.encode('utf-8')) # add it to the overall structure obj.objects.append(paragraph) # and ignore the original text obj.parse_child(Ignore(attrs, obj, 'paragraph')) else: obj.parse_child(paragraph) def parse_paragraph(self, attrs, obj): ignore_this = False try: if attrs['role'] == 'heading' and int(attrs['level']) == 1 \ and self.ignore_heading and self.follow_embed: self.ignore_heading = False ignore_this = True except: pass if ignore_this: obj.parse_child(Ignore(attrs, obj, 'paragraph')) else: self.parse_localized_paragraph(Paragraph(attrs, obj), attrs, obj) class XhpParser(ParserBase): def __init__(self, filename, follow_embed, embedding_app, wiki_page_name, lang): # we want to ignore the 1st level="1" heading, because in most of the # cases, it is the only level="1" heading in the file, and it is the # same as the page title self.ignore_heading = True current_app = '' self.current_app_raw = '' for i in [['sbasic', 'BASIC'], ['scalc', 'CALC'], \ ['sdatabase', 'DATABASE'], ['sdraw', 'DRAW'], \ ['schart', 'CHART'], ['simpress', 'IMPRESS'], \ ['smath', 'MATH'], ['swriter', 'WRITER']]: if filename.find('/%s/'% i[0]) >= 0: self.current_app_raw = i[0] current_app = i[1] break if embedding_app == '': embedding_app = current_app file = codecs.open(filename, "r", "utf-8") buf = file.read() file.close() ParserBase.__init__(self, filename, follow_embed, embedding_app, current_app, wiki_page_name, lang, XhpFile(), buf.encode('utf-8')) def loadallfiles(filename): global titles titles = [] file = codecs.open(filename, "r", "utf-8") for line in file: title = line.split(";", 2) titles.append(title) file.close() class WikiConverter(Thread): def __init__(self, inputfile, wiki_page_name, lang, outputfile): Thread.__init__(self) self.inputfile = inputfile self.wiki_page_name = wiki_page_name self.lang = lang self.outputfile = outputfile def run(self): parser = XhpParser(self.inputfile, True, '', self.wiki_page_name, self.lang) file = codecs.open(self.outputfile, "wb", "utf-8") file.write(parser.get_all()) file.close() def write_link(r, target): fname = 'wiki/%s'% r try: file = open(fname, "w") file.write('#REDIRECT [[%s]]\n'% target) file.close() except: sys.stderr.write('Unable to write "%s".\n'%'wiki/%s'% fname) def write_redirects(): print 'Generating the redirects...' written = {} # in the first pass, immediately writte the links that are embedded, so that # we can always point to that source versions for redir in redirects: app = redir[0] redirect = redir[1] target = redir[2] authoritative = redir[3] if app != '': r = '%s/%s'% (app, redirect) if authoritative: write_link(r, target) written[r] = True else: try: written[r] except: written[r] = False # in the second pass, output the wiki links for redir in redirects: app = redir[0] redirect = redir[1] target = redir[2] if app == '': for i in ['swriter', 'scalc', 'simpress', 'sdraw', 'smath', \ 'schart', 'sbasic', 'sdatabase']: write_link('%s/%s'% (i, redirect), target) else: r = '%s/%s'% (app, redirect) if not written[r]: write_link(r, target) # Main Function def convert(generate_redirects, lang, sdf_file): if lang == '': print 'Generating the main wiki pages...' else: print 'Generating the wiki pages for language %s...'% lang global redirects redirects = [] global images images = set() loadallfiles("alltitles.csv") if lang != '': sys.stderr.write('Using localizations from "%s"\n'% sdf_file) if not load_localization_data(sdf_file): return for title in titles: while threading.active_count() > max_threads: time.sleep(0.001) infile = title[0].strip() wikiname = title[1].strip() articledir = 'wiki/' + wikiname try: os.mkdir(articledir) except: pass outfile = '' if lang != '': wikiname = '%s/%s'% (wikiname, lang) outfile = '%s/%s'% (articledir, lang) else: outfile = '%s/MAIN'% articledir try: file = open(outfile, 'r') except: try: wiki = WikiConverter(infile, wikiname, lang, outfile) wiki.start() continue except: print 'Failed to convert "%s" into "%s".\n'% \ (infile, outfile) sys.stderr.write('Warning: Skipping: %s > %s\n'% (infile, outfile)) file.close() # wait for everyone to finish while threading.active_count() > 1: time.sleep(0.001) if lang == '': # set of the images used here print 'Generating "images.txt", the list of used images...' file = open('images.txt', "w") for image in images: file.write('%s\n'% image) file.close() # generate the redirects if generate_redirects: write_redirects() # vim:set shiftwidth=4 softtabstop=4 expandtab: