I've migrated my company's Perspective wiki to dokuwiki. The Perspective wiki had dozens of collections and editors and attachments and hundreds of pages, and the importing script worked reasonably well. Perspective is a Windows hosted wiki, so this is a Windows Python script. (The path separators are hardcoded.)
ToDo:
name = GetNode(fields, "name", "page.name").firstChild.data doku_namespace = "" doku_name = "" if name.find(":") != -1: doku_namespace, doku_name = name.split(":") doku_namespace = "\\" + doku_namespace else: doku_name = name
with this code:
doku_namespace = "" doku_name = "" doku_namespace = "\\" + GetNode(fields, "name", "page.collection").firstChild.data doku_name = GetNode(fields, "name", "page.display-name").firstChild.data
This will also allow you to search for a full page name with spaces in the search term instead of only being able to find the page by searching for the full name without spaces or one word from the page title.
s = s.replace(u'\ufeff',"") s = s.replace(u'\uf06c',"o") s = s.replace(u'\xad', "<<") s = s.replace(u'\xae', ">>") s = s.replace(u'\xa7', " degrees ") s = s.replace(u'\xe0', "-") s = s.replace(u'\xef', "i")
Requirements:
Note: After running this script, run the indexer.php script in dokuwiki\bin. This will vastly improve the search functionality.
# ConvertPerspectiveToDokuwiki # By: David Blume # # This was a quick hack, but it works reasonably well. import sys import os import time from xml.dom import minidom doku_dir = "" def CopyAttachments(attachments, path, doku_page, doku_namespace): for attachment in attachments.childNodes: if attachment.nodeName == u'attachment': version = attachment.getElementsByTagName("version")[0].firstChild.data name = attachment.getElementsByTagName("name")[0].firstChild.data doku_name = name.lower().replace(" ", "_") doku_media_path = doku_dir + "\\media" + doku_namespace if not os.path.exists(doku_media_path): os.mkdir(doku_media_path) os.popen4("copy \"" + path + "\\" + version + "-attachments\\" + name + "\" \"" + doku_media_path + "\\" + doku_name + "\"") if len(doku_namespace): name = doku_namespace[1:] + ":" + name if name not in attached_images: doku_page.write("\nAutomatically Attached : {{" + name + "}}\n") doku_list_types = [] attached_images = [] changes_log = None def ParsePage(node, doku_page, doku_namespace, doku_name): global doku_list_types global attached_images if node.nodeType == minidom.Node.TEXT_NODE: s = node.nodeValue.lstrip() if s: s = s.replace("&", "&") s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace(u'\u201c', "\"") s = s.replace(u'\u201d', "\"") s = s.replace(u'\xb4', "'") s = s.replace(u'\u2019', "'") s = s.replace(u'\u2013', "-") s = s.replace(u'\u2022', ".") s = s.replace(u'\u2026', "...") s = s.replace(u'\u2018', "'") s = s.replace(u'\xb7', "*") s = s.replace("**", "<nowiki>**</nowiki>") doku_page.write(s) return node_name = node.nodeName if node_name == u'img': src = node.getAttribute("src") src_name = src[src.find("name=") + 5:] if len(doku_namespace): src_name = doku_namespace[1:] + ":" + src_name doku_page.write("{{" + src_name + "}}") attached_images.append(src_name) return if node_name == u'link': link_dest = node.getElementsByTagName("name")[0].firstChild.data if link_dest[0] == ':' and len(doku_namespace): link_dest = link_dest[1:] doku_page.write("[[" + link_dest + "|" + node.getElementsByTagName("anchor")[0].firstChild.data + "]]") return doku_list_pop = False bold = False italics = False underline = False anchor = False preformatted = False line_item = False header = "" if node_name == u'a': doku_page.write("[[" + node.getAttribute("href") + "|") anchor = True if node_name == u'ol': doku_list_types += "o" doku_list_pop = True if node_name == u'ul': doku_list_types += "u" doku_list_pop = True if node_name == u'li': doku_page.write(" " * len(doku_list_types)) doku_page.write(doku_list_types[-1] == 'u' and "* " or "- ") line_item = True # What about tables? How are they done? if node_name == u'h1': header = "======" doku_page.write(header + " ") if node_name == u'h2': header = "=====" doku_page.write(header + " ") if node_name == u'h3': header = "====" doku_page.write(header + " ") if node_name == u'h4': header = "===" doku_page.write(header + " ") if node_name == u'h5': header = "==" doku_page.write(header + " ") if node_name == u'span': style = node.getAttribute("style") if style: if style.find("font-weight:bold;") != -1: doku_page.write("**") bold = True if style.find("text-decoration:underline;") != -1: doku_page.write("__") underline = True if style.find("font-style:italic;") != -1: doku_page.write("//") italics = True if node_name == u'div': style = node.getAttribute("style") if style: if style.find("margin-left:40px;") != -1: # Maybe one day we'll support indentation pass if node_name == u'pre': doku_page.write("''") preformatted = True for subnode in node.childNodes: ParsePage(subnode, doku_page, doku_namespace, doku_name) if preformatted: doku_page.write("''") if italics: doku_page.write("//") if underline: doku_page.write("__") if bold: doku_page.write("**") if doku_list_pop: doku_list_types.pop() doku_page.write("\n") if len(header): doku_page.write(" " + header + "\n") if anchor: doku_page.write("]]") if line_item: doku_page.write("\n") if node_name == u'p' and len(doku_list_types) == 0: doku_page.write("\n\n") if (node_name == u'br' or node_name == u'div') and len(doku_list_types) == 0: doku_page.write("\\\\ \n") def GetNode(nodes, attribute, name): for n in nodes: if n.getAttribute(attribute) == name: return n def Walk(path): global changes_log global attached_images for filename in os.listdir(path): fullpath = path + "\\" + filename if os.path.isdir(fullpath): if filename.endswith(".page"): # Parse this page # Get the revision from "latest.txt" print "Parsing " + fullpath + "..." version = -1 try: version = file(fullpath + "\\latest.txt", "r").read().strip() except: print "WARNING: " + filename + " does not have a latest version." continue source = minidom.parse(fullpath + "\\versions\\" + version + ".xml") assert source.documentElement.tagName == "page-data" fields = source.documentElement.getElementsByTagName("field") name = GetNode(fields, "name", "page.name").firstChild.data doku_namespace = "" doku_name = "" if name.find(":") != -1: doku_namespace, doku_name = name.split(":") doku_namespace = "\\" + doku_namespace else: doku_name = name doku_namespace = doku_namespace.lower().replace(" ", "_") doku_name = doku_name.lower().replace(" ", "_") if not os.path.isdir(doku_dir + "\\pages" + doku_namespace): os.mkdir(doku_dir + "\\pages" + doku_namespace) doku_page = file(doku_dir + "\\pages" + doku_namespace + "\\" + doku_name + ".txt", 'w') user = GetNode(fields, "name", "page.last-edit-username").firstChild.data.lower() date = GetNode(fields, "name", "page.last-edit-server-time").firstChild.data date = str(int(time.mktime(time.strptime(date, '%d/%b/%y %H:%M:%S')))) page = GetNode(fields, "name", "page.contents") attached_images = [] ParsePage(page, doku_page, doku_namespace, doku_name) attachments = GetNode(fields, "name", "page.attachments") if attachments: CopyAttachments(attachments, fullpath + "\\versions", doku_page, doku_namespace) doku_page.close() source.unlink() changes_log.write("\t".join([date, "127.0.0.1", doku_namespace[1:] + ":" + doku_name, user[user.find(":")+1:], "imported"]) + '\n') else: Walk(fullpath) def main(args): source_dir = "" global doku_dir global changes_log if len(args) == 2: source_dir = args[0] doku_dir = args[1] doku_dir += "\\data" changes_log = file(doku_dir + "\\imported.log", 'wb') Walk(source_dir) changes_log.close() print "Finished parsing " + source_dir if __name__=='__main__': if len(sys.argv) == 3: main(sys.argv[1:]) else: print "usage: " + sys.argv[0] + " perspective_dir dokuwiki_dir" print "Where the perspective_dir contains the .col (collection) directories, and" print "the dokuwiki_dir contains the dokuwiki's data directory as a child directory."