)(\/\/+)(<\/nowiki>)([^\]]*)/\1\3\5/g
p
' \
| sed -r -n '
# See also: http://www.grymoire.com/Unix/Sed.html#uh-40
# http://en.wikipedia.org/wiki/Regular_expression
# This is pretty advanced sed syntax, so I ll try to explain as much as possible
################################################################################
# if line starts with a space, add it to the hold buffer
# we do this by 'branching' to :addtopre
/^ [ ]*[^ ][^ ]*/ b addtopre
# if line has only whitespace or is empty, the preformatted block is over, so we surround that with
# we do this by 'branching' to :outputpre
/^[ ]*$/ b outputpre
# if line starts with NO whitespace, the preformatted block is over, so we surround that with
/^[^ ].*$/ b outputpre
#else this is a normal line
#s/(.*)/NORMAL LINE: \1/g; p
# print the line
p
#delete the current pattern space (so new cycle is started -> jumps to top)
d
# this is a line that should be part of a CODE block
:addtopre
#add it to the hold buffer
H
#s/(.*)/ADDED LINE: \1/g; p
# if this is the last line of the file (end-of-file), empty this line and then output this last preformatted block
$ { s/.*//g
b outputpre
}
#delete the current pattern space (so new cycle is started -> jumps to top)
d
# this is where a paragraph is surrounded by
:outputpre
#s/(.*)/END OF CODE LINE: \1/g; p
# HOLD buffer is exchanged with the pattern space
x
# IF not empty, surround with and PRINT the pattern space
/(.+)/ {
# surround it with
s/(.+)/\1<\/pre>/g
p
}
# exchange pattern space and hold buffer again, pattern is now the current line (not part of the preformatted block) and PRINT this line
x
p
#delete the current pattern space
s/.*//g
#and exchange this again with the hold buffer, so that the hold buffer is empty again
x
#delete the current pattern space (so new cycle is started -> jumps to top)
d
' \
> mediawiki0
# Headings
cat mediawiki0 \
| sed -r 's/^[ ]*=([^=])/ \1/g' \
| sed -r 's/([^=])=[ ]*$/\1 <\/h1>/g' \
| sed -r 's/^[ ]*==([^=])/ \1/g' \
| sed -r 's/([^=])==[ ]*$/\1 <\/h2>/g' \
| sed -r 's/^[ ]*===([^=])/ \1/g' \
| sed -r 's/([^=])===[ ]*$/\1 <\/h3>/g' \
| sed -r 's/^[ ]*====([^=])/ \1/g' \
| sed -r 's/([^=])====[ ]*$/\1 <\/h4>/g' \
| sed -r 's/^[ ]*=====([^=])/ \1/g' \
| sed -r 's/([^=])=====[ ]*$/\1 <\/h5>/g' \
| sed -r 's/^[ ]*======([^=])/ \1/g' \
| sed -r 's/([^=])======[ ]*$/\1 <\/h6>/g' \
> mediawiki1
cat mediawiki1 \
| sed -r 's/<\/?h1>/======/g' \
| sed -r 's/<\/?h2>/=====/g' \
| sed -r 's/<\/?h3>/====/g' \
| sed -r 's/<\/?h4>/===/g' \
| sed -r 's/<\/?h5>/==/g' \
| sed -r 's/<\/?h6>/=/g' \
> mediawiki2
# lists
cat mediawiki2 \
| sed -r 's/^[*#][*#][*#][*#]\*/ * /g' \
| sed -r 's/^[*#][*#][*#]\*/ * /g' \
| sed -r 's/^[*#][*#]\*/ * /g' \
| sed -r 's/^[*#]\*/ * /g' \
| sed -r 's/^\*/ * /g' \
| sed -r 's/^[*#][*#][*#][*#]#/ - /g' \
| sed -r 's/^[*#][*#][*#]#/ - /g' \
| sed -r 's/^[*#][*#]#/ - /g' \
| sed -r 's/^[*#]#/ - /g' \
| sed -r 's/^#/ - /g' \
> mediawiki3
#[url text] => [url|text]
cat mediawiki3 \
| sed -r 's/([^[]|^)(\[[^] ]*) ([^]]*\])([^]]|$)/\1\2|\3\4/g' \
> mediawiki4
#[link] => [[link]]
cat mediawiki4 \
| sed -r 's/([^[]|^)(\[[^]]*\])([^]]|$)/\1[\2]\3/g' \
> mediawiki5
# bold, italic
cat mediawiki5 \
| sed -r "s/'''''(.*)'''''/\/\/**\1**\/\//g" \
| sed -r "s/'''/**/g" \
| sed -r "s/''/\/\//g" \
> mediawiki6
# talks
cat mediawiki6 \
| sed -r "s/^[ ]*:/>/g" \
| sed -r "s/>:/>>/g" \
| sed -r "s/>>:/>>>/g" \
| sed -r "s/>>>:/>>>>/g" \
| sed -r "s/>>>>:/>>>>>/g" \
| sed -r "s/>>>>>:/>>>>>>/g" \
| sed -r "s/>>>>>>:/>>>>>>>/g" \
> mediawiki7
cat mediawiki7 \
| sed -r "s//\'\'/g" \
| sed -r "s/<\/code>/\'\'/g" \
> mediawiki8
cat mediawiki8 \
| sed -r "s///g" \
| sed -r "s/<\/pre>/<\/code>/g" \
> mediawiki9
#100720-MSe: remove "<\code>\n \n"
cat mediawiki9 \
| sed 'N;N;s/<\/code>\n[ \t]*\n//;P;D;D;' \
> mediawiki10
#cat mediawiki10 > dokuwiki
# font (color, ...)
cat mediawiki10 \
| sed 's///g' \
| sed 's/<\/span>/<\/font>/g' \
| sed 's///g' \
> mediawiki11
cat mediawiki11 > dokuwiki
There is also one issue, when bold and italic texts are combined. I tested with the German UNIX Wikipedia article and there were 2 tags that made whole parts of the generated DokuWiki in bold.
The following code fixes this behaviour:
$ diff mediawiki2dokuwiki.sh mediawiki2dokuwiki.sh.080925-1
7d6
< # changes by Reiner Rottmann: - fixed erroneous interpretation of combined bold and italic text.
165,169c164
<
< cat mediawiki9 \
< | sed -r "s/\*\*\/\//\/\/\*\*/g"> mediawiki10
<
< cat mediawiki10 > dokuwiki
---
> cat mediawiki9 > dokuwiki
===== Automatic script =====
This script get the contents of your mediawiki (by database connection), and convert it to dokuwiki syntax.
All, out of the box, you only need to configure user/password of database.
Example:
cd mediawiki2dokuwiki
./getContent.sh
mv old /var/www/dokuwiki/data/pages
chmod a+r /var/www/dokuwiki/data/pages/old -R
Here you can find the package: http://dabax.net/files/mediawiki2dokuwiki.tar.gz (this link is broken as of 9/3/2012)
This is the main script.
#!/bin/bash
#About your mediawiki
WIKIDB="DATABSE_NAME"
WIKIPASS="DATABASE_PASSWORD"
#The destination folder
DEST="old"
#Dont touch this
TITLES="titles"
PHARSER="./m2d.sh"
mysql --password=$WIKIPASS $WIKIDB -e 'select cur_title from cur;' | \
while read title; do
newtitle="$(echo $title | tr "[:upper:]" "[:lower:]").txt"
echo "$newtitle"
mysql --password=$WIKIPASS $WIKIDB -e "select cur_text from cur where cur_title='$title';" \
| sed s/'\\n'/\\n/g | grep -v cur_text | $PHARSER $DEST/$newtitle
done
for f in $DEST/*; do
[ $(cat $f | wc -w) -lt 25 ] && \
{ echo "Deleting $f, too short"; rm -f $f;}
done
echo ""
echo "Done. Put the contents of $DEST to Path_Of_dokuwiki/data/pages/"
m2d.sh is the "sed version" published in this page, with some modifications.
Enjoy it!
==== ERROR 1146 (42S02) at line 1: Table 'dbname.cur' doesn't exist ====
Hi,
Using the above attached code, I'm having some problems:
ERROR 1146 (42S02) at line 1: Table 'pswiki.cur' doesn't exist
cat: old/*: No such file or directory
Deleting old/*, too short
Done. Put the contents of old to Path_Of_dokuwiki/data/pages/
(where pswiki is the database containing my MW data, which came from a Windows-based MySQL server via mysqldump.) The directory ./old is empty.
If I turn on MySQL statement logging:
SET GLOBAL general_log_file='/var/log/mysql/sql.log';
SET GLOBAL general_log='ON';
then I get the following only in that log file when I run ./getContent.sh:
110216 2:11:17 75 Connect root@localhost on pswiki
75 Query select @@version_comment limit 1
75 Query select cur_title from cur
75 Quit
It looks to me as if what's intended to be a cursor is being interpreted as a literal table name, but I'm getting out of my depth there. I have tried back-ticking `cur` in getContent.sh to no avail.
Can anyone shed light?
Thanks!
--- [[user>tomgreen|tomgreen]] //2011/02/16 03:06//
EDIT: I've used the web-based service linked above, and apart from some table funnies it worked very well - a lifesaver. Thanks!
--- [[user>tomgreen|tomgreen]] //2011/02/18 03:26//
I have solved the "cur problem". It is assumed, for my opinion, as a VIEW.
Create this view, named "cur", with the following SQL request :
CREATE VIEW cur AS SELECT mw_page.page_title AS cur_title, mw_text.old_text AS cur_text
FROM mw_page,mw_text WHERE mw_page.page_id=mw_text.old_id;
and re-run the shell.
Hope this helps!
--- [[user>gtournat|gtournat]] //2011/11/06 13:16//
====== Mediawiki 2 Dokuwiki Converter ======
#! /bin/sh
# Mediawiki2Dokuwiki Converter
# originally by Johannes Buchner
# License: GPL (http://www.gnu.org/licenses/gpl.txt)
# Headings
cat mediawiki | \
perl -pe 's/^[ ]*=([^=])/ ${1}/g' | \
perl -pe 's/([^=])=[ ]*$/${1} <\/h1>/g' | \
perl -pe 's/^[ ]*==([^=])/ ${1}/g' | \
perl -pe 's/([^=])==[ ]*$/${1} <\/h2>/g' | \
perl -pe 's/^[ ]*===([^=])/ ${1}/g' | \
perl -pe 's/([^=])===[ ]*$/${1} <\/h3>/g' | \
perl -pe 's/^[ ]*====([^=])/ ${1}/g' | \
perl -pe 's/([^=])====[ ]*$/${1} <\/h4>/g' | \
perl -pe 's/^[ ]*=====([^=])/ ${1}/g' | \
perl -pe 's/([^=])=====[ ]*$/${1} <\/h5>/g' | \
perl -pe 's/^[ ]*======([^=])/ ${1}/g' | \
perl -pe 's/([^=])======[ ]*$/${1} <\/h6>/g' \
> mediawiki1
cat mediawiki1 | \
perl -pe 's/<\/?h1>/======/g' | \
perl -pe 's/<\/?h2>/=====/g' | \
perl -pe 's/<\/?h3>/====/g' | \
perl -pe 's/<\/?h4>/===/g' | \
perl -pe 's/<\/?h5>/==/g' | \
perl -pe 's/<\/?h6>/=/g' | \
cat > mediawiki2
# lists
cat mediawiki2 |
perl -pe 's/^[\*#]{4}\*/ * /g' | \
perl -pe 's/^[\*#]{3}\*/ * /g' | \
perl -pe 's/^[\*#]{2}\*/ * /g' | \
perl -pe 's/^[\*#]{1}\*/ * /g' | \
perl -pe 's/^\*/ * /g' | \
perl -pe 's/^[\*#]{4}#/ \- /g' | \
perl -pe 's/^[\*\#]{3}\#/ \- /g' | \
perl -pe 's/^[\*\#]{2}\#/ \- /g' | \
perl -pe 's/^[\*\#]{1}\#/ \- /g' | \
perl -pe 's/^\#/ - /g' | \
cat > mediawiki3
#[link] => [[link]]
cat mediawiki3 |
perl -pe 's/([^\[])\[([^\[])/${1}[[${2}/g' |
perl -pe 's/^\[([^\[])/[[${1}/g' |
perl -pe 's/([^\]])\]([^\]])/${1}]]${2}/g' |
perl -pe 's/([^\]])\]$/${1}]]/g' \
> mediawiki4
#[[url text]] => [[url|text]]
cat mediawiki4 |
perl -pe 's/(\[\[[^| \]]*) ([^|\]]*\]\])/${1}|${2}/g' \
> mediawiki5
# bold, italic
cat mediawiki5 |
perl -pe "s/'''/**/g" |
perl -pe "s/''/\/\//g" \
> mediawiki6
# talks
cat mediawiki6 |
perl -pe "s/^[ ]*:/>/g" |
perl -pe "s/>:/>>/g" |
perl -pe "s/>>:/>>>/g" |
perl -pe "s/>>>:/>>>>/g" |
perl -pe "s/>>>>:/>>>>>/g" |
perl -pe "s/>>>>>:/>>>>>>/g" |
perl -pe "s/>>>>>>:/>>>>>>>/g" \
> mediawiki7
cat mediawiki7 |
perl -pe "s///g" |
perl -pe "s/<\/pre>/<\/code>/g" \
> mediawiki8
cat mediawiki8 > dokuwiki