#! /bin/perl
#
#           De-moron-ise Text from Microsoft Applications
# 
#                   by John Walker -- January 1998
#                      http://www.fourmilab.ch/
#
#               This program is in the public domain.
#
# This is the Unmoroniser fork
# Changelog:
# June 2003: Unicode added by Charlie Loyd

    $lineWrap = 72;                   # Wrap lines at this column
    $lineBreak1 = '[<]';              # Line break first pass candidates
    $lineBreak2 = '[>]';              # Line break second pass candidates

    #   Process command line options

    for ($i = 0; $i <= $#ARGV; $i++) {
        if ($ARGV[$i] =~ m/^-/) {
            $o = $ARGV[$i];
            splice(@ARGV, $i, 1);
            $i--;
            if (length($o) == 1) {
                last;
            }
            $opt = substr($o, 1, 1);
            $arg = substr($o, 2);

            #   -u                  -- Print how-to-call information

            if ($opt eq 'u' || $opt eq '?') {
                print("Usage: demoroniser [ options ] infile outfile\n");
                print("       Options:\n");
                print("             -u              Print this message.\n");
                print("             -wcols          Wrap lines at cols columns, 0 = no wrap.\n");
                exit(0);

            #   -wcols              -- Wrap lines at cols columns, 0 = no wrap

            } elsif ($opt eq 'w') {
                if ($arg =~ m/^\d+$/ && $arg >= 0) {
                    $lineWrap = $arg;
                    if ($lineWrap == 0) {
                        $lineWrap = 1 << 31;
                    }
                } else {
                    die("Invalid wrap length '$arg' in -w option.\n");
                }
            }
        }
    }

    #   Open input and output files

    $if = STDIN;
    $of = STDOUT;
    $ifname = "(stdin)";
    if ($#ARGV >= 0) {
        $if = IF;
        open($if, "<$ARGV[0]") || die("Cannot open input file $ARGV[0]: $!\n");
        $ifname = $ARGV[0];
    }
    if ($#ARGV >= 1) {
        $of = OF;
        open($of, ">$ARGV[1]") || die("Cannot open output file $ARGV[1]: $!\n");
    }

    $iline = 0;
    $oline = 0;

    while ($l = <$if>) {
        $iline++;

        $l1 = &demoronise($l);
        &printWrap($l1);
    }

    close($if);
    close($of);

#   demoronise  --  Translate moronic Microsoft bit-drool into
#                   vaguely readable and compatible HTML.

sub demoronise {
    local($s) = @_;
    local($i, $c);

    #   Eliminate idiot MS-DOS carriage returns from line terminator

    $s =~ s/\s+$//;
    $s .= "\n";

    #   Fix strategically non-standard characters 0x82 through 0x9f.
    #   Unicode!

	$s =~ s/\x80/&euro;/g; # Euro currency symbol (looks like e)
	$s =~ s/\x82/&sbquo;/g; # single low open quote (looks like ,)
	$s =~ s/\x83/&fnof;/g; # function, folder, and florin symbol (looks like f)
	$s =~ s/\x84/&bdquo;/g; # double low open quote (looks like ,,)
	$s =~ s/\x85/&hellip;/g; # horizontal ellipsis (looks like ...)
	$s =~ s/\x86/&dagger;/g; # dagger symbol (death or second footnote)
	$s =~ s/\x87/&Dagger;/g; # double dagger symbol (third footnote)
	$s =~ s/\x88/&circ;/g; # empty circumflex accent (looks like ^)
	$s =~ s/\x89/&permil;/g; # per-thousand symbol (looks like %0)
	$s =~ s/\x8a/&Scaron;/g; # capital s with caron (looks like S + v)
	$s =~ s/\x8b/&lsaquo;/g; # left single angle quote (looks like less-than)
	$s =~ s/\x8c/&OElig;/g; # capital o-e ligature (looks like Oe)
	$s =~ s/\x8e/&#x017d;/g; # capital z with caron (looks like Z + v)
	$s =~ s/\x91/&lsquo;/g; # left single quote (looks like `)
	$s =~ s/\x92/&rsquo;/g; # right single quote (looks like ')
	$s =~ s/\x93/&ldquo;/g; # left double quote (looks like ``)
	$s =~ s/\x94/&rdquo;/g; # right double quote (looks like ")
	$s =~ s/\x95/&bull;/g; # bullet (dot for lists)
	$s =~ s/\x96/&ndash;/g; # en dash (looks like -)
	$s =~ s/\x97/&mdash;/g; # em dash (looks like --)
	$s =~ s/\x98/&tilde;/g; # small tilde (looks like ~)
	$s =~ s/\x99/&trade;/g; # trademark symbol (looks like TM)
	$s =~ s/\x9a/&scaron;/g; # lowercase s with caron (looks like s + v)
	$s =~ s/\x9b/&rsaquo;/g; # right single angle quote (looks like greater-than)
	$s =~ s/\x9c/&oelig;/g; # lowercase o-e ligature (looks like oe)
	$s =~ s/\x9e/&#x017e;/g; # lowercase z with caron (looks like z + v)
	$s =~ s/\x9f/&Yuml;/g; # capital y with diaeresis or umlaut (looks like Y + ")

    #   That was Unicode.
    #   Now check for any remaining untranslated characters.

    if ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) {
        for ($i = 0; $i < length($s); $i++) {
            $c = substr($s, $i, 1);
            if ($c =~ m/[\x00-\x09\x10-\x1F\x80-\x9F]/) {
                printf(STDERR  "$ifname: warning--untranslated character 0x%02X in input line %d, output line(s) %d(...).\n",
                    unpack('C', $c), $iline, $oline + 1);
            }
        }
    }
    #   Supply missing semicolon at end of numeric entity if
    #   Billy's bozos left it out.

    $s =~ s/(&#[0-2]\d\d)\s/$1; /g;

    #   Fix dimbulb obscure numeric rendering of &lt; &gt; &amp;

    $s =~ s/&#038;/&amp;/g;
    $s =~ s/&#060;/&lt;/g;
    $s =~ s/&#062;/&gt;/g;

    #   Fix unquoted non-alphanumeric characters in table tags

    $s =~ s/(<TABLE\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;
    $s =~ s/(<TD\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;
    $s =~ s/(<TH\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;

    #   Correct PowerPoint mis-nesting of tags

    $s =~ s-(<Font .*>\s*<STRONG>.*)(</FONT>\s*</STRONG>)-$1</STRONG></Font>-gi;

    #   Translate bonehead PowerPoint misuse of <UL> to achieve
    #   paragraph breaks.

    $s =~ s-<P>\s*<UL>-<p>-gi;
    $s =~ s-</UL><UL>-<p>-gi;
    $s =~ s-</UL>\s*</P>--gi;

    #   Repair PowerPoint depredations in "text-only slides"

    $s =~ s-<P></P>--gi;
    $s =~ s- <TD HEIGHT=100- <tr><TD HEIGHT=100-ig;
    $s =~ s-<LI><H2>-<H2>-ig;

    $s;
}

#   printWrap  --  Print one or more lines with wrap at
#                  the specified column.

sub printWrap {
    local($s) = @_;
    local($l, $sep, $rem, $ter, $lwrap, $indent);

    #   Pick the input apart line by line and reformat each line,
    #   if necessary, so as not to exceed the maximum line length.

    $s =~ m/(\s*)(\S)/;
    $indent = $1;
    if ($2 eq '<') {
        $indent .= ' ';
    }
    while (length($s) > 0) {
        if (($s =~ s/(.*\n)//) != 1) {
            $aax = $_[0];
            print("printWrap arg = |$aax|\n");
            print("printWrap s = |$s|\n");
            $aal = length($s);
            print("printWrap length(s) = $aal\n");
            die("$ifname: Error splitting lines.");
        }
        $l = $1;

        $sep = '';
        $lwrap = '';
        while (length($l) > $lineWrap) {
            if (($l =~ s/(^.{1,$lineWrap})(\s)//o) || 
                ($l =~ s/(^.{1,$lineWrap})($lineBreak1)//o) ||
                ($l =~ s/(^.{1,$lineWrap})($lineBreak2)//o)
               ) {
                $rem = $1;
                $ter = $2;
                if ($ter =~ m/\s+/) {
                    $ter='';
                }
                $lwrap .= "$sep$rem$ter\n";
                $oline++;
                $l =~ s/^\s*//;
                $sep = $indent;
            } else {
                last;
            }
        }
        print($of "$lwrap$sep$l");
        $oline++;
    }
}
