#! /bin/perl # # De-moron-ise Text from Microsoft Applications # # by John Walker -- January 1998 # http://www.fourmilab.ch/ # # This program is in the public domain. # # This is the Unmoroniser fork # Changelog: # June 2003: Unicode added by Charlie Loyd $lineWrap = 72; # Wrap lines at this column $lineBreak1 = '[<]'; # Line break first pass candidates $lineBreak2 = '[>]'; # Line break second pass candidates # Process command line options for ($i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] =~ m/^-/) { $o = $ARGV[$i]; splice(@ARGV, $i, 1); $i--; if (length($o) == 1) { last; } $opt = substr($o, 1, 1); $arg = substr($o, 2); # -u -- Print how-to-call information if ($opt eq 'u' || $opt eq '?') { print("Usage: demoroniser [ options ] infile outfile\n"); print(" Options:\n"); print(" -u Print this message.\n"); print(" -wcols Wrap lines at cols columns, 0 = no wrap.\n"); exit(0); # -wcols -- Wrap lines at cols columns, 0 = no wrap } elsif ($opt eq 'w') { if ($arg =~ m/^\d+$/ && $arg >= 0) { $lineWrap = $arg; if ($lineWrap == 0) { $lineWrap = 1 << 31; } } else { die("Invalid wrap length '$arg' in -w option.\n"); } } } } # Open input and output files $if = STDIN; $of = STDOUT; $ifname = "(stdin)"; if ($#ARGV >= 0) { $if = IF; open($if, "<$ARGV[0]") || die("Cannot open input file $ARGV[0]: $!\n"); $ifname = $ARGV[0]; } if ($#ARGV >= 1) { $of = OF; open($of, ">$ARGV[1]") || die("Cannot open output file $ARGV[1]: $!\n"); } $iline = 0; $oline = 0; while ($l = <$if>) { $iline++; $l1 = &demoronise($l); &printWrap($l1); } close($if); close($of); # demoronise -- Translate moronic Microsoft bit-drool into # vaguely readable and compatible HTML. sub demoronise { local($s) = @_; local($i, $c); # Eliminate idiot MS-DOS carriage returns from line terminator $s =~ s/\s+$//; $s .= "\n"; # Fix strategically non-standard characters 0x82 through 0x9f. # Unicode! $s =~ s/\x80/€/g; # Euro currency symbol (looks like e) $s =~ s/\x82/‚/g; # single low open quote (looks like ,) $s =~ s/\x83/ƒ/g; # function, folder, and florin symbol (looks like f) $s =~ s/\x84/„/g; # double low open quote (looks like ,,) $s =~ s/\x85/…/g; # horizontal ellipsis (looks like ...) $s =~ s/\x86/†/g; # dagger symbol (death or second footnote) $s =~ s/\x87/‡/g; # double dagger symbol (third footnote) $s =~ s/\x88/ˆ/g; # empty circumflex accent (looks like ^) $s =~ s/\x89/‰/g; # per-thousand symbol (looks like %0) $s =~ s/\x8a/Š/g; # capital s with caron (looks like S + v) $s =~ s/\x8b/‹/g; # left single angle quote (looks like less-than) $s =~ s/\x8c/Œ/g; # capital o-e ligature (looks like Oe) $s =~ s/\x8e/Ž/g; # capital z with caron (looks like Z + v) $s =~ s/\x91/‘/g; # left single quote (looks like `) $s =~ s/\x92/’/g; # right single quote (looks like ') $s =~ s/\x93/“/g; # left double quote (looks like ``) $s =~ s/\x94/”/g; # right double quote (looks like ") $s =~ s/\x95/•/g; # bullet (dot for lists) $s =~ s/\x96/–/g; # en dash (looks like -) $s =~ s/\x97/—/g; # em dash (looks like --) $s =~ s/\x98/˜/g; # small tilde (looks like ~) $s =~ s/\x99/™/g; # trademark symbol (looks like TM) $s =~ s/\x9a/š/g; # lowercase s with caron (looks like s + v) $s =~ s/\x9b/›/g; # right single angle quote (looks like greater-than) $s =~ s/\x9c/œ/g; # lowercase o-e ligature (looks like oe) $s =~ s/\x9e/ž/g; # lowercase z with caron (looks like z + v) $s =~ s/\x9f/Ÿ/g; # capital y with diaeresis or umlaut (looks like Y + ") # That was Unicode. # Now check for any remaining untranslated characters. if ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) { for ($i = 0; $i < length($s); $i++) { $c = substr($s, $i, 1); if ($c =~ m/[\x00-\x09\x10-\x1F\x80-\x9F]/) { printf(STDERR "$ifname: warning--untranslated character 0x%02X in input line %d, output line(s) %d(...).\n", unpack('C', $c), $iline, $oline + 1); } } } # Supply missing semicolon at end of numeric entity if # Billy's bozos left it out. $s =~ s/([0-2]\d\d)\s/$1; /g; # Fix dimbulb obscure numeric rendering of < > & $s =~ s/&/&/g; $s =~ s/</</g; $s =~ s/>/>/g; # Fix unquoted non-alphanumeric characters in table tags $s =~ s/(
- |