#!/opt/perl/bin/perl -w
$vernum = "1.0f - 26 MAR 2003" ;           # Code version and modify date
#
# Convert file that has been munged by many PC-type tools so it conforms
# better to standard specs.
#
# This code copyright 1999-2003 by
# D. W. Eaton, Artronic Development, Phoenix, AZ -- dwe@arde.com
#
# This software is made freely available under the provisions of the Perl
# "Artistic" license:  http://language.perl.com/misc/Artistic.html
#
# This code is not supported and is not warranteed to perform any particular
# function. Contact dwe@arde.com for aditional information.
# If you find bugs or make enhancements, it would be appreciated if you
# sent them on to the author at dwe@arde.com.
#

use Getopt::Long;
#
# Constants
$false = 0;
$true  = 1;
$all   = 2;  # for Getopt ignore case
use vars qw($false $true $all);
#
$stdin  = "STDIN";  # standard in
$stdout = "STDOUT"; # standard out
#
# Option defaults
$opt{'codeswap'}     = $false; # FALSE = do not swap bad MS char codes
$opt{'help'}         = $false; # TRUE to show help message
$opt{'nowrap'}       = $false; # TRUE to supress word wrap
$opt{'replacechr'}   = 'XXX';  # What to replace the bad characters with
$opt{'specchrswap'}  = $false; # FALSE to not transform special chars
$opt{'transphrase'}  = $false; # FALSE = do not transform MS phrases
$opt{'verbose'}      = $false; # No expanded diagnostic messages
@opts = qw( codeswap|c specchrswap|s transphrase|t 
            help|h verbose|v );
#
# Initialize
$wrap_len            = 78 ;                # max line length

# Some special characters to watch for:
$emdash2 = "\x96";  # long dash, make it 2 dashes
$emdash = "\x97";   # long dash, make it 2 dashes
$dashes0 = "\xd0";  # dashes
$dashes1 = "\xd1";  # dashes
$apost = "\x92";    # apostrophe
$apost2 = "\x93";   # apostrophe
$apost3 = "\x94";   # apostrophe
$apost4 = "\xd4";   # apostrophe
$apost5 = "\xd5";   # apostrophe
$sngqot1 = "\xd2";  # single quote
$sngqot2 = "\xd3";  # single quote
$DOT = "\x85";   # code for "..."
$huh = "\x06";   # Make this a space
$zero = "\x00";  # Null (not usually found)
#
# Bad MS char code definition stuff
#   ((code == 1) || (code > 129 && code < 156)) {
$firstbad = "\x01"; # 1 = first bad MS character
$startbad = "\x81"; # 129 = start bad MS character block
$endbad = "\x9c"; # 156 = end bad MS character block
$foundbadMScodes = ""; # Clear bad char code line numbers

# ---------------------- logic -------------------------
#
# Process command-line
@opts = qw( codeswap|c replacechr|r=s specchrswap|s transphrase|t 
            nowrap|n
            help|h verbose|v );
$Getopt::Long::bundling = $true;  # perl 5.003 and earlier will complain about this
$Getopt::Long::ignorecase = $all;
GetOptions (\%opt, @opts);

if ($opt{'help'})
{
   &syntax_message ();
   exit (1);
}
if ($opt{'replacechr'} eq 'DEL')
{
 $opt{'replacechr'} = ''; # remove bad characters completely
}

$lineno = 0; # Current line number
$line = "\n";

while (<>)
{
 $lineno++;
 if ($lineno > 0 )
 {
  $line = $_;

  # Get rid of junk we don't want:
  $line =~ s/&#9\;/ /g ;      # Make HT (tabs) into spaces
  $line =~ s/$emdash/--/g ;   # Make em-dash into 2 dashes
  $line =~ s/$emdash2/--/g ;  # Make em-dash into 2 dashes
  # Also look at: ' for hex 92

  if ($opt{'transphrase'})
  {
   # Get rid of junk HTML phrases MS-type tools put in:
   $line =~ s/<P>&nbsp;<\/P>//ig;       # Get rid of this thing
   $line =~ s/<\/FONT>//ig;             # Get rid of this thing
   $line =~ s/<FONT SIZE=\d>//ig;       # Get rid of this thing
   $line =~ s/<FONT[^\>]*>//ig;         # Get rid of this thing
   # There may be remnants left ... it is assumed you'll be hand-tweaking the file
  }

  # Convert non-Unix new lines:
  $line =~ s/\r\n/\n/g; # convert returns/newline to newline
  $line =~ s/\n\r/\n/g; # convert newline/returns to newline
  $line =~ s/\r/\n/g;   # convert remaining returns to newlines

  if ($opt{'specchrswap'})
  {
   # Process each line for special characters which might have snuck in
   $line =~ s/$apost4/'/g;     # convert apostrophies
   $line =~ s/$apost5/'/g;     # convert apostrophies
   $line =~ s/$dashes0/-/g;    # convert dashes
   $line =~ s/$dashes1/-/g;    # convert dashes
   $line =~ s/$sngqot1/`/g;    # convert single quotes
   $line =~ s/$sngqot2/'/g;    # convert single quotes
   $line =~ s/$DOT/.../g;      # convert '...'
   $line =~ s/$apost/'/g;      # convert '
   $line =~ s/$apost2/&quot;/g;     # convert '
   $line =~ s/$apost3/&quot;/g;     # convert '
   $line =~ s/$huh/ /g;     # convert ???
  }

  $new = $line;
  $new = &word_wrap ($line);
  if ($opt{'verbose'})
  {
   print "$lineno: $new";
  }
  else
  {
   print "$new";
  }
 }
}

if ($foundbadMScodes)
{
 print STDERR "WARNING: Found bad MS char codes on lines\n";
 print STDERR "$foundbadMScodes\n";
 print STDERR "Change";
 if ($opt{'codeswap'})
 {
  print STDERR "d to '$opt{'replacechr'}'";
 }
 print STDERR " with '--codeswap' argument\n";
}

exit (0);

# ----------
#
# Try to insert strategically-placed newlines to accomplish word-wrap
# and do character swaps (subject to command line options).
sub word_wrap
{
   my ($message) = @_ ;
   my ($start_col, $col, $sep) ;


   $start_col = 0 ;
   $message =~ /^([\* \t]*)/ ;
   $sep = "\n$1" ;
   $lensep = length($sep);
 
   # Strip off all whitespace stuff at end (no need to wrap that)
   $message =~ s/[\s\r\n]*$//;

   # Look for zero bytes and those bad MS char codes foisted on us from FrontPage:
   if ($message =~ /[$startbad-$endbad]/ ||
       $message =~ /$firstbad/ ||
       $message =~ /$zero/)
   {
    $foundbadMScodes .= "$lineno ";
    if ($opt{'codeswap'})
    {
     $message =~ s/[$startbad-$endbad]/$opt{'replacechr'}/g;
     $message =~ s/[$zero]/$opt{'replacechr'}/g;
    }
   }

   unless ($opt{'nowrap'})
   {
    # (if there is a long phrase that can't wrap ... leave it, wrap rest)
    while (($col = $start_col + $wrap_len) <= length ($message) &&
          $message =~ /\s/)
    {
      --$col while ($col > $start_col && substr ($message, $col, 1) !~ /\s/) ;
      $col = $start_col + $wrap_len if $col <= $start_col ;
      substr ($message, $col, 1) = substr ($message, $col, 1) . $sep ;
      $start_col = $col + $lensep;
      if ($opt{'verbose'})
      {
       print "$col: $message";
      }
    }
   }

   if ($message =~ /\<p[ \>]/)
   {
    $message =~ s/\<p/\n\<p/g;
    $message =~ s/\n[\n]*/\n/g;
   }
   $message =~ s/\s+\n/\n/g; # Remove trailing white space on new lines
   $message .= "\n";
   return $message ;
}
# --------------------
#
# Print syntax message
sub syntax_message
{
   my ($progname);

   chomp ($progname = `basename $0`);
   print STDERR  qq
$progname $vernum;

Sanitize output of some (usually PC) tools

Many PC tools use bad character ranges or bad HTML phrases when they
convert word processing files. This application trys to recognize and
remove the commonly found "junk" for more standard treatment. It is
expected you will review and further hand-modify the file before posting.

The result is wordwrapped.

Syntax:

   $progname [ options ] [pathname(s)]

Where 'options' are:

   -c, --codeswap
       Swap bad character codes encountered
   -h, --help
       Print this help message
   -n, --nowrap
       Do not word-wrap the results. (Default is to wrap.)
   -r, --replacechr=s
       What to replace the bad characters with -- default is 'XXX'.
       If this is set to 'DEL', remove bad characters completely.
   -s, --specchrswap
       Transform special characters to meaningful ones.
       (Refer to logic for details.)
   -t, --transphrase
       Transform extraneous or usualy faulty HTML phrases.
       (Eliminate most. Refer to logic for details.)
   -v, --verbose
       Verbose mode. Show additional diagnostics.

By default, input is from STDIN and output is to STDOUT. The
string "STDIN" may be one of the input pathnames specified.

EXAMPLES
--------

Swap the special characters but leave the rest alone:

   $progname  -s <filename

;
}

