#!/usr/bin/perl -w # NAME / AUTHOR # =================================== # htmlcleaner.pl by Eduardo Unda-Sanzana # Antofagasta (CHILE) # # PURPOSE # =================================== # This script cleans and organize the HTML output from # programs such as Open Office. # use Text::Wrap; # Read arguments from the command line. if (@ARGV != 2) { print " \n"; print "htmlcleaner.pl - Eduardo Unda-Sanzana \n"; print " \n"; print "Usage: perl htmlcleaner.pl in out \n"; print " \n"; print "in : File to be processed. \n"; print "out : File to store the cleaned HTML file. \n"; print " \n"; die " \n"; } $in = shift; $out = shift; # Perform some control on the user's input. (-e "$in") || die "$in does not exist in this directory.\n"; (-f "$in") || die "$in it is not a plain file.\n"; (-e "$out") && die "$out exists in this directory.\n"; # Open the file. open(IN,"$in") || die "Sorry, I couldn't open $in\n"; open(OUT,">$out") || die "Sorry, I couldn't open $out for writing.\n"; undef $/; # This allows reading the whole file in one single string. $content = ; $/ = "\n"; # Then I restore the undef variable to its normal behaviour. # $content =~ s/([0-9])([0-9])([0-9])([0-9])-([0-9])([0-9])-([0-9])([0-9])/$7$8\/$5$6\/$3$4/g; # Remember to first eliminate everything useless. Only then # format the output. # Removes new lines. $content =~ s/\n/ /g; # Remove everything prior to and after body. This might be optional in a later version. $content =~ s/^(.*)//ig; $content =~ s/<\/BODY(.*?)$//ig; # Remove the header and footer div created by Open Office. $content =~ s/
(.*?)<\/DIV>//g; $content =~ s/
(.*?)<\/DIV>//g; # Remove useless blank space. $content =~ s/\s+/ /g; $content =~ s/>\s//ig; $content =~ s/<\/FONT>//ig; # Eliminate language stuff. $content =~ s///ig; $content =~ s/<\/SPAN>//ig; # Eliminate multicolumn stuff. $content =~ s///ig; $content =~ s/<\/MULTICOL>//ig; # Eliminate column stuff. $content =~ s///ig; # Remove parameters from inside tags. Also capitalize. $content =~ s/<[Hh]([1-6])(.*?)>//g; $content =~ s/<\/[Hh]([1-6])>/<\/h$1>/g; $content =~ s/<[Pp](.*?)>/

/g; $content =~ s/<\/[Pp]>/<\/p>/g; $content =~ s//

    /g; $content =~ s/<\/OL>/<\/ol>/g; $content =~ s//
      /g; $content =~ s/<\/UL>/<\/ul>/g; $content =~ s//
    • /g; $content =~ s/<\/LI>/<\/li>/g; $content =~ s///g; $content =~ s/<\/B>/<\/b>/g; $content =~ s///g; $content =~ s/<\/I>/<\/i>/g; $content =~ s///g; $content =~ s/<\/U>/<\/u>/g; $content =~ s///g; $content =~ s///g; $content =~ s/<\/A>/<\/a>/g; $content =~ s///g; $content =~ s/<\/SUP>/<\/sup>/g; $content =~ s///g; $content =~ s/<\/SUB>/<\/sub>/g; $content =~ s///g; $content =~ s/<\/TABLE>/<\/table>/g; $content =~ s///g; $content =~ s/<\/TR>/<\/tr>/g; $content =~ s///g; $content =~ s/<\/TD>/<\/td>/g; # Format table tags. # Delete empty structures. $content =~ s/

      \s*
      \s*<\/p>//g; $content =~ s/

      \s*
      \s*<\/p>//g; $content =~ s/

      \s*
      \s*<\/p>//g; $content =~ s/

      \s*
      \s*<\/p>//g; $content =~ s/­//g; # Check headers for img tags. $content =~ s/(.*?)()(.*?)<\/h([1-6])>/$2$4<\/h$5>$3/g; # Check

    • structures. Don't mess with the order in which I'm checking # this. It took me days to figure out that the first one must be indeed # the first one... By the way, the first test includes a nifty piece of # code. Note that the weird structure in the middle of the pattern instructs # the s operator to *not* match the pattern

      . In this way I can avoid # a problem arising from the combination of left-to-right replacement plus # greediness. So that you can remember what I'm talking about, consider # this bit of uncleaned HTML code: # #

      1. One.

      2. Two.

      # # How do you tell the s operator to pick: # #
    • Two.

      # # and do something with it? It is not trivial because something like # # s/
    • (.+?)<\/p><\/ol>/do something/g # # will take everything after the *first*

    • . # $content =~ s/
    • ((?:(?!

      ).)*)<\/p><\/ol>/

    • $1<\/li><\/ol>/sg; $content =~ s/
      1. (.+?)<\/p><\/ol>/

        1. $1<\/li><\/ol>/g; $content =~ s/
          1. (.+?)<\/p>

          2. /
            1. $1<\/li>
            2. /g; $content =~ s/
            3. ((?:(?!

              ).)*)<\/p><\/ul>/

            4. $1<\/li><\/ul>/sg; $content =~ s/
    • /g; $content =~ s///g; $content =~ s//\n
      \n/g; $content =~ s/<\/tr><\/table>/<\/tr><\/table>\n/g; $content =~ s// \n/g; $content =~ s/<\/td><\/tr>/<\/td>\n <\/tr>\n/g; $content =~ s/<\/td>/<\/td>\n/g; $content =~ s/<\/td>/\n<\/td>/g; $content =~ s/(.*?)<\/td>/format_td($1,$2)/ges; sub format_td { my($parameters,$text) = @_; my $string; $initial_tab = " "; # Space before first lines. $subsequent_tab = " "; # And the same for the other lines. $string = " "; $Text::Wrap::columns = 72; $string .= wrap($initial_tab, $subsequent_tab,$2); $string .= "\n "; return $string; } # Preserve content from "Sections" but do away with the tags. $content =~ s/
      (.*?)<\/DIV>/$1/isg; # Respect but format footnotes. They have part of the text! $content =~ s/
      (.*?)<\/DIV>/\n

      \nNotas al pie\n<\/h2>\n\n$1\n/gs; $content =~ s/
      (.*?)<\/DIV>/\n\n$1\n/gs; # Remove redundant new lines. Ugly, I know, but it works :-) $content =~ s/<\/p>\n\n+

      /<\/p>\n\n

      /sg; $content =~ s/<\/li>\n+<\/ol>/<\/li>\n<\/ol>/sg; $content =~ s/<\/li>\n+<\/ul>/<\/li>\n<\/ul>/sg; # Customize the file for my own use, adding standard header and footer. # You may change this by whatever you need. print OUT <<'END'; END print OUT $content; print OUT <<'END'; END close(OUT); close(IN);