#!/usr/bin/perl # Version 0.1.3 # April 29, 2004 # # The latest version of this file can be found at http://backspace.com/pagify # # This software is copyright (C) 2004 John Emerson. It is distributed # under the terms of the GNU General Public License (GPL). Because it is # licensed free of charge, there is NO WARRANTY, it is provided AS IS. # The author can not be held liable for any damage that might arise from # the use of this software. Use it at your own risk. # # See http://www.gnu.org/copyleft/gpl.html for more details. # # This script removes the cruft from Word's "Save as Web Page..." HTML output, # splits the file into separate HTML pages wherever a "Heading 1 appears", and converts # the endnotes (if any) to footnotes. # # # How to use it: # # 1. "Save as Web Page..." from Microsoft Word. # 2. Edit the file configuration information below. # 3. Run! # # There's still work to be done, but it does make a useful # first pass when coding big documents. # # For more information, visit http://backspace.com/pagify # $source_file = "/path/to/HTML_file_from_Word"; $output_root = "/path/to/output_directory/"; $title_front = qq|Document Title|; $date_stamp = "April 2004"; $language = "en"; # currently avaiable: en, es, fr, pt # for "next" "index" "previous" links $verbose = "0"; # binary 1 or 0 $page_header1 = qq| |; $page_header2 = qq|
|; $footnote_separator = qq|


|; $page_footer = qq|
|; ##### End of configuration ##### if ($language eq "en" ) { $previous = "previous"; $index = "index"; $next = "next"; } if ($language eq "es" ) { $previous = "precédente"; $index = "índice"; $next = "proximo"; } if ($language eq "fr" ) { $previous = "précédente"; $index = "index"; $next = "suivant"; } if ($language eq "pt" ) { $previous = "precedente"; $index = "índice"; $next = "seguinte"; } # change line breaks from Mac to Unix `tr '\015' '\012' < '$source_file' > '$source_file.tmp'`; `mv '$source_file.tmp' '$source_file'`; $output_root =~ s/(\.htm|\.html)//; $n = "0"; $p = "0"; if ($verbose == "1") { print "Reading file\n"; } open (SOURCE, "<$source_file") || die("can't open $source_file"); while () { if ($_ =~ "[ ]*]*)>") { &makepage; } else { $source .= $_ } #end if } # end while &makepage; close (SOURCE); while ( $p < $n) { $output_filename = "$output_root/$p.htm"; ## call the first page index.htm if ($p == 0) { $output_filename = $output_root . "/index.htm"; } if ($verbose == "1") { print "testing $output_filename for footnotes\n"; } # do top and bottom navigation $previous_filename = ($p - 1) . ".htm"; $next_filename = ($p + 1) . ".htm"; $page_nav = qq!


!; if ($p != 1) { $page_nav .= qq!<<$previous  |  !; } $page_nav .= qq!$index  |  $next>>$date_stamp
!; if ($p == 0) { $page_nav = ""; } # get this page's footnotes $j = "1"; $f = @page_notes; while ($j < $f) { if ($page_notes[$j] =~ "$output_filename") { $this_page_notes .= "$footnote[$j]\n"; } ## endif $j++; } # write this page open (SOURCE, ">>$output_filename") || die("can't open $output_filename"); if ($this_page_notes) { print SOURCE $footnote_separator; $this_page_notes .= ""; print SOURCE $this_page_notes; } print SOURCE $page_nav; print SOURCE $page_footer; close (SOURCE); $this_page_notes = ""; $p++; } # remove next links from last page if ($verbose == "1") { print "Cleaning last page.\n"; } open (SOURCE, "$output_filename") || die("can't open $output_filename"); open (LAST_PAGE, "> $output_filename.tmp") || die("can't open $output_filename.tmp"); while () { $_ =~ s/ \ \|\  $next\>\><\/a>//; print LAST_PAGE "$_"; } close (SOURCE); close (LAST_PAGE); `mv '$output_filename.tmp' '$output_filename'`; if ($verbose == "1") { print "Done.\n"; } exit; ########## sub makepage { $source =~ s/\r//g; $source =~ s/\ / /g; &cleanlines; $source =~ s|

||g; $source =~ s/<([^>]*)(\n)([^>]*)>/<$1 $3>/g; $source =~ s/]*)>//; $source =~ s/]*)>//; $source =~ s/]*)>//g; $source =~ s/]*)>//g; $source =~ s///g; $source =~ s###g; $source =~ s###g; # remove "SmartTag"s and place names $source =~ s/]|\n)*)>//g; $source =~ s/]|\n)*)>//g; $source =~ s#]|\n)*)>##g; # remove comments $source =~ s///gsx; $source =~ s###g; $source =~ s###g; $source =~ s#dotted'>(.| )+([0-9])+#>#g; $source =~ s/()?\[([0-9]+)\](<\/span>)?<\/span>/\2<\/sup><\/font>/g; $source =~ s|

]+)margin-left([^>]+)>(.*)

|

\3

|g; $source =~ s|

]*)>(.*)

|

\2

|g; #if (@all_toc = $source =~ "(

]+)>([^\n]*)

)\n") { #for ($i=0;$i<$all_toc;$i=$i+4){ #$replace_with = qq|
    $3
|; #$source =~ s#$1#$replace_with#; # } #} $source =~ s/ class=(.*?)( |>)/\2/g; $source =~ s###g; $source =~ s/ //g; $source =~ s/([^<]*)(\n)*([^<]*)/
  • /g; $source =~ s/\x2211/
  • /g; $source =~ s/\xB7/
  • /g; $source =~ s/\s+>/>/g; &encode_entities; $source =~ s/\x93/“/g; $source =~ s/\x94/”/g; $source =~ s/\x91/‘/g; $source =~ s/\x92/’/g; $source =~ s/ //g; $source =~ s/ title=""//g; # recursively remove all nested, empty tags $pattern = qr{<([^>]*)>(?:(??{$pattern})| )*<\/\1>}; $source =~ s/$pattern//g; $source =~ s/
    <\/b>//g; # remove track changes $source =~ s/]*)>//g; $source =~ s/<\/ins>//g; $source =~ s///g; if ($p == 0) { $source =~ s/\.\.+ (.*)<\/a>/<\/a>/g; } if ($source =~ m/(]*)>(<([^>]*)>)*(.*)<\/h1>)(\n)*/) { $section_title = "$1"; $section_title =~ s/<([^>]*)>//g; if ($verbose == "1") { print "\n\n$section_title\n"; } } ### DO FOOTNOTES @allnotez = $source =~ m/(
    (((.*)\n)*?)<\/div>)/g; @allnotez = $source =~ m/(
    (((.*)\n)*?)<\/div>)/g; $count = @allnotez; $i = "0"; while ($i <= $count) { $allnotez[$i+2] =~ s/style='(.*?)'//g; $allnotez[$i+2]=~ s/]*)>//g; $allnotez[$i+2] =~ s/<\/span>//g; $allnotez[$i+2] =~ s/\[([0-9]+)\]/\1<\/sup><\/font>/g; $footnote[$allnotez[$i+1]] = "$allnotez[$i+2]"; $i = $i + 5; } $source =~ s/(
    ((.*)\n)*?<\/div>)//g; $source =~ s/(
    (((.*)\n)*?)<\/div>)//g; @allToc = $source =~ m/"(_Toc[0-9]+)"/g; $source =~ s/]*)>//g; $source =~ s/<\/div>//g; $source =~ s/]*)>//g; $source =~ s/<\/span>//g; $source =~ s/style='(([^>]|\n| )*)'//g; $source =~ s/<([^>]*)[ ]+>/<\1>/g; $source =~ s/\[([0-9]+)\]/\1<\/sup><\/font>/g; $source =~ s/>\[([0-9]+)\]\1<\/sup><\/font>
    \n\n
    \n\n//g; &cleanlines; # get those last empty tags $source =~ s/<([^>]*)>[ ]*<\/\1>\n\n//g; # fix those bullets in the blockquotes $source =~ s/

  • (.*)<\/p> <\/blockquote>/
    • \1<\/p><\/li><\/ul>/g; $source =~ s/

    • (.*)<\/p> <\/blockquote>/
      • \1<\/p><\/li><\/ul>/g; $source =~ s/<\/ul>\n\n

          /\n\n/g; $source =~ s|
          \n\n
          \n\n||g; $source =~ s|
    • \n\n
      \n\n||g; $output_filename = "$output_root/$n.htm"; if ($n == 0) { $output_filename = $output_root . "/index.htm"; } @page_notez = $source =~ m/#_ftn([0-9]+)/g; foreach $page_note (@page_notez) { $page_notes[$page_note] = "$output_filename"; } if ($source =~ m/

      (<([^>]*)>)*<\/h1>(\n)*/) { $source = "$_"; return; } $previous_filename = ($n - 1) . ".htm"; $next_filename = ($n + 1) . ".htm"; $page_nav = qq!

      !; if ($n != 1) { $page_nav .= qq!<<$previous  |  !; } $page_nav .= qq!$index  |  $next>>

      !; if ($n == 0) { $page_nav = ""; } open (OUTPUT_FILE, ">$output_filename"); print OUTPUT_FILE "$page_header1\n"; if ($n == 0) { print OUTPUT_FILE "$title_front\n"; } else { print OUTPUT_FILE "$title_front: $section_title\n"; } print OUTPUT_FILE "$page_header2\n"; print OUTPUT_FILE "$page_nav\n"; print OUTPUT_FILE "$source\n"; close (OUTPUT_FILE); if ($verbose == "1") { print "wrote $output_filename\n"; } $source = "$_"; $index_all = ""; open (INDEX, "< $output_root/index.htm"); while () { $index_all = $index_all . $_; } close (INDEX); foreach $toc_destination (@allToc) { if ($verbose == "1") { print "$toc_destination\n"; } $index_all =~ s/\#$toc_destination/$n.htm\#$toc_destination/g; } open (INDEX_TOC, ">$output_root/index.htm"); print INDEX_TOC $index_all; close (INDEX_TOC); $n++; } sub encode_entities { $entity{"aacute"} = pack("c", 225); $entity{"Aacute"} = pack("c", 193); $entity{"acirc"} = pack("c", 226); $entity{"Acirc"} = pack("c", 194); $entity{"agrave"} = pack("c", 224); $entity{"Agrave"} = pack("c", 192); $entity{"aring"} = pack("c", 229); $entity{"Aring"} = pack("c", 197); $entity{"atilde"} = pack("c", 195); $entity{"Atilde"} = pack("c", 227); $entity{"auml"} = pack("c", 228); $entity{"Auml"} = pack("c", 196); $entity{"aelig"} = pack("c", 230); $entity{"AElig"} = pack("c", 198); $entity{"ccedil"} = pack("c", 231); $entity{"Ccedil"} = pack("c", 199); $entity{"eth"} = pack("c", 240); $entity{"ETH"} = pack("c", 208); $entity{"eacute"} = pack("c", 233); $entity{"Eacute"} = pack("c", 201); $entity{"ecirc"} = pack("c", 234); $entity{"Ecirc"} = pack("c", 202); $entity{"egrave"} = pack("c", 232); $entity{"Egrave"} = pack("c", 200); $entity{"euml"} = pack("c", 235); $entity{"Euml"} = pack("c", 203); $entity{"iacute"} = pack("c", 237); $entity{"Iacute"} = pack("c", 205); $entity{"icirc"} = pack("c", 238); $entity{"Icirc"} = pack("c", 206); $entity{"igrave"} = pack("c", 236); $entity{"Igrave"} = pack("c", 204); $entity{"iuml"} = pack("c", 239); $entity{"Iuml"} = pack("c", 207); $entity{"ntilde"} = pack("c", 241); $entity{"Ntilde"} = pack("c", 209); $entity{"oacute"} = pack("c", 243); $entity{"Oacute"} = pack("c", 211); $entity{"ocirc"} = pack("c", 244); $entity{"Ocirc"} = pack("c", 212); $entity{"ograve"} = pack("c", 242); $entity{"Ograve"} = pack("c", 210); $entity{"oslash"} = pack("c", 248); $entity{"Oslash"} = pack("c", 216); $entity{"otilde"} = pack("c", 245); $entity{"Otilde"} = pack("c", 213); $entity{"ouml"} = pack("c", 246); $entity{"Ouml"} = pack("c", 214); $entity{"szlig"} = pack("c", 223); $entity{"thorn"} = pack("c", 254); $entity{"THORN"} = pack("c", 222); $entity{"uacute"} = pack("c", 250); $entity{"Uacute"} = pack("c", 218); $entity{"ucirc"} = pack("c", 251); $entity{"Ucirc"} = pack("c", 219); $entity{"ugrave"} = pack("c", 249); $entity{"Ugrave"} = pack("c", 217); $entity{"uuml"} = pack("c", 252); $entity{"Uuml"} = pack("c", 220); $entity{"yacute"} = pack("c", 253); $entity{"Yacute"} = pack("c", 221); $entity{"yuml"} = pack("c", 255); foreach $entity (keys %entity) { $charentity{$entity{$entity}} = $entity; } local($i, $c); # Get rid of ampersands first s/\&/\&\;/g; foreach $char (keys %charentity) { next if $char eq "&"; # already did ampersands $source =~ s/$char/\&$charentity{$char}\;/g; } for($i = 128; $i < 256; $i++) { $c = pack("c", $i); $source =~ s/$c/\&\#$i\;/g; } } sub cleanlines { $source =~ s#

      \n\n

      ##g; $source =~ s|\n( )+|\n|g; $source =~ s||/p>\n|g; $source =~ s||/h\1>\n|g; $source =~ s/\n\n+/\n\n/g; $source =~ s/\n\n/%%%makeabreak%%%/g; $source =~ s/\n/ /g; $source =~ s/%%%makeabreak%%%/\n\n/g; $source =~ s/ +/ /g; }