#!/usr/bin/perl # Version 0.1.3 # April 29, 2004 # # The latest version of this file can be found at http://backspace.com/pagify # # This software is copyright (C) 2004 John Emerson. It is distributed # under the terms of the GNU General Public License (GPL). Because it is # licensed free of charge, there is NO WARRANTY, it is provided AS IS. # The author can not be held liable for any damage that might arise from # the use of this software. Use it at your own risk. # # See http://www.gnu.org/copyleft/gpl.html for more details. # # This script removes the cruft from Word's "Save as Web Page..." HTML output, # splits the file into separate HTML pages wherever a "Heading 1 appears", and converts # the endnotes (if any) to footnotes. # # # How to use it: # # 1. "Save as Web Page..." from Microsoft Word. # 2. Edit the file configuration information below. # 3. Run! # # There's still work to be done, but it does make a useful # first pass when coding big documents. # # For more information, visit http://backspace.com/pagify # $source_file = "/path/to/HTML_file_from_Word"; $output_root = "/path/to/output_directory/"; $title_front = qq|Document Title|; $date_stamp = "April 2004"; $language = "en"; # currently avaiable: en, es, fr, pt # for "next" "index" "previous" links $verbose = "0"; # binary 1 or 0 $page_header1 = qq|
| !; if ($p != 1) { $page_nav .= qq!<<$previous | !; } $page_nav .= qq!$index | $next>> | $date_stamp |
]+)margin-left([^>]+)>(.*)
||g; $source =~ s|\3
]*)>(.*)
||g; #if (@all_toc = $source =~ "(\2
]+)>([^\n]*)
)\n") { #for ($i=0;$i<$all_toc;$i=$i+4){ #$replace_with = qq|(.*)<\/p> <\/blockquote>/
\1<\/p><\/li><\/ul>/g; $source =~ s/
\n\n- (.*)<\/p> <\/blockquote>/
\1<\/p><\/li><\/ul>/g; $source =~ s/<\/ul>\n\n
/\n\n/g; $source =~ s|
\n\n\n\n||g; $source =~ s|\n\n||g; $output_filename = "$output_root/$n.htm"; if ($n == 0) { $output_filename = $output_root . "/index.htm"; } @page_notez = $source =~ m/#_ftn([0-9]+)/g; foreach $page_note (@page_notez) { $page_notes[$page_note] = "$output_filename"; } if ($source =~ m/(<([^>]*)>)*<\/h1>(\n)*/) { $source = "$_"; return; } $previous_filename = ($n - 1) . ".htm"; $next_filename = ($n + 1) . ".htm"; $page_nav = qq!
!; if ($n != 1) { $page_nav .= qq!<<$previous | !; } $page_nav .= qq!$index | $next>>
!; if ($n == 0) { $page_nav = ""; } open (OUTPUT_FILE, ">$output_filename"); print OUTPUT_FILE "$page_header1\n"; if ($n == 0) { print OUTPUT_FILE "$title_front\n"; } else { print OUTPUT_FILE "$title_front: $section_title\n"; } print OUTPUT_FILE "$page_header2\n"; print OUTPUT_FILE "$page_nav\n"; print OUTPUT_FILE "$source\n"; close (OUTPUT_FILE); if ($verbose == "1") { print "wrote $output_filename\n"; } $source = "$_"; $index_all = ""; open (INDEX, "< $output_root/index.htm"); while () { $index_all = $index_all . $_; } close (INDEX); foreach $toc_destination (@allToc) { if ($verbose == "1") { print "$toc_destination\n"; } $index_all =~ s/\#$toc_destination/$n.htm\#$toc_destination/g; } open (INDEX_TOC, ">$output_root/index.htm"); print INDEX_TOC $index_all; close (INDEX_TOC); $n++; } sub encode_entities { $entity{"aacute"} = pack("c", 225); $entity{"Aacute"} = pack("c", 193); $entity{"acirc"} = pack("c", 226); $entity{"Acirc"} = pack("c", 194); $entity{"agrave"} = pack("c", 224); $entity{"Agrave"} = pack("c", 192); $entity{"aring"} = pack("c", 229); $entity{"Aring"} = pack("c", 197); $entity{"atilde"} = pack("c", 195); $entity{"Atilde"} = pack("c", 227); $entity{"auml"} = pack("c", 228); $entity{"Auml"} = pack("c", 196); $entity{"aelig"} = pack("c", 230); $entity{"AElig"} = pack("c", 198); $entity{"ccedil"} = pack("c", 231); $entity{"Ccedil"} = pack("c", 199); $entity{"eth"} = pack("c", 240); $entity{"ETH"} = pack("c", 208); $entity{"eacute"} = pack("c", 233); $entity{"Eacute"} = pack("c", 201); $entity{"ecirc"} = pack("c", 234); $entity{"Ecirc"} = pack("c", 202); $entity{"egrave"} = pack("c", 232); $entity{"Egrave"} = pack("c", 200); $entity{"euml"} = pack("c", 235); $entity{"Euml"} = pack("c", 203); $entity{"iacute"} = pack("c", 237); $entity{"Iacute"} = pack("c", 205); $entity{"icirc"} = pack("c", 238); $entity{"Icirc"} = pack("c", 206); $entity{"igrave"} = pack("c", 236); $entity{"Igrave"} = pack("c", 204); $entity{"iuml"} = pack("c", 239); $entity{"Iuml"} = pack("c", 207); $entity{"ntilde"} = pack("c", 241); $entity{"Ntilde"} = pack("c", 209); $entity{"oacute"} = pack("c", 243); $entity{"Oacute"} = pack("c", 211); $entity{"ocirc"} = pack("c", 244); $entity{"Ocirc"} = pack("c", 212); $entity{"ograve"} = pack("c", 242); $entity{"Ograve"} = pack("c", 210); $entity{"oslash"} = pack("c", 248); $entity{"Oslash"} = pack("c", 216); $entity{"otilde"} = pack("c", 245); $entity{"Otilde"} = pack("c", 213); $entity{"ouml"} = pack("c", 246); $entity{"Ouml"} = pack("c", 214); $entity{"szlig"} = pack("c", 223); $entity{"thorn"} = pack("c", 254); $entity{"THORN"} = pack("c", 222); $entity{"uacute"} = pack("c", 250); $entity{"Uacute"} = pack("c", 218); $entity{"ucirc"} = pack("c", 251); $entity{"Ucirc"} = pack("c", 219); $entity{"ugrave"} = pack("c", 249); $entity{"Ugrave"} = pack("c", 217); $entity{"uuml"} = pack("c", 252); $entity{"Uuml"} = pack("c", 220); $entity{"yacute"} = pack("c", 253); $entity{"Yacute"} = pack("c", 221); $entity{"yuml"} = pack("c", 255); foreach $entity (keys %entity) { $charentity{$entity{$entity}} = $entity; } local($i, $c); # Get rid of ampersands first s/\&/\&\;/g; foreach $char (keys %charentity) { next if $char eq "&"; # already did ampersands $source =~ s/$char/\&$charentity{$char}\;/g; } for($i = 128; $i < 256; $i++) { $c = pack("c", $i); $source =~ s/$c/\&\#$i\;/g; } } sub cleanlines { $source =~ s#\n\n ##g; $source =~ s|\n( )+|\n|g; $source =~ s|
|/p>\n|g; $source =~ s|
|/h\1>\n|g; $source =~ s/\n\n+/\n\n/g; $source =~ s/\n\n/%%%makeabreak%%%/g; $source =~ s/\n/ /g; $source =~ s/%%%makeabreak%%%/\n\n/g; $source =~ s/ +/ /g; }