#!/usr/bin/perl -w
# Author     :   Guillaume Durr
#                echo "g.dull@free.fr" | sed 's/l/r/g';echo ".fr"
# Version    :   0.1   oct 29 2004
# Aim        :   Cut tags to make a HTML file "HTML 4 compliant"
# Usage      :   css.pl file.html > new_file.html
#
########################################################################

$charset=0;
$i=0;
if ( ! exists($ARGV[0])) {
print ("Missing file name. Usage is : \n\tcss.pl file.html > new_file.html\n");
exit;
}
open(F, $ARGV[0]) or die "Can't open $ARGV[0]"; # open the file
while (<F>) {
    s/\r/\n/g;
    if ( $i == 0)
    {
 if (not $_ =~ "<!DOCTYPE")
        {
        print <<ENTETE;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
        "http://www.w3.org/TR/html4/strict.dtd">     
ENTETE
       }
   } 
   
    if ($_ =~ "<meta http-equiv.+charset")
        {
        $charset=1;
        }

    if ($charset==0 && $_ =~ "</head>")
    #on viens de passer le </head> et pas de charset, c'est pas bien!!! on y remedie
    {
    s&</head>&<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head>&g;
    }

   #     {
  #}
   # echo $DOCTYPE;
	s/<body[^>]*>/<body>/g;
	
	#if ($_ =~ "//DOCTYPE.+DTD/")
	s/<p>&nbsp;<\/p>//g;
	
	s/<b>/<strong>/g;
  
	s&</b>&</strong>&g;

    s&</i>&</em>&g;

	s&</t[dr]>&&g;
	s/<t[dr][^>]*>//g;
	
	s/[ \t]+/ /g;
	s/<font[^>]*>//g; 
	s&</font>&&g; 

	s/align="?(center|left|right)"?//g;    
	s/<!--.+-->//g;

	#  tage de mise en forme inutile
	s/<img[^>]+"spacer.gif"[^>]+>//g;
	s/<spacer[^>].+>//g;
	
	
	 s#</table>##g;
	 s/<table[^>]+>//g;
	
	#################### accent ISO
	s/&egrave;/è/g;
	s/&eacute;/é/g;	
	s/&agrave;/à/g;
	s/&acirc;/â/g;
	s/&ocirc;/ô/g;
	s/&icirc;/î/g;	 
	s/&ccedil;/ç/g;	  	
	
	# remise en etat tag
	 s/" >/">/g;
	 s/<div >/<div>/g;
	 
	 s/<p >/<p>/g;
	 
	 #mise en page
	 s/^\s+$//g;
	 s/^[ \t]*//g;
	 s/(<img)/\n$1/g;
	 s/(<a)/\n\t$1/g;
	 
	   $i++;
	 print $_;
} # end while
#   s/<b>/<strong>/g;
