#!/usr/local/bin/perl -w

# $Id: ians,v 1.37.1.5 1999/06/01 14:48:49 brianr Exp $

$ourexefile=$0;
$version='$Revision: 1.37.1.5 $ ';
$version =~ m/\ (.*) /;
$version = "$1";
$fullversion = "IANS $1";
$urlctr = 0 ;

print "$$ [". time2str() ."] $version \n";
STDOUT->flush();

#  Internet Alternate Namespace 
#  Copyright (C) 1998 Brian Ristuccia <brianr@osiris.978.org>
#  With contributions by Bennett Haselton <bennett@peacefire.org>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
# 
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software  
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# 

# Where's the redirection form website?
$website="http://www.osiris.978.org/~brianr/ians/";

{
  package RefFixer;
  require HTML::Filter;
  @ISA=qw(HTML::Filter);
  
  sub output {
    my $self = shift;
    my $text = $_[0];
    if ($self->{prohibtags}) {
#      if ( $text =~ m/\<\! IANS-VERSION \>/ ) {
#	push(@{$self->{fhtml}}, "*" . $main::fullversion);
#	return; 
#      } #elsif ( $_[0] eq '<! IANS VERSION >' )

      #push(@{$self->{fhtml}}, "<!-- JavaScript Removed for Security Reasons. Nesting " . $self->{prohibtags} . " -->");
      1;
    } else {
      # Skip javascript and other cruft. 
      push(@{$self->{fhtml}}, $text); 
    }
  }
  sub filtered_html { join("", @{$_[0]->{fhtml}}) }
  
  sub start {
    my $self = shift;
    my %props = %{$_[1]};
    my @order = @{$_[2]};
    my $fixurl;
    my $altbase;
    $self->{prohibtags}++ if $_[0] eq "script";
    $_[0] = "X-$_[0]" if $_[0] eq "meta";
    $compltext="<$_[0]";
    foreach $prop (@order) {

      if ( $prop =~ m/^on.+/ ) {
	# No javascrit on* tags. Comment em out.
	$props{$prop} = "// " . $props{$prop}
      }
      if ( $prop =~ m/(href|src|background|action)/ ) {
	$fixurl=undef;
	if ( ($props{$prop} =~ m/^[^:\/]*:/)  ) {
	  # Reference like http://www.site.com/file/
	  $fixurl="$props{$prop}";
	} elsif ($props{$prop} =~ m/^\// ) {
	  # root reference
	  $altbase = $main::base;
	  $altbase =~ m/([^:\/]*:\/\/[^\/]*)\//;
	  
	  $fixurl="$1$props{$prop}";
	} else {
	  #wicked relative
	  my $url1 = new URI::URL "$props{$prop}", "$main::base";
	  $fixurl=$url1->abs;
	}
	$fixurl = main::EncodeURL($fixurl);
	if ($fixurl) { $props{$prop}="/$fixurl"; };
      }
      $compltext="$compltext $prop=\"$props{$prop}\"";     
    }  
    $compltext="$compltext\>";
    $self->SUPER::start($_[0],$_[1],$_[2],$compltext);
  }
  
  sub end
    {
      my $self = shift;
      $self->SUPER::end(@_);
      $self->{prohibtags}-- if $_[0] eq "script";
    }
}

use HTTP::Daemon;
use HTTP::Status;
use HTTP::Date;
use LWP::UserAgent;
use URI::URL;
use Net::hostent;
use POSIX ":sys_wait_h";
use MIME::Base64;
use IO::Socket;
use HTML::Entities ();

# Set up our user agent to snag stuff for us. We're doing this before we fork
# because the kernel can copy-on-write parts of the UA as neccessary much 
# faster than perl can set up entire UA's.
$ua = new LWP::UserAgent;

# If our local squid is up, use it. 
$ua->env_proxy();

# Set the UA string
$ua->agent("$fullversion " . $ua->agent);

# Discourage abuse
@abuse_strings = (
		  # PHF Exploit
		  'phf?(.*?)(passwd|group|htpasswd|xterm|rxvt)',
		  # Chargen provides a continuous stream of characters, and can be used
		  # to flood out network links or deplete resources on the proxy.
		  '^http://(.*?):(chargen|19)'
		 );

sub IsAbuseURL {
  my($url) = @_;
  foreach $string (@abuse_strings) {
    if ($url =~ /$string/i) { 
      return 1;
    }
  }
  return 0;
}

sub DecodeFragment {
  print $urldsock "dec @_\n";
  $baz=<$urldsock>;
  chomp $baz;
  return $baz;
}

sub EncodeFragment {
  print $urldsock "enc @_\n";
  $baz=<$urldsock>;
  chomp $baz;
  return $baz;
}

sub EncodeURL {
  return EncodeFragment(@_);
    #$originalurl;
}

sub DecodeURL {
  my($originalurl) = @_;
  # $encodedurl =~ s/(..)/chr(hex($1))/eg;
  return DecodeFragment($originalurl);
    #decode_base64($originalurl);
  #return $originalurl;
}

# set up as a web server listening on whatever port
$d = new HTTP::Daemon(LocalPort => 8802,
		      Listen => 32,
		      Reuse => 1)
  or die("Couldn't establish listening socket for HTTP server");

# Sometimes we have socket accepts that fail, but this doesn't mean it's a 
# permenant failure condition. This keeps the program looping forever. 
while (1) {
  # Get a connection
  while ($c = $d->accept) {
    # Clean up any left over zombies
    while (($deadpid= waitpid(-1,&WNOHANG)) > 0) { 1; }
    
    # Now we fork off a separate copy of ourselves to read and process the
    # Reques. This makes us immediately available to process other connections.
    
    while ( !defined ($forkvalue = fork())) { 
      print "$$ [" . time2str . "] Couldn't fork -- collecting zombies, pausing, and then trying again.\n";
      STDOUT->flush();
      while (($deadpid= waitpid(-1,&WNOHANG)) > 0) { 1; }
      sleep 1;  
    }
    
    if ( $forkvalue eq "0" ) {
      
      # Get a connection to the URL daemon.
      while ( !defined ($urldsock = IO::Socket::INET->new('127.0.0.1:9000')) ) {
	# Couldn't connect? Maybe it got hosed. Start another. 
	system('urld') or die "Couldn't start URL Daemon: $!";
      }

      # Try to resolve the remote machine's hostname
      if ($remotehostname = gethost($c->peerhost)) {
	$remotehostname=$remotehostname->name;
      } else {
	$remotehostname=$c->peerhost;
      }
      
      # sometimes we don't get a request right away, so set up the status
      # indicator.
      $0="$ourexefile $remotehostname (0)\000\000";
      
      # Set a timeout on this socket so we can't get busied out forever. We
      # don't set the timeout right away because resolving the host on its
      # initial connection may take some time.
      $c->timeout(32);
      
      # Now we get the request. Note that since I implemented Keep-Alive, we
      # can get multiple requests on the same connection. 
      while  ($r = $c->get_request) {
	# Set some information about our current status
	$numberofconnections++;	
	$0="$ourexefile $remotehostname ($numberofconnections)\000\000";
	
	# Display some information for the log file
	print "$$ [" . time2str . "] Request: " . $r->url . " from " . $remotehostname . ":" 
	  . $c->peerport . " : ";
	
	# Look at the Host: header to determine the hostname we were 
	# accessed by, and use it in all redirects. We have no idea if 
	# we're being accessed through a port
	# redirector, DNS alias, etc. If we have't been told, make a
	# reasonable guess. 
	if (! ($myhost = $r->headers->header(Host))) {
	  $myhost= $c->sockhost() . ":" . $c->sockport();
	}
	$MYURL="http://" . $myhost. "/";
	

	# Get the path that was requested from the server and remove any 
	# leading slash.
	my $temppath = $r->url->full_path . "";

	if ($temppath =~ s!^/via-another-proxy-from/([^/]+)/!!) {
	  # Perhaps this could be abused to make an attack appear to come
	  # from another host, but they could do that via proxy servers or
	  # the Via: header anyway, so it's no real loss. 
	  $reallyfrom="$1, ";
	  #print "REALLY FROM $1 -- adjusted to $temppath -- \n";
	}

	$temppath =~ s/^\///;
	
	
	#print "\n* ---". $temppath . "---\n";
	if ($temppath eq "") {
	  # No URL? Show them a web site where they can enter one.
	  $r->url($website);
	} else {
	  # Separate any query information from GET requests and separate it 
	  # from the path we're about to decode. 
	  if ( $temppath =~ m/(.*)(\?.*)/ ) {
	    $temppath = $1;
	    $querypart =$2;
	  } else {
	    $querypart = "";
	  }
	  
	  # Decode the URL they gave us.
	  $temppath =DecodeURL($temppath);
	  #~ s/(..)/chr(hex($1))/eg;
	  
	  # Now set the request URL to the decoded path plus any query
	  # information
	  $r->url ( $temppath . $querypart );
	}
	
	# And print the decoded and nicified URL into the log. 
	print $r->url . "\n";
	
	#	  if ( $r->url eq "" ) { 
	#  $r->url($website); 
	#} els
	if ( &IsAbuseURL( $r->url ) ) {
	  # Someone thinks they're cool trying old-school phf lameness or other crap
	  $r->url ( $website . "abuse.html" );
	} elsif ( ! ( $r->url =~ m/^(http|ftp):\/\// ) ) {
	  # Unsupported protocol (or perhaps something nasty like file://)
	  $r->url ( $website . "otherprot.cgi?url=" . $r->url );
	}
	
	# If our document has a referer header, then we need to decode it,
	# because it's currently hexified. 
	if ( $referer = $r->headers->header(Referer) ) {
	  # strip us off the referer for sites that check.
	  if ( $referer =~ s/^$MYURL// ) {
	    $referer = DecodeURL($referer);
	    $r->headers->header(Referer => "$referer");
	    #	    print "\n*Referer is $referer\n";
	    #$referer =~ s/(..)/chr(hex($1))/eg;
	  }
	  
	}
	#      print "$$ Referer is fixed as " . $r->headers->header(Referer) . "\n";
	$r->headers->remove_header(Content_Length);
	$r->headers->remove_header(Host);
	$browserhaskeepalive = $r->headers->header(Connection => 'Close');
	
	
	# Inlcude valuable diagnostic information and discourage use as an anonymous
	# means to defraud or vandalize remote sites.
	#$via=$r->headers->header(Via);
	if ( defined ($via = $r->headers->header(Via)) ) {
	  $via=$via.", ";
	} else {
	  $via = "";
	}
	$r->headers->header(Via => $via . $r->protocol() . " $myhost ($fullversion request received from $remotehostname)");
	
	;
	if ( defined ($via=$r->headers->header(X_Forwarded_For)) ) {
	  $via=$via.", ";
	} else {
	  $via="";
	}
	$r->headers->header(X_Forwarded_For => $via . $reallyfrom . $c->peerhost );
	
	
	# show the request as a debug message
	#print $r->as_string;
	
	
	my $res = $ua->simple_request($r);
	#     print "$$ Got response\n";
	if ($res->headers->header(Location)) {
	  # lets check before tacking our URL onto the header... What if we're
	  # already on there (kaboom!)
	  $location=$res->headers->header(Location);
	  if ( !($location =~ m/^$MYURL/) ) {
	    # encode and update the location if it's not already.
	    #	  print "$$ Encoding location $location\n";
	    $location = EncodeURL($location);
	    #~ s/(.)/sprintf("%02x", ord($1))/ge;
	    # make this more relative (like the href encoder - should help
	    # SSL proxies, although might violate RFC a little bit.
	    $res->headers->header(Location => "/$location" );
	  }
	}
	
	#     print "$$ Sent response\n";
	if ( !($res->code =~ m/(400|500|501)/ ) ) {
	  
	  if ( ( $res->headers->header(Content_Type) || '' ) eq "text/html" ) {
	    # we need to do some fixup here.
	    $base=$res->base;
	    #	  print "$$ content base is " . $base . "\n";
	    $res->add_content("<HR><IMG SRC=\"$website/ians-tiny.gif\"> -");
	    $res->add_content("<A HREF=\"$website\"><IMG SRC=\"$website/another.gif\" BORDER=0></A>");
#<HR><CITE>Via Internet Alternate Namespace ' . $version  . ' Copyright &copy; 1998 '.
#			      'Brian Ristuccia</CITE>' .
#			      '<BR>[<A HREF="' . $website . '">visit another site</A>]');
	    $p = RefFixer->new->parse($res->content);
	    $res->content($p->filtered_html);

	    # could trigger keyword blocking. Sorry.
	    #$res->add_content( '[<A HREF="'. $r->url .'">view directly</A>]'. "\n");
	  }
	  
	  # Set the Content-Length so Keep-Alive doesn't break.
	  $res->headers->header(Content_Length => length($res->content()));
	  
	  # If the browser had Keep-Alive, enable it.
	  $res->headers->header(Connection => $browserhaskeepalive);
	  
	  # This header was returned by the server, and may have no basis
	  # in reality.
	  $res->headers->remove_header('Keep-Alive');
	  
	  $c->send_response($res);
	  
	} else {
	  #if ($res->code == 400) {
	  #  # Send user to 'Where do you want to go?' page. 
	  # $setmeup = $website;
	  # $setmeup =~ s/(.)/sprintf("%02x", ord($1))/ge;
	  # $c->send_redirect("$MYURL" . $setmeup);
	  #}# else {
	  $c->send_error ($res->code, $res->error_as_HTML);
	  #}
	}
	$c->flush;
      }
      $c = undef; 
      

      exit;
    } else {
      # This is what we do if we're the parent of the fork()
      $c = undef;		# close connection
    }
  }
}