#!/util/bin/perl # check_url - check urls use strict; use diagnostics -verbose; use File::Find; use HTML::LinkExtor; use HTTP::Status; # cwm use LWP::Simple qw (get head); use LWP::UserAgent; use URI::Heuristic; my $base_url = shift || die "usage: $0 \n"; my @base_dir = ('/web/local'); # my @web_files = GetWebFiles (@base_dir); # foreach my $wf (@web_files) { # print "$wf\n"; # } my ($good, $bad) = CheckURL ($base_url); print "Good -> " . scalar (@{$good}) . "\n"; foreach my $err_code (sort keys %{$bad}) { print "Bad -> $err_code -> " . scalar (@{$bad->{$err_code}}) . "\n"; } exit (0); sub GetWebFiles { # This chokes on symlinks. my @base_dir = shift; use vars qw ( $name ); my @rv = (); no warnings 'redefine'; sub find(&@) { &File::Find::find } # sub { print $File::Find::name, -d && '/', "\n" }, $base_dir; *name = *File::Find::name; # find { print "$name\n" if -d } @base_dir; find { push (@rv, $name) if $_ =~ /htm[l]*$/ } @base_dir; return @rv; } sub CheckURL { my $base_url = shift; my $good = []; # array ref my $bad = {} ; # hash ref $base_url = URI::Heuristic::uf_urlstr ($base_url); my $parser = HTML::LinkExtor->new (undef, $base_url); $parser->parse (get ($base_url)); my @links = $parser->links; my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->env_proxy; foreach my $linkarray (@links) { my @element = @$linkarray; my $tag = shift @element; # typically a while (@element) { my ($attr_name, $attr_value) = splice (@element, 0, 2); if ($attr_value->scheme !~ /\b(ftp|http|https|file)\b/) { next; } my $response = $ua->head ($attr_value); if ($response->is_success) { push (@{$good}, $attr_value); } else { print "$attr_value: " . $response->status_line . "\n"; push (@{$bad->{$response->status_line}}, $attr_value); } } } return ($good, $bad); }